1 | /* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRLEN |
24 | # define STRLEN __strlen_evex |
25 | # endif |
26 | |
27 | # define VMOVA vmovdqa64 |
28 | |
29 | # ifdef USE_AS_WCSLEN |
30 | # define VPCMP vpcmpd |
31 | # define VPMINU vpminud |
32 | # define SHIFT_REG ecx |
33 | # define CHAR_SIZE 4 |
34 | # else |
35 | # define VPCMP vpcmpb |
36 | # define VPMINU vpminub |
37 | # define SHIFT_REG edx |
38 | # define CHAR_SIZE 1 |
39 | # endif |
40 | |
41 | # define XMMZERO xmm16 |
42 | # define YMMZERO ymm16 |
43 | # define YMM1 ymm17 |
44 | # define YMM2 ymm18 |
45 | # define YMM3 ymm19 |
46 | # define YMM4 ymm20 |
47 | # define YMM5 ymm21 |
48 | # define YMM6 ymm22 |
49 | |
50 | # define VEC_SIZE 32 |
51 | # define PAGE_SIZE 4096 |
52 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
53 | |
54 | .section .text.evex,"ax" ,@progbits |
55 | ENTRY (STRLEN) |
56 | # ifdef USE_AS_STRNLEN |
57 | /* Check zero length. */ |
58 | test %RSI_LP, %RSI_LP |
59 | jz L(zero) |
60 | # ifdef __ILP32__ |
61 | /* Clear the upper 32 bits. */ |
62 | movl %esi, %esi |
63 | # endif |
64 | mov %RSI_LP, %R8_LP |
65 | # endif |
66 | movl %edi, %eax |
67 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
68 | /* Clear high bits from edi. Only keeping bits relevant to page |
69 | cross check. */ |
70 | andl $(PAGE_SIZE - 1), %eax |
71 | /* Check if we may cross page boundary with one vector load. */ |
72 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
73 | ja L(cross_page_boundary) |
74 | |
75 | /* Check the first VEC_SIZE bytes. Each bit in K0 represents a |
76 | null byte. */ |
77 | VPCMP $0, (%rdi), %YMMZERO, %k0 |
78 | kmovd %k0, %eax |
79 | # ifdef USE_AS_STRNLEN |
80 | /* If length < CHAR_PER_VEC handle special. */ |
81 | cmpq $CHAR_PER_VEC, %rsi |
82 | jbe L(first_vec_x0) |
83 | # endif |
84 | testl %eax, %eax |
85 | jz L(aligned_more) |
86 | tzcntl %eax, %eax |
87 | ret |
88 | # ifdef USE_AS_STRNLEN |
89 | L(zero): |
90 | xorl %eax, %eax |
91 | ret |
92 | |
93 | .p2align 4 |
94 | L(first_vec_x0): |
95 | /* Set bit for max len so that tzcnt will return min of max len |
96 | and position of first match. */ |
97 | btsq %rsi, %rax |
98 | tzcntl %eax, %eax |
99 | ret |
100 | # endif |
101 | |
102 | .p2align 4 |
103 | L(first_vec_x1): |
104 | tzcntl %eax, %eax |
105 | /* Safe to use 32 bit instructions as these are only called for |
106 | size = [1, 159]. */ |
107 | # ifdef USE_AS_STRNLEN |
108 | /* Use ecx which was computed earlier to compute correct value. |
109 | */ |
110 | leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax |
111 | # else |
112 | subl %edx, %edi |
113 | # ifdef USE_AS_WCSLEN |
114 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
115 | sarl $2, %edi |
116 | # endif |
117 | leal CHAR_PER_VEC(%rdi, %rax), %eax |
118 | # endif |
119 | ret |
120 | |
121 | .p2align 4 |
122 | L(first_vec_x2): |
123 | tzcntl %eax, %eax |
124 | /* Safe to use 32 bit instructions as these are only called for |
125 | size = [1, 159]. */ |
126 | # ifdef USE_AS_STRNLEN |
127 | /* Use ecx which was computed earlier to compute correct value. |
128 | */ |
129 | leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax |
130 | # else |
131 | subl %edx, %edi |
132 | # ifdef USE_AS_WCSLEN |
133 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
134 | sarl $2, %edi |
135 | # endif |
136 | leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax |
137 | # endif |
138 | ret |
139 | |
140 | .p2align 4 |
141 | L(first_vec_x3): |
142 | tzcntl %eax, %eax |
143 | /* Safe to use 32 bit instructions as these are only called for |
144 | size = [1, 159]. */ |
145 | # ifdef USE_AS_STRNLEN |
146 | /* Use ecx which was computed earlier to compute correct value. |
147 | */ |
148 | leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax |
149 | # else |
150 | subl %edx, %edi |
151 | # ifdef USE_AS_WCSLEN |
152 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
153 | sarl $2, %edi |
154 | # endif |
155 | leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax |
156 | # endif |
157 | ret |
158 | |
159 | .p2align 4 |
160 | L(first_vec_x4): |
161 | tzcntl %eax, %eax |
162 | /* Safe to use 32 bit instructions as these are only called for |
163 | size = [1, 159]. */ |
164 | # ifdef USE_AS_STRNLEN |
165 | /* Use ecx which was computed earlier to compute correct value. |
166 | */ |
167 | leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax |
168 | # else |
169 | subl %edx, %edi |
170 | # ifdef USE_AS_WCSLEN |
171 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
172 | sarl $2, %edi |
173 | # endif |
174 | leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax |
175 | # endif |
176 | ret |
177 | |
178 | .p2align 5 |
179 | L(aligned_more): |
180 | movq %rdi, %rdx |
181 | /* Align data to VEC_SIZE. */ |
182 | andq $-(VEC_SIZE), %rdi |
183 | L(cross_page_continue): |
184 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
185 | since data is only aligned to VEC_SIZE. */ |
186 | # ifdef USE_AS_STRNLEN |
187 | /* + CHAR_SIZE because it simplies the logic in |
188 | last_4x_vec_or_less. */ |
189 | leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx |
190 | subq %rdx, %rcx |
191 | # ifdef USE_AS_WCSLEN |
192 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
193 | sarl $2, %ecx |
194 | # endif |
195 | # endif |
196 | /* Load first VEC regardless. */ |
197 | VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 |
198 | # ifdef USE_AS_STRNLEN |
199 | /* Adjust length. If near end handle specially. */ |
200 | subq %rcx, %rsi |
201 | jb L(last_4x_vec_or_less) |
202 | # endif |
203 | kmovd %k0, %eax |
204 | testl %eax, %eax |
205 | jnz L(first_vec_x1) |
206 | |
207 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
208 | kmovd %k0, %eax |
209 | test %eax, %eax |
210 | jnz L(first_vec_x2) |
211 | |
212 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 |
213 | kmovd %k0, %eax |
214 | testl %eax, %eax |
215 | jnz L(first_vec_x3) |
216 | |
217 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 |
218 | kmovd %k0, %eax |
219 | testl %eax, %eax |
220 | jnz L(first_vec_x4) |
221 | |
222 | addq $VEC_SIZE, %rdi |
223 | # ifdef USE_AS_STRNLEN |
224 | /* Check if at last VEC_SIZE * 4 length. */ |
225 | cmpq $(CHAR_PER_VEC * 4 - 1), %rsi |
226 | jbe L(last_4x_vec_or_less_load) |
227 | movl %edi, %ecx |
228 | andl $(VEC_SIZE * 4 - 1), %ecx |
229 | # ifdef USE_AS_WCSLEN |
230 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
231 | sarl $2, %ecx |
232 | # endif |
233 | /* Readjust length. */ |
234 | addq %rcx, %rsi |
235 | # endif |
236 | /* Align data to VEC_SIZE * 4. */ |
237 | andq $-(VEC_SIZE * 4), %rdi |
238 | |
239 | /* Compare 4 * VEC at a time forward. */ |
240 | .p2align 4 |
241 | L(loop_4x_vec): |
242 | /* Load first VEC regardless. */ |
243 | VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 |
244 | # ifdef USE_AS_STRNLEN |
245 | /* Break if at end of length. */ |
246 | subq $(CHAR_PER_VEC * 4), %rsi |
247 | jb L(last_4x_vec_or_less_cmpeq) |
248 | # endif |
249 | /* Save some code size by microfusing VPMINU with the load. Since |
250 | the matches in ymm2/ymm4 can only be returned if there where no |
251 | matches in ymm1/ymm3 respectively there is no issue with overlap. |
252 | */ |
253 | VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 |
254 | VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 |
255 | VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 |
256 | |
257 | VPCMP $0, %YMM2, %YMMZERO, %k0 |
258 | VPCMP $0, %YMM4, %YMMZERO, %k1 |
259 | subq $-(VEC_SIZE * 4), %rdi |
260 | kortestd %k0, %k1 |
261 | jz L(loop_4x_vec) |
262 | |
263 | /* Check if end was in first half. */ |
264 | kmovd %k0, %eax |
265 | subq %rdx, %rdi |
266 | # ifdef USE_AS_WCSLEN |
267 | shrq $2, %rdi |
268 | # endif |
269 | testl %eax, %eax |
270 | jz L(second_vec_return) |
271 | |
272 | VPCMP $0, %YMM1, %YMMZERO, %k2 |
273 | kmovd %k2, %edx |
274 | /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ |
275 | # ifdef USE_AS_WCSLEN |
276 | sall $CHAR_PER_VEC, %eax |
277 | orl %edx, %eax |
278 | tzcntl %eax, %eax |
279 | # else |
280 | salq $CHAR_PER_VEC, %rax |
281 | orq %rdx, %rax |
282 | tzcntq %rax, %rax |
283 | # endif |
284 | addq %rdi, %rax |
285 | ret |
286 | |
287 | |
288 | # ifdef USE_AS_STRNLEN |
289 | |
290 | L(last_4x_vec_or_less_load): |
291 | /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ |
292 | VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 |
293 | L(last_4x_vec_or_less_cmpeq): |
294 | VPCMP $0, %YMM1, %YMMZERO, %k0 |
295 | addq $(VEC_SIZE * 3), %rdi |
296 | L(last_4x_vec_or_less): |
297 | kmovd %k0, %eax |
298 | /* If remaining length > VEC_SIZE * 2. This works if esi is off by |
299 | VEC_SIZE * 4. */ |
300 | testl $(CHAR_PER_VEC * 2), %esi |
301 | jnz L(last_4x_vec) |
302 | |
303 | /* length may have been negative or positive by an offset of |
304 | CHAR_PER_VEC * 4 depending on where this was called from. This |
305 | fixes that. */ |
306 | andl $(CHAR_PER_VEC * 4 - 1), %esi |
307 | testl %eax, %eax |
308 | jnz L(last_vec_x1_check) |
309 | |
310 | /* Check the end of data. */ |
311 | subl $CHAR_PER_VEC, %esi |
312 | jb L(max) |
313 | |
314 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
315 | kmovd %k0, %eax |
316 | tzcntl %eax, %eax |
317 | /* Check the end of data. */ |
318 | cmpl %eax, %esi |
319 | jb L(max) |
320 | |
321 | subq %rdx, %rdi |
322 | # ifdef USE_AS_WCSLEN |
323 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
324 | sarq $2, %rdi |
325 | # endif |
326 | leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax |
327 | ret |
328 | L(max): |
329 | movq %r8, %rax |
330 | ret |
331 | # endif |
332 | |
333 | /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) |
334 | in the 4x VEC loop can use 2 byte encoding. */ |
335 | .p2align 4 |
336 | L(second_vec_return): |
337 | VPCMP $0, %YMM3, %YMMZERO, %k0 |
338 | /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ |
339 | # ifdef USE_AS_WCSLEN |
340 | kunpckbw %k0, %k1, %k0 |
341 | kmovd %k0, %eax |
342 | tzcntl %eax, %eax |
343 | # else |
344 | kunpckdq %k0, %k1, %k0 |
345 | kmovq %k0, %rax |
346 | tzcntq %rax, %rax |
347 | # endif |
348 | leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax |
349 | ret |
350 | |
351 | |
352 | # ifdef USE_AS_STRNLEN |
353 | L(last_vec_x1_check): |
354 | tzcntl %eax, %eax |
355 | /* Check the end of data. */ |
356 | cmpl %eax, %esi |
357 | jb L(max) |
358 | subq %rdx, %rdi |
359 | # ifdef USE_AS_WCSLEN |
360 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
361 | sarq $2, %rdi |
362 | # endif |
363 | leaq (CHAR_PER_VEC)(%rdi, %rax), %rax |
364 | ret |
365 | |
366 | .p2align 4 |
367 | L(last_4x_vec): |
368 | /* Test first 2x VEC normally. */ |
369 | testl %eax, %eax |
370 | jnz L(last_vec_x1) |
371 | |
372 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
373 | kmovd %k0, %eax |
374 | testl %eax, %eax |
375 | jnz L(last_vec_x2) |
376 | |
377 | /* Normalize length. */ |
378 | andl $(CHAR_PER_VEC * 4 - 1), %esi |
379 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 |
380 | kmovd %k0, %eax |
381 | testl %eax, %eax |
382 | jnz L(last_vec_x3) |
383 | |
384 | /* Check the end of data. */ |
385 | subl $(CHAR_PER_VEC * 3), %esi |
386 | jb L(max) |
387 | |
388 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 |
389 | kmovd %k0, %eax |
390 | tzcntl %eax, %eax |
391 | /* Check the end of data. */ |
392 | cmpl %eax, %esi |
393 | jb L(max_end) |
394 | |
395 | subq %rdx, %rdi |
396 | # ifdef USE_AS_WCSLEN |
397 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
398 | sarq $2, %rdi |
399 | # endif |
400 | leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax |
401 | ret |
402 | |
403 | .p2align 4 |
404 | L(last_vec_x1): |
405 | tzcntl %eax, %eax |
406 | subq %rdx, %rdi |
407 | # ifdef USE_AS_WCSLEN |
408 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
409 | sarq $2, %rdi |
410 | # endif |
411 | leaq (CHAR_PER_VEC)(%rdi, %rax), %rax |
412 | ret |
413 | |
414 | .p2align 4 |
415 | L(last_vec_x2): |
416 | tzcntl %eax, %eax |
417 | subq %rdx, %rdi |
418 | # ifdef USE_AS_WCSLEN |
419 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
420 | sarq $2, %rdi |
421 | # endif |
422 | leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax |
423 | ret |
424 | |
425 | .p2align 4 |
426 | L(last_vec_x3): |
427 | tzcntl %eax, %eax |
428 | subl $(CHAR_PER_VEC * 2), %esi |
429 | /* Check the end of data. */ |
430 | cmpl %eax, %esi |
431 | jb L(max_end) |
432 | subq %rdx, %rdi |
433 | # ifdef USE_AS_WCSLEN |
434 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
435 | sarq $2, %rdi |
436 | # endif |
437 | leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax |
438 | ret |
439 | L(max_end): |
440 | movq %r8, %rax |
441 | ret |
442 | # endif |
443 | |
444 | /* Cold case for crossing page with first load. */ |
445 | .p2align 4 |
446 | L(cross_page_boundary): |
447 | movq %rdi, %rdx |
448 | /* Align data to VEC_SIZE. */ |
449 | andq $-VEC_SIZE, %rdi |
450 | VPCMP $0, (%rdi), %YMMZERO, %k0 |
451 | kmovd %k0, %eax |
452 | /* Remove the leading bytes. */ |
453 | # ifdef USE_AS_WCSLEN |
454 | /* NB: Divide shift count by 4 since each bit in K0 represent 4 |
455 | bytes. */ |
456 | movl %edx, %ecx |
457 | shrl $2, %ecx |
458 | andl $(CHAR_PER_VEC - 1), %ecx |
459 | # endif |
460 | /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ |
461 | sarxl %SHIFT_REG, %eax, %eax |
462 | testl %eax, %eax |
463 | # ifndef USE_AS_STRNLEN |
464 | jz L(cross_page_continue) |
465 | tzcntl %eax, %eax |
466 | ret |
467 | # else |
468 | jnz L(cross_page_less_vec) |
469 | # ifndef USE_AS_WCSLEN |
470 | movl %edx, %ecx |
471 | andl $(CHAR_PER_VEC - 1), %ecx |
472 | # endif |
473 | movl $CHAR_PER_VEC, %eax |
474 | subl %ecx, %eax |
475 | /* Check the end of data. */ |
476 | cmpq %rax, %rsi |
477 | ja L(cross_page_continue) |
478 | movl %esi, %eax |
479 | ret |
480 | L(cross_page_less_vec): |
481 | tzcntl %eax, %eax |
482 | /* Select min of length and position of first null. */ |
483 | cmpq %rax, %rsi |
484 | cmovb %esi, %eax |
485 | ret |
486 | # endif |
487 | |
488 | END (STRLEN) |
489 | #endif |
490 | |