1 | /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef STRLEN |
26 | # define STRLEN __strlen_avx2 |
27 | # endif |
28 | |
29 | # ifdef USE_AS_WCSLEN |
30 | # define VPCMPEQ vpcmpeqd |
31 | # define VPMINU vpminud |
32 | # define CHAR_SIZE 4 |
33 | # else |
34 | # define VPCMPEQ vpcmpeqb |
35 | # define VPMINU vpminub |
36 | # define CHAR_SIZE 1 |
37 | # endif |
38 | |
39 | # ifndef VZEROUPPER |
40 | # define VZEROUPPER vzeroupper |
41 | # endif |
42 | |
43 | # ifndef SECTION |
44 | # define SECTION(p) p##.avx |
45 | # endif |
46 | |
47 | # define VEC_SIZE 32 |
48 | # define PAGE_SIZE 4096 |
49 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
50 | |
51 | .section SECTION(.text),"ax" ,@progbits |
52 | ENTRY (STRLEN) |
53 | # ifdef USE_AS_STRNLEN |
54 | /* Check zero length. */ |
55 | # ifdef __ILP32__ |
56 | /* Clear upper bits. */ |
57 | and %RSI_LP, %RSI_LP |
58 | # else |
59 | test %RSI_LP, %RSI_LP |
60 | # endif |
61 | jz L(zero) |
62 | /* Store max len in R8_LP before adjusting if using WCSLEN. */ |
63 | mov %RSI_LP, %R8_LP |
64 | # endif |
65 | movl %edi, %eax |
66 | movq %rdi, %rdx |
67 | vpxor %xmm0, %xmm0, %xmm0 |
68 | /* Clear high bits from edi. Only keeping bits relevant to page |
69 | cross check. */ |
70 | andl $(PAGE_SIZE - 1), %eax |
71 | /* Check if we may cross page boundary with one vector load. */ |
72 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
73 | ja L(cross_page_boundary) |
74 | |
75 | /* Check the first VEC_SIZE bytes. */ |
76 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
77 | vpmovmskb %ymm1, %eax |
78 | # ifdef USE_AS_STRNLEN |
79 | /* If length < VEC_SIZE handle special. */ |
80 | cmpq $CHAR_PER_VEC, %rsi |
81 | jbe L(first_vec_x0) |
82 | # endif |
83 | /* If empty continue to aligned_more. Otherwise return bit |
84 | position of first match. */ |
85 | testl %eax, %eax |
86 | jz L(aligned_more) |
87 | tzcntl %eax, %eax |
88 | # ifdef USE_AS_WCSLEN |
89 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
90 | shrl $2, %eax |
91 | # endif |
92 | VZEROUPPER_RETURN |
93 | |
94 | # ifdef USE_AS_STRNLEN |
95 | L(zero): |
96 | xorl %eax, %eax |
97 | ret |
98 | |
99 | .p2align 4 |
100 | L(first_vec_x0): |
101 | /* Set bit for max len so that tzcnt will return min of max len |
102 | and position of first match. */ |
103 | # ifdef USE_AS_WCSLEN |
104 | /* NB: Multiply length by 4 to get byte count. */ |
105 | sall $2, %esi |
106 | # endif |
107 | btsq %rsi, %rax |
108 | tzcntl %eax, %eax |
109 | # ifdef USE_AS_WCSLEN |
110 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
111 | shrl $2, %eax |
112 | # endif |
113 | VZEROUPPER_RETURN |
114 | # endif |
115 | |
116 | .p2align 4 |
117 | L(first_vec_x1): |
118 | tzcntl %eax, %eax |
119 | /* Safe to use 32 bit instructions as these are only called for |
120 | size = [1, 159]. */ |
121 | # ifdef USE_AS_STRNLEN |
122 | /* Use ecx which was computed earlier to compute correct value. |
123 | */ |
124 | # ifdef USE_AS_WCSLEN |
125 | leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax |
126 | # else |
127 | subl $(VEC_SIZE * 4 + 1), %ecx |
128 | addl %ecx, %eax |
129 | # endif |
130 | # else |
131 | subl %edx, %edi |
132 | incl %edi |
133 | addl %edi, %eax |
134 | # endif |
135 | # ifdef USE_AS_WCSLEN |
136 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
137 | shrl $2, %eax |
138 | # endif |
139 | VZEROUPPER_RETURN |
140 | |
141 | .p2align 4 |
142 | L(first_vec_x2): |
143 | tzcntl %eax, %eax |
144 | /* Safe to use 32 bit instructions as these are only called for |
145 | size = [1, 159]. */ |
146 | # ifdef USE_AS_STRNLEN |
147 | /* Use ecx which was computed earlier to compute correct value. |
148 | */ |
149 | # ifdef USE_AS_WCSLEN |
150 | leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax |
151 | # else |
152 | subl $(VEC_SIZE * 3 + 1), %ecx |
153 | addl %ecx, %eax |
154 | # endif |
155 | # else |
156 | subl %edx, %edi |
157 | addl $(VEC_SIZE + 1), %edi |
158 | addl %edi, %eax |
159 | # endif |
160 | # ifdef USE_AS_WCSLEN |
161 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
162 | shrl $2, %eax |
163 | # endif |
164 | VZEROUPPER_RETURN |
165 | |
166 | .p2align 4 |
167 | L(first_vec_x3): |
168 | tzcntl %eax, %eax |
169 | /* Safe to use 32 bit instructions as these are only called for |
170 | size = [1, 159]. */ |
171 | # ifdef USE_AS_STRNLEN |
172 | /* Use ecx which was computed earlier to compute correct value. |
173 | */ |
174 | # ifdef USE_AS_WCSLEN |
175 | leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax |
176 | # else |
177 | subl $(VEC_SIZE * 2 + 1), %ecx |
178 | addl %ecx, %eax |
179 | # endif |
180 | # else |
181 | subl %edx, %edi |
182 | addl $(VEC_SIZE * 2 + 1), %edi |
183 | addl %edi, %eax |
184 | # endif |
185 | # ifdef USE_AS_WCSLEN |
186 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
187 | shrl $2, %eax |
188 | # endif |
189 | VZEROUPPER_RETURN |
190 | |
191 | .p2align 4 |
192 | L(first_vec_x4): |
193 | tzcntl %eax, %eax |
194 | /* Safe to use 32 bit instructions as these are only called for |
195 | size = [1, 159]. */ |
196 | # ifdef USE_AS_STRNLEN |
197 | /* Use ecx which was computed earlier to compute correct value. |
198 | */ |
199 | # ifdef USE_AS_WCSLEN |
200 | leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax |
201 | # else |
202 | subl $(VEC_SIZE + 1), %ecx |
203 | addl %ecx, %eax |
204 | # endif |
205 | # else |
206 | subl %edx, %edi |
207 | addl $(VEC_SIZE * 3 + 1), %edi |
208 | addl %edi, %eax |
209 | # endif |
210 | # ifdef USE_AS_WCSLEN |
211 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
212 | shrl $2, %eax |
213 | # endif |
214 | VZEROUPPER_RETURN |
215 | |
216 | .p2align 5 |
217 | L(aligned_more): |
218 | /* Align data to VEC_SIZE - 1. This is the same number of |
219 | instructions as using andq with -VEC_SIZE but saves 4 bytes of |
220 | code on the x4 check. */ |
221 | orq $(VEC_SIZE - 1), %rdi |
222 | L(cross_page_continue): |
223 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
224 | since data is only aligned to VEC_SIZE. */ |
225 | # ifdef USE_AS_STRNLEN |
226 | /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE |
227 | because it simplifies the logic in last_4x_vec_or_less. */ |
228 | leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx |
229 | subq %rdx, %rcx |
230 | # ifdef USE_AS_WCSLEN |
231 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
232 | sarl $2, %ecx |
233 | # endif |
234 | # endif |
235 | /* Load first VEC regardless. */ |
236 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
237 | # ifdef USE_AS_STRNLEN |
238 | /* Adjust length. If near end handle specially. */ |
239 | subq %rcx, %rsi |
240 | jb L(last_4x_vec_or_less) |
241 | # endif |
242 | vpmovmskb %ymm1, %eax |
243 | testl %eax, %eax |
244 | jnz L(first_vec_x1) |
245 | |
246 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
247 | vpmovmskb %ymm1, %eax |
248 | testl %eax, %eax |
249 | jnz L(first_vec_x2) |
250 | |
251 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
252 | vpmovmskb %ymm1, %eax |
253 | testl %eax, %eax |
254 | jnz L(first_vec_x3) |
255 | |
256 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
257 | vpmovmskb %ymm1, %eax |
258 | testl %eax, %eax |
259 | jnz L(first_vec_x4) |
260 | |
261 | /* Align data to VEC_SIZE * 4 - 1. */ |
262 | # ifdef USE_AS_STRNLEN |
263 | /* Before adjusting length check if at last VEC_SIZE * 4. */ |
264 | cmpq $(CHAR_PER_VEC * 4 - 1), %rsi |
265 | jbe L(last_4x_vec_or_less_load) |
266 | incq %rdi |
267 | movl %edi, %ecx |
268 | orq $(VEC_SIZE * 4 - 1), %rdi |
269 | andl $(VEC_SIZE * 4 - 1), %ecx |
270 | # ifdef USE_AS_WCSLEN |
271 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
272 | sarl $2, %ecx |
273 | # endif |
274 | /* Readjust length. */ |
275 | addq %rcx, %rsi |
276 | # else |
277 | incq %rdi |
278 | orq $(VEC_SIZE * 4 - 1), %rdi |
279 | # endif |
280 | /* Compare 4 * VEC at a time forward. */ |
281 | .p2align 4 |
282 | L(loop_4x_vec): |
283 | # ifdef USE_AS_STRNLEN |
284 | /* Break if at end of length. */ |
285 | subq $(CHAR_PER_VEC * 4), %rsi |
286 | jb L(last_4x_vec_or_less_cmpeq) |
287 | # endif |
288 | /* Save some code size by microfusing VPMINU with the load. |
289 | Since the matches in ymm2/ymm4 can only be returned if there |
290 | where no matches in ymm1/ymm3 respectively there is no issue |
291 | with overlap. */ |
292 | vmovdqa 1(%rdi), %ymm1 |
293 | VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 |
294 | vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 |
295 | VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 |
296 | |
297 | VPMINU %ymm2, %ymm4, %ymm5 |
298 | VPCMPEQ %ymm5, %ymm0, %ymm5 |
299 | vpmovmskb %ymm5, %ecx |
300 | |
301 | subq $-(VEC_SIZE * 4), %rdi |
302 | testl %ecx, %ecx |
303 | jz L(loop_4x_vec) |
304 | |
305 | |
306 | VPCMPEQ %ymm1, %ymm0, %ymm1 |
307 | vpmovmskb %ymm1, %eax |
308 | subq %rdx, %rdi |
309 | testl %eax, %eax |
310 | jnz L(last_vec_return_x0) |
311 | |
312 | VPCMPEQ %ymm2, %ymm0, %ymm2 |
313 | vpmovmskb %ymm2, %eax |
314 | testl %eax, %eax |
315 | jnz L(last_vec_return_x1) |
316 | |
317 | /* Combine last 2 VEC. */ |
318 | VPCMPEQ %ymm3, %ymm0, %ymm3 |
319 | vpmovmskb %ymm3, %eax |
320 | /* rcx has combined result from all 4 VEC. It will only be used |
321 | if the first 3 other VEC all did not contain a match. */ |
322 | salq $32, %rcx |
323 | orq %rcx, %rax |
324 | tzcntq %rax, %rax |
325 | subq $(VEC_SIZE * 2 - 1), %rdi |
326 | addq %rdi, %rax |
327 | # ifdef USE_AS_WCSLEN |
328 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
329 | shrq $2, %rax |
330 | # endif |
331 | VZEROUPPER_RETURN |
332 | |
333 | |
334 | # ifdef USE_AS_STRNLEN |
335 | .p2align 4 |
336 | L(last_4x_vec_or_less_load): |
337 | /* Depending on entry adjust rdi / prepare first VEC in ymm1. |
338 | */ |
339 | subq $-(VEC_SIZE * 4), %rdi |
340 | L(last_4x_vec_or_less_cmpeq): |
341 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |
342 | L(last_4x_vec_or_less): |
343 | # ifdef USE_AS_WCSLEN |
344 | /* NB: Multiply length by 4 to get byte count. */ |
345 | sall $2, %esi |
346 | # endif |
347 | vpmovmskb %ymm1, %eax |
348 | /* If remaining length > VEC_SIZE * 2. This works if esi is off |
349 | by VEC_SIZE * 4. */ |
350 | testl $(VEC_SIZE * 2), %esi |
351 | jnz L(last_4x_vec) |
352 | |
353 | /* length may have been negative or positive by an offset of |
354 | VEC_SIZE * 4 depending on where this was called from. This fixes |
355 | that. */ |
356 | andl $(VEC_SIZE * 4 - 1), %esi |
357 | testl %eax, %eax |
358 | jnz L(last_vec_x1_check) |
359 | |
360 | subl $VEC_SIZE, %esi |
361 | jb L(max) |
362 | |
363 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
364 | vpmovmskb %ymm1, %eax |
365 | tzcntl %eax, %eax |
366 | /* Check the end of data. */ |
367 | cmpl %eax, %esi |
368 | jb L(max) |
369 | subq %rdx, %rdi |
370 | addl $(VEC_SIZE + 1), %eax |
371 | addq %rdi, %rax |
372 | # ifdef USE_AS_WCSLEN |
373 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
374 | shrq $2, %rax |
375 | # endif |
376 | VZEROUPPER_RETURN |
377 | # endif |
378 | |
379 | .p2align 4 |
380 | L(last_vec_return_x0): |
381 | tzcntl %eax, %eax |
382 | subq $(VEC_SIZE * 4 - 1), %rdi |
383 | addq %rdi, %rax |
384 | # ifdef USE_AS_WCSLEN |
385 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
386 | shrq $2, %rax |
387 | # endif |
388 | VZEROUPPER_RETURN |
389 | |
390 | .p2align 4 |
391 | L(last_vec_return_x1): |
392 | tzcntl %eax, %eax |
393 | subq $(VEC_SIZE * 3 - 1), %rdi |
394 | addq %rdi, %rax |
395 | # ifdef USE_AS_WCSLEN |
396 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
397 | shrq $2, %rax |
398 | # endif |
399 | VZEROUPPER_RETURN |
400 | |
401 | # ifdef USE_AS_STRNLEN |
402 | .p2align 4 |
403 | L(last_vec_x1_check): |
404 | |
405 | tzcntl %eax, %eax |
406 | /* Check the end of data. */ |
407 | cmpl %eax, %esi |
408 | jb L(max) |
409 | subq %rdx, %rdi |
410 | incl %eax |
411 | addq %rdi, %rax |
412 | # ifdef USE_AS_WCSLEN |
413 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
414 | shrq $2, %rax |
415 | # endif |
416 | VZEROUPPER_RETURN |
417 | |
418 | L(max): |
419 | movq %r8, %rax |
420 | VZEROUPPER_RETURN |
421 | |
422 | .p2align 4 |
423 | L(last_4x_vec): |
424 | /* Test first 2x VEC normally. */ |
425 | testl %eax, %eax |
426 | jnz L(last_vec_x1) |
427 | |
428 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |
429 | vpmovmskb %ymm1, %eax |
430 | testl %eax, %eax |
431 | jnz L(last_vec_x2) |
432 | |
433 | /* Normalize length. */ |
434 | andl $(VEC_SIZE * 4 - 1), %esi |
435 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |
436 | vpmovmskb %ymm1, %eax |
437 | testl %eax, %eax |
438 | jnz L(last_vec_x3) |
439 | |
440 | subl $(VEC_SIZE * 3), %esi |
441 | jb L(max) |
442 | |
443 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |
444 | vpmovmskb %ymm1, %eax |
445 | tzcntl %eax, %eax |
446 | /* Check the end of data. */ |
447 | cmpl %eax, %esi |
448 | jb L(max) |
449 | subq %rdx, %rdi |
450 | addl $(VEC_SIZE * 3 + 1), %eax |
451 | addq %rdi, %rax |
452 | # ifdef USE_AS_WCSLEN |
453 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
454 | shrq $2, %rax |
455 | # endif |
456 | VZEROUPPER_RETURN |
457 | |
458 | |
459 | .p2align 4 |
460 | L(last_vec_x1): |
461 | /* essentially duplicates of first_vec_x1 but use 64 bit |
462 | instructions. */ |
463 | tzcntl %eax, %eax |
464 | subq %rdx, %rdi |
465 | incl %eax |
466 | addq %rdi, %rax |
467 | # ifdef USE_AS_WCSLEN |
468 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
469 | shrq $2, %rax |
470 | # endif |
471 | VZEROUPPER_RETURN |
472 | |
473 | .p2align 4 |
474 | L(last_vec_x2): |
475 | /* essentially duplicates of first_vec_x1 but use 64 bit |
476 | instructions. */ |
477 | tzcntl %eax, %eax |
478 | subq %rdx, %rdi |
479 | addl $(VEC_SIZE + 1), %eax |
480 | addq %rdi, %rax |
481 | # ifdef USE_AS_WCSLEN |
482 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
483 | shrq $2, %rax |
484 | # endif |
485 | VZEROUPPER_RETURN |
486 | |
487 | .p2align 4 |
488 | L(last_vec_x3): |
489 | tzcntl %eax, %eax |
490 | subl $(VEC_SIZE * 2), %esi |
491 | /* Check the end of data. */ |
492 | cmpl %eax, %esi |
493 | jb L(max_end) |
494 | subq %rdx, %rdi |
495 | addl $(VEC_SIZE * 2 + 1), %eax |
496 | addq %rdi, %rax |
497 | # ifdef USE_AS_WCSLEN |
498 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
499 | shrq $2, %rax |
500 | # endif |
501 | VZEROUPPER_RETURN |
502 | L(max_end): |
503 | movq %r8, %rax |
504 | VZEROUPPER_RETURN |
505 | # endif |
506 | |
507 | /* Cold case for crossing page with first load. */ |
508 | .p2align 4 |
509 | L(cross_page_boundary): |
510 | /* Align data to VEC_SIZE - 1. */ |
511 | orq $(VEC_SIZE - 1), %rdi |
512 | VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 |
513 | vpmovmskb %ymm1, %eax |
514 | /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT |
515 | so no need to manually mod rdx. */ |
516 | sarxl %edx, %eax, %eax |
517 | # ifdef USE_AS_STRNLEN |
518 | testl %eax, %eax |
519 | jnz L(cross_page_less_vec) |
520 | leaq 1(%rdi), %rcx |
521 | subq %rdx, %rcx |
522 | # ifdef USE_AS_WCSLEN |
523 | /* NB: Divide bytes by 4 to get wchar_t count. */ |
524 | shrl $2, %ecx |
525 | # endif |
526 | /* Check length. */ |
527 | cmpq %rsi, %rcx |
528 | jb L(cross_page_continue) |
529 | movq %r8, %rax |
530 | # else |
531 | testl %eax, %eax |
532 | jz L(cross_page_continue) |
533 | tzcntl %eax, %eax |
534 | # ifdef USE_AS_WCSLEN |
535 | /* NB: Divide length by 4 to get wchar_t count. */ |
536 | shrl $2, %eax |
537 | # endif |
538 | # endif |
539 | L(return_vzeroupper): |
540 | ZERO_UPPER_VEC_REGISTERS_RETURN |
541 | |
542 | # ifdef USE_AS_STRNLEN |
543 | .p2align 4 |
544 | L(cross_page_less_vec): |
545 | tzcntl %eax, %eax |
546 | # ifdef USE_AS_WCSLEN |
547 | /* NB: Divide by 4 to convert from byte-count to length. */ |
548 | shrl $2, %eax |
549 | # endif |
550 | cmpq %rax, %rsi |
551 | cmovb %esi, %eax |
552 | VZEROUPPER_RETURN |
553 | # endif |
554 | |
555 | END (STRLEN) |
556 | #endif |
557 | |