1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef STRLEN
26# define STRLEN __strlen_avx2
27# endif
28
29# ifdef USE_AS_WCSLEN
30# define VPCMPEQ vpcmpeqd
31# define VPMINU vpminud
32# define CHAR_SIZE 4
33# else
34# define VPCMPEQ vpcmpeqb
35# define VPMINU vpminub
36# define CHAR_SIZE 1
37# endif
38
39# ifndef VZEROUPPER
40# define VZEROUPPER vzeroupper
41# endif
42
43# ifndef SECTION
44# define SECTION(p) p##.avx
45# endif
46
47# define VEC_SIZE 32
48# define PAGE_SIZE 4096
49# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
50
51 .section SECTION(.text),"ax",@progbits
52ENTRY (STRLEN)
53# ifdef USE_AS_STRNLEN
54 /* Check zero length. */
55# ifdef __ILP32__
56 /* Clear upper bits. */
57 and %RSI_LP, %RSI_LP
58# else
59 test %RSI_LP, %RSI_LP
60# endif
61 jz L(zero)
62 /* Store max len in R8_LP before adjusting if using WCSLEN. */
63 mov %RSI_LP, %R8_LP
64# endif
65 movl %edi, %eax
66 movq %rdi, %rdx
67 vpxor %xmm0, %xmm0, %xmm0
68 /* Clear high bits from edi. Only keeping bits relevant to page
69 cross check. */
70 andl $(PAGE_SIZE - 1), %eax
71 /* Check if we may cross page boundary with one vector load. */
72 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
73 ja L(cross_page_boundary)
74
75 /* Check the first VEC_SIZE bytes. */
76 VPCMPEQ (%rdi), %ymm0, %ymm1
77 vpmovmskb %ymm1, %eax
78# ifdef USE_AS_STRNLEN
79 /* If length < VEC_SIZE handle special. */
80 cmpq $CHAR_PER_VEC, %rsi
81 jbe L(first_vec_x0)
82# endif
83 /* If empty continue to aligned_more. Otherwise return bit
84 position of first match. */
85 testl %eax, %eax
86 jz L(aligned_more)
87 tzcntl %eax, %eax
88# ifdef USE_AS_WCSLEN
89 /* NB: Divide bytes by 4 to get wchar_t count. */
90 shrl $2, %eax
91# endif
92 VZEROUPPER_RETURN
93
94# ifdef USE_AS_STRNLEN
95L(zero):
96 xorl %eax, %eax
97 ret
98
99 .p2align 4
100L(first_vec_x0):
101 /* Set bit for max len so that tzcnt will return min of max len
102 and position of first match. */
103# ifdef USE_AS_WCSLEN
104 /* NB: Multiply length by 4 to get byte count. */
105 sall $2, %esi
106# endif
107 btsq %rsi, %rax
108 tzcntl %eax, %eax
109# ifdef USE_AS_WCSLEN
110 /* NB: Divide bytes by 4 to get wchar_t count. */
111 shrl $2, %eax
112# endif
113 VZEROUPPER_RETURN
114# endif
115
116 .p2align 4
117L(first_vec_x1):
118 tzcntl %eax, %eax
119 /* Safe to use 32 bit instructions as these are only called for
120 size = [1, 159]. */
121# ifdef USE_AS_STRNLEN
122 /* Use ecx which was computed earlier to compute correct value.
123 */
124# ifdef USE_AS_WCSLEN
125 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
126# else
127 subl $(VEC_SIZE * 4 + 1), %ecx
128 addl %ecx, %eax
129# endif
130# else
131 subl %edx, %edi
132 incl %edi
133 addl %edi, %eax
134# endif
135# ifdef USE_AS_WCSLEN
136 /* NB: Divide bytes by 4 to get wchar_t count. */
137 shrl $2, %eax
138# endif
139 VZEROUPPER_RETURN
140
141 .p2align 4
142L(first_vec_x2):
143 tzcntl %eax, %eax
144 /* Safe to use 32 bit instructions as these are only called for
145 size = [1, 159]. */
146# ifdef USE_AS_STRNLEN
147 /* Use ecx which was computed earlier to compute correct value.
148 */
149# ifdef USE_AS_WCSLEN
150 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
151# else
152 subl $(VEC_SIZE * 3 + 1), %ecx
153 addl %ecx, %eax
154# endif
155# else
156 subl %edx, %edi
157 addl $(VEC_SIZE + 1), %edi
158 addl %edi, %eax
159# endif
160# ifdef USE_AS_WCSLEN
161 /* NB: Divide bytes by 4 to get wchar_t count. */
162 shrl $2, %eax
163# endif
164 VZEROUPPER_RETURN
165
166 .p2align 4
167L(first_vec_x3):
168 tzcntl %eax, %eax
169 /* Safe to use 32 bit instructions as these are only called for
170 size = [1, 159]. */
171# ifdef USE_AS_STRNLEN
172 /* Use ecx which was computed earlier to compute correct value.
173 */
174# ifdef USE_AS_WCSLEN
175 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
176# else
177 subl $(VEC_SIZE * 2 + 1), %ecx
178 addl %ecx, %eax
179# endif
180# else
181 subl %edx, %edi
182 addl $(VEC_SIZE * 2 + 1), %edi
183 addl %edi, %eax
184# endif
185# ifdef USE_AS_WCSLEN
186 /* NB: Divide bytes by 4 to get wchar_t count. */
187 shrl $2, %eax
188# endif
189 VZEROUPPER_RETURN
190
191 .p2align 4
192L(first_vec_x4):
193 tzcntl %eax, %eax
194 /* Safe to use 32 bit instructions as these are only called for
195 size = [1, 159]. */
196# ifdef USE_AS_STRNLEN
197 /* Use ecx which was computed earlier to compute correct value.
198 */
199# ifdef USE_AS_WCSLEN
200 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
201# else
202 subl $(VEC_SIZE + 1), %ecx
203 addl %ecx, %eax
204# endif
205# else
206 subl %edx, %edi
207 addl $(VEC_SIZE * 3 + 1), %edi
208 addl %edi, %eax
209# endif
210# ifdef USE_AS_WCSLEN
211 /* NB: Divide bytes by 4 to get wchar_t count. */
212 shrl $2, %eax
213# endif
214 VZEROUPPER_RETURN
215
216 .p2align 5
217L(aligned_more):
218 /* Align data to VEC_SIZE - 1. This is the same number of
219 instructions as using andq with -VEC_SIZE but saves 4 bytes of
220 code on the x4 check. */
221 orq $(VEC_SIZE - 1), %rdi
222L(cross_page_continue):
223 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
224 since data is only aligned to VEC_SIZE. */
225# ifdef USE_AS_STRNLEN
226 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
227 because it simplifies the logic in last_4x_vec_or_less. */
228 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
229 subq %rdx, %rcx
230# ifdef USE_AS_WCSLEN
231 /* NB: Divide bytes by 4 to get the wchar_t count. */
232 sarl $2, %ecx
233# endif
234# endif
235 /* Load first VEC regardless. */
236 VPCMPEQ 1(%rdi), %ymm0, %ymm1
237# ifdef USE_AS_STRNLEN
238 /* Adjust length. If near end handle specially. */
239 subq %rcx, %rsi
240 jb L(last_4x_vec_or_less)
241# endif
242 vpmovmskb %ymm1, %eax
243 testl %eax, %eax
244 jnz L(first_vec_x1)
245
246 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
247 vpmovmskb %ymm1, %eax
248 testl %eax, %eax
249 jnz L(first_vec_x2)
250
251 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
252 vpmovmskb %ymm1, %eax
253 testl %eax, %eax
254 jnz L(first_vec_x3)
255
256 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
257 vpmovmskb %ymm1, %eax
258 testl %eax, %eax
259 jnz L(first_vec_x4)
260
261 /* Align data to VEC_SIZE * 4 - 1. */
262# ifdef USE_AS_STRNLEN
263 /* Before adjusting length check if at last VEC_SIZE * 4. */
264 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
265 jbe L(last_4x_vec_or_less_load)
266 incq %rdi
267 movl %edi, %ecx
268 orq $(VEC_SIZE * 4 - 1), %rdi
269 andl $(VEC_SIZE * 4 - 1), %ecx
270# ifdef USE_AS_WCSLEN
271 /* NB: Divide bytes by 4 to get the wchar_t count. */
272 sarl $2, %ecx
273# endif
274 /* Readjust length. */
275 addq %rcx, %rsi
276# else
277 incq %rdi
278 orq $(VEC_SIZE * 4 - 1), %rdi
279# endif
280 /* Compare 4 * VEC at a time forward. */
281 .p2align 4
282L(loop_4x_vec):
283# ifdef USE_AS_STRNLEN
284 /* Break if at end of length. */
285 subq $(CHAR_PER_VEC * 4), %rsi
286 jb L(last_4x_vec_or_less_cmpeq)
287# endif
288 /* Save some code size by microfusing VPMINU with the load.
289 Since the matches in ymm2/ymm4 can only be returned if there
290 where no matches in ymm1/ymm3 respectively there is no issue
291 with overlap. */
292 vmovdqa 1(%rdi), %ymm1
293 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
294 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
295 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
296
297 VPMINU %ymm2, %ymm4, %ymm5
298 VPCMPEQ %ymm5, %ymm0, %ymm5
299 vpmovmskb %ymm5, %ecx
300
301 subq $-(VEC_SIZE * 4), %rdi
302 testl %ecx, %ecx
303 jz L(loop_4x_vec)
304
305
306 VPCMPEQ %ymm1, %ymm0, %ymm1
307 vpmovmskb %ymm1, %eax
308 subq %rdx, %rdi
309 testl %eax, %eax
310 jnz L(last_vec_return_x0)
311
312 VPCMPEQ %ymm2, %ymm0, %ymm2
313 vpmovmskb %ymm2, %eax
314 testl %eax, %eax
315 jnz L(last_vec_return_x1)
316
317 /* Combine last 2 VEC. */
318 VPCMPEQ %ymm3, %ymm0, %ymm3
319 vpmovmskb %ymm3, %eax
320 /* rcx has combined result from all 4 VEC. It will only be used
321 if the first 3 other VEC all did not contain a match. */
322 salq $32, %rcx
323 orq %rcx, %rax
324 tzcntq %rax, %rax
325 subq $(VEC_SIZE * 2 - 1), %rdi
326 addq %rdi, %rax
327# ifdef USE_AS_WCSLEN
328 /* NB: Divide bytes by 4 to get wchar_t count. */
329 shrq $2, %rax
330# endif
331 VZEROUPPER_RETURN
332
333
334# ifdef USE_AS_STRNLEN
335 .p2align 4
336L(last_4x_vec_or_less_load):
337 /* Depending on entry adjust rdi / prepare first VEC in ymm1.
338 */
339 subq $-(VEC_SIZE * 4), %rdi
340L(last_4x_vec_or_less_cmpeq):
341 VPCMPEQ 1(%rdi), %ymm0, %ymm1
342L(last_4x_vec_or_less):
343# ifdef USE_AS_WCSLEN
344 /* NB: Multiply length by 4 to get byte count. */
345 sall $2, %esi
346# endif
347 vpmovmskb %ymm1, %eax
348 /* If remaining length > VEC_SIZE * 2. This works if esi is off
349 by VEC_SIZE * 4. */
350 testl $(VEC_SIZE * 2), %esi
351 jnz L(last_4x_vec)
352
353 /* length may have been negative or positive by an offset of
354 VEC_SIZE * 4 depending on where this was called from. This fixes
355 that. */
356 andl $(VEC_SIZE * 4 - 1), %esi
357 testl %eax, %eax
358 jnz L(last_vec_x1_check)
359
360 subl $VEC_SIZE, %esi
361 jb L(max)
362
363 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
364 vpmovmskb %ymm1, %eax
365 tzcntl %eax, %eax
366 /* Check the end of data. */
367 cmpl %eax, %esi
368 jb L(max)
369 subq %rdx, %rdi
370 addl $(VEC_SIZE + 1), %eax
371 addq %rdi, %rax
372# ifdef USE_AS_WCSLEN
373 /* NB: Divide bytes by 4 to get wchar_t count. */
374 shrq $2, %rax
375# endif
376 VZEROUPPER_RETURN
377# endif
378
379 .p2align 4
380L(last_vec_return_x0):
381 tzcntl %eax, %eax
382 subq $(VEC_SIZE * 4 - 1), %rdi
383 addq %rdi, %rax
384# ifdef USE_AS_WCSLEN
385 /* NB: Divide bytes by 4 to get wchar_t count. */
386 shrq $2, %rax
387# endif
388 VZEROUPPER_RETURN
389
390 .p2align 4
391L(last_vec_return_x1):
392 tzcntl %eax, %eax
393 subq $(VEC_SIZE * 3 - 1), %rdi
394 addq %rdi, %rax
395# ifdef USE_AS_WCSLEN
396 /* NB: Divide bytes by 4 to get wchar_t count. */
397 shrq $2, %rax
398# endif
399 VZEROUPPER_RETURN
400
401# ifdef USE_AS_STRNLEN
402 .p2align 4
403L(last_vec_x1_check):
404
405 tzcntl %eax, %eax
406 /* Check the end of data. */
407 cmpl %eax, %esi
408 jb L(max)
409 subq %rdx, %rdi
410 incl %eax
411 addq %rdi, %rax
412# ifdef USE_AS_WCSLEN
413 /* NB: Divide bytes by 4 to get wchar_t count. */
414 shrq $2, %rax
415# endif
416 VZEROUPPER_RETURN
417
418L(max):
419 movq %r8, %rax
420 VZEROUPPER_RETURN
421
422 .p2align 4
423L(last_4x_vec):
424 /* Test first 2x VEC normally. */
425 testl %eax, %eax
426 jnz L(last_vec_x1)
427
428 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
429 vpmovmskb %ymm1, %eax
430 testl %eax, %eax
431 jnz L(last_vec_x2)
432
433 /* Normalize length. */
434 andl $(VEC_SIZE * 4 - 1), %esi
435 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
436 vpmovmskb %ymm1, %eax
437 testl %eax, %eax
438 jnz L(last_vec_x3)
439
440 subl $(VEC_SIZE * 3), %esi
441 jb L(max)
442
443 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
444 vpmovmskb %ymm1, %eax
445 tzcntl %eax, %eax
446 /* Check the end of data. */
447 cmpl %eax, %esi
448 jb L(max)
449 subq %rdx, %rdi
450 addl $(VEC_SIZE * 3 + 1), %eax
451 addq %rdi, %rax
452# ifdef USE_AS_WCSLEN
453 /* NB: Divide bytes by 4 to get wchar_t count. */
454 shrq $2, %rax
455# endif
456 VZEROUPPER_RETURN
457
458
459 .p2align 4
460L(last_vec_x1):
461 /* essentially duplicates of first_vec_x1 but use 64 bit
462 instructions. */
463 tzcntl %eax, %eax
464 subq %rdx, %rdi
465 incl %eax
466 addq %rdi, %rax
467# ifdef USE_AS_WCSLEN
468 /* NB: Divide bytes by 4 to get wchar_t count. */
469 shrq $2, %rax
470# endif
471 VZEROUPPER_RETURN
472
473 .p2align 4
474L(last_vec_x2):
475 /* essentially duplicates of first_vec_x1 but use 64 bit
476 instructions. */
477 tzcntl %eax, %eax
478 subq %rdx, %rdi
479 addl $(VEC_SIZE + 1), %eax
480 addq %rdi, %rax
481# ifdef USE_AS_WCSLEN
482 /* NB: Divide bytes by 4 to get wchar_t count. */
483 shrq $2, %rax
484# endif
485 VZEROUPPER_RETURN
486
487 .p2align 4
488L(last_vec_x3):
489 tzcntl %eax, %eax
490 subl $(VEC_SIZE * 2), %esi
491 /* Check the end of data. */
492 cmpl %eax, %esi
493 jb L(max_end)
494 subq %rdx, %rdi
495 addl $(VEC_SIZE * 2 + 1), %eax
496 addq %rdi, %rax
497# ifdef USE_AS_WCSLEN
498 /* NB: Divide bytes by 4 to get wchar_t count. */
499 shrq $2, %rax
500# endif
501 VZEROUPPER_RETURN
502L(max_end):
503 movq %r8, %rax
504 VZEROUPPER_RETURN
505# endif
506
507 /* Cold case for crossing page with first load. */
508 .p2align 4
509L(cross_page_boundary):
510 /* Align data to VEC_SIZE - 1. */
511 orq $(VEC_SIZE - 1), %rdi
512 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
513 vpmovmskb %ymm1, %eax
514 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
515 so no need to manually mod rdx. */
516 sarxl %edx, %eax, %eax
517# ifdef USE_AS_STRNLEN
518 testl %eax, %eax
519 jnz L(cross_page_less_vec)
520 leaq 1(%rdi), %rcx
521 subq %rdx, %rcx
522# ifdef USE_AS_WCSLEN
523 /* NB: Divide bytes by 4 to get wchar_t count. */
524 shrl $2, %ecx
525# endif
526 /* Check length. */
527 cmpq %rsi, %rcx
528 jb L(cross_page_continue)
529 movq %r8, %rax
530# else
531 testl %eax, %eax
532 jz L(cross_page_continue)
533 tzcntl %eax, %eax
534# ifdef USE_AS_WCSLEN
535 /* NB: Divide length by 4 to get wchar_t count. */
536 shrl $2, %eax
537# endif
538# endif
539L(return_vzeroupper):
540 ZERO_UPPER_VEC_REGISTERS_RETURN
541
542# ifdef USE_AS_STRNLEN
543 .p2align 4
544L(cross_page_less_vec):
545 tzcntl %eax, %eax
546# ifdef USE_AS_WCSLEN
547 /* NB: Divide by 4 to convert from byte-count to length. */
548 shrl $2, %eax
549# endif
550 cmpq %rax, %rsi
551 cmovb %esi, %eax
552 VZEROUPPER_RETURN
553# endif
554
555END (STRLEN)
556#endif
557