1/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifndef STRLEN
26# define STRLEN __strlen_evex
27# endif
28
29# define VMOVA vmovdqa64
30
31# ifdef USE_AS_WCSLEN
32# define VPCMP vpcmpd
33# define VPMINU vpminud
34# define SHIFT_REG ecx
35# define CHAR_SIZE 4
36# else
37# define VPCMP vpcmpb
38# define VPMINU vpminub
39# define SHIFT_REG edx
40# define CHAR_SIZE 1
41# endif
42
43# define XMMZERO xmm16
44# define YMMZERO ymm16
45# define YMM1 ymm17
46# define YMM2 ymm18
47# define YMM3 ymm19
48# define YMM4 ymm20
49# define YMM5 ymm21
50# define YMM6 ymm22
51
52# define VEC_SIZE 32
53# define PAGE_SIZE 4096
54# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
55
56 .section .text.evex,"ax",@progbits
57ENTRY (STRLEN)
58# ifdef USE_AS_STRNLEN
59 /* Check zero length. */
60 test %RSI_LP, %RSI_LP
61 jz L(zero)
62# ifdef __ILP32__
63 /* Clear the upper 32 bits. */
64 movl %esi, %esi
65# endif
66 mov %RSI_LP, %R8_LP
67# endif
68 movl %edi, %eax
69 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
70 /* Clear high bits from edi. Only keeping bits relevant to page
71 cross check. */
72 andl $(PAGE_SIZE - 1), %eax
73 /* Check if we may cross page boundary with one vector load. */
74 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
75 ja L(cross_page_boundary)
76
77 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
78 null byte. */
79 VPCMP $0, (%rdi), %YMMZERO, %k0
80 kmovd %k0, %eax
81# ifdef USE_AS_STRNLEN
82 /* If length < CHAR_PER_VEC handle special. */
83 cmpq $CHAR_PER_VEC, %rsi
84 jbe L(first_vec_x0)
85# endif
86 testl %eax, %eax
87 jz L(aligned_more)
88 tzcntl %eax, %eax
89 ret
90# ifdef USE_AS_STRNLEN
91L(zero):
92 xorl %eax, %eax
93 ret
94
95 .p2align 4
96L(first_vec_x0):
97 /* Set bit for max len so that tzcnt will return min of max len
98 and position of first match. */
99 btsq %rsi, %rax
100 tzcntl %eax, %eax
101 ret
102# endif
103
104 .p2align 4
105L(first_vec_x1):
106 tzcntl %eax, %eax
107 /* Safe to use 32 bit instructions as these are only called for
108 size = [1, 159]. */
109# ifdef USE_AS_STRNLEN
110 /* Use ecx which was computed earlier to compute correct value.
111 */
112 leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
113# else
114 subl %edx, %edi
115# ifdef USE_AS_WCSLEN
116 /* NB: Divide bytes by 4 to get the wchar_t count. */
117 sarl $2, %edi
118# endif
119 leal CHAR_PER_VEC(%rdi, %rax), %eax
120# endif
121 ret
122
123 .p2align 4
124L(first_vec_x2):
125 tzcntl %eax, %eax
126 /* Safe to use 32 bit instructions as these are only called for
127 size = [1, 159]. */
128# ifdef USE_AS_STRNLEN
129 /* Use ecx which was computed earlier to compute correct value.
130 */
131 leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
132# else
133 subl %edx, %edi
134# ifdef USE_AS_WCSLEN
135 /* NB: Divide bytes by 4 to get the wchar_t count. */
136 sarl $2, %edi
137# endif
138 leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
139# endif
140 ret
141
142 .p2align 4
143L(first_vec_x3):
144 tzcntl %eax, %eax
145 /* Safe to use 32 bit instructions as these are only called for
146 size = [1, 159]. */
147# ifdef USE_AS_STRNLEN
148 /* Use ecx which was computed earlier to compute correct value.
149 */
150 leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
151# else
152 subl %edx, %edi
153# ifdef USE_AS_WCSLEN
154 /* NB: Divide bytes by 4 to get the wchar_t count. */
155 sarl $2, %edi
156# endif
157 leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
158# endif
159 ret
160
161 .p2align 4
162L(first_vec_x4):
163 tzcntl %eax, %eax
164 /* Safe to use 32 bit instructions as these are only called for
165 size = [1, 159]. */
166# ifdef USE_AS_STRNLEN
167 /* Use ecx which was computed earlier to compute correct value.
168 */
169 leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
170# else
171 subl %edx, %edi
172# ifdef USE_AS_WCSLEN
173 /* NB: Divide bytes by 4 to get the wchar_t count. */
174 sarl $2, %edi
175# endif
176 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
177# endif
178 ret
179
180 .p2align 5
181L(aligned_more):
182 movq %rdi, %rdx
183 /* Align data to VEC_SIZE. */
184 andq $-(VEC_SIZE), %rdi
185L(cross_page_continue):
186 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
187 since data is only aligned to VEC_SIZE. */
188# ifdef USE_AS_STRNLEN
189 /* + CHAR_SIZE because it simplies the logic in
190 last_4x_vec_or_less. */
191 leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
192 subq %rdx, %rcx
193# ifdef USE_AS_WCSLEN
194 /* NB: Divide bytes by 4 to get the wchar_t count. */
195 sarl $2, %ecx
196# endif
197# endif
198 /* Load first VEC regardless. */
199 VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
200# ifdef USE_AS_STRNLEN
201 /* Adjust length. If near end handle specially. */
202 subq %rcx, %rsi
203 jb L(last_4x_vec_or_less)
204# endif
205 kmovd %k0, %eax
206 testl %eax, %eax
207 jnz L(first_vec_x1)
208
209 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
210 kmovd %k0, %eax
211 test %eax, %eax
212 jnz L(first_vec_x2)
213
214 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
215 kmovd %k0, %eax
216 testl %eax, %eax
217 jnz L(first_vec_x3)
218
219 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
220 kmovd %k0, %eax
221 testl %eax, %eax
222 jnz L(first_vec_x4)
223
224 addq $VEC_SIZE, %rdi
225# ifdef USE_AS_STRNLEN
226 /* Check if at last VEC_SIZE * 4 length. */
227 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
228 jbe L(last_4x_vec_or_less_load)
229 movl %edi, %ecx
230 andl $(VEC_SIZE * 4 - 1), %ecx
231# ifdef USE_AS_WCSLEN
232 /* NB: Divide bytes by 4 to get the wchar_t count. */
233 sarl $2, %ecx
234# endif
235 /* Readjust length. */
236 addq %rcx, %rsi
237# endif
238 /* Align data to VEC_SIZE * 4. */
239 andq $-(VEC_SIZE * 4), %rdi
240
241 /* Compare 4 * VEC at a time forward. */
242 .p2align 4
243L(loop_4x_vec):
244 /* Load first VEC regardless. */
245 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
246# ifdef USE_AS_STRNLEN
247 /* Break if at end of length. */
248 subq $(CHAR_PER_VEC * 4), %rsi
249 jb L(last_4x_vec_or_less_cmpeq)
250# endif
251 /* Save some code size by microfusing VPMINU with the load. Since
252 the matches in ymm2/ymm4 can only be returned if there where no
253 matches in ymm1/ymm3 respectively there is no issue with overlap.
254 */
255 VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
256 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
257 VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
258
259 VPCMP $0, %YMM2, %YMMZERO, %k0
260 VPCMP $0, %YMM4, %YMMZERO, %k1
261 subq $-(VEC_SIZE * 4), %rdi
262 kortestd %k0, %k1
263 jz L(loop_4x_vec)
264
265 /* Check if end was in first half. */
266 kmovd %k0, %eax
267 subq %rdx, %rdi
268# ifdef USE_AS_WCSLEN
269 shrq $2, %rdi
270# endif
271 testl %eax, %eax
272 jz L(second_vec_return)
273
274 VPCMP $0, %YMM1, %YMMZERO, %k2
275 kmovd %k2, %edx
276 /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
277# ifdef USE_AS_WCSLEN
278 sall $CHAR_PER_VEC, %eax
279 orl %edx, %eax
280 tzcntl %eax, %eax
281# else
282 salq $CHAR_PER_VEC, %rax
283 orq %rdx, %rax
284 tzcntq %rax, %rax
285# endif
286 addq %rdi, %rax
287 ret
288
289
290# ifdef USE_AS_STRNLEN
291
292L(last_4x_vec_or_less_load):
293 /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
294 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
295L(last_4x_vec_or_less_cmpeq):
296 VPCMP $0, %YMM1, %YMMZERO, %k0
297 addq $(VEC_SIZE * 3), %rdi
298L(last_4x_vec_or_less):
299 kmovd %k0, %eax
300 /* If remaining length > VEC_SIZE * 2. This works if esi is off by
301 VEC_SIZE * 4. */
302 testl $(CHAR_PER_VEC * 2), %esi
303 jnz L(last_4x_vec)
304
305 /* length may have been negative or positive by an offset of
306 CHAR_PER_VEC * 4 depending on where this was called from. This
307 fixes that. */
308 andl $(CHAR_PER_VEC * 4 - 1), %esi
309 testl %eax, %eax
310 jnz L(last_vec_x1_check)
311
312 /* Check the end of data. */
313 subl $CHAR_PER_VEC, %esi
314 jb L(max)
315
316 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
317 kmovd %k0, %eax
318 tzcntl %eax, %eax
319 /* Check the end of data. */
320 cmpl %eax, %esi
321 jb L(max)
322
323 subq %rdx, %rdi
324# ifdef USE_AS_WCSLEN
325 /* NB: Divide bytes by 4 to get the wchar_t count. */
326 sarq $2, %rdi
327# endif
328 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
329 ret
330L(max):
331 movq %r8, %rax
332 ret
333# endif
334
335 /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
336 in the 4x VEC loop can use 2 byte encoding. */
337 .p2align 4
338L(second_vec_return):
339 VPCMP $0, %YMM3, %YMMZERO, %k0
340 /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
341# ifdef USE_AS_WCSLEN
342 kunpckbw %k0, %k1, %k0
343 kmovd %k0, %eax
344 tzcntl %eax, %eax
345# else
346 kunpckdq %k0, %k1, %k0
347 kmovq %k0, %rax
348 tzcntq %rax, %rax
349# endif
350 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
351 ret
352
353
354# ifdef USE_AS_STRNLEN
355L(last_vec_x1_check):
356 tzcntl %eax, %eax
357 /* Check the end of data. */
358 cmpl %eax, %esi
359 jb L(max)
360 subq %rdx, %rdi
361# ifdef USE_AS_WCSLEN
362 /* NB: Divide bytes by 4 to get the wchar_t count. */
363 sarq $2, %rdi
364# endif
365 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
366 ret
367
368 .p2align 4
369L(last_4x_vec):
370 /* Test first 2x VEC normally. */
371 testl %eax, %eax
372 jnz L(last_vec_x1)
373
374 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
375 kmovd %k0, %eax
376 testl %eax, %eax
377 jnz L(last_vec_x2)
378
379 /* Normalize length. */
380 andl $(CHAR_PER_VEC * 4 - 1), %esi
381 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
382 kmovd %k0, %eax
383 testl %eax, %eax
384 jnz L(last_vec_x3)
385
386 /* Check the end of data. */
387 subl $(CHAR_PER_VEC * 3), %esi
388 jb L(max)
389
390 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
391 kmovd %k0, %eax
392 tzcntl %eax, %eax
393 /* Check the end of data. */
394 cmpl %eax, %esi
395 jb L(max_end)
396
397 subq %rdx, %rdi
398# ifdef USE_AS_WCSLEN
399 /* NB: Divide bytes by 4 to get the wchar_t count. */
400 sarq $2, %rdi
401# endif
402 leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
403 ret
404
405 .p2align 4
406L(last_vec_x1):
407 tzcntl %eax, %eax
408 subq %rdx, %rdi
409# ifdef USE_AS_WCSLEN
410 /* NB: Divide bytes by 4 to get the wchar_t count. */
411 sarq $2, %rdi
412# endif
413 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
414 ret
415
416 .p2align 4
417L(last_vec_x2):
418 tzcntl %eax, %eax
419 subq %rdx, %rdi
420# ifdef USE_AS_WCSLEN
421 /* NB: Divide bytes by 4 to get the wchar_t count. */
422 sarq $2, %rdi
423# endif
424 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
425 ret
426
427 .p2align 4
428L(last_vec_x3):
429 tzcntl %eax, %eax
430 subl $(CHAR_PER_VEC * 2), %esi
431 /* Check the end of data. */
432 cmpl %eax, %esi
433 jb L(max_end)
434 subq %rdx, %rdi
435# ifdef USE_AS_WCSLEN
436 /* NB: Divide bytes by 4 to get the wchar_t count. */
437 sarq $2, %rdi
438# endif
439 leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
440 ret
441L(max_end):
442 movq %r8, %rax
443 ret
444# endif
445
446 /* Cold case for crossing page with first load. */
447 .p2align 4
448L(cross_page_boundary):
449 movq %rdi, %rdx
450 /* Align data to VEC_SIZE. */
451 andq $-VEC_SIZE, %rdi
452 VPCMP $0, (%rdi), %YMMZERO, %k0
453 kmovd %k0, %eax
454 /* Remove the leading bytes. */
455# ifdef USE_AS_WCSLEN
456 /* NB: Divide shift count by 4 since each bit in K0 represent 4
457 bytes. */
458 movl %edx, %ecx
459 shrl $2, %ecx
460 andl $(CHAR_PER_VEC - 1), %ecx
461# endif
462 /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
463 sarxl %SHIFT_REG, %eax, %eax
464 testl %eax, %eax
465# ifndef USE_AS_STRNLEN
466 jz L(cross_page_continue)
467 tzcntl %eax, %eax
468 ret
469# else
470 jnz L(cross_page_less_vec)
471# ifndef USE_AS_WCSLEN
472 movl %edx, %ecx
473 andl $(CHAR_PER_VEC - 1), %ecx
474# endif
475 movl $CHAR_PER_VEC, %eax
476 subl %ecx, %eax
477 /* Check the end of data. */
478 cmpq %rax, %rsi
479 ja L(cross_page_continue)
480 movl %esi, %eax
481 ret
482L(cross_page_less_vec):
483 tzcntl %eax, %eax
484 /* Select min of length and position of first null. */
485 cmpq %rax, %rsi
486 cmovb %esi, %eax
487 ret
488# endif
489
490END (STRLEN)
491#endif
492