1 | /* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef STRLEN |
26 | # define STRLEN __strlen_evex |
27 | # endif |
28 | |
29 | # define VMOVA vmovdqa64 |
30 | |
31 | # ifdef USE_AS_WCSLEN |
32 | # define VPCMP vpcmpd |
33 | # define VPMINU vpminud |
34 | # define SHIFT_REG ecx |
35 | # define CHAR_SIZE 4 |
36 | # else |
37 | # define VPCMP vpcmpb |
38 | # define VPMINU vpminub |
39 | # define SHIFT_REG edx |
40 | # define CHAR_SIZE 1 |
41 | # endif |
42 | |
43 | # define XMMZERO xmm16 |
44 | # define YMMZERO ymm16 |
45 | # define YMM1 ymm17 |
46 | # define YMM2 ymm18 |
47 | # define YMM3 ymm19 |
48 | # define YMM4 ymm20 |
49 | # define YMM5 ymm21 |
50 | # define YMM6 ymm22 |
51 | |
52 | # define VEC_SIZE 32 |
53 | # define PAGE_SIZE 4096 |
54 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
55 | |
56 | .section .text.evex,"ax" ,@progbits |
57 | ENTRY (STRLEN) |
58 | # ifdef USE_AS_STRNLEN |
59 | /* Check zero length. */ |
60 | test %RSI_LP, %RSI_LP |
61 | jz L(zero) |
62 | # ifdef __ILP32__ |
63 | /* Clear the upper 32 bits. */ |
64 | movl %esi, %esi |
65 | # endif |
66 | mov %RSI_LP, %R8_LP |
67 | # endif |
68 | movl %edi, %eax |
69 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
70 | /* Clear high bits from edi. Only keeping bits relevant to page |
71 | cross check. */ |
72 | andl $(PAGE_SIZE - 1), %eax |
73 | /* Check if we may cross page boundary with one vector load. */ |
74 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
75 | ja L(cross_page_boundary) |
76 | |
77 | /* Check the first VEC_SIZE bytes. Each bit in K0 represents a |
78 | null byte. */ |
79 | VPCMP $0, (%rdi), %YMMZERO, %k0 |
80 | kmovd %k0, %eax |
81 | # ifdef USE_AS_STRNLEN |
82 | /* If length < CHAR_PER_VEC handle special. */ |
83 | cmpq $CHAR_PER_VEC, %rsi |
84 | jbe L(first_vec_x0) |
85 | # endif |
86 | testl %eax, %eax |
87 | jz L(aligned_more) |
88 | tzcntl %eax, %eax |
89 | ret |
90 | # ifdef USE_AS_STRNLEN |
91 | L(zero): |
92 | xorl %eax, %eax |
93 | ret |
94 | |
95 | .p2align 4 |
96 | L(first_vec_x0): |
97 | /* Set bit for max len so that tzcnt will return min of max len |
98 | and position of first match. */ |
99 | btsq %rsi, %rax |
100 | tzcntl %eax, %eax |
101 | ret |
102 | # endif |
103 | |
104 | .p2align 4 |
105 | L(first_vec_x1): |
106 | tzcntl %eax, %eax |
107 | /* Safe to use 32 bit instructions as these are only called for |
108 | size = [1, 159]. */ |
109 | # ifdef USE_AS_STRNLEN |
110 | /* Use ecx which was computed earlier to compute correct value. |
111 | */ |
112 | leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax |
113 | # else |
114 | subl %edx, %edi |
115 | # ifdef USE_AS_WCSLEN |
116 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
117 | sarl $2, %edi |
118 | # endif |
119 | leal CHAR_PER_VEC(%rdi, %rax), %eax |
120 | # endif |
121 | ret |
122 | |
123 | .p2align 4 |
124 | L(first_vec_x2): |
125 | tzcntl %eax, %eax |
126 | /* Safe to use 32 bit instructions as these are only called for |
127 | size = [1, 159]. */ |
128 | # ifdef USE_AS_STRNLEN |
129 | /* Use ecx which was computed earlier to compute correct value. |
130 | */ |
131 | leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax |
132 | # else |
133 | subl %edx, %edi |
134 | # ifdef USE_AS_WCSLEN |
135 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
136 | sarl $2, %edi |
137 | # endif |
138 | leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax |
139 | # endif |
140 | ret |
141 | |
142 | .p2align 4 |
143 | L(first_vec_x3): |
144 | tzcntl %eax, %eax |
145 | /* Safe to use 32 bit instructions as these are only called for |
146 | size = [1, 159]. */ |
147 | # ifdef USE_AS_STRNLEN |
148 | /* Use ecx which was computed earlier to compute correct value. |
149 | */ |
150 | leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax |
151 | # else |
152 | subl %edx, %edi |
153 | # ifdef USE_AS_WCSLEN |
154 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
155 | sarl $2, %edi |
156 | # endif |
157 | leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax |
158 | # endif |
159 | ret |
160 | |
161 | .p2align 4 |
162 | L(first_vec_x4): |
163 | tzcntl %eax, %eax |
164 | /* Safe to use 32 bit instructions as these are only called for |
165 | size = [1, 159]. */ |
166 | # ifdef USE_AS_STRNLEN |
167 | /* Use ecx which was computed earlier to compute correct value. |
168 | */ |
169 | leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax |
170 | # else |
171 | subl %edx, %edi |
172 | # ifdef USE_AS_WCSLEN |
173 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
174 | sarl $2, %edi |
175 | # endif |
176 | leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax |
177 | # endif |
178 | ret |
179 | |
180 | .p2align 5 |
181 | L(aligned_more): |
182 | movq %rdi, %rdx |
183 | /* Align data to VEC_SIZE. */ |
184 | andq $-(VEC_SIZE), %rdi |
185 | L(cross_page_continue): |
186 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
187 | since data is only aligned to VEC_SIZE. */ |
188 | # ifdef USE_AS_STRNLEN |
189 | /* + CHAR_SIZE because it simplies the logic in |
190 | last_4x_vec_or_less. */ |
191 | leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx |
192 | subq %rdx, %rcx |
193 | # ifdef USE_AS_WCSLEN |
194 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
195 | sarl $2, %ecx |
196 | # endif |
197 | # endif |
198 | /* Load first VEC regardless. */ |
199 | VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 |
200 | # ifdef USE_AS_STRNLEN |
201 | /* Adjust length. If near end handle specially. */ |
202 | subq %rcx, %rsi |
203 | jb L(last_4x_vec_or_less) |
204 | # endif |
205 | kmovd %k0, %eax |
206 | testl %eax, %eax |
207 | jnz L(first_vec_x1) |
208 | |
209 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
210 | kmovd %k0, %eax |
211 | test %eax, %eax |
212 | jnz L(first_vec_x2) |
213 | |
214 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 |
215 | kmovd %k0, %eax |
216 | testl %eax, %eax |
217 | jnz L(first_vec_x3) |
218 | |
219 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 |
220 | kmovd %k0, %eax |
221 | testl %eax, %eax |
222 | jnz L(first_vec_x4) |
223 | |
224 | addq $VEC_SIZE, %rdi |
225 | # ifdef USE_AS_STRNLEN |
226 | /* Check if at last VEC_SIZE * 4 length. */ |
227 | cmpq $(CHAR_PER_VEC * 4 - 1), %rsi |
228 | jbe L(last_4x_vec_or_less_load) |
229 | movl %edi, %ecx |
230 | andl $(VEC_SIZE * 4 - 1), %ecx |
231 | # ifdef USE_AS_WCSLEN |
232 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
233 | sarl $2, %ecx |
234 | # endif |
235 | /* Readjust length. */ |
236 | addq %rcx, %rsi |
237 | # endif |
238 | /* Align data to VEC_SIZE * 4. */ |
239 | andq $-(VEC_SIZE * 4), %rdi |
240 | |
241 | /* Compare 4 * VEC at a time forward. */ |
242 | .p2align 4 |
243 | L(loop_4x_vec): |
244 | /* Load first VEC regardless. */ |
245 | VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 |
246 | # ifdef USE_AS_STRNLEN |
247 | /* Break if at end of length. */ |
248 | subq $(CHAR_PER_VEC * 4), %rsi |
249 | jb L(last_4x_vec_or_less_cmpeq) |
250 | # endif |
251 | /* Save some code size by microfusing VPMINU with the load. Since |
252 | the matches in ymm2/ymm4 can only be returned if there where no |
253 | matches in ymm1/ymm3 respectively there is no issue with overlap. |
254 | */ |
255 | VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 |
256 | VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 |
257 | VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 |
258 | |
259 | VPCMP $0, %YMM2, %YMMZERO, %k0 |
260 | VPCMP $0, %YMM4, %YMMZERO, %k1 |
261 | subq $-(VEC_SIZE * 4), %rdi |
262 | kortestd %k0, %k1 |
263 | jz L(loop_4x_vec) |
264 | |
265 | /* Check if end was in first half. */ |
266 | kmovd %k0, %eax |
267 | subq %rdx, %rdi |
268 | # ifdef USE_AS_WCSLEN |
269 | shrq $2, %rdi |
270 | # endif |
271 | testl %eax, %eax |
272 | jz L(second_vec_return) |
273 | |
274 | VPCMP $0, %YMM1, %YMMZERO, %k2 |
275 | kmovd %k2, %edx |
276 | /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ |
277 | # ifdef USE_AS_WCSLEN |
278 | sall $CHAR_PER_VEC, %eax |
279 | orl %edx, %eax |
280 | tzcntl %eax, %eax |
281 | # else |
282 | salq $CHAR_PER_VEC, %rax |
283 | orq %rdx, %rax |
284 | tzcntq %rax, %rax |
285 | # endif |
286 | addq %rdi, %rax |
287 | ret |
288 | |
289 | |
290 | # ifdef USE_AS_STRNLEN |
291 | |
292 | L(last_4x_vec_or_less_load): |
293 | /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ |
294 | VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 |
295 | L(last_4x_vec_or_less_cmpeq): |
296 | VPCMP $0, %YMM1, %YMMZERO, %k0 |
297 | addq $(VEC_SIZE * 3), %rdi |
298 | L(last_4x_vec_or_less): |
299 | kmovd %k0, %eax |
300 | /* If remaining length > VEC_SIZE * 2. This works if esi is off by |
301 | VEC_SIZE * 4. */ |
302 | testl $(CHAR_PER_VEC * 2), %esi |
303 | jnz L(last_4x_vec) |
304 | |
305 | /* length may have been negative or positive by an offset of |
306 | CHAR_PER_VEC * 4 depending on where this was called from. This |
307 | fixes that. */ |
308 | andl $(CHAR_PER_VEC * 4 - 1), %esi |
309 | testl %eax, %eax |
310 | jnz L(last_vec_x1_check) |
311 | |
312 | /* Check the end of data. */ |
313 | subl $CHAR_PER_VEC, %esi |
314 | jb L(max) |
315 | |
316 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
317 | kmovd %k0, %eax |
318 | tzcntl %eax, %eax |
319 | /* Check the end of data. */ |
320 | cmpl %eax, %esi |
321 | jb L(max) |
322 | |
323 | subq %rdx, %rdi |
324 | # ifdef USE_AS_WCSLEN |
325 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
326 | sarq $2, %rdi |
327 | # endif |
328 | leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax |
329 | ret |
330 | L(max): |
331 | movq %r8, %rax |
332 | ret |
333 | # endif |
334 | |
335 | /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) |
336 | in the 4x VEC loop can use 2 byte encoding. */ |
337 | .p2align 4 |
338 | L(second_vec_return): |
339 | VPCMP $0, %YMM3, %YMMZERO, %k0 |
340 | /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ |
341 | # ifdef USE_AS_WCSLEN |
342 | kunpckbw %k0, %k1, %k0 |
343 | kmovd %k0, %eax |
344 | tzcntl %eax, %eax |
345 | # else |
346 | kunpckdq %k0, %k1, %k0 |
347 | kmovq %k0, %rax |
348 | tzcntq %rax, %rax |
349 | # endif |
350 | leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax |
351 | ret |
352 | |
353 | |
354 | # ifdef USE_AS_STRNLEN |
355 | L(last_vec_x1_check): |
356 | tzcntl %eax, %eax |
357 | /* Check the end of data. */ |
358 | cmpl %eax, %esi |
359 | jb L(max) |
360 | subq %rdx, %rdi |
361 | # ifdef USE_AS_WCSLEN |
362 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
363 | sarq $2, %rdi |
364 | # endif |
365 | leaq (CHAR_PER_VEC)(%rdi, %rax), %rax |
366 | ret |
367 | |
368 | .p2align 4 |
369 | L(last_4x_vec): |
370 | /* Test first 2x VEC normally. */ |
371 | testl %eax, %eax |
372 | jnz L(last_vec_x1) |
373 | |
374 | VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 |
375 | kmovd %k0, %eax |
376 | testl %eax, %eax |
377 | jnz L(last_vec_x2) |
378 | |
379 | /* Normalize length. */ |
380 | andl $(CHAR_PER_VEC * 4 - 1), %esi |
381 | VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 |
382 | kmovd %k0, %eax |
383 | testl %eax, %eax |
384 | jnz L(last_vec_x3) |
385 | |
386 | /* Check the end of data. */ |
387 | subl $(CHAR_PER_VEC * 3), %esi |
388 | jb L(max) |
389 | |
390 | VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 |
391 | kmovd %k0, %eax |
392 | tzcntl %eax, %eax |
393 | /* Check the end of data. */ |
394 | cmpl %eax, %esi |
395 | jb L(max_end) |
396 | |
397 | subq %rdx, %rdi |
398 | # ifdef USE_AS_WCSLEN |
399 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
400 | sarq $2, %rdi |
401 | # endif |
402 | leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax |
403 | ret |
404 | |
405 | .p2align 4 |
406 | L(last_vec_x1): |
407 | tzcntl %eax, %eax |
408 | subq %rdx, %rdi |
409 | # ifdef USE_AS_WCSLEN |
410 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
411 | sarq $2, %rdi |
412 | # endif |
413 | leaq (CHAR_PER_VEC)(%rdi, %rax), %rax |
414 | ret |
415 | |
416 | .p2align 4 |
417 | L(last_vec_x2): |
418 | tzcntl %eax, %eax |
419 | subq %rdx, %rdi |
420 | # ifdef USE_AS_WCSLEN |
421 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
422 | sarq $2, %rdi |
423 | # endif |
424 | leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax |
425 | ret |
426 | |
427 | .p2align 4 |
428 | L(last_vec_x3): |
429 | tzcntl %eax, %eax |
430 | subl $(CHAR_PER_VEC * 2), %esi |
431 | /* Check the end of data. */ |
432 | cmpl %eax, %esi |
433 | jb L(max_end) |
434 | subq %rdx, %rdi |
435 | # ifdef USE_AS_WCSLEN |
436 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |
437 | sarq $2, %rdi |
438 | # endif |
439 | leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax |
440 | ret |
441 | L(max_end): |
442 | movq %r8, %rax |
443 | ret |
444 | # endif |
445 | |
446 | /* Cold case for crossing page with first load. */ |
447 | .p2align 4 |
448 | L(cross_page_boundary): |
449 | movq %rdi, %rdx |
450 | /* Align data to VEC_SIZE. */ |
451 | andq $-VEC_SIZE, %rdi |
452 | VPCMP $0, (%rdi), %YMMZERO, %k0 |
453 | kmovd %k0, %eax |
454 | /* Remove the leading bytes. */ |
455 | # ifdef USE_AS_WCSLEN |
456 | /* NB: Divide shift count by 4 since each bit in K0 represent 4 |
457 | bytes. */ |
458 | movl %edx, %ecx |
459 | shrl $2, %ecx |
460 | andl $(CHAR_PER_VEC - 1), %ecx |
461 | # endif |
462 | /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ |
463 | sarxl %SHIFT_REG, %eax, %eax |
464 | testl %eax, %eax |
465 | # ifndef USE_AS_STRNLEN |
466 | jz L(cross_page_continue) |
467 | tzcntl %eax, %eax |
468 | ret |
469 | # else |
470 | jnz L(cross_page_less_vec) |
471 | # ifndef USE_AS_WCSLEN |
472 | movl %edx, %ecx |
473 | andl $(CHAR_PER_VEC - 1), %ecx |
474 | # endif |
475 | movl $CHAR_PER_VEC, %eax |
476 | subl %ecx, %eax |
477 | /* Check the end of data. */ |
478 | cmpq %rax, %rsi |
479 | ja L(cross_page_continue) |
480 | movl %esi, %eax |
481 | ret |
482 | L(cross_page_less_vec): |
483 | tzcntl %eax, %eax |
484 | /* Select min of length and position of first null. */ |
485 | cmpq %rax, %rsi |
486 | cmovb %esi, %eax |
487 | ret |
488 | # endif |
489 | |
490 | END (STRLEN) |
491 | #endif |
492 | |