1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. |
2 | Copyright (C) 2018-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # ifndef STRCMP_ISA |
24 | # define STRCMP_ISA _avx2 |
25 | # endif |
26 | |
27 | # include "strcmp-naming.h" |
28 | |
29 | # include <sysdep.h> |
30 | |
31 | # if defined USE_AS_STRCASECMP_L |
32 | # include "locale-defines.h" |
33 | # endif |
34 | |
35 | # ifndef STRCMP |
36 | # define STRCMP __strcmp_avx2 |
37 | # endif |
38 | |
39 | # define PAGE_SIZE 4096 |
40 | |
41 | /* VEC_SIZE = Number of bytes in a ymm register. */ |
42 | # define VEC_SIZE 32 |
43 | |
44 | # define VMOVU vmovdqu |
45 | # define VMOVA vmovdqa |
46 | |
47 | # ifdef USE_AS_WCSCMP |
48 | /* Compare packed dwords. */ |
49 | # define VPCMPEQ vpcmpeqd |
50 | /* Compare packed dwords and store minimum. */ |
51 | # define VPMINU vpminud |
52 | /* 1 dword char == 4 bytes. */ |
53 | # define SIZE_OF_CHAR 4 |
54 | # else |
55 | /* Compare packed bytes. */ |
56 | # define VPCMPEQ vpcmpeqb |
57 | /* Compare packed bytes and store minimum. */ |
58 | # define VPMINU vpminub |
59 | /* 1 byte char == 1 byte. */ |
60 | # define SIZE_OF_CHAR 1 |
61 | # endif |
62 | |
63 | # ifdef USE_AS_STRNCMP |
64 | # define LOOP_REG r9d |
65 | # define LOOP_REG64 r9 |
66 | |
67 | # define OFFSET_REG8 r9b |
68 | # define OFFSET_REG r9d |
69 | # define OFFSET_REG64 r9 |
70 | # else |
71 | # define LOOP_REG edx |
72 | # define LOOP_REG64 rdx |
73 | |
74 | # define OFFSET_REG8 dl |
75 | # define OFFSET_REG edx |
76 | # define OFFSET_REG64 rdx |
77 | # endif |
78 | |
79 | # ifndef VZEROUPPER |
80 | # define VZEROUPPER vzeroupper |
81 | # endif |
82 | |
83 | # if defined USE_AS_STRNCMP |
84 | # define VEC_OFFSET 0 |
85 | # else |
86 | # define VEC_OFFSET (-VEC_SIZE) |
87 | # endif |
88 | |
89 | # ifdef USE_AS_STRCASECMP_L |
90 | # define BYTE_LOOP_REG OFFSET_REG |
91 | # else |
92 | # define BYTE_LOOP_REG ecx |
93 | # endif |
94 | |
95 | # ifdef USE_AS_STRCASECMP_L |
96 | # ifdef USE_AS_STRNCMP |
97 | # define LOCALE_REG rcx |
98 | # define LOCALE_REG_LP RCX_LP |
99 | # else |
100 | # define LOCALE_REG rdx |
101 | # define LOCALE_REG_LP RDX_LP |
102 | # endif |
103 | # endif |
104 | |
105 | # define xmmZERO xmm15 |
106 | # define ymmZERO ymm15 |
107 | |
108 | # define LCASE_MIN_ymm %ymm10 |
109 | # define LCASE_MAX_ymm %ymm11 |
110 | # define CASE_ADD_ymm %ymm12 |
111 | |
112 | # define LCASE_MIN_xmm %xmm10 |
113 | # define LCASE_MAX_xmm %xmm11 |
114 | # define CASE_ADD_xmm %xmm12 |
115 | |
116 | /* r11 is never use elsewhere so this is safe to maintain. */ |
117 | # define TOLOWER_BASE %r11 |
118 | |
119 | # ifndef SECTION |
120 | # define SECTION(p) p##.avx |
121 | # endif |
122 | |
123 | # ifdef USE_AS_STRCASECMP_L |
124 | # define REG(x, y) x ## y |
125 | # define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ |
126 | vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ |
127 | vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ |
128 | vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ |
129 | vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ |
130 | vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ |
131 | vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ |
132 | vpaddb REG(%ext, 8), reg1_in, reg1_out; \ |
133 | vpaddb REG(%ext, 9), reg2_in, reg2_out |
134 | |
135 | # define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst |
136 | # define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) |
137 | # define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) |
138 | |
139 | # define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ |
140 | TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ |
141 | VPCMPEQ scratch_reg, s2_reg, reg_out |
142 | |
143 | # define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ |
144 | VMOVU s2_mem, reg_out; \ |
145 | CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) |
146 | |
147 | # define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) |
148 | # define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) |
149 | |
150 | # define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) |
151 | # define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) |
152 | |
153 | # else |
154 | # define TOLOWER_gpr(...) |
155 | # define TOLOWER_ymm(...) |
156 | # define TOLOWER_xmm(...) |
157 | |
158 | # define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ |
159 | VPCMPEQ s2_reg, s1_reg, reg_out |
160 | |
161 | # define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) |
162 | |
163 | # define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) |
164 | # define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) |
165 | # endif |
166 | |
167 | /* Warning! |
168 | wcscmp/wcsncmp have to use SIGNED comparison for elements. |
169 | strcmp/strncmp have to use UNSIGNED comparison for elements. |
170 | */ |
171 | |
172 | /* The main idea of the string comparison (byte or dword) using AVX2 |
173 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on |
174 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order |
175 | to check the null char, algorithm keeps the matched bytes/dwords, |
176 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, |
177 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and |
178 | one VPMINU instructions, together with movdqu and testl instructions. |
179 | Main loop (away from from page boundary) compares 4 vectors are a time, |
180 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. |
181 | |
182 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic |
183 | is the same as strcmp, except that an a maximum offset is tracked. If |
184 | the maximum offset is reached before a difference is found, zero is |
185 | returned. */ |
186 | |
187 | .section SECTION(.text), "ax" , @progbits |
188 | .align 16 |
189 | .type STRCMP, @function |
190 | .globl STRCMP |
191 | |
192 | # ifdef USE_AS_STRCASECMP_L |
193 | ENTRY (STRCASECMP) |
194 | movq __libc_tsd_LOCALE@gottpoff(%rip), %rax |
195 | mov %fs:(%rax), %LOCALE_REG_LP |
196 | |
197 | /* Either 1 or 5 bytes (dependeing if CET is enabled). */ |
198 | .p2align 4 |
199 | END (STRCASECMP) |
200 | /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ |
201 | # endif |
202 | |
203 | .p2align 4 |
204 | STRCMP: |
205 | cfi_startproc |
206 | _CET_ENDBR |
207 | CALL_MCOUNT |
208 | |
209 | # if defined USE_AS_STRCASECMP_L |
210 | /* We have to fall back on the C implementation for locales with |
211 | encodings not matching ASCII for single bytes. */ |
212 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 |
213 | mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP |
214 | # else |
215 | mov (%LOCALE_REG), %RAX_LP |
216 | # endif |
217 | testb $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) |
218 | jne STRCASECMP_L_NONASCII |
219 | leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE |
220 | # endif |
221 | |
222 | # ifdef USE_AS_STRNCMP |
223 | /* Don't overwrite LOCALE_REG (rcx) until we have pass |
224 | L(one_or_less). Otherwise we might use the wrong locale in |
225 | the OVERFLOW_STRCMP (strcasecmp_l). */ |
226 | # ifdef __ILP32__ |
227 | /* Clear the upper 32 bits. */ |
228 | movl %edx, %edx |
229 | # endif |
230 | cmp $1, %RDX_LP |
231 | /* Signed comparison intentional. We use this branch to also |
232 | test cases where length >= 2^63. These very large sizes can be |
233 | handled with strcmp as there is no way for that length to |
234 | actually bound the buffer. */ |
235 | jle L(one_or_less) |
236 | # ifdef USE_AS_WCSCMP |
237 | movq %rdx, %rcx |
238 | |
239 | /* Multiplying length by sizeof(wchar_t) can result in overflow. |
240 | Check if that is possible. All cases where overflow are possible |
241 | are cases where length is large enough that it can never be a |
242 | bound on valid memory so just use wcscmp. */ |
243 | shrq $56, %rcx |
244 | jnz OVERFLOW_STRCMP |
245 | |
246 | leaq (, %rdx, 4), %rdx |
247 | # endif |
248 | # endif |
249 | vpxor %xmmZERO, %xmmZERO, %xmmZERO |
250 | # if defined USE_AS_STRCASECMP_L |
251 | .section .rodata.cst32, "aM" , @progbits, 32 |
252 | .align 32 |
253 | L(lcase_min): |
254 | .quad 0x3f3f3f3f3f3f3f3f |
255 | .quad 0x3f3f3f3f3f3f3f3f |
256 | .quad 0x3f3f3f3f3f3f3f3f |
257 | .quad 0x3f3f3f3f3f3f3f3f |
258 | L(lcase_max): |
259 | .quad 0x9999999999999999 |
260 | .quad 0x9999999999999999 |
261 | .quad 0x9999999999999999 |
262 | .quad 0x9999999999999999 |
263 | L(case_add): |
264 | .quad 0x2020202020202020 |
265 | .quad 0x2020202020202020 |
266 | .quad 0x2020202020202020 |
267 | .quad 0x2020202020202020 |
268 | .previous |
269 | |
270 | vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm |
271 | vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm |
272 | vmovdqa L(case_add)(%rip), CASE_ADD_ymm |
273 | # endif |
274 | movl %edi, %eax |
275 | orl %esi, %eax |
276 | sall $20, %eax |
277 | /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ |
278 | cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax |
279 | ja L(page_cross) |
280 | |
281 | L(no_page_cross): |
282 | /* Safe to compare 4x vectors. */ |
283 | VMOVU (%rdi), %ymm0 |
284 | /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. |
285 | Otherwise converts ymm0 and load from rsi to lower. ymm2 is |
286 | scratch and ymm1 is the return. */ |
287 | CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) |
288 | /* 1s at null CHAR. */ |
289 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
290 | /* 1s where s1 and s2 equal AND not null CHAR. */ |
291 | vpandn %ymm1, %ymm2, %ymm1 |
292 | |
293 | /* All 1s -> keep going, any 0s -> return. */ |
294 | vpmovmskb %ymm1, %ecx |
295 | # ifdef USE_AS_STRNCMP |
296 | cmpq $VEC_SIZE, %rdx |
297 | jbe L(vec_0_test_len) |
298 | # endif |
299 | |
300 | /* All 1s represents all equals. incl will overflow to zero in |
301 | all equals case. Otherwise 1s will carry until position of first |
302 | mismatch. */ |
303 | incl %ecx |
304 | jz L(more_3x_vec) |
305 | |
306 | .p2align 4,, 4 |
307 | L(return_vec_0): |
308 | tzcntl %ecx, %ecx |
309 | # ifdef USE_AS_WCSCMP |
310 | movl (%rdi, %rcx), %edx |
311 | xorl %eax, %eax |
312 | cmpl (%rsi, %rcx), %edx |
313 | je L(ret0) |
314 | setl %al |
315 | negl %eax |
316 | orl $1, %eax |
317 | # else |
318 | movzbl (%rdi, %rcx), %eax |
319 | movzbl (%rsi, %rcx), %ecx |
320 | TOLOWER_gpr (%rax, %eax) |
321 | TOLOWER_gpr (%rcx, %ecx) |
322 | subl %ecx, %eax |
323 | # endif |
324 | L(ret0): |
325 | L(return_vzeroupper): |
326 | ZERO_UPPER_VEC_REGISTERS_RETURN |
327 | |
328 | # ifdef USE_AS_STRNCMP |
329 | .p2align 4,, 8 |
330 | L(vec_0_test_len): |
331 | notl %ecx |
332 | bzhil %edx, %ecx, %eax |
333 | jnz L(return_vec_0) |
334 | /* Align if will cross fetch block. */ |
335 | .p2align 4,, 2 |
336 | L(ret_zero): |
337 | xorl %eax, %eax |
338 | VZEROUPPER_RETURN |
339 | |
340 | .p2align 4,, 5 |
341 | L(one_or_less): |
342 | # ifdef USE_AS_STRCASECMP_L |
343 | /* Set locale argument for strcasecmp. */ |
344 | movq %LOCALE_REG, %rdx |
345 | # endif |
346 | jb L(ret_zero) |
347 | /* 'nbe' covers the case where length is negative (large |
348 | unsigned). */ |
349 | jnbe OVERFLOW_STRCMP |
350 | # ifdef USE_AS_WCSCMP |
351 | movl (%rdi), %edx |
352 | xorl %eax, %eax |
353 | cmpl (%rsi), %edx |
354 | je L(ret1) |
355 | setl %al |
356 | negl %eax |
357 | orl $1, %eax |
358 | # else |
359 | movzbl (%rdi), %eax |
360 | movzbl (%rsi), %ecx |
361 | TOLOWER_gpr (%rax, %eax) |
362 | TOLOWER_gpr (%rcx, %ecx) |
363 | subl %ecx, %eax |
364 | # endif |
365 | L(ret1): |
366 | ret |
367 | # endif |
368 | |
369 | .p2align 4,, 10 |
370 | L(return_vec_1): |
371 | tzcntl %ecx, %ecx |
372 | # ifdef USE_AS_STRNCMP |
373 | /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of |
374 | overflow. */ |
375 | addq $-VEC_SIZE, %rdx |
376 | cmpq %rcx, %rdx |
377 | jbe L(ret_zero) |
378 | # endif |
379 | # ifdef USE_AS_WCSCMP |
380 | movl VEC_SIZE(%rdi, %rcx), %edx |
381 | xorl %eax, %eax |
382 | cmpl VEC_SIZE(%rsi, %rcx), %edx |
383 | je L(ret2) |
384 | setl %al |
385 | negl %eax |
386 | orl $1, %eax |
387 | # else |
388 | movzbl VEC_SIZE(%rdi, %rcx), %eax |
389 | movzbl VEC_SIZE(%rsi, %rcx), %ecx |
390 | TOLOWER_gpr (%rax, %eax) |
391 | TOLOWER_gpr (%rcx, %ecx) |
392 | subl %ecx, %eax |
393 | # endif |
394 | L(ret2): |
395 | VZEROUPPER_RETURN |
396 | |
397 | .p2align 4,, 10 |
398 | # ifdef USE_AS_STRNCMP |
399 | L(return_vec_3): |
400 | salq $32, %rcx |
401 | # endif |
402 | |
403 | L(return_vec_2): |
404 | # ifndef USE_AS_STRNCMP |
405 | tzcntl %ecx, %ecx |
406 | # else |
407 | tzcntq %rcx, %rcx |
408 | cmpq %rcx, %rdx |
409 | jbe L(ret_zero) |
410 | # endif |
411 | |
412 | # ifdef USE_AS_WCSCMP |
413 | movl (VEC_SIZE * 2)(%rdi, %rcx), %edx |
414 | xorl %eax, %eax |
415 | cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
416 | je L(ret3) |
417 | setl %al |
418 | negl %eax |
419 | orl $1, %eax |
420 | # else |
421 | movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax |
422 | movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx |
423 | TOLOWER_gpr (%rax, %eax) |
424 | TOLOWER_gpr (%rcx, %ecx) |
425 | subl %ecx, %eax |
426 | # endif |
427 | L(ret3): |
428 | VZEROUPPER_RETURN |
429 | |
430 | # ifndef USE_AS_STRNCMP |
431 | .p2align 4,, 10 |
432 | L(return_vec_3): |
433 | tzcntl %ecx, %ecx |
434 | # ifdef USE_AS_WCSCMP |
435 | movl (VEC_SIZE * 3)(%rdi, %rcx), %edx |
436 | xorl %eax, %eax |
437 | cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx |
438 | je L(ret4) |
439 | setl %al |
440 | negl %eax |
441 | orl $1, %eax |
442 | # else |
443 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
444 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx |
445 | TOLOWER_gpr (%rax, %eax) |
446 | TOLOWER_gpr (%rcx, %ecx) |
447 | subl %ecx, %eax |
448 | # endif |
449 | L(ret4): |
450 | VZEROUPPER_RETURN |
451 | # endif |
452 | |
453 | .p2align 4,, 10 |
454 | L(more_3x_vec): |
455 | /* Safe to compare 4x vectors. */ |
456 | VMOVU VEC_SIZE(%rdi), %ymm0 |
457 | CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) |
458 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
459 | vpandn %ymm1, %ymm2, %ymm1 |
460 | vpmovmskb %ymm1, %ecx |
461 | incl %ecx |
462 | jnz L(return_vec_1) |
463 | |
464 | # ifdef USE_AS_STRNCMP |
465 | subq $(VEC_SIZE * 2), %rdx |
466 | jbe L(ret_zero) |
467 | # endif |
468 | |
469 | VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 |
470 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) |
471 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
472 | vpandn %ymm1, %ymm2, %ymm1 |
473 | vpmovmskb %ymm1, %ecx |
474 | incl %ecx |
475 | jnz L(return_vec_2) |
476 | |
477 | VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 |
478 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) |
479 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
480 | vpandn %ymm1, %ymm2, %ymm1 |
481 | vpmovmskb %ymm1, %ecx |
482 | incl %ecx |
483 | jnz L(return_vec_3) |
484 | |
485 | # ifdef USE_AS_STRNCMP |
486 | cmpq $(VEC_SIZE * 2), %rdx |
487 | jbe L(ret_zero) |
488 | # endif |
489 | |
490 | # ifdef USE_AS_WCSCMP |
491 | /* any non-zero positive value that doesn't inference with 0x1. |
492 | */ |
493 | movl $2, %r8d |
494 | |
495 | # else |
496 | xorl %r8d, %r8d |
497 | # endif |
498 | |
499 | /* The prepare labels are various entry points from the page |
500 | cross logic. */ |
501 | L(prepare_loop): |
502 | |
503 | # ifdef USE_AS_STRNCMP |
504 | /* Store N + (VEC_SIZE * 4) and place check at the begining of |
505 | the loop. */ |
506 | leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx |
507 | # endif |
508 | L(prepare_loop_no_len): |
509 | |
510 | /* Align s1 and adjust s2 accordingly. */ |
511 | subq %rdi, %rsi |
512 | andq $-(VEC_SIZE * 4), %rdi |
513 | addq %rdi, %rsi |
514 | |
515 | # ifdef USE_AS_STRNCMP |
516 | subq %rdi, %rdx |
517 | # endif |
518 | |
519 | L(prepare_loop_aligned): |
520 | /* eax stores distance from rsi to next page cross. These cases |
521 | need to be handled specially as the 4x loop could potentially |
522 | read memory past the length of s1 or s2 and across a page |
523 | boundary. */ |
524 | movl $-(VEC_SIZE * 4), %eax |
525 | subl %esi, %eax |
526 | andl $(PAGE_SIZE - 1), %eax |
527 | |
528 | /* Loop 4x comparisons at a time. */ |
529 | .p2align 4 |
530 | L(loop): |
531 | |
532 | /* End condition for strncmp. */ |
533 | # ifdef USE_AS_STRNCMP |
534 | subq $(VEC_SIZE * 4), %rdx |
535 | jbe L(ret_zero) |
536 | # endif |
537 | |
538 | subq $-(VEC_SIZE * 4), %rdi |
539 | subq $-(VEC_SIZE * 4), %rsi |
540 | |
541 | /* Check if rsi loads will cross a page boundary. */ |
542 | addl $-(VEC_SIZE * 4), %eax |
543 | jnb L(page_cross_during_loop) |
544 | |
545 | /* Loop entry after handling page cross during loop. */ |
546 | L(loop_skip_page_cross_check): |
547 | VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 |
548 | VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 |
549 | VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 |
550 | VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 |
551 | |
552 | /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ |
553 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) |
554 | CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) |
555 | CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) |
556 | CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) |
557 | |
558 | /* If any mismatches or null CHAR then 0 CHAR, otherwise non- |
559 | zero. */ |
560 | vpand %ymm0, %ymm1, %ymm1 |
561 | |
562 | |
563 | vpand %ymm2, %ymm3, %ymm3 |
564 | vpand %ymm4, %ymm5, %ymm5 |
565 | vpand %ymm6, %ymm7, %ymm7 |
566 | |
567 | VPMINU %ymm1, %ymm3, %ymm3 |
568 | VPMINU %ymm5, %ymm7, %ymm7 |
569 | |
570 | /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ |
571 | VPMINU %ymm3, %ymm7, %ymm7 |
572 | |
573 | /* If any 0 CHAR then done. */ |
574 | VPCMPEQ %ymm7, %ymmZERO, %ymm7 |
575 | vpmovmskb %ymm7, %LOOP_REG |
576 | testl %LOOP_REG, %LOOP_REG |
577 | jz L(loop) |
578 | |
579 | /* Find which VEC has the mismatch of end of string. */ |
580 | VPCMPEQ %ymm1, %ymmZERO, %ymm1 |
581 | vpmovmskb %ymm1, %ecx |
582 | testl %ecx, %ecx |
583 | jnz L(return_vec_0_end) |
584 | |
585 | |
586 | VPCMPEQ %ymm3, %ymmZERO, %ymm3 |
587 | vpmovmskb %ymm3, %ecx |
588 | testl %ecx, %ecx |
589 | jnz L(return_vec_1_end) |
590 | |
591 | L(return_vec_2_3_end): |
592 | # ifdef USE_AS_STRNCMP |
593 | subq $(VEC_SIZE * 2), %rdx |
594 | jbe L(ret_zero_end) |
595 | # endif |
596 | |
597 | VPCMPEQ %ymm5, %ymmZERO, %ymm5 |
598 | vpmovmskb %ymm5, %ecx |
599 | testl %ecx, %ecx |
600 | jnz L(return_vec_2_end) |
601 | |
602 | /* LOOP_REG contains matches for null/mismatch from the loop. If |
603 | VEC 0,1,and 2 all have no null and no mismatches then mismatch |
604 | must entirely be from VEC 3 which is fully represented by |
605 | LOOP_REG. */ |
606 | tzcntl %LOOP_REG, %LOOP_REG |
607 | |
608 | # ifdef USE_AS_STRNCMP |
609 | subl $-(VEC_SIZE), %LOOP_REG |
610 | cmpq %LOOP_REG64, %rdx |
611 | jbe L(ret_zero_end) |
612 | # endif |
613 | |
614 | # ifdef USE_AS_WCSCMP |
615 | movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx |
616 | xorl %eax, %eax |
617 | cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx |
618 | je L(ret5) |
619 | setl %al |
620 | negl %eax |
621 | xorl %r8d, %eax |
622 | # else |
623 | movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax |
624 | movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx |
625 | TOLOWER_gpr (%rax, %eax) |
626 | TOLOWER_gpr (%rcx, %ecx) |
627 | subl %ecx, %eax |
628 | xorl %r8d, %eax |
629 | subl %r8d, %eax |
630 | # endif |
631 | L(ret5): |
632 | VZEROUPPER_RETURN |
633 | |
634 | # ifdef USE_AS_STRNCMP |
635 | .p2align 4,, 2 |
636 | L(ret_zero_end): |
637 | xorl %eax, %eax |
638 | VZEROUPPER_RETURN |
639 | # endif |
640 | |
641 | |
642 | /* The L(return_vec_N_end) differ from L(return_vec_N) in that |
643 | they use the value of `r8` to negate the return value. This is |
644 | because the page cross logic can swap `rdi` and `rsi`. */ |
645 | .p2align 4,, 10 |
646 | # ifdef USE_AS_STRNCMP |
647 | L(return_vec_1_end): |
648 | salq $32, %rcx |
649 | # endif |
650 | L(return_vec_0_end): |
651 | # ifndef USE_AS_STRNCMP |
652 | tzcntl %ecx, %ecx |
653 | # else |
654 | tzcntq %rcx, %rcx |
655 | cmpq %rcx, %rdx |
656 | jbe L(ret_zero_end) |
657 | # endif |
658 | |
659 | # ifdef USE_AS_WCSCMP |
660 | movl (%rdi, %rcx), %edx |
661 | xorl %eax, %eax |
662 | cmpl (%rsi, %rcx), %edx |
663 | je L(ret6) |
664 | setl %al |
665 | negl %eax |
666 | xorl %r8d, %eax |
667 | # else |
668 | movzbl (%rdi, %rcx), %eax |
669 | movzbl (%rsi, %rcx), %ecx |
670 | TOLOWER_gpr (%rax, %eax) |
671 | TOLOWER_gpr (%rcx, %ecx) |
672 | subl %ecx, %eax |
673 | xorl %r8d, %eax |
674 | subl %r8d, %eax |
675 | # endif |
676 | L(ret6): |
677 | VZEROUPPER_RETURN |
678 | |
679 | # ifndef USE_AS_STRNCMP |
680 | .p2align 4,, 10 |
681 | L(return_vec_1_end): |
682 | tzcntl %ecx, %ecx |
683 | # ifdef USE_AS_WCSCMP |
684 | movl VEC_SIZE(%rdi, %rcx), %edx |
685 | xorl %eax, %eax |
686 | cmpl VEC_SIZE(%rsi, %rcx), %edx |
687 | je L(ret7) |
688 | setl %al |
689 | negl %eax |
690 | xorl %r8d, %eax |
691 | # else |
692 | movzbl VEC_SIZE(%rdi, %rcx), %eax |
693 | movzbl VEC_SIZE(%rsi, %rcx), %ecx |
694 | TOLOWER_gpr (%rax, %eax) |
695 | TOLOWER_gpr (%rcx, %ecx) |
696 | subl %ecx, %eax |
697 | xorl %r8d, %eax |
698 | subl %r8d, %eax |
699 | # endif |
700 | L(ret7): |
701 | VZEROUPPER_RETURN |
702 | # endif |
703 | |
704 | .p2align 4,, 10 |
705 | L(return_vec_2_end): |
706 | tzcntl %ecx, %ecx |
707 | # ifdef USE_AS_STRNCMP |
708 | cmpq %rcx, %rdx |
709 | jbe L(ret_zero_page_cross) |
710 | # endif |
711 | # ifdef USE_AS_WCSCMP |
712 | movl (VEC_SIZE * 2)(%rdi, %rcx), %edx |
713 | xorl %eax, %eax |
714 | cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
715 | je L(ret11) |
716 | setl %al |
717 | negl %eax |
718 | xorl %r8d, %eax |
719 | # else |
720 | movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax |
721 | movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx |
722 | TOLOWER_gpr (%rax, %eax) |
723 | TOLOWER_gpr (%rcx, %ecx) |
724 | subl %ecx, %eax |
725 | xorl %r8d, %eax |
726 | subl %r8d, %eax |
727 | # endif |
728 | L(ret11): |
729 | VZEROUPPER_RETURN |
730 | |
731 | |
732 | /* Page cross in rsi in next 4x VEC. */ |
733 | |
734 | /* TODO: Improve logic here. */ |
735 | .p2align 4,, 10 |
736 | L(page_cross_during_loop): |
737 | /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ |
738 | |
739 | /* Optimistically rsi and rdi and both aligned inwhich case we |
740 | don't need any logic here. */ |
741 | cmpl $-(VEC_SIZE * 4), %eax |
742 | /* Don't adjust eax before jumping back to loop and we will |
743 | never hit page cross case again. */ |
744 | je L(loop_skip_page_cross_check) |
745 | |
746 | /* Check if we can safely load a VEC. */ |
747 | cmpl $-(VEC_SIZE * 3), %eax |
748 | jle L(less_1x_vec_till_page_cross) |
749 | |
750 | VMOVA (%rdi), %ymm0 |
751 | CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) |
752 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
753 | vpandn %ymm1, %ymm2, %ymm1 |
754 | vpmovmskb %ymm1, %ecx |
755 | incl %ecx |
756 | jnz L(return_vec_0_end) |
757 | |
758 | /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ |
759 | cmpl $-(VEC_SIZE * 2), %eax |
760 | jg L(more_2x_vec_till_page_cross) |
761 | |
762 | .p2align 4,, 4 |
763 | L(less_1x_vec_till_page_cross): |
764 | subl $-(VEC_SIZE * 4), %eax |
765 | /* Guranteed safe to read from rdi - VEC_SIZE here. The only |
766 | concerning case is first iteration if incoming s1 was near start |
767 | of a page and s2 near end. If s1 was near the start of the page |
768 | we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe |
769 | to read back -VEC_SIZE. If rdi is truly at the start of a page |
770 | here, it means the previous page (rdi - VEC_SIZE) has already |
771 | been loaded earlier so must be valid. */ |
772 | VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 |
773 | CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) |
774 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
775 | vpandn %ymm1, %ymm2, %ymm1 |
776 | vpmovmskb %ymm1, %ecx |
777 | |
778 | /* Mask of potentially valid bits. The lower bits can be out of |
779 | range comparisons (but safe regarding page crosses). */ |
780 | movl $-1, %r10d |
781 | shlxl %esi, %r10d, %r10d |
782 | notl %ecx |
783 | |
784 | # ifdef USE_AS_STRNCMP |
785 | cmpq %rax, %rdx |
786 | jbe L(return_page_cross_end_check) |
787 | # endif |
788 | movl %eax, %OFFSET_REG |
789 | addl $(PAGE_SIZE - VEC_SIZE * 4), %eax |
790 | |
791 | andl %r10d, %ecx |
792 | jz L(loop_skip_page_cross_check) |
793 | |
794 | .p2align 4,, 3 |
795 | L(return_page_cross_end): |
796 | tzcntl %ecx, %ecx |
797 | |
798 | # ifdef USE_AS_STRNCMP |
799 | leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx |
800 | L(return_page_cross_cmp_mem): |
801 | # else |
802 | addl %OFFSET_REG, %ecx |
803 | # endif |
804 | # ifdef USE_AS_WCSCMP |
805 | movl VEC_OFFSET(%rdi, %rcx), %edx |
806 | xorl %eax, %eax |
807 | cmpl VEC_OFFSET(%rsi, %rcx), %edx |
808 | je L(ret8) |
809 | setl %al |
810 | negl %eax |
811 | xorl %r8d, %eax |
812 | # else |
813 | movzbl VEC_OFFSET(%rdi, %rcx), %eax |
814 | movzbl VEC_OFFSET(%rsi, %rcx), %ecx |
815 | TOLOWER_gpr (%rax, %eax) |
816 | TOLOWER_gpr (%rcx, %ecx) |
817 | subl %ecx, %eax |
818 | xorl %r8d, %eax |
819 | subl %r8d, %eax |
820 | # endif |
821 | L(ret8): |
822 | VZEROUPPER_RETURN |
823 | |
824 | # ifdef USE_AS_STRNCMP |
825 | .p2align 4,, 10 |
826 | L(return_page_cross_end_check): |
827 | andl %r10d, %ecx |
828 | tzcntl %ecx, %ecx |
829 | leal -VEC_SIZE(%rax, %rcx), %ecx |
830 | cmpl %ecx, %edx |
831 | ja L(return_page_cross_cmp_mem) |
832 | xorl %eax, %eax |
833 | VZEROUPPER_RETURN |
834 | # endif |
835 | |
836 | |
837 | .p2align 4,, 10 |
838 | L(more_2x_vec_till_page_cross): |
839 | /* If more 2x vec till cross we will complete a full loop |
840 | iteration here. */ |
841 | |
842 | VMOVU VEC_SIZE(%rdi), %ymm0 |
843 | CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) |
844 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
845 | vpandn %ymm1, %ymm2, %ymm1 |
846 | vpmovmskb %ymm1, %ecx |
847 | incl %ecx |
848 | jnz L(return_vec_1_end) |
849 | |
850 | # ifdef USE_AS_STRNCMP |
851 | cmpq $(VEC_SIZE * 2), %rdx |
852 | jbe L(ret_zero_in_loop_page_cross) |
853 | # endif |
854 | |
855 | subl $-(VEC_SIZE * 4), %eax |
856 | |
857 | /* Safe to include comparisons from lower bytes. */ |
858 | VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 |
859 | CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) |
860 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
861 | vpandn %ymm1, %ymm2, %ymm1 |
862 | vpmovmskb %ymm1, %ecx |
863 | incl %ecx |
864 | jnz L(return_vec_page_cross_0) |
865 | |
866 | VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 |
867 | CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) |
868 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
869 | vpandn %ymm1, %ymm2, %ymm1 |
870 | vpmovmskb %ymm1, %ecx |
871 | incl %ecx |
872 | jnz L(return_vec_page_cross_1) |
873 | |
874 | # ifdef USE_AS_STRNCMP |
875 | /* Must check length here as length might proclude reading next |
876 | page. */ |
877 | cmpq %rax, %rdx |
878 | jbe L(ret_zero_in_loop_page_cross) |
879 | # endif |
880 | |
881 | /* Finish the loop. */ |
882 | VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 |
883 | VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 |
884 | |
885 | CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) |
886 | CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) |
887 | vpand %ymm4, %ymm5, %ymm5 |
888 | vpand %ymm6, %ymm7, %ymm7 |
889 | VPMINU %ymm5, %ymm7, %ymm7 |
890 | VPCMPEQ %ymm7, %ymmZERO, %ymm7 |
891 | vpmovmskb %ymm7, %LOOP_REG |
892 | testl %LOOP_REG, %LOOP_REG |
893 | jnz L(return_vec_2_3_end) |
894 | |
895 | /* Best for code size to include ucond-jmp here. Would be faster |
896 | if this case is hot to duplicate the L(return_vec_2_3_end) code |
897 | as fall-through and have jump back to loop on mismatch |
898 | comparison. */ |
899 | subq $-(VEC_SIZE * 4), %rdi |
900 | subq $-(VEC_SIZE * 4), %rsi |
901 | addl $(PAGE_SIZE - VEC_SIZE * 8), %eax |
902 | # ifdef USE_AS_STRNCMP |
903 | subq $(VEC_SIZE * 4), %rdx |
904 | ja L(loop_skip_page_cross_check) |
905 | L(ret_zero_in_loop_page_cross): |
906 | xorl %eax, %eax |
907 | VZEROUPPER_RETURN |
908 | # else |
909 | jmp L(loop_skip_page_cross_check) |
910 | # endif |
911 | |
912 | |
913 | .p2align 4,, 10 |
914 | L(return_vec_page_cross_0): |
915 | addl $-VEC_SIZE, %eax |
916 | L(return_vec_page_cross_1): |
917 | tzcntl %ecx, %ecx |
918 | # ifdef USE_AS_STRNCMP |
919 | leal -VEC_SIZE(%rax, %rcx), %ecx |
920 | cmpq %rcx, %rdx |
921 | jbe L(ret_zero_in_loop_page_cross) |
922 | # else |
923 | addl %eax, %ecx |
924 | # endif |
925 | |
926 | # ifdef USE_AS_WCSCMP |
927 | movl VEC_OFFSET(%rdi, %rcx), %edx |
928 | xorl %eax, %eax |
929 | cmpl VEC_OFFSET(%rsi, %rcx), %edx |
930 | je L(ret9) |
931 | setl %al |
932 | negl %eax |
933 | xorl %r8d, %eax |
934 | # else |
935 | movzbl VEC_OFFSET(%rdi, %rcx), %eax |
936 | movzbl VEC_OFFSET(%rsi, %rcx), %ecx |
937 | TOLOWER_gpr (%rax, %eax) |
938 | TOLOWER_gpr (%rcx, %ecx) |
939 | subl %ecx, %eax |
940 | xorl %r8d, %eax |
941 | subl %r8d, %eax |
942 | # endif |
943 | L(ret9): |
944 | VZEROUPPER_RETURN |
945 | |
946 | |
947 | .p2align 4,, 10 |
948 | L(page_cross): |
949 | # ifndef USE_AS_STRNCMP |
950 | /* If both are VEC aligned we don't need any special logic here. |
951 | Only valid for strcmp where stop condition is guranteed to be |
952 | reachable by just reading memory. */ |
953 | testl $((VEC_SIZE - 1) << 20), %eax |
954 | jz L(no_page_cross) |
955 | # endif |
956 | |
957 | movl %edi, %eax |
958 | movl %esi, %ecx |
959 | andl $(PAGE_SIZE - 1), %eax |
960 | andl $(PAGE_SIZE - 1), %ecx |
961 | |
962 | xorl %OFFSET_REG, %OFFSET_REG |
963 | |
964 | /* Check which is closer to page cross, s1 or s2. */ |
965 | cmpl %eax, %ecx |
966 | jg L(page_cross_s2) |
967 | |
968 | /* The previous page cross check has false positives. Check for |
969 | true positive as page cross logic is very expensive. */ |
970 | subl $(PAGE_SIZE - VEC_SIZE * 4), %eax |
971 | jbe L(no_page_cross) |
972 | |
973 | /* Set r8 to not interfere with normal return value (rdi and rsi |
974 | did not swap). */ |
975 | # ifdef USE_AS_WCSCMP |
976 | /* any non-zero positive value that doesn't inference with 0x1. |
977 | */ |
978 | movl $2, %r8d |
979 | # else |
980 | xorl %r8d, %r8d |
981 | # endif |
982 | |
983 | /* Check if less than 1x VEC till page cross. */ |
984 | subl $(VEC_SIZE * 3), %eax |
985 | jg L(less_1x_vec_till_page) |
986 | |
987 | /* If more than 1x VEC till page cross, loop throuh safely |
988 | loadable memory until within 1x VEC of page cross. */ |
989 | |
990 | .p2align 4,, 10 |
991 | L(page_cross_loop): |
992 | |
993 | VMOVU (%rdi, %OFFSET_REG64), %ymm0 |
994 | CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) |
995 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
996 | vpandn %ymm1, %ymm2, %ymm1 |
997 | vpmovmskb %ymm1, %ecx |
998 | incl %ecx |
999 | |
1000 | jnz L(check_ret_vec_page_cross) |
1001 | addl $VEC_SIZE, %OFFSET_REG |
1002 | # ifdef USE_AS_STRNCMP |
1003 | cmpq %OFFSET_REG64, %rdx |
1004 | jbe L(ret_zero_page_cross) |
1005 | # endif |
1006 | addl $VEC_SIZE, %eax |
1007 | jl L(page_cross_loop) |
1008 | |
1009 | subl %eax, %OFFSET_REG |
1010 | /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed |
1011 | to not cross page so is safe to load. Since we have already |
1012 | loaded at least 1 VEC from rsi it is also guranteed to be |
1013 | safe. */ |
1014 | |
1015 | VMOVU (%rdi, %OFFSET_REG64), %ymm0 |
1016 | CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) |
1017 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
1018 | vpandn %ymm1, %ymm2, %ymm1 |
1019 | vpmovmskb %ymm1, %ecx |
1020 | |
1021 | # ifdef USE_AS_STRNCMP |
1022 | leal VEC_SIZE(%OFFSET_REG64), %eax |
1023 | cmpq %rax, %rdx |
1024 | jbe L(check_ret_vec_page_cross2) |
1025 | addq %rdi, %rdx |
1026 | # endif |
1027 | incl %ecx |
1028 | jz L(prepare_loop_no_len) |
1029 | |
1030 | .p2align 4,, 4 |
1031 | L(ret_vec_page_cross): |
1032 | # ifndef USE_AS_STRNCMP |
1033 | L(check_ret_vec_page_cross): |
1034 | # endif |
1035 | tzcntl %ecx, %ecx |
1036 | addl %OFFSET_REG, %ecx |
1037 | L(ret_vec_page_cross_cont): |
1038 | # ifdef USE_AS_WCSCMP |
1039 | movl (%rdi, %rcx), %edx |
1040 | xorl %eax, %eax |
1041 | cmpl (%rsi, %rcx), %edx |
1042 | je L(ret12) |
1043 | setl %al |
1044 | negl %eax |
1045 | xorl %r8d, %eax |
1046 | # else |
1047 | movzbl (%rdi, %rcx), %eax |
1048 | movzbl (%rsi, %rcx), %ecx |
1049 | TOLOWER_gpr (%rax, %eax) |
1050 | TOLOWER_gpr (%rcx, %ecx) |
1051 | subl %ecx, %eax |
1052 | xorl %r8d, %eax |
1053 | subl %r8d, %eax |
1054 | # endif |
1055 | L(ret12): |
1056 | VZEROUPPER_RETURN |
1057 | |
1058 | # ifdef USE_AS_STRNCMP |
1059 | .p2align 4,, 10 |
1060 | L(check_ret_vec_page_cross2): |
1061 | incl %ecx |
1062 | L(check_ret_vec_page_cross): |
1063 | tzcntl %ecx, %ecx |
1064 | addl %OFFSET_REG, %ecx |
1065 | cmpq %rcx, %rdx |
1066 | ja L(ret_vec_page_cross_cont) |
1067 | .p2align 4,, 2 |
1068 | L(ret_zero_page_cross): |
1069 | xorl %eax, %eax |
1070 | VZEROUPPER_RETURN |
1071 | # endif |
1072 | |
1073 | .p2align 4,, 4 |
1074 | L(page_cross_s2): |
1075 | /* Ensure this is a true page cross. */ |
1076 | subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx |
1077 | jbe L(no_page_cross) |
1078 | |
1079 | |
1080 | movl %ecx, %eax |
1081 | movq %rdi, %rcx |
1082 | movq %rsi, %rdi |
1083 | movq %rcx, %rsi |
1084 | |
1085 | /* set r8 to negate return value as rdi and rsi swapped. */ |
1086 | # ifdef USE_AS_WCSCMP |
1087 | movl $-4, %r8d |
1088 | # else |
1089 | movl $-1, %r8d |
1090 | # endif |
1091 | xorl %OFFSET_REG, %OFFSET_REG |
1092 | |
1093 | /* Check if more than 1x VEC till page cross. */ |
1094 | subl $(VEC_SIZE * 3), %eax |
1095 | jle L(page_cross_loop) |
1096 | |
1097 | .p2align 4,, 6 |
1098 | L(less_1x_vec_till_page): |
1099 | /* Find largest load size we can use. */ |
1100 | cmpl $16, %eax |
1101 | ja L(less_16_till_page) |
1102 | |
1103 | VMOVU (%rdi), %xmm0 |
1104 | CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) |
1105 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1106 | vpandn %xmm1, %xmm2, %xmm1 |
1107 | vpmovmskb %ymm1, %ecx |
1108 | incw %cx |
1109 | jnz L(check_ret_vec_page_cross) |
1110 | movl $16, %OFFSET_REG |
1111 | # ifdef USE_AS_STRNCMP |
1112 | cmpq %OFFSET_REG64, %rdx |
1113 | jbe L(ret_zero_page_cross_slow_case0) |
1114 | subl %eax, %OFFSET_REG |
1115 | # else |
1116 | /* Explicit check for 16 byte alignment. */ |
1117 | subl %eax, %OFFSET_REG |
1118 | jz L(prepare_loop) |
1119 | # endif |
1120 | |
1121 | VMOVU (%rdi, %OFFSET_REG64), %xmm0 |
1122 | CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) |
1123 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1124 | vpandn %xmm1, %xmm2, %xmm1 |
1125 | vpmovmskb %ymm1, %ecx |
1126 | incw %cx |
1127 | jnz L(check_ret_vec_page_cross) |
1128 | |
1129 | # ifdef USE_AS_STRNCMP |
1130 | addl $16, %OFFSET_REG |
1131 | subq %OFFSET_REG64, %rdx |
1132 | jbe L(ret_zero_page_cross_slow_case0) |
1133 | subq $-(VEC_SIZE * 4), %rdx |
1134 | |
1135 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1136 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1137 | # else |
1138 | leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1139 | leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1140 | # endif |
1141 | jmp L(prepare_loop_aligned) |
1142 | |
1143 | # ifdef USE_AS_STRNCMP |
1144 | .p2align 4,, 2 |
1145 | L(ret_zero_page_cross_slow_case0): |
1146 | xorl %eax, %eax |
1147 | ret |
1148 | # endif |
1149 | |
1150 | |
1151 | .p2align 4,, 10 |
1152 | L(less_16_till_page): |
1153 | /* Find largest load size we can use. */ |
1154 | cmpl $24, %eax |
1155 | ja L(less_8_till_page) |
1156 | |
1157 | vmovq (%rdi), %xmm0 |
1158 | vmovq (%rsi), %xmm1 |
1159 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1160 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1161 | vpandn %xmm1, %xmm2, %xmm1 |
1162 | vpmovmskb %ymm1, %ecx |
1163 | incb %cl |
1164 | jnz L(check_ret_vec_page_cross) |
1165 | |
1166 | |
1167 | # ifdef USE_AS_STRNCMP |
1168 | cmpq $8, %rdx |
1169 | jbe L(ret_zero_page_cross_slow_case0) |
1170 | # endif |
1171 | movl $24, %OFFSET_REG |
1172 | /* Explicit check for 16 byte alignment. */ |
1173 | subl %eax, %OFFSET_REG |
1174 | |
1175 | |
1176 | |
1177 | vmovq (%rdi, %OFFSET_REG64), %xmm0 |
1178 | vmovq (%rsi, %OFFSET_REG64), %xmm1 |
1179 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1180 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1181 | vpandn %xmm1, %xmm2, %xmm1 |
1182 | vpmovmskb %ymm1, %ecx |
1183 | incb %cl |
1184 | jnz L(check_ret_vec_page_cross) |
1185 | |
1186 | # ifdef USE_AS_STRNCMP |
1187 | addl $8, %OFFSET_REG |
1188 | subq %OFFSET_REG64, %rdx |
1189 | jbe L(ret_zero_page_cross_slow_case0) |
1190 | subq $-(VEC_SIZE * 4), %rdx |
1191 | |
1192 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1193 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1194 | # else |
1195 | leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1196 | leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1197 | # endif |
1198 | jmp L(prepare_loop_aligned) |
1199 | |
1200 | |
1201 | .p2align 4,, 10 |
1202 | L(less_8_till_page): |
1203 | # ifdef USE_AS_WCSCMP |
1204 | /* If using wchar then this is the only check before we reach |
1205 | the page boundary. */ |
1206 | movl (%rdi), %eax |
1207 | movl (%rsi), %ecx |
1208 | cmpl %ecx, %eax |
1209 | jnz L(ret_less_8_wcs) |
1210 | # ifdef USE_AS_STRNCMP |
1211 | addq %rdi, %rdx |
1212 | /* We already checked for len <= 1 so cannot hit that case here. |
1213 | */ |
1214 | # endif |
1215 | testl %eax, %eax |
1216 | jnz L(prepare_loop_no_len) |
1217 | ret |
1218 | |
1219 | .p2align 4,, 8 |
1220 | L(ret_less_8_wcs): |
1221 | setl %OFFSET_REG8 |
1222 | negl %OFFSET_REG |
1223 | movl %OFFSET_REG, %eax |
1224 | xorl %r8d, %eax |
1225 | ret |
1226 | |
1227 | # else |
1228 | |
1229 | /* Find largest load size we can use. */ |
1230 | cmpl $28, %eax |
1231 | ja L(less_4_till_page) |
1232 | |
1233 | vmovd (%rdi), %xmm0 |
1234 | vmovd (%rsi), %xmm1 |
1235 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1236 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1237 | vpandn %xmm1, %xmm2, %xmm1 |
1238 | vpmovmskb %ymm1, %ecx |
1239 | subl $0xf, %ecx |
1240 | jnz L(check_ret_vec_page_cross) |
1241 | |
1242 | # ifdef USE_AS_STRNCMP |
1243 | cmpq $4, %rdx |
1244 | jbe L(ret_zero_page_cross_slow_case1) |
1245 | # endif |
1246 | movl $28, %OFFSET_REG |
1247 | /* Explicit check for 16 byte alignment. */ |
1248 | subl %eax, %OFFSET_REG |
1249 | |
1250 | |
1251 | |
1252 | vmovd (%rdi, %OFFSET_REG64), %xmm0 |
1253 | vmovd (%rsi, %OFFSET_REG64), %xmm1 |
1254 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1255 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
1256 | vpandn %xmm1, %xmm2, %xmm1 |
1257 | vpmovmskb %ymm1, %ecx |
1258 | subl $0xf, %ecx |
1259 | jnz L(check_ret_vec_page_cross) |
1260 | |
1261 | # ifdef USE_AS_STRNCMP |
1262 | addl $4, %OFFSET_REG |
1263 | subq %OFFSET_REG64, %rdx |
1264 | jbe L(ret_zero_page_cross_slow_case1) |
1265 | subq $-(VEC_SIZE * 4), %rdx |
1266 | |
1267 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1268 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1269 | # else |
1270 | leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1271 | leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi |
1272 | # endif |
1273 | jmp L(prepare_loop_aligned) |
1274 | |
1275 | # ifdef USE_AS_STRNCMP |
1276 | .p2align 4,, 2 |
1277 | L(ret_zero_page_cross_slow_case1): |
1278 | xorl %eax, %eax |
1279 | ret |
1280 | # endif |
1281 | |
1282 | .p2align 4,, 10 |
1283 | L(less_4_till_page): |
1284 | subq %rdi, %rsi |
1285 | /* Extremely slow byte comparison loop. */ |
1286 | L(less_4_loop): |
1287 | movzbl (%rdi), %eax |
1288 | movzbl (%rsi, %rdi), %ecx |
1289 | TOLOWER_gpr (%rax, %eax) |
1290 | TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) |
1291 | subl %BYTE_LOOP_REG, %eax |
1292 | jnz L(ret_less_4_loop) |
1293 | testl %ecx, %ecx |
1294 | jz L(ret_zero_4_loop) |
1295 | # ifdef USE_AS_STRNCMP |
1296 | decq %rdx |
1297 | jz L(ret_zero_4_loop) |
1298 | # endif |
1299 | incq %rdi |
1300 | /* end condition is reach page boundary (rdi is aligned). */ |
1301 | testl $31, %edi |
1302 | jnz L(less_4_loop) |
1303 | leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi |
1304 | addq $-(VEC_SIZE * 4), %rdi |
1305 | # ifdef USE_AS_STRNCMP |
1306 | subq $-(VEC_SIZE * 4), %rdx |
1307 | # endif |
1308 | jmp L(prepare_loop_aligned) |
1309 | |
1310 | L(ret_zero_4_loop): |
1311 | xorl %eax, %eax |
1312 | ret |
1313 | L(ret_less_4_loop): |
1314 | xorl %r8d, %eax |
1315 | subl %r8d, %eax |
1316 | ret |
1317 | # endif |
1318 | cfi_endproc |
1319 | .size STRCMP, .-STRCMP |
1320 | #endif |
1321 | |