1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2 Copyright (C) 2018-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCMP
24# define STRCMP __strcmp_avx2
25# endif
26
27# define PAGE_SIZE 4096
28
29/* VEC_SIZE = Number of bytes in a ymm register */
30# define VEC_SIZE 32
31
32/* Shift for dividing by (VEC_SIZE * 4). */
33# define DIVIDE_BY_VEC_4_SHIFT 7
34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36# endif
37
38# ifdef USE_AS_WCSCMP
39/* Compare packed dwords. */
40# define VPCMPEQ vpcmpeqd
41/* Compare packed dwords and store minimum. */
42# define VPMINU vpminud
43/* 1 dword char == 4 bytes. */
44# define SIZE_OF_CHAR 4
45# else
46/* Compare packed bytes. */
47# define VPCMPEQ vpcmpeqb
48/* Compare packed bytes and store minimum. */
49# define VPMINU vpminub
50/* 1 byte char == 1 byte. */
51# define SIZE_OF_CHAR 1
52# endif
53
54# ifndef VZEROUPPER
55# define VZEROUPPER vzeroupper
56# endif
57
58/* Warning!
59 wcscmp/wcsncmp have to use SIGNED comparison for elements.
60 strcmp/strncmp have to use UNSIGNED comparison for elements.
61*/
62
63/* The main idea of the string comparison (byte or dword) using AVX2
64 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
65 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
66 to check the null char, algorithm keeps the matched bytes/dwords,
67 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
68 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
69 one VPMINU instructions, together with movdqu and testl instructions.
70 Main loop (away from from page boundary) compares 4 vectors are a time,
71 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
72
73 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
74 is the same as strcmp, except that an a maximum offset is tracked. If
75 the maximum offset is reached before a difference is found, zero is
76 returned. */
77
78 .section .text.avx,"ax",@progbits
79ENTRY (STRCMP)
80# ifdef USE_AS_STRNCMP
81 /* Check for simple cases (0 or 1) in offset. */
82 cmp $1, %RDX_LP
83 je L(char0)
84 jb L(zero)
85# ifdef USE_AS_WCSCMP
86 /* Convert units: from wide to byte char. */
87 shl $2, %RDX_LP
88# endif
89 /* Register %r11 tracks the maximum offset. */
90 mov %RDX_LP, %R11_LP
91# endif
92 movl %edi, %eax
93 xorl %edx, %edx
94 /* Make %xmm7 (%ymm7) all zeros in this function. */
95 vpxor %xmm7, %xmm7, %xmm7
96 orl %esi, %eax
97 andl $(PAGE_SIZE - 1), %eax
98 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
99 jg L(cross_page)
100 /* Start comparing 4 vectors. */
101 vmovdqu (%rdi), %ymm1
102 VPCMPEQ (%rsi), %ymm1, %ymm0
103 VPMINU %ymm1, %ymm0, %ymm0
104 VPCMPEQ %ymm7, %ymm0, %ymm0
105 vpmovmskb %ymm0, %ecx
106 testl %ecx, %ecx
107 je L(next_3_vectors)
108 tzcntl %ecx, %edx
109# ifdef USE_AS_STRNCMP
110 /* Return 0 if the mismatched index (%rdx) is after the maximum
111 offset (%r11). */
112 cmpq %r11, %rdx
113 jae L(zero)
114# endif
115# ifdef USE_AS_WCSCMP
116 xorl %eax, %eax
117 movl (%rdi, %rdx), %ecx
118 cmpl (%rsi, %rdx), %ecx
119 je L(return)
120L(wcscmp_return):
121 setl %al
122 negl %eax
123 orl $1, %eax
124L(return):
125# else
126 movzbl (%rdi, %rdx), %eax
127 movzbl (%rsi, %rdx), %edx
128 subl %edx, %eax
129# endif
130 VZEROUPPER
131 ret
132
133 .p2align 4
134L(return_vec_size):
135 tzcntl %ecx, %edx
136# ifdef USE_AS_STRNCMP
137 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
138 the maximum offset (%r11). */
139 addq $VEC_SIZE, %rdx
140 cmpq %r11, %rdx
141 jae L(zero)
142# ifdef USE_AS_WCSCMP
143 xorl %eax, %eax
144 movl (%rdi, %rdx), %ecx
145 cmpl (%rsi, %rdx), %ecx
146 jne L(wcscmp_return)
147# else
148 movzbl (%rdi, %rdx), %eax
149 movzbl (%rsi, %rdx), %edx
150 subl %edx, %eax
151# endif
152# else
153# ifdef USE_AS_WCSCMP
154 xorl %eax, %eax
155 movl VEC_SIZE(%rdi, %rdx), %ecx
156 cmpl VEC_SIZE(%rsi, %rdx), %ecx
157 jne L(wcscmp_return)
158# else
159 movzbl VEC_SIZE(%rdi, %rdx), %eax
160 movzbl VEC_SIZE(%rsi, %rdx), %edx
161 subl %edx, %eax
162# endif
163# endif
164 VZEROUPPER
165 ret
166
167 .p2align 4
168L(return_2_vec_size):
169 tzcntl %ecx, %edx
170# ifdef USE_AS_STRNCMP
171 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
172 after the maximum offset (%r11). */
173 addq $(VEC_SIZE * 2), %rdx
174 cmpq %r11, %rdx
175 jae L(zero)
176# ifdef USE_AS_WCSCMP
177 xorl %eax, %eax
178 movl (%rdi, %rdx), %ecx
179 cmpl (%rsi, %rdx), %ecx
180 jne L(wcscmp_return)
181# else
182 movzbl (%rdi, %rdx), %eax
183 movzbl (%rsi, %rdx), %edx
184 subl %edx, %eax
185# endif
186# else
187# ifdef USE_AS_WCSCMP
188 xorl %eax, %eax
189 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
190 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
191 jne L(wcscmp_return)
192# else
193 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
194 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
195 subl %edx, %eax
196# endif
197# endif
198 VZEROUPPER
199 ret
200
201 .p2align 4
202L(return_3_vec_size):
203 tzcntl %ecx, %edx
204# ifdef USE_AS_STRNCMP
205 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
206 after the maximum offset (%r11). */
207 addq $(VEC_SIZE * 3), %rdx
208 cmpq %r11, %rdx
209 jae L(zero)
210# ifdef USE_AS_WCSCMP
211 xorl %eax, %eax
212 movl (%rdi, %rdx), %ecx
213 cmpl (%rsi, %rdx), %ecx
214 jne L(wcscmp_return)
215# else
216 movzbl (%rdi, %rdx), %eax
217 movzbl (%rsi, %rdx), %edx
218 subl %edx, %eax
219# endif
220# else
221# ifdef USE_AS_WCSCMP
222 xorl %eax, %eax
223 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
224 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
225 jne L(wcscmp_return)
226# else
227 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
228 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
229 subl %edx, %eax
230# endif
231# endif
232 VZEROUPPER
233 ret
234
235 .p2align 4
236L(next_3_vectors):
237 vmovdqu VEC_SIZE(%rdi), %ymm6
238 VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3
239 VPMINU %ymm6, %ymm3, %ymm3
240 VPCMPEQ %ymm7, %ymm3, %ymm3
241 vpmovmskb %ymm3, %ecx
242 testl %ecx, %ecx
243 jne L(return_vec_size)
244 vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5
245 vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4
246 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0
247 VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
248 VPMINU %ymm5, %ymm2, %ymm2
249 VPCMPEQ %ymm4, %ymm0, %ymm0
250 VPCMPEQ %ymm7, %ymm2, %ymm2
251 vpmovmskb %ymm2, %ecx
252 testl %ecx, %ecx
253 jne L(return_2_vec_size)
254 VPMINU %ymm4, %ymm0, %ymm0
255 VPCMPEQ %ymm7, %ymm0, %ymm0
256 vpmovmskb %ymm0, %ecx
257 testl %ecx, %ecx
258 jne L(return_3_vec_size)
259L(main_loop_header):
260 leaq (VEC_SIZE * 4)(%rdi), %rdx
261 movl $PAGE_SIZE, %ecx
262 /* Align load via RAX. */
263 andq $-(VEC_SIZE * 4), %rdx
264 subq %rdi, %rdx
265 leaq (%rdi, %rdx), %rax
266# ifdef USE_AS_STRNCMP
267 /* Starting from this point, the maximum offset, or simply the
268 'offset', DECREASES by the same amount when base pointers are
269 moved forward. Return 0 when:
270 1) On match: offset <= the matched vector index.
271 2) On mistmach, offset is before the mistmatched index.
272 */
273 subq %rdx, %r11
274 jbe L(zero)
275# endif
276 addq %rsi, %rdx
277 movq %rdx, %rsi
278 andl $(PAGE_SIZE - 1), %esi
279 /* Number of bytes before page crossing. */
280 subq %rsi, %rcx
281 /* Number of VEC_SIZE * 4 blocks before page crossing. */
282 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
283 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
284 movl %ecx, %esi
285 jmp L(loop_start)
286
287 .p2align 4
288L(loop):
289# ifdef USE_AS_STRNCMP
290 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
291 the maximum offset (%r11) by the same amount. */
292 subq $(VEC_SIZE * 4), %r11
293 jbe L(zero)
294# endif
295 addq $(VEC_SIZE * 4), %rax
296 addq $(VEC_SIZE * 4), %rdx
297L(loop_start):
298 testl %esi, %esi
299 leal -1(%esi), %esi
300 je L(loop_cross_page)
301L(back_to_loop):
302 /* Main loop, comparing 4 vectors are a time. */
303 vmovdqa (%rax), %ymm0
304 vmovdqa VEC_SIZE(%rax), %ymm3
305 VPCMPEQ (%rdx), %ymm0, %ymm4
306 VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1
307 VPMINU %ymm0, %ymm4, %ymm4
308 VPMINU %ymm3, %ymm1, %ymm1
309 vmovdqa (VEC_SIZE * 2)(%rax), %ymm2
310 VPMINU %ymm1, %ymm4, %ymm0
311 vmovdqa (VEC_SIZE * 3)(%rax), %ymm3
312 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
313 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
314 VPMINU %ymm2, %ymm5, %ymm5
315 VPMINU %ymm3, %ymm6, %ymm6
316 VPMINU %ymm5, %ymm0, %ymm0
317 VPMINU %ymm6, %ymm0, %ymm0
318 VPCMPEQ %ymm7, %ymm0, %ymm0
319
320 /* Test each mask (32 bits) individually because for VEC_SIZE
321 == 32 is not possible to OR the four masks and keep all bits
322 in a 64-bit integer register, differing from SSE2 strcmp
323 where ORing is possible. */
324 vpmovmskb %ymm0, %ecx
325 testl %ecx, %ecx
326 je L(loop)
327 VPCMPEQ %ymm7, %ymm4, %ymm0
328 vpmovmskb %ymm0, %edi
329 testl %edi, %edi
330 je L(test_vec)
331 tzcntl %edi, %ecx
332# ifdef USE_AS_STRNCMP
333 cmpq %rcx, %r11
334 jbe L(zero)
335# ifdef USE_AS_WCSCMP
336 movq %rax, %rsi
337 xorl %eax, %eax
338 movl (%rsi, %rcx), %edi
339 cmpl (%rdx, %rcx), %edi
340 jne L(wcscmp_return)
341# else
342 movzbl (%rax, %rcx), %eax
343 movzbl (%rdx, %rcx), %edx
344 subl %edx, %eax
345# endif
346# else
347# ifdef USE_AS_WCSCMP
348 movq %rax, %rsi
349 xorl %eax, %eax
350 movl (%rsi, %rcx), %edi
351 cmpl (%rdx, %rcx), %edi
352 jne L(wcscmp_return)
353# else
354 movzbl (%rax, %rcx), %eax
355 movzbl (%rdx, %rcx), %edx
356 subl %edx, %eax
357# endif
358# endif
359 VZEROUPPER
360 ret
361
362 .p2align 4
363L(test_vec):
364# ifdef USE_AS_STRNCMP
365 /* The first vector matched. Return 0 if the maximum offset
366 (%r11) <= VEC_SIZE. */
367 cmpq $VEC_SIZE, %r11
368 jbe L(zero)
369# endif
370 VPCMPEQ %ymm7, %ymm1, %ymm1
371 vpmovmskb %ymm1, %ecx
372 testl %ecx, %ecx
373 je L(test_2_vec)
374 tzcntl %ecx, %edi
375# ifdef USE_AS_STRNCMP
376 addq $VEC_SIZE, %rdi
377 cmpq %rdi, %r11
378 jbe L(zero)
379# ifdef USE_AS_WCSCMP
380 movq %rax, %rsi
381 xorl %eax, %eax
382 movl (%rsi, %rdi), %ecx
383 cmpl (%rdx, %rdi), %ecx
384 jne L(wcscmp_return)
385# else
386 movzbl (%rax, %rdi), %eax
387 movzbl (%rdx, %rdi), %edx
388 subl %edx, %eax
389# endif
390# else
391# ifdef USE_AS_WCSCMP
392 movq %rax, %rsi
393 xorl %eax, %eax
394 movl VEC_SIZE(%rsi, %rdi), %ecx
395 cmpl VEC_SIZE(%rdx, %rdi), %ecx
396 jne L(wcscmp_return)
397# else
398 movzbl VEC_SIZE(%rax, %rdi), %eax
399 movzbl VEC_SIZE(%rdx, %rdi), %edx
400 subl %edx, %eax
401# endif
402# endif
403 VZEROUPPER
404 ret
405
406 .p2align 4
407L(test_2_vec):
408# ifdef USE_AS_STRNCMP
409 /* The first 2 vectors matched. Return 0 if the maximum offset
410 (%r11) <= 2 * VEC_SIZE. */
411 cmpq $(VEC_SIZE * 2), %r11
412 jbe L(zero)
413# endif
414 VPCMPEQ %ymm7, %ymm5, %ymm5
415 vpmovmskb %ymm5, %ecx
416 testl %ecx, %ecx
417 je L(test_3_vec)
418 tzcntl %ecx, %edi
419# ifdef USE_AS_STRNCMP
420 addq $(VEC_SIZE * 2), %rdi
421 cmpq %rdi, %r11
422 jbe L(zero)
423# ifdef USE_AS_WCSCMP
424 movq %rax, %rsi
425 xorl %eax, %eax
426 movl (%rsi, %rdi), %ecx
427 cmpl (%rdx, %rdi), %ecx
428 jne L(wcscmp_return)
429# else
430 movzbl (%rax, %rdi), %eax
431 movzbl (%rdx, %rdi), %edx
432 subl %edx, %eax
433# endif
434# else
435# ifdef USE_AS_WCSCMP
436 movq %rax, %rsi
437 xorl %eax, %eax
438 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
439 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
440 jne L(wcscmp_return)
441# else
442 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
443 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
444 subl %edx, %eax
445# endif
446# endif
447 VZEROUPPER
448 ret
449
450 .p2align 4
451L(test_3_vec):
452# ifdef USE_AS_STRNCMP
453 /* The first 3 vectors matched. Return 0 if the maximum offset
454 (%r11) <= 3 * VEC_SIZE. */
455 cmpq $(VEC_SIZE * 3), %r11
456 jbe L(zero)
457# endif
458 VPCMPEQ %ymm7, %ymm6, %ymm6
459 vpmovmskb %ymm6, %esi
460 tzcntl %esi, %ecx
461# ifdef USE_AS_STRNCMP
462 addq $(VEC_SIZE * 3), %rcx
463 cmpq %rcx, %r11
464 jbe L(zero)
465# ifdef USE_AS_WCSCMP
466 movq %rax, %rsi
467 xorl %eax, %eax
468 movl (%rsi, %rcx), %esi
469 cmpl (%rdx, %rcx), %esi
470 jne L(wcscmp_return)
471# else
472 movzbl (%rax, %rcx), %eax
473 movzbl (%rdx, %rcx), %edx
474 subl %edx, %eax
475# endif
476# else
477# ifdef USE_AS_WCSCMP
478 movq %rax, %rsi
479 xorl %eax, %eax
480 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
481 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
482 jne L(wcscmp_return)
483# else
484 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
485 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
486 subl %edx, %eax
487# endif
488# endif
489 VZEROUPPER
490 ret
491
492 .p2align 4
493L(loop_cross_page):
494 xorl %r10d, %r10d
495 movq %rdx, %rcx
496 /* Align load via RDX. We load the extra ECX bytes which should
497 be ignored. */
498 andl $((VEC_SIZE * 4) - 1), %ecx
499 /* R10 is -RCX. */
500 subq %rcx, %r10
501
502 /* This works only if VEC_SIZE * 2 == 64. */
503# if (VEC_SIZE * 2) != 64
504# error (VEC_SIZE * 2) != 64
505# endif
506
507 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
508 cmpl $(VEC_SIZE * 2), %ecx
509 jge L(loop_cross_page_2_vec)
510
511 vmovdqu (%rax, %r10), %ymm2
512 vmovdqu VEC_SIZE(%rax, %r10), %ymm3
513 VPCMPEQ (%rdx, %r10), %ymm2, %ymm0
514 VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
515 VPMINU %ymm2, %ymm0, %ymm0
516 VPMINU %ymm3, %ymm1, %ymm1
517 VPCMPEQ %ymm7, %ymm0, %ymm0
518 VPCMPEQ %ymm7, %ymm1, %ymm1
519
520 vpmovmskb %ymm0, %edi
521 vpmovmskb %ymm1, %esi
522
523 salq $32, %rsi
524 xorq %rsi, %rdi
525
526 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
527 shrq %cl, %rdi
528
529 testq %rdi, %rdi
530 je L(loop_cross_page_2_vec)
531 tzcntq %rdi, %rcx
532# ifdef USE_AS_STRNCMP
533 cmpq %rcx, %r11
534 jbe L(zero)
535# ifdef USE_AS_WCSCMP
536 movq %rax, %rsi
537 xorl %eax, %eax
538 movl (%rsi, %rcx), %edi
539 cmpl (%rdx, %rcx), %edi
540 jne L(wcscmp_return)
541# else
542 movzbl (%rax, %rcx), %eax
543 movzbl (%rdx, %rcx), %edx
544 subl %edx, %eax
545# endif
546# else
547# ifdef USE_AS_WCSCMP
548 movq %rax, %rsi
549 xorl %eax, %eax
550 movl (%rsi, %rcx), %edi
551 cmpl (%rdx, %rcx), %edi
552 jne L(wcscmp_return)
553# else
554 movzbl (%rax, %rcx), %eax
555 movzbl (%rdx, %rcx), %edx
556 subl %edx, %eax
557# endif
558# endif
559 VZEROUPPER
560 ret
561
562 .p2align 4
563L(loop_cross_page_2_vec):
564 /* The first VEC_SIZE * 2 bytes match or are ignored. */
565 vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2
566 vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3
567 VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
568 VPMINU %ymm2, %ymm5, %ymm5
569 VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
570 VPCMPEQ %ymm7, %ymm5, %ymm5
571 VPMINU %ymm3, %ymm6, %ymm6
572 VPCMPEQ %ymm7, %ymm6, %ymm6
573
574 vpmovmskb %ymm5, %edi
575 vpmovmskb %ymm6, %esi
576
577 salq $32, %rsi
578 xorq %rsi, %rdi
579
580 xorl %r8d, %r8d
581 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
582 subl $(VEC_SIZE * 2), %ecx
583 jle 1f
584 /* Skip ECX bytes. */
585 shrq %cl, %rdi
586 /* R8 has number of bytes skipped. */
587 movl %ecx, %r8d
5881:
589 /* Before jumping back to the loop, set ESI to the number of
590 VEC_SIZE * 4 blocks before page crossing. */
591 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
592
593 testq %rdi, %rdi
594# ifdef USE_AS_STRNCMP
595 /* At this point, if %rdi value is 0, it already tested
596 VEC_SIZE*4+%r10 byte starting from %rax. This label
597 checks whether strncmp maximum offset reached or not. */
598 je L(string_nbyte_offset_check)
599# else
600 je L(back_to_loop)
601# endif
602 tzcntq %rdi, %rcx
603 addq %r10, %rcx
604 /* Adjust for number of bytes skipped. */
605 addq %r8, %rcx
606# ifdef USE_AS_STRNCMP
607 addq $(VEC_SIZE * 2), %rcx
608 subq %rcx, %r11
609 jbe L(zero)
610# ifdef USE_AS_WCSCMP
611 movq %rax, %rsi
612 xorl %eax, %eax
613 movl (%rsi, %rcx), %edi
614 cmpl (%rdx, %rcx), %edi
615 jne L(wcscmp_return)
616# else
617 movzbl (%rax, %rcx), %eax
618 movzbl (%rdx, %rcx), %edx
619 subl %edx, %eax
620# endif
621# else
622# ifdef USE_AS_WCSCMP
623 movq %rax, %rsi
624 xorl %eax, %eax
625 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
626 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
627 jne L(wcscmp_return)
628# else
629 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
630 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
631 subl %edx, %eax
632# endif
633# endif
634 VZEROUPPER
635 ret
636
637# ifdef USE_AS_STRNCMP
638L(string_nbyte_offset_check):
639 leaq (VEC_SIZE * 4)(%r10), %r10
640 cmpq %r10, %r11
641 jbe L(zero)
642 jmp L(back_to_loop)
643# endif
644
645 .p2align 4
646L(cross_page_loop):
647 /* Check one byte/dword at a time. */
648# ifdef USE_AS_WCSCMP
649 cmpl %ecx, %eax
650# else
651 subl %ecx, %eax
652# endif
653 jne L(different)
654 addl $SIZE_OF_CHAR, %edx
655 cmpl $(VEC_SIZE * 4), %edx
656 je L(main_loop_header)
657# ifdef USE_AS_STRNCMP
658 cmpq %r11, %rdx
659 jae L(zero)
660# endif
661# ifdef USE_AS_WCSCMP
662 movl (%rdi, %rdx), %eax
663 movl (%rsi, %rdx), %ecx
664# else
665 movzbl (%rdi, %rdx), %eax
666 movzbl (%rsi, %rdx), %ecx
667# endif
668 /* Check null char. */
669 testl %eax, %eax
670 jne L(cross_page_loop)
671 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
672 comparisons. */
673 subl %ecx, %eax
674# ifndef USE_AS_WCSCMP
675L(different):
676# endif
677 VZEROUPPER
678 ret
679
680# ifdef USE_AS_WCSCMP
681 .p2align 4
682L(different):
683 /* Use movl to avoid modifying EFLAGS. */
684 movl $0, %eax
685 setl %al
686 negl %eax
687 orl $1, %eax
688 VZEROUPPER
689 ret
690# endif
691
692# ifdef USE_AS_STRNCMP
693 .p2align 4
694L(zero):
695 xorl %eax, %eax
696 VZEROUPPER
697 ret
698
699 .p2align 4
700L(char0):
701# ifdef USE_AS_WCSCMP
702 xorl %eax, %eax
703 movl (%rdi), %ecx
704 cmpl (%rsi), %ecx
705 jne L(wcscmp_return)
706# else
707 movzbl (%rsi), %ecx
708 movzbl (%rdi), %eax
709 subl %ecx, %eax
710# endif
711 VZEROUPPER
712 ret
713# endif
714
715 .p2align 4
716L(last_vector):
717 addq %rdx, %rdi
718 addq %rdx, %rsi
719# ifdef USE_AS_STRNCMP
720 subq %rdx, %r11
721# endif
722 tzcntl %ecx, %edx
723# ifdef USE_AS_STRNCMP
724 cmpq %r11, %rdx
725 jae L(zero)
726# endif
727# ifdef USE_AS_WCSCMP
728 xorl %eax, %eax
729 movl (%rdi, %rdx), %ecx
730 cmpl (%rsi, %rdx), %ecx
731 jne L(wcscmp_return)
732# else
733 movzbl (%rdi, %rdx), %eax
734 movzbl (%rsi, %rdx), %edx
735 subl %edx, %eax
736# endif
737 VZEROUPPER
738 ret
739
740 /* Comparing on page boundary region requires special treatment:
741 It must done one vector at the time, starting with the wider
742 ymm vector if possible, if not, with xmm. If fetching 16 bytes
743 (xmm) still passes the boundary, byte comparison must be done.
744 */
745 .p2align 4
746L(cross_page):
747 /* Try one ymm vector at a time. */
748 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
749 jg L(cross_page_1_vector)
750L(loop_1_vector):
751 vmovdqu (%rdi, %rdx), %ymm1
752 VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0
753 VPMINU %ymm1, %ymm0, %ymm0
754 VPCMPEQ %ymm7, %ymm0, %ymm0
755 vpmovmskb %ymm0, %ecx
756 testl %ecx, %ecx
757 jne L(last_vector)
758
759 addl $VEC_SIZE, %edx
760
761 addl $VEC_SIZE, %eax
762# ifdef USE_AS_STRNCMP
763 /* Return 0 if the current offset (%rdx) >= the maximum offset
764 (%r11). */
765 cmpq %r11, %rdx
766 jae L(zero)
767# endif
768 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
769 jle L(loop_1_vector)
770L(cross_page_1_vector):
771 /* Less than 32 bytes to check, try one xmm vector. */
772 cmpl $(PAGE_SIZE - 16), %eax
773 jg L(cross_page_1_xmm)
774 vmovdqu (%rdi, %rdx), %xmm1
775 VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0
776 VPMINU %xmm1, %xmm0, %xmm0
777 VPCMPEQ %xmm7, %xmm0, %xmm0
778 vpmovmskb %xmm0, %ecx
779 testl %ecx, %ecx
780 jne L(last_vector)
781
782 addl $16, %edx
783# ifndef USE_AS_WCSCMP
784 addl $16, %eax
785# endif
786# ifdef USE_AS_STRNCMP
787 /* Return 0 if the current offset (%rdx) >= the maximum offset
788 (%r11). */
789 cmpq %r11, %rdx
790 jae L(zero)
791# endif
792
793L(cross_page_1_xmm):
794# ifndef USE_AS_WCSCMP
795 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
796 for wcscmp nor wcsncmp since wide char is 4 bytes. */
797 cmpl $(PAGE_SIZE - 8), %eax
798 jg L(cross_page_8bytes)
799 vmovq (%rdi, %rdx), %xmm1
800 vmovq (%rsi, %rdx), %xmm0
801 VPCMPEQ %xmm0, %xmm1, %xmm0
802 VPMINU %xmm1, %xmm0, %xmm0
803 VPCMPEQ %xmm7, %xmm0, %xmm0
804 vpmovmskb %xmm0, %ecx
805 /* Only last 8 bits are valid. */
806 andl $0xff, %ecx
807 testl %ecx, %ecx
808 jne L(last_vector)
809
810 addl $8, %edx
811 addl $8, %eax
812# ifdef USE_AS_STRNCMP
813 /* Return 0 if the current offset (%rdx) >= the maximum offset
814 (%r11). */
815 cmpq %r11, %rdx
816 jae L(zero)
817# endif
818
819L(cross_page_8bytes):
820 /* Less than 8 bytes to check, try 4 byte vector. */
821 cmpl $(PAGE_SIZE - 4), %eax
822 jg L(cross_page_4bytes)
823 vmovd (%rdi, %rdx), %xmm1
824 vmovd (%rsi, %rdx), %xmm0
825 VPCMPEQ %xmm0, %xmm1, %xmm0
826 VPMINU %xmm1, %xmm0, %xmm0
827 VPCMPEQ %xmm7, %xmm0, %xmm0
828 vpmovmskb %xmm0, %ecx
829 /* Only last 4 bits are valid. */
830 andl $0xf, %ecx
831 testl %ecx, %ecx
832 jne L(last_vector)
833
834 addl $4, %edx
835# ifdef USE_AS_STRNCMP
836 /* Return 0 if the current offset (%rdx) >= the maximum offset
837 (%r11). */
838 cmpq %r11, %rdx
839 jae L(zero)
840# endif
841
842L(cross_page_4bytes):
843# endif
844 /* Less than 4 bytes to check, try one byte/dword at a time. */
845# ifdef USE_AS_STRNCMP
846 cmpq %r11, %rdx
847 jae L(zero)
848# endif
849# ifdef USE_AS_WCSCMP
850 movl (%rdi, %rdx), %eax
851 movl (%rsi, %rdx), %ecx
852# else
853 movzbl (%rdi, %rdx), %eax
854 movzbl (%rsi, %rdx), %ecx
855# endif
856 testl %eax, %eax
857 jne L(cross_page_loop)
858 subl %ecx, %eax
859 VZEROUPPER
860 ret
861END (STRCMP)
862#endif
863