1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2 Copyright (C) 2018-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCMP
24# define STRCMP __strcmp_avx2
25# endif
26
27# define PAGE_SIZE 4096
28
29/* VEC_SIZE = Number of bytes in a ymm register */
30# define VEC_SIZE 32
31
32/* Shift for dividing by (VEC_SIZE * 4). */
33# define DIVIDE_BY_VEC_4_SHIFT 7
34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36# endif
37
38# ifdef USE_AS_WCSCMP
39/* Compare packed dwords. */
40# define VPCMPEQ vpcmpeqd
41/* Compare packed dwords and store minimum. */
42# define VPMINU vpminud
43/* 1 dword char == 4 bytes. */
44# define SIZE_OF_CHAR 4
45# else
46/* Compare packed bytes. */
47# define VPCMPEQ vpcmpeqb
48/* Compare packed bytes and store minimum. */
49# define VPMINU vpminub
50/* 1 byte char == 1 byte. */
51# define SIZE_OF_CHAR 1
52# endif
53
54# ifndef VZEROUPPER
55# define VZEROUPPER vzeroupper
56# endif
57
58# ifndef SECTION
59# define SECTION(p) p##.avx
60# endif
61
62/* Warning!
63 wcscmp/wcsncmp have to use SIGNED comparison for elements.
64 strcmp/strncmp have to use UNSIGNED comparison for elements.
65*/
66
67/* The main idea of the string comparison (byte or dword) using AVX2
68 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
69 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
70 to check the null char, algorithm keeps the matched bytes/dwords,
71 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
72 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
73 one VPMINU instructions, together with movdqu and testl instructions.
74 Main loop (away from from page boundary) compares 4 vectors are a time,
75 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
76
77 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
78 is the same as strcmp, except that an a maximum offset is tracked. If
79 the maximum offset is reached before a difference is found, zero is
80 returned. */
81
82 .section SECTION(.text),"ax",@progbits
83ENTRY (STRCMP)
84# ifdef USE_AS_STRNCMP
85 /* Check for simple cases (0 or 1) in offset. */
86 cmp $1, %RDX_LP
87 je L(char0)
88 jb L(zero)
89# ifdef USE_AS_WCSCMP
90# ifndef __ILP32__
91 movq %rdx, %rcx
92 /* Check if length could overflow when multiplied by
93 sizeof(wchar_t). Checking top 8 bits will cover all potential
94 overflow cases as well as redirect cases where its impossible to
95 length to bound a valid memory region. In these cases just use
96 'wcscmp'. */
97 shrq $56, %rcx
98 jnz __wcscmp_avx2
99# endif
100 /* Convert units: from wide to byte char. */
101 shl $2, %RDX_LP
102# endif
103 /* Register %r11 tracks the maximum offset. */
104 mov %RDX_LP, %R11_LP
105# endif
106 movl %edi, %eax
107 xorl %edx, %edx
108 /* Make %xmm7 (%ymm7) all zeros in this function. */
109 vpxor %xmm7, %xmm7, %xmm7
110 orl %esi, %eax
111 andl $(PAGE_SIZE - 1), %eax
112 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
113 jg L(cross_page)
114 /* Start comparing 4 vectors. */
115 vmovdqu (%rdi), %ymm1
116 VPCMPEQ (%rsi), %ymm1, %ymm0
117 VPMINU %ymm1, %ymm0, %ymm0
118 VPCMPEQ %ymm7, %ymm0, %ymm0
119 vpmovmskb %ymm0, %ecx
120 testl %ecx, %ecx
121 je L(next_3_vectors)
122 tzcntl %ecx, %edx
123# ifdef USE_AS_STRNCMP
124 /* Return 0 if the mismatched index (%rdx) is after the maximum
125 offset (%r11). */
126 cmpq %r11, %rdx
127 jae L(zero)
128# endif
129# ifdef USE_AS_WCSCMP
130 xorl %eax, %eax
131 movl (%rdi, %rdx), %ecx
132 cmpl (%rsi, %rdx), %ecx
133 je L(return)
134L(wcscmp_return):
135 setl %al
136 negl %eax
137 orl $1, %eax
138L(return):
139# else
140 movzbl (%rdi, %rdx), %eax
141 movzbl (%rsi, %rdx), %edx
142 subl %edx, %eax
143# endif
144L(return_vzeroupper):
145 ZERO_UPPER_VEC_REGISTERS_RETURN
146
147 .p2align 4
148L(return_vec_size):
149 tzcntl %ecx, %edx
150# ifdef USE_AS_STRNCMP
151 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
152 the maximum offset (%r11). */
153 addq $VEC_SIZE, %rdx
154 cmpq %r11, %rdx
155 jae L(zero)
156# ifdef USE_AS_WCSCMP
157 xorl %eax, %eax
158 movl (%rdi, %rdx), %ecx
159 cmpl (%rsi, %rdx), %ecx
160 jne L(wcscmp_return)
161# else
162 movzbl (%rdi, %rdx), %eax
163 movzbl (%rsi, %rdx), %edx
164 subl %edx, %eax
165# endif
166# else
167# ifdef USE_AS_WCSCMP
168 xorl %eax, %eax
169 movl VEC_SIZE(%rdi, %rdx), %ecx
170 cmpl VEC_SIZE(%rsi, %rdx), %ecx
171 jne L(wcscmp_return)
172# else
173 movzbl VEC_SIZE(%rdi, %rdx), %eax
174 movzbl VEC_SIZE(%rsi, %rdx), %edx
175 subl %edx, %eax
176# endif
177# endif
178 VZEROUPPER_RETURN
179
180 .p2align 4
181L(return_2_vec_size):
182 tzcntl %ecx, %edx
183# ifdef USE_AS_STRNCMP
184 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
185 after the maximum offset (%r11). */
186 addq $(VEC_SIZE * 2), %rdx
187 cmpq %r11, %rdx
188 jae L(zero)
189# ifdef USE_AS_WCSCMP
190 xorl %eax, %eax
191 movl (%rdi, %rdx), %ecx
192 cmpl (%rsi, %rdx), %ecx
193 jne L(wcscmp_return)
194# else
195 movzbl (%rdi, %rdx), %eax
196 movzbl (%rsi, %rdx), %edx
197 subl %edx, %eax
198# endif
199# else
200# ifdef USE_AS_WCSCMP
201 xorl %eax, %eax
202 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
203 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
204 jne L(wcscmp_return)
205# else
206 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
207 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
208 subl %edx, %eax
209# endif
210# endif
211 VZEROUPPER_RETURN
212
213 .p2align 4
214L(return_3_vec_size):
215 tzcntl %ecx, %edx
216# ifdef USE_AS_STRNCMP
217 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
218 after the maximum offset (%r11). */
219 addq $(VEC_SIZE * 3), %rdx
220 cmpq %r11, %rdx
221 jae L(zero)
222# ifdef USE_AS_WCSCMP
223 xorl %eax, %eax
224 movl (%rdi, %rdx), %ecx
225 cmpl (%rsi, %rdx), %ecx
226 jne L(wcscmp_return)
227# else
228 movzbl (%rdi, %rdx), %eax
229 movzbl (%rsi, %rdx), %edx
230 subl %edx, %eax
231# endif
232# else
233# ifdef USE_AS_WCSCMP
234 xorl %eax, %eax
235 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
236 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
237 jne L(wcscmp_return)
238# else
239 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
240 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
241 subl %edx, %eax
242# endif
243# endif
244 VZEROUPPER_RETURN
245
246 .p2align 4
247L(next_3_vectors):
248 vmovdqu VEC_SIZE(%rdi), %ymm6
249 VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3
250 VPMINU %ymm6, %ymm3, %ymm3
251 VPCMPEQ %ymm7, %ymm3, %ymm3
252 vpmovmskb %ymm3, %ecx
253 testl %ecx, %ecx
254 jne L(return_vec_size)
255 vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5
256 vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4
257 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0
258 VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
259 VPMINU %ymm5, %ymm2, %ymm2
260 VPCMPEQ %ymm4, %ymm0, %ymm0
261 VPCMPEQ %ymm7, %ymm2, %ymm2
262 vpmovmskb %ymm2, %ecx
263 testl %ecx, %ecx
264 jne L(return_2_vec_size)
265 VPMINU %ymm4, %ymm0, %ymm0
266 VPCMPEQ %ymm7, %ymm0, %ymm0
267 vpmovmskb %ymm0, %ecx
268 testl %ecx, %ecx
269 jne L(return_3_vec_size)
270L(main_loop_header):
271 leaq (VEC_SIZE * 4)(%rdi), %rdx
272 movl $PAGE_SIZE, %ecx
273 /* Align load via RAX. */
274 andq $-(VEC_SIZE * 4), %rdx
275 subq %rdi, %rdx
276 leaq (%rdi, %rdx), %rax
277# ifdef USE_AS_STRNCMP
278 /* Starting from this point, the maximum offset, or simply the
279 'offset', DECREASES by the same amount when base pointers are
280 moved forward. Return 0 when:
281 1) On match: offset <= the matched vector index.
282 2) On mistmach, offset is before the mistmatched index.
283 */
284 subq %rdx, %r11
285 jbe L(zero)
286# endif
287 addq %rsi, %rdx
288 movq %rdx, %rsi
289 andl $(PAGE_SIZE - 1), %esi
290 /* Number of bytes before page crossing. */
291 subq %rsi, %rcx
292 /* Number of VEC_SIZE * 4 blocks before page crossing. */
293 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
294 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
295 movl %ecx, %esi
296 jmp L(loop_start)
297
298 .p2align 4
299L(loop):
300# ifdef USE_AS_STRNCMP
301 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
302 the maximum offset (%r11) by the same amount. */
303 subq $(VEC_SIZE * 4), %r11
304 jbe L(zero)
305# endif
306 addq $(VEC_SIZE * 4), %rax
307 addq $(VEC_SIZE * 4), %rdx
308L(loop_start):
309 testl %esi, %esi
310 leal -1(%esi), %esi
311 je L(loop_cross_page)
312L(back_to_loop):
313 /* Main loop, comparing 4 vectors are a time. */
314 vmovdqa (%rax), %ymm0
315 vmovdqa VEC_SIZE(%rax), %ymm3
316 VPCMPEQ (%rdx), %ymm0, %ymm4
317 VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1
318 VPMINU %ymm0, %ymm4, %ymm4
319 VPMINU %ymm3, %ymm1, %ymm1
320 vmovdqa (VEC_SIZE * 2)(%rax), %ymm2
321 VPMINU %ymm1, %ymm4, %ymm0
322 vmovdqa (VEC_SIZE * 3)(%rax), %ymm3
323 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
324 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
325 VPMINU %ymm2, %ymm5, %ymm5
326 VPMINU %ymm3, %ymm6, %ymm6
327 VPMINU %ymm5, %ymm0, %ymm0
328 VPMINU %ymm6, %ymm0, %ymm0
329 VPCMPEQ %ymm7, %ymm0, %ymm0
330
331 /* Test each mask (32 bits) individually because for VEC_SIZE
332 == 32 is not possible to OR the four masks and keep all bits
333 in a 64-bit integer register, differing from SSE2 strcmp
334 where ORing is possible. */
335 vpmovmskb %ymm0, %ecx
336 testl %ecx, %ecx
337 je L(loop)
338 VPCMPEQ %ymm7, %ymm4, %ymm0
339 vpmovmskb %ymm0, %edi
340 testl %edi, %edi
341 je L(test_vec)
342 tzcntl %edi, %ecx
343# ifdef USE_AS_STRNCMP
344 cmpq %rcx, %r11
345 jbe L(zero)
346# ifdef USE_AS_WCSCMP
347 movq %rax, %rsi
348 xorl %eax, %eax
349 movl (%rsi, %rcx), %edi
350 cmpl (%rdx, %rcx), %edi
351 jne L(wcscmp_return)
352# else
353 movzbl (%rax, %rcx), %eax
354 movzbl (%rdx, %rcx), %edx
355 subl %edx, %eax
356# endif
357# else
358# ifdef USE_AS_WCSCMP
359 movq %rax, %rsi
360 xorl %eax, %eax
361 movl (%rsi, %rcx), %edi
362 cmpl (%rdx, %rcx), %edi
363 jne L(wcscmp_return)
364# else
365 movzbl (%rax, %rcx), %eax
366 movzbl (%rdx, %rcx), %edx
367 subl %edx, %eax
368# endif
369# endif
370 VZEROUPPER_RETURN
371
372 .p2align 4
373L(test_vec):
374# ifdef USE_AS_STRNCMP
375 /* The first vector matched. Return 0 if the maximum offset
376 (%r11) <= VEC_SIZE. */
377 cmpq $VEC_SIZE, %r11
378 jbe L(zero)
379# endif
380 VPCMPEQ %ymm7, %ymm1, %ymm1
381 vpmovmskb %ymm1, %ecx
382 testl %ecx, %ecx
383 je L(test_2_vec)
384 tzcntl %ecx, %edi
385# ifdef USE_AS_STRNCMP
386 addq $VEC_SIZE, %rdi
387 cmpq %rdi, %r11
388 jbe L(zero)
389# ifdef USE_AS_WCSCMP
390 movq %rax, %rsi
391 xorl %eax, %eax
392 movl (%rsi, %rdi), %ecx
393 cmpl (%rdx, %rdi), %ecx
394 jne L(wcscmp_return)
395# else
396 movzbl (%rax, %rdi), %eax
397 movzbl (%rdx, %rdi), %edx
398 subl %edx, %eax
399# endif
400# else
401# ifdef USE_AS_WCSCMP
402 movq %rax, %rsi
403 xorl %eax, %eax
404 movl VEC_SIZE(%rsi, %rdi), %ecx
405 cmpl VEC_SIZE(%rdx, %rdi), %ecx
406 jne L(wcscmp_return)
407# else
408 movzbl VEC_SIZE(%rax, %rdi), %eax
409 movzbl VEC_SIZE(%rdx, %rdi), %edx
410 subl %edx, %eax
411# endif
412# endif
413 VZEROUPPER_RETURN
414
415 .p2align 4
416L(test_2_vec):
417# ifdef USE_AS_STRNCMP
418 /* The first 2 vectors matched. Return 0 if the maximum offset
419 (%r11) <= 2 * VEC_SIZE. */
420 cmpq $(VEC_SIZE * 2), %r11
421 jbe L(zero)
422# endif
423 VPCMPEQ %ymm7, %ymm5, %ymm5
424 vpmovmskb %ymm5, %ecx
425 testl %ecx, %ecx
426 je L(test_3_vec)
427 tzcntl %ecx, %edi
428# ifdef USE_AS_STRNCMP
429 addq $(VEC_SIZE * 2), %rdi
430 cmpq %rdi, %r11
431 jbe L(zero)
432# ifdef USE_AS_WCSCMP
433 movq %rax, %rsi
434 xorl %eax, %eax
435 movl (%rsi, %rdi), %ecx
436 cmpl (%rdx, %rdi), %ecx
437 jne L(wcscmp_return)
438# else
439 movzbl (%rax, %rdi), %eax
440 movzbl (%rdx, %rdi), %edx
441 subl %edx, %eax
442# endif
443# else
444# ifdef USE_AS_WCSCMP
445 movq %rax, %rsi
446 xorl %eax, %eax
447 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
448 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
449 jne L(wcscmp_return)
450# else
451 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
452 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
453 subl %edx, %eax
454# endif
455# endif
456 VZEROUPPER_RETURN
457
458 .p2align 4
459L(test_3_vec):
460# ifdef USE_AS_STRNCMP
461 /* The first 3 vectors matched. Return 0 if the maximum offset
462 (%r11) <= 3 * VEC_SIZE. */
463 cmpq $(VEC_SIZE * 3), %r11
464 jbe L(zero)
465# endif
466 VPCMPEQ %ymm7, %ymm6, %ymm6
467 vpmovmskb %ymm6, %esi
468 tzcntl %esi, %ecx
469# ifdef USE_AS_STRNCMP
470 addq $(VEC_SIZE * 3), %rcx
471 cmpq %rcx, %r11
472 jbe L(zero)
473# ifdef USE_AS_WCSCMP
474 movq %rax, %rsi
475 xorl %eax, %eax
476 movl (%rsi, %rcx), %esi
477 cmpl (%rdx, %rcx), %esi
478 jne L(wcscmp_return)
479# else
480 movzbl (%rax, %rcx), %eax
481 movzbl (%rdx, %rcx), %edx
482 subl %edx, %eax
483# endif
484# else
485# ifdef USE_AS_WCSCMP
486 movq %rax, %rsi
487 xorl %eax, %eax
488 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
489 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
490 jne L(wcscmp_return)
491# else
492 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
493 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
494 subl %edx, %eax
495# endif
496# endif
497 VZEROUPPER_RETURN
498
499 .p2align 4
500L(loop_cross_page):
501 xorl %r10d, %r10d
502 movq %rdx, %rcx
503 /* Align load via RDX. We load the extra ECX bytes which should
504 be ignored. */
505 andl $((VEC_SIZE * 4) - 1), %ecx
506 /* R10 is -RCX. */
507 subq %rcx, %r10
508
509 /* This works only if VEC_SIZE * 2 == 64. */
510# if (VEC_SIZE * 2) != 64
511# error (VEC_SIZE * 2) != 64
512# endif
513
514 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
515 cmpl $(VEC_SIZE * 2), %ecx
516 jge L(loop_cross_page_2_vec)
517
518 vmovdqu (%rax, %r10), %ymm2
519 vmovdqu VEC_SIZE(%rax, %r10), %ymm3
520 VPCMPEQ (%rdx, %r10), %ymm2, %ymm0
521 VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
522 VPMINU %ymm2, %ymm0, %ymm0
523 VPMINU %ymm3, %ymm1, %ymm1
524 VPCMPEQ %ymm7, %ymm0, %ymm0
525 VPCMPEQ %ymm7, %ymm1, %ymm1
526
527 vpmovmskb %ymm0, %edi
528 vpmovmskb %ymm1, %esi
529
530 salq $32, %rsi
531 xorq %rsi, %rdi
532
533 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
534 shrq %cl, %rdi
535
536 testq %rdi, %rdi
537 je L(loop_cross_page_2_vec)
538 tzcntq %rdi, %rcx
539# ifdef USE_AS_STRNCMP
540 cmpq %rcx, %r11
541 jbe L(zero)
542# ifdef USE_AS_WCSCMP
543 movq %rax, %rsi
544 xorl %eax, %eax
545 movl (%rsi, %rcx), %edi
546 cmpl (%rdx, %rcx), %edi
547 jne L(wcscmp_return)
548# else
549 movzbl (%rax, %rcx), %eax
550 movzbl (%rdx, %rcx), %edx
551 subl %edx, %eax
552# endif
553# else
554# ifdef USE_AS_WCSCMP
555 movq %rax, %rsi
556 xorl %eax, %eax
557 movl (%rsi, %rcx), %edi
558 cmpl (%rdx, %rcx), %edi
559 jne L(wcscmp_return)
560# else
561 movzbl (%rax, %rcx), %eax
562 movzbl (%rdx, %rcx), %edx
563 subl %edx, %eax
564# endif
565# endif
566 VZEROUPPER_RETURN
567
568 .p2align 4
569L(loop_cross_page_2_vec):
570 /* The first VEC_SIZE * 2 bytes match or are ignored. */
571 vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2
572 vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3
573 VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
574 VPMINU %ymm2, %ymm5, %ymm5
575 VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
576 VPCMPEQ %ymm7, %ymm5, %ymm5
577 VPMINU %ymm3, %ymm6, %ymm6
578 VPCMPEQ %ymm7, %ymm6, %ymm6
579
580 vpmovmskb %ymm5, %edi
581 vpmovmskb %ymm6, %esi
582
583 salq $32, %rsi
584 xorq %rsi, %rdi
585
586 xorl %r8d, %r8d
587 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
588 subl $(VEC_SIZE * 2), %ecx
589 jle 1f
590 /* Skip ECX bytes. */
591 shrq %cl, %rdi
592 /* R8 has number of bytes skipped. */
593 movl %ecx, %r8d
5941:
595 /* Before jumping back to the loop, set ESI to the number of
596 VEC_SIZE * 4 blocks before page crossing. */
597 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
598
599 testq %rdi, %rdi
600# ifdef USE_AS_STRNCMP
601 /* At this point, if %rdi value is 0, it already tested
602 VEC_SIZE*4+%r10 byte starting from %rax. This label
603 checks whether strncmp maximum offset reached or not. */
604 je L(string_nbyte_offset_check)
605# else
606 je L(back_to_loop)
607# endif
608 tzcntq %rdi, %rcx
609 addq %r10, %rcx
610 /* Adjust for number of bytes skipped. */
611 addq %r8, %rcx
612# ifdef USE_AS_STRNCMP
613 addq $(VEC_SIZE * 2), %rcx
614 subq %rcx, %r11
615 jbe L(zero)
616# ifdef USE_AS_WCSCMP
617 movq %rax, %rsi
618 xorl %eax, %eax
619 movl (%rsi, %rcx), %edi
620 cmpl (%rdx, %rcx), %edi
621 jne L(wcscmp_return)
622# else
623 movzbl (%rax, %rcx), %eax
624 movzbl (%rdx, %rcx), %edx
625 subl %edx, %eax
626# endif
627# else
628# ifdef USE_AS_WCSCMP
629 movq %rax, %rsi
630 xorl %eax, %eax
631 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
632 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
633 jne L(wcscmp_return)
634# else
635 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
636 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
637 subl %edx, %eax
638# endif
639# endif
640 VZEROUPPER_RETURN
641
642# ifdef USE_AS_STRNCMP
643L(string_nbyte_offset_check):
644 leaq (VEC_SIZE * 4)(%r10), %r10
645 cmpq %r10, %r11
646 jbe L(zero)
647 jmp L(back_to_loop)
648# endif
649
650 .p2align 4
651L(cross_page_loop):
652 /* Check one byte/dword at a time. */
653# ifdef USE_AS_WCSCMP
654 cmpl %ecx, %eax
655# else
656 subl %ecx, %eax
657# endif
658 jne L(different)
659 addl $SIZE_OF_CHAR, %edx
660 cmpl $(VEC_SIZE * 4), %edx
661 je L(main_loop_header)
662# ifdef USE_AS_STRNCMP
663 cmpq %r11, %rdx
664 jae L(zero)
665# endif
666# ifdef USE_AS_WCSCMP
667 movl (%rdi, %rdx), %eax
668 movl (%rsi, %rdx), %ecx
669# else
670 movzbl (%rdi, %rdx), %eax
671 movzbl (%rsi, %rdx), %ecx
672# endif
673 /* Check null char. */
674 testl %eax, %eax
675 jne L(cross_page_loop)
676 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
677 comparisons. */
678 subl %ecx, %eax
679# ifndef USE_AS_WCSCMP
680L(different):
681# endif
682 VZEROUPPER_RETURN
683
684# ifdef USE_AS_WCSCMP
685 .p2align 4
686L(different):
687 /* Use movl to avoid modifying EFLAGS. */
688 movl $0, %eax
689 setl %al
690 negl %eax
691 orl $1, %eax
692 VZEROUPPER_RETURN
693# endif
694
695# ifdef USE_AS_STRNCMP
696 .p2align 4
697L(zero):
698 xorl %eax, %eax
699 VZEROUPPER_RETURN
700
701 .p2align 4
702L(char0):
703# ifdef USE_AS_WCSCMP
704 xorl %eax, %eax
705 movl (%rdi), %ecx
706 cmpl (%rsi), %ecx
707 jne L(wcscmp_return)
708# else
709 movzbl (%rsi), %ecx
710 movzbl (%rdi), %eax
711 subl %ecx, %eax
712# endif
713 VZEROUPPER_RETURN
714# endif
715
716 .p2align 4
717L(last_vector):
718 addq %rdx, %rdi
719 addq %rdx, %rsi
720# ifdef USE_AS_STRNCMP
721 subq %rdx, %r11
722# endif
723 tzcntl %ecx, %edx
724# ifdef USE_AS_STRNCMP
725 cmpq %r11, %rdx
726 jae L(zero)
727# endif
728# ifdef USE_AS_WCSCMP
729 xorl %eax, %eax
730 movl (%rdi, %rdx), %ecx
731 cmpl (%rsi, %rdx), %ecx
732 jne L(wcscmp_return)
733# else
734 movzbl (%rdi, %rdx), %eax
735 movzbl (%rsi, %rdx), %edx
736 subl %edx, %eax
737# endif
738 VZEROUPPER_RETURN
739
740 /* Comparing on page boundary region requires special treatment:
741 It must done one vector at the time, starting with the wider
742 ymm vector if possible, if not, with xmm. If fetching 16 bytes
743 (xmm) still passes the boundary, byte comparison must be done.
744 */
745 .p2align 4
746L(cross_page):
747 /* Try one ymm vector at a time. */
748 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
749 jg L(cross_page_1_vector)
750L(loop_1_vector):
751 vmovdqu (%rdi, %rdx), %ymm1
752 VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0
753 VPMINU %ymm1, %ymm0, %ymm0
754 VPCMPEQ %ymm7, %ymm0, %ymm0
755 vpmovmskb %ymm0, %ecx
756 testl %ecx, %ecx
757 jne L(last_vector)
758
759 addl $VEC_SIZE, %edx
760
761 addl $VEC_SIZE, %eax
762# ifdef USE_AS_STRNCMP
763 /* Return 0 if the current offset (%rdx) >= the maximum offset
764 (%r11). */
765 cmpq %r11, %rdx
766 jae L(zero)
767# endif
768 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
769 jle L(loop_1_vector)
770L(cross_page_1_vector):
771 /* Less than 32 bytes to check, try one xmm vector. */
772 cmpl $(PAGE_SIZE - 16), %eax
773 jg L(cross_page_1_xmm)
774 vmovdqu (%rdi, %rdx), %xmm1
775 VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0
776 VPMINU %xmm1, %xmm0, %xmm0
777 VPCMPEQ %xmm7, %xmm0, %xmm0
778 vpmovmskb %xmm0, %ecx
779 testl %ecx, %ecx
780 jne L(last_vector)
781
782 addl $16, %edx
783# ifndef USE_AS_WCSCMP
784 addl $16, %eax
785# endif
786# ifdef USE_AS_STRNCMP
787 /* Return 0 if the current offset (%rdx) >= the maximum offset
788 (%r11). */
789 cmpq %r11, %rdx
790 jae L(zero)
791# endif
792
793L(cross_page_1_xmm):
794# ifndef USE_AS_WCSCMP
795 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
796 for wcscmp nor wcsncmp since wide char is 4 bytes. */
797 cmpl $(PAGE_SIZE - 8), %eax
798 jg L(cross_page_8bytes)
799 vmovq (%rdi, %rdx), %xmm1
800 vmovq (%rsi, %rdx), %xmm0
801 VPCMPEQ %xmm0, %xmm1, %xmm0
802 VPMINU %xmm1, %xmm0, %xmm0
803 VPCMPEQ %xmm7, %xmm0, %xmm0
804 vpmovmskb %xmm0, %ecx
805 /* Only last 8 bits are valid. */
806 andl $0xff, %ecx
807 testl %ecx, %ecx
808 jne L(last_vector)
809
810 addl $8, %edx
811 addl $8, %eax
812# ifdef USE_AS_STRNCMP
813 /* Return 0 if the current offset (%rdx) >= the maximum offset
814 (%r11). */
815 cmpq %r11, %rdx
816 jae L(zero)
817# endif
818
819L(cross_page_8bytes):
820 /* Less than 8 bytes to check, try 4 byte vector. */
821 cmpl $(PAGE_SIZE - 4), %eax
822 jg L(cross_page_4bytes)
823 vmovd (%rdi, %rdx), %xmm1
824 vmovd (%rsi, %rdx), %xmm0
825 VPCMPEQ %xmm0, %xmm1, %xmm0
826 VPMINU %xmm1, %xmm0, %xmm0
827 VPCMPEQ %xmm7, %xmm0, %xmm0
828 vpmovmskb %xmm0, %ecx
829 /* Only last 4 bits are valid. */
830 andl $0xf, %ecx
831 testl %ecx, %ecx
832 jne L(last_vector)
833
834 addl $4, %edx
835# ifdef USE_AS_STRNCMP
836 /* Return 0 if the current offset (%rdx) >= the maximum offset
837 (%r11). */
838 cmpq %r11, %rdx
839 jae L(zero)
840# endif
841
842L(cross_page_4bytes):
843# endif
844 /* Less than 4 bytes to check, try one byte/dword at a time. */
845# ifdef USE_AS_STRNCMP
846 cmpq %r11, %rdx
847 jae L(zero)
848# endif
849# ifdef USE_AS_WCSCMP
850 movl (%rdi, %rdx), %eax
851 movl (%rsi, %rdx), %ecx
852# else
853 movzbl (%rdi, %rdx), %eax
854 movzbl (%rsi, %rdx), %ecx
855# endif
856 testl %eax, %eax
857 jne L(cross_page_loop)
858 subl %ecx, %eax
859 VZEROUPPER_RETURN
860END (STRCMP)
861#endif
862