1/* memcmp with SSE4.1, wmemcmp with SSE4.1
2 Copyright (C) 2010-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCMP
24# define MEMCMP __memcmp_sse4_1
25# endif
26
27#ifdef USE_AS_WMEMCMP
28# define CMPEQ pcmpeqd
29# define CHAR_SIZE 4
30#else
31# define CMPEQ pcmpeqb
32# define CHAR_SIZE 1
33#endif
34
35
36/* Warning!
37 wmemcmp has to use SIGNED comparison for elements.
38 memcmp has to use UNSIGNED comparison for elemnts.
39*/
40
41 .section .text.sse4.1,"ax",@progbits
42ENTRY (MEMCMP)
43# ifdef USE_AS_WMEMCMP
44 shl $2, %RDX_LP
45# elif defined __ILP32__
46 /* Clear the upper 32 bits. */
47 mov %edx, %edx
48# endif
49 cmp $79, %RDX_LP
50 ja L(79bytesormore)
51
52 cmp $CHAR_SIZE, %RDX_LP
53 jbe L(firstbyte)
54
55 /* N in (CHAR_SIZE, 79) bytes. */
56 cmpl $32, %edx
57 ja L(more_32_bytes)
58
59 cmpl $16, %edx
60 jae L(16_to_32_bytes)
61
62# ifndef USE_AS_WMEMCMP
63 cmpl $8, %edx
64 jae L(8_to_16_bytes)
65
66 cmpl $4, %edx
67 jb L(2_to_3_bytes)
68
69 movl (%rdi), %eax
70 movl (%rsi), %ecx
71
72 bswap %eax
73 bswap %ecx
74
75 shlq $32, %rax
76 shlq $32, %rcx
77
78 movl -4(%rdi, %rdx), %edi
79 movl -4(%rsi, %rdx), %esi
80
81 bswap %edi
82 bswap %esi
83
84 orq %rdi, %rax
85 orq %rsi, %rcx
86 subq %rcx, %rax
87 cmovne %edx, %eax
88 sbbl %ecx, %ecx
89 orl %ecx, %eax
90 ret
91
92 .p2align 4,, 8
93L(2_to_3_bytes):
94 movzwl (%rdi), %eax
95 movzwl (%rsi), %ecx
96 shll $8, %eax
97 shll $8, %ecx
98 bswap %eax
99 bswap %ecx
100 movzbl -1(%rdi, %rdx), %edi
101 movzbl -1(%rsi, %rdx), %esi
102 orl %edi, %eax
103 orl %esi, %ecx
104 subl %ecx, %eax
105 ret
106
107 .p2align 4,, 8
108L(8_to_16_bytes):
109 movq (%rdi), %rax
110 movq (%rsi), %rcx
111
112 bswap %rax
113 bswap %rcx
114
115 subq %rcx, %rax
116 jne L(8_to_16_bytes_done)
117
118 movq -8(%rdi, %rdx), %rax
119 movq -8(%rsi, %rdx), %rcx
120
121 bswap %rax
122 bswap %rcx
123
124 subq %rcx, %rax
125
126L(8_to_16_bytes_done):
127 cmovne %edx, %eax
128 sbbl %ecx, %ecx
129 orl %ecx, %eax
130 ret
131# else
132 xorl %eax, %eax
133 movl (%rdi), %ecx
134 cmpl (%rsi), %ecx
135 jne L(8_to_16_bytes_done)
136 movl 4(%rdi), %ecx
137 cmpl 4(%rsi), %ecx
138 jne L(8_to_16_bytes_done)
139 movl -4(%rdi, %rdx), %ecx
140 cmpl -4(%rsi, %rdx), %ecx
141 jne L(8_to_16_bytes_done)
142 ret
143# endif
144
145 .p2align 4,, 3
146L(ret_zero):
147 xorl %eax, %eax
148L(zero):
149 ret
150
151 .p2align 4,, 8
152L(firstbyte):
153 jb L(ret_zero)
154# ifdef USE_AS_WMEMCMP
155 xorl %eax, %eax
156 movl (%rdi), %ecx
157 cmpl (%rsi), %ecx
158 je L(zero)
159L(8_to_16_bytes_done):
160 setg %al
161 leal -1(%rax, %rax), %eax
162# else
163 movzbl (%rdi), %eax
164 movzbl (%rsi), %ecx
165 sub %ecx, %eax
166# endif
167 ret
168
169 .p2align 4
170L(vec_return_begin_48):
171 addq $16, %rdi
172 addq $16, %rsi
173L(vec_return_begin_32):
174 bsfl %eax, %eax
175# ifdef USE_AS_WMEMCMP
176 movl 32(%rdi, %rax), %ecx
177 xorl %edx, %edx
178 cmpl 32(%rsi, %rax), %ecx
179 setg %dl
180 leal -1(%rdx, %rdx), %eax
181# else
182 movzbl 32(%rsi, %rax), %ecx
183 movzbl 32(%rdi, %rax), %eax
184 subl %ecx, %eax
185# endif
186 ret
187
188 .p2align 4
189L(vec_return_begin_16):
190 addq $16, %rdi
191 addq $16, %rsi
192L(vec_return_begin):
193 bsfl %eax, %eax
194# ifdef USE_AS_WMEMCMP
195 movl (%rdi, %rax), %ecx
196 xorl %edx, %edx
197 cmpl (%rsi, %rax), %ecx
198 setg %dl
199 leal -1(%rdx, %rdx), %eax
200# else
201 movzbl (%rsi, %rax), %ecx
202 movzbl (%rdi, %rax), %eax
203 subl %ecx, %eax
204# endif
205 ret
206
207 .p2align 4
208L(vec_return_end_16):
209 subl $16, %edx
210L(vec_return_end):
211 bsfl %eax, %eax
212 addl %edx, %eax
213# ifdef USE_AS_WMEMCMP
214 movl -16(%rdi, %rax), %ecx
215 xorl %edx, %edx
216 cmpl -16(%rsi, %rax), %ecx
217 setg %dl
218 leal -1(%rdx, %rdx), %eax
219# else
220 movzbl -16(%rsi, %rax), %ecx
221 movzbl -16(%rdi, %rax), %eax
222 subl %ecx, %eax
223# endif
224 ret
225
226 .p2align 4,, 8
227L(more_32_bytes):
228 movdqu (%rdi), %xmm0
229 movdqu (%rsi), %xmm1
230 CMPEQ %xmm0, %xmm1
231 pmovmskb %xmm1, %eax
232 incw %ax
233 jnz L(vec_return_begin)
234
235 movdqu 16(%rdi), %xmm0
236 movdqu 16(%rsi), %xmm1
237 CMPEQ %xmm0, %xmm1
238 pmovmskb %xmm1, %eax
239 incw %ax
240 jnz L(vec_return_begin_16)
241
242 cmpl $64, %edx
243 jbe L(32_to_64_bytes)
244 movdqu 32(%rdi), %xmm0
245 movdqu 32(%rsi), %xmm1
246 CMPEQ %xmm0, %xmm1
247 pmovmskb %xmm1, %eax
248 incw %ax
249 jnz L(vec_return_begin_32)
250
251 .p2align 4,, 6
252L(32_to_64_bytes):
253 movdqu -32(%rdi, %rdx), %xmm0
254 movdqu -32(%rsi, %rdx), %xmm1
255 CMPEQ %xmm0, %xmm1
256 pmovmskb %xmm1, %eax
257 incw %ax
258 jnz L(vec_return_end_16)
259
260 movdqu -16(%rdi, %rdx), %xmm0
261 movdqu -16(%rsi, %rdx), %xmm1
262 CMPEQ %xmm0, %xmm1
263 pmovmskb %xmm1, %eax
264 incw %ax
265 jnz L(vec_return_end)
266 ret
267
268 .p2align 4
269L(16_to_32_bytes):
270 movdqu (%rdi), %xmm0
271 movdqu (%rsi), %xmm1
272 CMPEQ %xmm0, %xmm1
273 pmovmskb %xmm1, %eax
274 incw %ax
275 jnz L(vec_return_begin)
276
277 movdqu -16(%rdi, %rdx), %xmm0
278 movdqu -16(%rsi, %rdx), %xmm1
279 CMPEQ %xmm0, %xmm1
280 pmovmskb %xmm1, %eax
281 incw %ax
282 jnz L(vec_return_end)
283 ret
284
285
286 .p2align 4
287L(79bytesormore):
288 movdqu (%rdi), %xmm0
289 movdqu (%rsi), %xmm1
290 CMPEQ %xmm0, %xmm1
291 pmovmskb %xmm1, %eax
292 incw %ax
293 jnz L(vec_return_begin)
294
295
296 mov %rsi, %rcx
297 and $-16, %rsi
298 add $16, %rsi
299 sub %rsi, %rcx
300
301 sub %rcx, %rdi
302 add %rcx, %rdx
303 test $0xf, %rdi
304 jz L(2aligned)
305
306 cmp $128, %rdx
307 ja L(128bytesormore)
308
309 .p2align 4,, 6
310L(less128bytes):
311 movdqu (%rdi), %xmm1
312 CMPEQ (%rsi), %xmm1
313 pmovmskb %xmm1, %eax
314 incw %ax
315 jnz L(vec_return_begin)
316
317 movdqu 16(%rdi), %xmm1
318 CMPEQ 16(%rsi), %xmm1
319 pmovmskb %xmm1, %eax
320 incw %ax
321 jnz L(vec_return_begin_16)
322
323 movdqu 32(%rdi), %xmm1
324 CMPEQ 32(%rsi), %xmm1
325 pmovmskb %xmm1, %eax
326 incw %ax
327 jnz L(vec_return_begin_32)
328
329 movdqu 48(%rdi), %xmm1
330 CMPEQ 48(%rsi), %xmm1
331 pmovmskb %xmm1, %eax
332 incw %ax
333 jnz L(vec_return_begin_48)
334
335 cmp $96, %rdx
336 jb L(32_to_64_bytes)
337
338 addq $64, %rdi
339 addq $64, %rsi
340 subq $64, %rdx
341
342 .p2align 4,, 6
343L(last_64_bytes):
344 movdqu (%rdi), %xmm1
345 CMPEQ (%rsi), %xmm1
346 pmovmskb %xmm1, %eax
347 incw %ax
348 jnz L(vec_return_begin)
349
350 movdqu 16(%rdi), %xmm1
351 CMPEQ 16(%rsi), %xmm1
352 pmovmskb %xmm1, %eax
353 incw %ax
354 jnz L(vec_return_begin_16)
355
356 movdqu -32(%rdi, %rdx), %xmm0
357 movdqu -32(%rsi, %rdx), %xmm1
358 CMPEQ %xmm0, %xmm1
359 pmovmskb %xmm1, %eax
360 incw %ax
361 jnz L(vec_return_end_16)
362
363 movdqu -16(%rdi, %rdx), %xmm0
364 movdqu -16(%rsi, %rdx), %xmm1
365 CMPEQ %xmm0, %xmm1
366 pmovmskb %xmm1, %eax
367 incw %ax
368 jnz L(vec_return_end)
369 ret
370
371 .p2align 4
372L(128bytesormore):
373 cmp $256, %rdx
374 ja L(unaligned_loop)
375L(less256bytes):
376 movdqu (%rdi), %xmm1
377 CMPEQ (%rsi), %xmm1
378 pmovmskb %xmm1, %eax
379 incw %ax
380 jnz L(vec_return_begin)
381
382 movdqu 16(%rdi), %xmm1
383 CMPEQ 16(%rsi), %xmm1
384 pmovmskb %xmm1, %eax
385 incw %ax
386 jnz L(vec_return_begin_16)
387
388 movdqu 32(%rdi), %xmm1
389 CMPEQ 32(%rsi), %xmm1
390 pmovmskb %xmm1, %eax
391 incw %ax
392 jnz L(vec_return_begin_32)
393
394 movdqu 48(%rdi), %xmm1
395 CMPEQ 48(%rsi), %xmm1
396 pmovmskb %xmm1, %eax
397 incw %ax
398 jnz L(vec_return_begin_48)
399
400 addq $64, %rdi
401 addq $64, %rsi
402
403 movdqu (%rdi), %xmm1
404 CMPEQ (%rsi), %xmm1
405 pmovmskb %xmm1, %eax
406 incw %ax
407 jnz L(vec_return_begin)
408
409 movdqu 16(%rdi), %xmm1
410 CMPEQ 16(%rsi), %xmm1
411 pmovmskb %xmm1, %eax
412 incw %ax
413 jnz L(vec_return_begin_16)
414
415 movdqu 32(%rdi), %xmm1
416 CMPEQ 32(%rsi), %xmm1
417 pmovmskb %xmm1, %eax
418 incw %ax
419 jnz L(vec_return_begin_32)
420
421 movdqu 48(%rdi), %xmm1
422 CMPEQ 48(%rsi), %xmm1
423 pmovmskb %xmm1, %eax
424 incw %ax
425 jnz L(vec_return_begin_48)
426
427 addq $-128, %rdx
428 subq $-64, %rsi
429 subq $-64, %rdi
430
431 cmp $64, %rdx
432 ja L(less128bytes)
433
434 cmp $32, %rdx
435 ja L(last_64_bytes)
436
437 movdqu -32(%rdi, %rdx), %xmm0
438 movdqu -32(%rsi, %rdx), %xmm1
439 CMPEQ %xmm0, %xmm1
440 pmovmskb %xmm1, %eax
441 incw %ax
442 jnz L(vec_return_end_16)
443
444 movdqu -16(%rdi, %rdx), %xmm0
445 movdqu -16(%rsi, %rdx), %xmm1
446 CMPEQ %xmm0, %xmm1
447 pmovmskb %xmm1, %eax
448 incw %ax
449 jnz L(vec_return_end)
450 ret
451
452 .p2align 4
453L(unaligned_loop):
454# ifdef DATA_CACHE_SIZE_HALF
455 mov $DATA_CACHE_SIZE_HALF, %R8_LP
456# else
457 mov __x86_data_cache_size_half(%rip), %R8_LP
458# endif
459 movq %r8, %r9
460 addq %r8, %r8
461 addq %r9, %r8
462 cmpq %r8, %rdx
463 ja L(L2_L3_cache_unaligned)
464 sub $64, %rdx
465 .p2align 4
466L(64bytesormore_loop):
467 movdqu (%rdi), %xmm0
468 movdqu 16(%rdi), %xmm1
469 movdqu 32(%rdi), %xmm2
470 movdqu 48(%rdi), %xmm3
471
472 CMPEQ (%rsi), %xmm0
473 CMPEQ 16(%rsi), %xmm1
474 CMPEQ 32(%rsi), %xmm2
475 CMPEQ 48(%rsi), %xmm3
476
477 pand %xmm0, %xmm1
478 pand %xmm2, %xmm3
479 pand %xmm1, %xmm3
480
481 pmovmskb %xmm3, %eax
482 incw %ax
483 jnz L(64bytesormore_loop_end)
484
485 add $64, %rsi
486 add $64, %rdi
487 sub $64, %rdx
488 ja L(64bytesormore_loop)
489
490 .p2align 4,, 6
491L(loop_tail):
492 addq %rdx, %rdi
493 movdqu (%rdi), %xmm0
494 movdqu 16(%rdi), %xmm1
495 movdqu 32(%rdi), %xmm2
496 movdqu 48(%rdi), %xmm3
497
498 addq %rdx, %rsi
499 movdqu (%rsi), %xmm4
500 movdqu 16(%rsi), %xmm5
501 movdqu 32(%rsi), %xmm6
502 movdqu 48(%rsi), %xmm7
503
504 CMPEQ %xmm4, %xmm0
505 CMPEQ %xmm5, %xmm1
506 CMPEQ %xmm6, %xmm2
507 CMPEQ %xmm7, %xmm3
508
509 pand %xmm0, %xmm1
510 pand %xmm2, %xmm3
511 pand %xmm1, %xmm3
512
513 pmovmskb %xmm3, %eax
514 incw %ax
515 jnz L(64bytesormore_loop_end)
516 ret
517
518L(L2_L3_cache_unaligned):
519 subq $64, %rdx
520 .p2align 4
521L(L2_L3_unaligned_128bytes_loop):
522 prefetchnta 0x1c0(%rdi)
523 prefetchnta 0x1c0(%rsi)
524
525 movdqu (%rdi), %xmm0
526 movdqu 16(%rdi), %xmm1
527 movdqu 32(%rdi), %xmm2
528 movdqu 48(%rdi), %xmm3
529
530 CMPEQ (%rsi), %xmm0
531 CMPEQ 16(%rsi), %xmm1
532 CMPEQ 32(%rsi), %xmm2
533 CMPEQ 48(%rsi), %xmm3
534
535 pand %xmm0, %xmm1
536 pand %xmm2, %xmm3
537 pand %xmm1, %xmm3
538
539 pmovmskb %xmm3, %eax
540 incw %ax
541 jnz L(64bytesormore_loop_end)
542
543 add $64, %rsi
544 add $64, %rdi
545 sub $64, %rdx
546 ja L(L2_L3_unaligned_128bytes_loop)
547 jmp L(loop_tail)
548
549
550 /* This case is for machines which are sensitive for unaligned
551 * instructions. */
552 .p2align 4
553L(2aligned):
554 cmp $128, %rdx
555 ja L(128bytesormorein2aligned)
556L(less128bytesin2aligned):
557 movdqa (%rdi), %xmm1
558 CMPEQ (%rsi), %xmm1
559 pmovmskb %xmm1, %eax
560 incw %ax
561 jnz L(vec_return_begin)
562
563 movdqa 16(%rdi), %xmm1
564 CMPEQ 16(%rsi), %xmm1
565 pmovmskb %xmm1, %eax
566 incw %ax
567 jnz L(vec_return_begin_16)
568
569 movdqa 32(%rdi), %xmm1
570 CMPEQ 32(%rsi), %xmm1
571 pmovmskb %xmm1, %eax
572 incw %ax
573 jnz L(vec_return_begin_32)
574
575 movdqa 48(%rdi), %xmm1
576 CMPEQ 48(%rsi), %xmm1
577 pmovmskb %xmm1, %eax
578 incw %ax
579 jnz L(vec_return_begin_48)
580
581 cmp $96, %rdx
582 jb L(32_to_64_bytes)
583
584 addq $64, %rdi
585 addq $64, %rsi
586 subq $64, %rdx
587
588 .p2align 4,, 6
589L(aligned_last_64_bytes):
590 movdqa (%rdi), %xmm1
591 CMPEQ (%rsi), %xmm1
592 pmovmskb %xmm1, %eax
593 incw %ax
594 jnz L(vec_return_begin)
595
596 movdqa 16(%rdi), %xmm1
597 CMPEQ 16(%rsi), %xmm1
598 pmovmskb %xmm1, %eax
599 incw %ax
600 jnz L(vec_return_begin_16)
601
602 movdqu -32(%rdi, %rdx), %xmm0
603 movdqu -32(%rsi, %rdx), %xmm1
604 CMPEQ %xmm0, %xmm1
605 pmovmskb %xmm1, %eax
606 incw %ax
607 jnz L(vec_return_end_16)
608
609 movdqu -16(%rdi, %rdx), %xmm0
610 movdqu -16(%rsi, %rdx), %xmm1
611 CMPEQ %xmm0, %xmm1
612 pmovmskb %xmm1, %eax
613 incw %ax
614 jnz L(vec_return_end)
615 ret
616
617 .p2align 4
618L(128bytesormorein2aligned):
619 cmp $256, %rdx
620 ja L(aligned_loop)
621L(less256bytesin2alinged):
622 movdqa (%rdi), %xmm1
623 CMPEQ (%rsi), %xmm1
624 pmovmskb %xmm1, %eax
625 incw %ax
626 jnz L(vec_return_begin)
627
628 movdqa 16(%rdi), %xmm1
629 CMPEQ 16(%rsi), %xmm1
630 pmovmskb %xmm1, %eax
631 incw %ax
632 jnz L(vec_return_begin_16)
633
634 movdqa 32(%rdi), %xmm1
635 CMPEQ 32(%rsi), %xmm1
636 pmovmskb %xmm1, %eax
637 incw %ax
638 jnz L(vec_return_begin_32)
639
640 movdqa 48(%rdi), %xmm1
641 CMPEQ 48(%rsi), %xmm1
642 pmovmskb %xmm1, %eax
643 incw %ax
644 jnz L(vec_return_begin_48)
645
646 addq $64, %rdi
647 addq $64, %rsi
648
649 movdqa (%rdi), %xmm1
650 CMPEQ (%rsi), %xmm1
651 pmovmskb %xmm1, %eax
652 incw %ax
653 jnz L(vec_return_begin)
654
655 movdqa 16(%rdi), %xmm1
656 CMPEQ 16(%rsi), %xmm1
657 pmovmskb %xmm1, %eax
658 incw %ax
659 jnz L(vec_return_begin_16)
660
661 movdqa 32(%rdi), %xmm1
662 CMPEQ 32(%rsi), %xmm1
663 pmovmskb %xmm1, %eax
664 incw %ax
665 jnz L(vec_return_begin_32)
666
667 movdqa 48(%rdi), %xmm1
668 CMPEQ 48(%rsi), %xmm1
669 pmovmskb %xmm1, %eax
670 incw %ax
671 jnz L(vec_return_begin_48)
672
673 addq $-128, %rdx
674 subq $-64, %rsi
675 subq $-64, %rdi
676
677 cmp $64, %rdx
678 ja L(less128bytesin2aligned)
679
680 cmp $32, %rdx
681 ja L(aligned_last_64_bytes)
682
683 movdqu -32(%rdi, %rdx), %xmm0
684 movdqu -32(%rsi, %rdx), %xmm1
685 CMPEQ %xmm0, %xmm1
686 pmovmskb %xmm1, %eax
687 incw %ax
688 jnz L(vec_return_end_16)
689
690 movdqu -16(%rdi, %rdx), %xmm0
691 movdqu -16(%rsi, %rdx), %xmm1
692 CMPEQ %xmm0, %xmm1
693 pmovmskb %xmm1, %eax
694 incw %ax
695 jnz L(vec_return_end)
696 ret
697
698 .p2align 4
699L(aligned_loop):
700# ifdef DATA_CACHE_SIZE_HALF
701 mov $DATA_CACHE_SIZE_HALF, %R8_LP
702# else
703 mov __x86_data_cache_size_half(%rip), %R8_LP
704# endif
705 movq %r8, %r9
706 addq %r8, %r8
707 addq %r9, %r8
708 cmpq %r8, %rdx
709 ja L(L2_L3_cache_aligned)
710
711 sub $64, %rdx
712 .p2align 4
713L(64bytesormore_loopin2aligned):
714 movdqa (%rdi), %xmm0
715 movdqa 16(%rdi), %xmm1
716 movdqa 32(%rdi), %xmm2
717 movdqa 48(%rdi), %xmm3
718
719 CMPEQ (%rsi), %xmm0
720 CMPEQ 16(%rsi), %xmm1
721 CMPEQ 32(%rsi), %xmm2
722 CMPEQ 48(%rsi), %xmm3
723
724 pand %xmm0, %xmm1
725 pand %xmm2, %xmm3
726 pand %xmm1, %xmm3
727
728 pmovmskb %xmm3, %eax
729 incw %ax
730 jnz L(64bytesormore_loop_end)
731 add $64, %rsi
732 add $64, %rdi
733 sub $64, %rdx
734 ja L(64bytesormore_loopin2aligned)
735 jmp L(loop_tail)
736
737L(L2_L3_cache_aligned):
738 subq $64, %rdx
739 .p2align 4
740L(L2_L3_aligned_128bytes_loop):
741 prefetchnta 0x1c0(%rdi)
742 prefetchnta 0x1c0(%rsi)
743 movdqa (%rdi), %xmm0
744 movdqa 16(%rdi), %xmm1
745 movdqa 32(%rdi), %xmm2
746 movdqa 48(%rdi), %xmm3
747
748 CMPEQ (%rsi), %xmm0
749 CMPEQ 16(%rsi), %xmm1
750 CMPEQ 32(%rsi), %xmm2
751 CMPEQ 48(%rsi), %xmm3
752
753 pand %xmm0, %xmm1
754 pand %xmm2, %xmm3
755 pand %xmm1, %xmm3
756
757 pmovmskb %xmm3, %eax
758 incw %ax
759 jnz L(64bytesormore_loop_end)
760
761 addq $64, %rsi
762 addq $64, %rdi
763 subq $64, %rdx
764 ja L(L2_L3_aligned_128bytes_loop)
765 jmp L(loop_tail)
766
767 .p2align 4
768L(64bytesormore_loop_end):
769 pmovmskb %xmm0, %ecx
770 incw %cx
771 jnz L(loop_end_ret)
772
773 pmovmskb %xmm1, %ecx
774 notw %cx
775 sall $16, %ecx
776 jnz L(loop_end_ret)
777
778 pmovmskb %xmm2, %ecx
779 notw %cx
780 shlq $32, %rcx
781 jnz L(loop_end_ret)
782
783 addq $48, %rdi
784 addq $48, %rsi
785 movq %rax, %rcx
786
787 .p2align 4,, 6
788L(loop_end_ret):
789 bsfq %rcx, %rcx
790# ifdef USE_AS_WMEMCMP
791 movl (%rdi, %rcx), %eax
792 xorl %edx, %edx
793 cmpl (%rsi, %rcx), %eax
794 setg %dl
795 leal -1(%rdx, %rdx), %eax
796# else
797 movzbl (%rdi, %rcx), %eax
798 movzbl (%rsi, %rcx), %ecx
799 subl %ecx, %eax
800# endif
801 ret
802END (MEMCMP)
803#endif
804