1 | /* memcmp with SSE4.1, wmemcmp with SSE4.1 |
2 | Copyright (C) 2010-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef MEMCMP |
24 | # define MEMCMP __memcmp_sse4_1 |
25 | # endif |
26 | |
27 | #ifdef USE_AS_WMEMCMP |
28 | # define CMPEQ pcmpeqd |
29 | # define CHAR_SIZE 4 |
30 | #else |
31 | # define CMPEQ pcmpeqb |
32 | # define CHAR_SIZE 1 |
33 | #endif |
34 | |
35 | |
36 | /* Warning! |
37 | wmemcmp has to use SIGNED comparison for elements. |
38 | memcmp has to use UNSIGNED comparison for elemnts. |
39 | */ |
40 | |
41 | .section .text.sse4.1,"ax" ,@progbits |
42 | ENTRY (MEMCMP) |
43 | # ifdef USE_AS_WMEMCMP |
44 | shl $2, %RDX_LP |
45 | # elif defined __ILP32__ |
46 | /* Clear the upper 32 bits. */ |
47 | mov %edx, %edx |
48 | # endif |
49 | cmp $79, %RDX_LP |
50 | ja L(79bytesormore) |
51 | |
52 | cmp $CHAR_SIZE, %RDX_LP |
53 | jbe L(firstbyte) |
54 | |
55 | /* N in (CHAR_SIZE, 79) bytes. */ |
56 | cmpl $32, %edx |
57 | ja L(more_32_bytes) |
58 | |
59 | cmpl $16, %edx |
60 | jae L(16_to_32_bytes) |
61 | |
62 | # ifndef USE_AS_WMEMCMP |
63 | cmpl $8, %edx |
64 | jae L(8_to_16_bytes) |
65 | |
66 | cmpl $4, %edx |
67 | jb L(2_to_3_bytes) |
68 | |
69 | movl (%rdi), %eax |
70 | movl (%rsi), %ecx |
71 | |
72 | bswap %eax |
73 | bswap %ecx |
74 | |
75 | shlq $32, %rax |
76 | shlq $32, %rcx |
77 | |
78 | movl -4(%rdi, %rdx), %edi |
79 | movl -4(%rsi, %rdx), %esi |
80 | |
81 | bswap %edi |
82 | bswap %esi |
83 | |
84 | orq %rdi, %rax |
85 | orq %rsi, %rcx |
86 | subq %rcx, %rax |
87 | cmovne %edx, %eax |
88 | sbbl %ecx, %ecx |
89 | orl %ecx, %eax |
90 | ret |
91 | |
92 | .p2align 4,, 8 |
93 | L(2_to_3_bytes): |
94 | movzwl (%rdi), %eax |
95 | movzwl (%rsi), %ecx |
96 | shll $8, %eax |
97 | shll $8, %ecx |
98 | bswap %eax |
99 | bswap %ecx |
100 | movzbl -1(%rdi, %rdx), %edi |
101 | movzbl -1(%rsi, %rdx), %esi |
102 | orl %edi, %eax |
103 | orl %esi, %ecx |
104 | subl %ecx, %eax |
105 | ret |
106 | |
107 | .p2align 4,, 8 |
108 | L(8_to_16_bytes): |
109 | movq (%rdi), %rax |
110 | movq (%rsi), %rcx |
111 | |
112 | bswap %rax |
113 | bswap %rcx |
114 | |
115 | subq %rcx, %rax |
116 | jne L(8_to_16_bytes_done) |
117 | |
118 | movq -8(%rdi, %rdx), %rax |
119 | movq -8(%rsi, %rdx), %rcx |
120 | |
121 | bswap %rax |
122 | bswap %rcx |
123 | |
124 | subq %rcx, %rax |
125 | |
126 | L(8_to_16_bytes_done): |
127 | cmovne %edx, %eax |
128 | sbbl %ecx, %ecx |
129 | orl %ecx, %eax |
130 | ret |
131 | # else |
132 | xorl %eax, %eax |
133 | movl (%rdi), %ecx |
134 | cmpl (%rsi), %ecx |
135 | jne L(8_to_16_bytes_done) |
136 | movl 4(%rdi), %ecx |
137 | cmpl 4(%rsi), %ecx |
138 | jne L(8_to_16_bytes_done) |
139 | movl -4(%rdi, %rdx), %ecx |
140 | cmpl -4(%rsi, %rdx), %ecx |
141 | jne L(8_to_16_bytes_done) |
142 | ret |
143 | # endif |
144 | |
145 | .p2align 4,, 3 |
146 | L(ret_zero): |
147 | xorl %eax, %eax |
148 | L(zero): |
149 | ret |
150 | |
151 | .p2align 4,, 8 |
152 | L(firstbyte): |
153 | jb L(ret_zero) |
154 | # ifdef USE_AS_WMEMCMP |
155 | xorl %eax, %eax |
156 | movl (%rdi), %ecx |
157 | cmpl (%rsi), %ecx |
158 | je L(zero) |
159 | L(8_to_16_bytes_done): |
160 | setg %al |
161 | leal -1(%rax, %rax), %eax |
162 | # else |
163 | movzbl (%rdi), %eax |
164 | movzbl (%rsi), %ecx |
165 | sub %ecx, %eax |
166 | # endif |
167 | ret |
168 | |
169 | .p2align 4 |
170 | L(vec_return_begin_48): |
171 | addq $16, %rdi |
172 | addq $16, %rsi |
173 | L(vec_return_begin_32): |
174 | bsfl %eax, %eax |
175 | # ifdef USE_AS_WMEMCMP |
176 | movl 32(%rdi, %rax), %ecx |
177 | xorl %edx, %edx |
178 | cmpl 32(%rsi, %rax), %ecx |
179 | setg %dl |
180 | leal -1(%rdx, %rdx), %eax |
181 | # else |
182 | movzbl 32(%rsi, %rax), %ecx |
183 | movzbl 32(%rdi, %rax), %eax |
184 | subl %ecx, %eax |
185 | # endif |
186 | ret |
187 | |
188 | .p2align 4 |
189 | L(vec_return_begin_16): |
190 | addq $16, %rdi |
191 | addq $16, %rsi |
192 | L(vec_return_begin): |
193 | bsfl %eax, %eax |
194 | # ifdef USE_AS_WMEMCMP |
195 | movl (%rdi, %rax), %ecx |
196 | xorl %edx, %edx |
197 | cmpl (%rsi, %rax), %ecx |
198 | setg %dl |
199 | leal -1(%rdx, %rdx), %eax |
200 | # else |
201 | movzbl (%rsi, %rax), %ecx |
202 | movzbl (%rdi, %rax), %eax |
203 | subl %ecx, %eax |
204 | # endif |
205 | ret |
206 | |
207 | .p2align 4 |
208 | L(vec_return_end_16): |
209 | subl $16, %edx |
210 | L(vec_return_end): |
211 | bsfl %eax, %eax |
212 | addl %edx, %eax |
213 | # ifdef USE_AS_WMEMCMP |
214 | movl -16(%rdi, %rax), %ecx |
215 | xorl %edx, %edx |
216 | cmpl -16(%rsi, %rax), %ecx |
217 | setg %dl |
218 | leal -1(%rdx, %rdx), %eax |
219 | # else |
220 | movzbl -16(%rsi, %rax), %ecx |
221 | movzbl -16(%rdi, %rax), %eax |
222 | subl %ecx, %eax |
223 | # endif |
224 | ret |
225 | |
226 | .p2align 4,, 8 |
227 | L(more_32_bytes): |
228 | movdqu (%rdi), %xmm0 |
229 | movdqu (%rsi), %xmm1 |
230 | CMPEQ %xmm0, %xmm1 |
231 | pmovmskb %xmm1, %eax |
232 | incw %ax |
233 | jnz L(vec_return_begin) |
234 | |
235 | movdqu 16(%rdi), %xmm0 |
236 | movdqu 16(%rsi), %xmm1 |
237 | CMPEQ %xmm0, %xmm1 |
238 | pmovmskb %xmm1, %eax |
239 | incw %ax |
240 | jnz L(vec_return_begin_16) |
241 | |
242 | cmpl $64, %edx |
243 | jbe L(32_to_64_bytes) |
244 | movdqu 32(%rdi), %xmm0 |
245 | movdqu 32(%rsi), %xmm1 |
246 | CMPEQ %xmm0, %xmm1 |
247 | pmovmskb %xmm1, %eax |
248 | incw %ax |
249 | jnz L(vec_return_begin_32) |
250 | |
251 | .p2align 4,, 6 |
252 | L(32_to_64_bytes): |
253 | movdqu -32(%rdi, %rdx), %xmm0 |
254 | movdqu -32(%rsi, %rdx), %xmm1 |
255 | CMPEQ %xmm0, %xmm1 |
256 | pmovmskb %xmm1, %eax |
257 | incw %ax |
258 | jnz L(vec_return_end_16) |
259 | |
260 | movdqu -16(%rdi, %rdx), %xmm0 |
261 | movdqu -16(%rsi, %rdx), %xmm1 |
262 | CMPEQ %xmm0, %xmm1 |
263 | pmovmskb %xmm1, %eax |
264 | incw %ax |
265 | jnz L(vec_return_end) |
266 | ret |
267 | |
268 | .p2align 4 |
269 | L(16_to_32_bytes): |
270 | movdqu (%rdi), %xmm0 |
271 | movdqu (%rsi), %xmm1 |
272 | CMPEQ %xmm0, %xmm1 |
273 | pmovmskb %xmm1, %eax |
274 | incw %ax |
275 | jnz L(vec_return_begin) |
276 | |
277 | movdqu -16(%rdi, %rdx), %xmm0 |
278 | movdqu -16(%rsi, %rdx), %xmm1 |
279 | CMPEQ %xmm0, %xmm1 |
280 | pmovmskb %xmm1, %eax |
281 | incw %ax |
282 | jnz L(vec_return_end) |
283 | ret |
284 | |
285 | |
286 | .p2align 4 |
287 | L(79bytesormore): |
288 | movdqu (%rdi), %xmm0 |
289 | movdqu (%rsi), %xmm1 |
290 | CMPEQ %xmm0, %xmm1 |
291 | pmovmskb %xmm1, %eax |
292 | incw %ax |
293 | jnz L(vec_return_begin) |
294 | |
295 | |
296 | mov %rsi, %rcx |
297 | and $-16, %rsi |
298 | add $16, %rsi |
299 | sub %rsi, %rcx |
300 | |
301 | sub %rcx, %rdi |
302 | add %rcx, %rdx |
303 | test $0xf, %rdi |
304 | jz L(2aligned) |
305 | |
306 | cmp $128, %rdx |
307 | ja L(128bytesormore) |
308 | |
309 | .p2align 4,, 6 |
310 | L(less128bytes): |
311 | movdqu (%rdi), %xmm1 |
312 | CMPEQ (%rsi), %xmm1 |
313 | pmovmskb %xmm1, %eax |
314 | incw %ax |
315 | jnz L(vec_return_begin) |
316 | |
317 | movdqu 16(%rdi), %xmm1 |
318 | CMPEQ 16(%rsi), %xmm1 |
319 | pmovmskb %xmm1, %eax |
320 | incw %ax |
321 | jnz L(vec_return_begin_16) |
322 | |
323 | movdqu 32(%rdi), %xmm1 |
324 | CMPEQ 32(%rsi), %xmm1 |
325 | pmovmskb %xmm1, %eax |
326 | incw %ax |
327 | jnz L(vec_return_begin_32) |
328 | |
329 | movdqu 48(%rdi), %xmm1 |
330 | CMPEQ 48(%rsi), %xmm1 |
331 | pmovmskb %xmm1, %eax |
332 | incw %ax |
333 | jnz L(vec_return_begin_48) |
334 | |
335 | cmp $96, %rdx |
336 | jb L(32_to_64_bytes) |
337 | |
338 | addq $64, %rdi |
339 | addq $64, %rsi |
340 | subq $64, %rdx |
341 | |
342 | .p2align 4,, 6 |
343 | L(last_64_bytes): |
344 | movdqu (%rdi), %xmm1 |
345 | CMPEQ (%rsi), %xmm1 |
346 | pmovmskb %xmm1, %eax |
347 | incw %ax |
348 | jnz L(vec_return_begin) |
349 | |
350 | movdqu 16(%rdi), %xmm1 |
351 | CMPEQ 16(%rsi), %xmm1 |
352 | pmovmskb %xmm1, %eax |
353 | incw %ax |
354 | jnz L(vec_return_begin_16) |
355 | |
356 | movdqu -32(%rdi, %rdx), %xmm0 |
357 | movdqu -32(%rsi, %rdx), %xmm1 |
358 | CMPEQ %xmm0, %xmm1 |
359 | pmovmskb %xmm1, %eax |
360 | incw %ax |
361 | jnz L(vec_return_end_16) |
362 | |
363 | movdqu -16(%rdi, %rdx), %xmm0 |
364 | movdqu -16(%rsi, %rdx), %xmm1 |
365 | CMPEQ %xmm0, %xmm1 |
366 | pmovmskb %xmm1, %eax |
367 | incw %ax |
368 | jnz L(vec_return_end) |
369 | ret |
370 | |
371 | .p2align 4 |
372 | L(128bytesormore): |
373 | cmp $256, %rdx |
374 | ja L(unaligned_loop) |
375 | L(less256bytes): |
376 | movdqu (%rdi), %xmm1 |
377 | CMPEQ (%rsi), %xmm1 |
378 | pmovmskb %xmm1, %eax |
379 | incw %ax |
380 | jnz L(vec_return_begin) |
381 | |
382 | movdqu 16(%rdi), %xmm1 |
383 | CMPEQ 16(%rsi), %xmm1 |
384 | pmovmskb %xmm1, %eax |
385 | incw %ax |
386 | jnz L(vec_return_begin_16) |
387 | |
388 | movdqu 32(%rdi), %xmm1 |
389 | CMPEQ 32(%rsi), %xmm1 |
390 | pmovmskb %xmm1, %eax |
391 | incw %ax |
392 | jnz L(vec_return_begin_32) |
393 | |
394 | movdqu 48(%rdi), %xmm1 |
395 | CMPEQ 48(%rsi), %xmm1 |
396 | pmovmskb %xmm1, %eax |
397 | incw %ax |
398 | jnz L(vec_return_begin_48) |
399 | |
400 | addq $64, %rdi |
401 | addq $64, %rsi |
402 | |
403 | movdqu (%rdi), %xmm1 |
404 | CMPEQ (%rsi), %xmm1 |
405 | pmovmskb %xmm1, %eax |
406 | incw %ax |
407 | jnz L(vec_return_begin) |
408 | |
409 | movdqu 16(%rdi), %xmm1 |
410 | CMPEQ 16(%rsi), %xmm1 |
411 | pmovmskb %xmm1, %eax |
412 | incw %ax |
413 | jnz L(vec_return_begin_16) |
414 | |
415 | movdqu 32(%rdi), %xmm1 |
416 | CMPEQ 32(%rsi), %xmm1 |
417 | pmovmskb %xmm1, %eax |
418 | incw %ax |
419 | jnz L(vec_return_begin_32) |
420 | |
421 | movdqu 48(%rdi), %xmm1 |
422 | CMPEQ 48(%rsi), %xmm1 |
423 | pmovmskb %xmm1, %eax |
424 | incw %ax |
425 | jnz L(vec_return_begin_48) |
426 | |
427 | addq $-128, %rdx |
428 | subq $-64, %rsi |
429 | subq $-64, %rdi |
430 | |
431 | cmp $64, %rdx |
432 | ja L(less128bytes) |
433 | |
434 | cmp $32, %rdx |
435 | ja L(last_64_bytes) |
436 | |
437 | movdqu -32(%rdi, %rdx), %xmm0 |
438 | movdqu -32(%rsi, %rdx), %xmm1 |
439 | CMPEQ %xmm0, %xmm1 |
440 | pmovmskb %xmm1, %eax |
441 | incw %ax |
442 | jnz L(vec_return_end_16) |
443 | |
444 | movdqu -16(%rdi, %rdx), %xmm0 |
445 | movdqu -16(%rsi, %rdx), %xmm1 |
446 | CMPEQ %xmm0, %xmm1 |
447 | pmovmskb %xmm1, %eax |
448 | incw %ax |
449 | jnz L(vec_return_end) |
450 | ret |
451 | |
452 | .p2align 4 |
453 | L(unaligned_loop): |
454 | # ifdef DATA_CACHE_SIZE_HALF |
455 | mov $DATA_CACHE_SIZE_HALF, %R8_LP |
456 | # else |
457 | mov __x86_data_cache_size_half(%rip), %R8_LP |
458 | # endif |
459 | movq %r8, %r9 |
460 | addq %r8, %r8 |
461 | addq %r9, %r8 |
462 | cmpq %r8, %rdx |
463 | ja L(L2_L3_cache_unaligned) |
464 | sub $64, %rdx |
465 | .p2align 4 |
466 | L(64bytesormore_loop): |
467 | movdqu (%rdi), %xmm0 |
468 | movdqu 16(%rdi), %xmm1 |
469 | movdqu 32(%rdi), %xmm2 |
470 | movdqu 48(%rdi), %xmm3 |
471 | |
472 | CMPEQ (%rsi), %xmm0 |
473 | CMPEQ 16(%rsi), %xmm1 |
474 | CMPEQ 32(%rsi), %xmm2 |
475 | CMPEQ 48(%rsi), %xmm3 |
476 | |
477 | pand %xmm0, %xmm1 |
478 | pand %xmm2, %xmm3 |
479 | pand %xmm1, %xmm3 |
480 | |
481 | pmovmskb %xmm3, %eax |
482 | incw %ax |
483 | jnz L(64bytesormore_loop_end) |
484 | |
485 | add $64, %rsi |
486 | add $64, %rdi |
487 | sub $64, %rdx |
488 | ja L(64bytesormore_loop) |
489 | |
490 | .p2align 4,, 6 |
491 | L(loop_tail): |
492 | addq %rdx, %rdi |
493 | movdqu (%rdi), %xmm0 |
494 | movdqu 16(%rdi), %xmm1 |
495 | movdqu 32(%rdi), %xmm2 |
496 | movdqu 48(%rdi), %xmm3 |
497 | |
498 | addq %rdx, %rsi |
499 | movdqu (%rsi), %xmm4 |
500 | movdqu 16(%rsi), %xmm5 |
501 | movdqu 32(%rsi), %xmm6 |
502 | movdqu 48(%rsi), %xmm7 |
503 | |
504 | CMPEQ %xmm4, %xmm0 |
505 | CMPEQ %xmm5, %xmm1 |
506 | CMPEQ %xmm6, %xmm2 |
507 | CMPEQ %xmm7, %xmm3 |
508 | |
509 | pand %xmm0, %xmm1 |
510 | pand %xmm2, %xmm3 |
511 | pand %xmm1, %xmm3 |
512 | |
513 | pmovmskb %xmm3, %eax |
514 | incw %ax |
515 | jnz L(64bytesormore_loop_end) |
516 | ret |
517 | |
518 | L(L2_L3_cache_unaligned): |
519 | subq $64, %rdx |
520 | .p2align 4 |
521 | L(L2_L3_unaligned_128bytes_loop): |
522 | prefetchnta 0x1c0(%rdi) |
523 | prefetchnta 0x1c0(%rsi) |
524 | |
525 | movdqu (%rdi), %xmm0 |
526 | movdqu 16(%rdi), %xmm1 |
527 | movdqu 32(%rdi), %xmm2 |
528 | movdqu 48(%rdi), %xmm3 |
529 | |
530 | CMPEQ (%rsi), %xmm0 |
531 | CMPEQ 16(%rsi), %xmm1 |
532 | CMPEQ 32(%rsi), %xmm2 |
533 | CMPEQ 48(%rsi), %xmm3 |
534 | |
535 | pand %xmm0, %xmm1 |
536 | pand %xmm2, %xmm3 |
537 | pand %xmm1, %xmm3 |
538 | |
539 | pmovmskb %xmm3, %eax |
540 | incw %ax |
541 | jnz L(64bytesormore_loop_end) |
542 | |
543 | add $64, %rsi |
544 | add $64, %rdi |
545 | sub $64, %rdx |
546 | ja L(L2_L3_unaligned_128bytes_loop) |
547 | jmp L(loop_tail) |
548 | |
549 | |
550 | /* This case is for machines which are sensitive for unaligned |
551 | * instructions. */ |
552 | .p2align 4 |
553 | L(2aligned): |
554 | cmp $128, %rdx |
555 | ja L(128bytesormorein2aligned) |
556 | L(less128bytesin2aligned): |
557 | movdqa (%rdi), %xmm1 |
558 | CMPEQ (%rsi), %xmm1 |
559 | pmovmskb %xmm1, %eax |
560 | incw %ax |
561 | jnz L(vec_return_begin) |
562 | |
563 | movdqa 16(%rdi), %xmm1 |
564 | CMPEQ 16(%rsi), %xmm1 |
565 | pmovmskb %xmm1, %eax |
566 | incw %ax |
567 | jnz L(vec_return_begin_16) |
568 | |
569 | movdqa 32(%rdi), %xmm1 |
570 | CMPEQ 32(%rsi), %xmm1 |
571 | pmovmskb %xmm1, %eax |
572 | incw %ax |
573 | jnz L(vec_return_begin_32) |
574 | |
575 | movdqa 48(%rdi), %xmm1 |
576 | CMPEQ 48(%rsi), %xmm1 |
577 | pmovmskb %xmm1, %eax |
578 | incw %ax |
579 | jnz L(vec_return_begin_48) |
580 | |
581 | cmp $96, %rdx |
582 | jb L(32_to_64_bytes) |
583 | |
584 | addq $64, %rdi |
585 | addq $64, %rsi |
586 | subq $64, %rdx |
587 | |
588 | .p2align 4,, 6 |
589 | L(aligned_last_64_bytes): |
590 | movdqa (%rdi), %xmm1 |
591 | CMPEQ (%rsi), %xmm1 |
592 | pmovmskb %xmm1, %eax |
593 | incw %ax |
594 | jnz L(vec_return_begin) |
595 | |
596 | movdqa 16(%rdi), %xmm1 |
597 | CMPEQ 16(%rsi), %xmm1 |
598 | pmovmskb %xmm1, %eax |
599 | incw %ax |
600 | jnz L(vec_return_begin_16) |
601 | |
602 | movdqu -32(%rdi, %rdx), %xmm0 |
603 | movdqu -32(%rsi, %rdx), %xmm1 |
604 | CMPEQ %xmm0, %xmm1 |
605 | pmovmskb %xmm1, %eax |
606 | incw %ax |
607 | jnz L(vec_return_end_16) |
608 | |
609 | movdqu -16(%rdi, %rdx), %xmm0 |
610 | movdqu -16(%rsi, %rdx), %xmm1 |
611 | CMPEQ %xmm0, %xmm1 |
612 | pmovmskb %xmm1, %eax |
613 | incw %ax |
614 | jnz L(vec_return_end) |
615 | ret |
616 | |
617 | .p2align 4 |
618 | L(128bytesormorein2aligned): |
619 | cmp $256, %rdx |
620 | ja L(aligned_loop) |
621 | L(less256bytesin2alinged): |
622 | movdqa (%rdi), %xmm1 |
623 | CMPEQ (%rsi), %xmm1 |
624 | pmovmskb %xmm1, %eax |
625 | incw %ax |
626 | jnz L(vec_return_begin) |
627 | |
628 | movdqa 16(%rdi), %xmm1 |
629 | CMPEQ 16(%rsi), %xmm1 |
630 | pmovmskb %xmm1, %eax |
631 | incw %ax |
632 | jnz L(vec_return_begin_16) |
633 | |
634 | movdqa 32(%rdi), %xmm1 |
635 | CMPEQ 32(%rsi), %xmm1 |
636 | pmovmskb %xmm1, %eax |
637 | incw %ax |
638 | jnz L(vec_return_begin_32) |
639 | |
640 | movdqa 48(%rdi), %xmm1 |
641 | CMPEQ 48(%rsi), %xmm1 |
642 | pmovmskb %xmm1, %eax |
643 | incw %ax |
644 | jnz L(vec_return_begin_48) |
645 | |
646 | addq $64, %rdi |
647 | addq $64, %rsi |
648 | |
649 | movdqa (%rdi), %xmm1 |
650 | CMPEQ (%rsi), %xmm1 |
651 | pmovmskb %xmm1, %eax |
652 | incw %ax |
653 | jnz L(vec_return_begin) |
654 | |
655 | movdqa 16(%rdi), %xmm1 |
656 | CMPEQ 16(%rsi), %xmm1 |
657 | pmovmskb %xmm1, %eax |
658 | incw %ax |
659 | jnz L(vec_return_begin_16) |
660 | |
661 | movdqa 32(%rdi), %xmm1 |
662 | CMPEQ 32(%rsi), %xmm1 |
663 | pmovmskb %xmm1, %eax |
664 | incw %ax |
665 | jnz L(vec_return_begin_32) |
666 | |
667 | movdqa 48(%rdi), %xmm1 |
668 | CMPEQ 48(%rsi), %xmm1 |
669 | pmovmskb %xmm1, %eax |
670 | incw %ax |
671 | jnz L(vec_return_begin_48) |
672 | |
673 | addq $-128, %rdx |
674 | subq $-64, %rsi |
675 | subq $-64, %rdi |
676 | |
677 | cmp $64, %rdx |
678 | ja L(less128bytesin2aligned) |
679 | |
680 | cmp $32, %rdx |
681 | ja L(aligned_last_64_bytes) |
682 | |
683 | movdqu -32(%rdi, %rdx), %xmm0 |
684 | movdqu -32(%rsi, %rdx), %xmm1 |
685 | CMPEQ %xmm0, %xmm1 |
686 | pmovmskb %xmm1, %eax |
687 | incw %ax |
688 | jnz L(vec_return_end_16) |
689 | |
690 | movdqu -16(%rdi, %rdx), %xmm0 |
691 | movdqu -16(%rsi, %rdx), %xmm1 |
692 | CMPEQ %xmm0, %xmm1 |
693 | pmovmskb %xmm1, %eax |
694 | incw %ax |
695 | jnz L(vec_return_end) |
696 | ret |
697 | |
698 | .p2align 4 |
699 | L(aligned_loop): |
700 | # ifdef DATA_CACHE_SIZE_HALF |
701 | mov $DATA_CACHE_SIZE_HALF, %R8_LP |
702 | # else |
703 | mov __x86_data_cache_size_half(%rip), %R8_LP |
704 | # endif |
705 | movq %r8, %r9 |
706 | addq %r8, %r8 |
707 | addq %r9, %r8 |
708 | cmpq %r8, %rdx |
709 | ja L(L2_L3_cache_aligned) |
710 | |
711 | sub $64, %rdx |
712 | .p2align 4 |
713 | L(64bytesormore_loopin2aligned): |
714 | movdqa (%rdi), %xmm0 |
715 | movdqa 16(%rdi), %xmm1 |
716 | movdqa 32(%rdi), %xmm2 |
717 | movdqa 48(%rdi), %xmm3 |
718 | |
719 | CMPEQ (%rsi), %xmm0 |
720 | CMPEQ 16(%rsi), %xmm1 |
721 | CMPEQ 32(%rsi), %xmm2 |
722 | CMPEQ 48(%rsi), %xmm3 |
723 | |
724 | pand %xmm0, %xmm1 |
725 | pand %xmm2, %xmm3 |
726 | pand %xmm1, %xmm3 |
727 | |
728 | pmovmskb %xmm3, %eax |
729 | incw %ax |
730 | jnz L(64bytesormore_loop_end) |
731 | add $64, %rsi |
732 | add $64, %rdi |
733 | sub $64, %rdx |
734 | ja L(64bytesormore_loopin2aligned) |
735 | jmp L(loop_tail) |
736 | |
737 | L(L2_L3_cache_aligned): |
738 | subq $64, %rdx |
739 | .p2align 4 |
740 | L(L2_L3_aligned_128bytes_loop): |
741 | prefetchnta 0x1c0(%rdi) |
742 | prefetchnta 0x1c0(%rsi) |
743 | movdqa (%rdi), %xmm0 |
744 | movdqa 16(%rdi), %xmm1 |
745 | movdqa 32(%rdi), %xmm2 |
746 | movdqa 48(%rdi), %xmm3 |
747 | |
748 | CMPEQ (%rsi), %xmm0 |
749 | CMPEQ 16(%rsi), %xmm1 |
750 | CMPEQ 32(%rsi), %xmm2 |
751 | CMPEQ 48(%rsi), %xmm3 |
752 | |
753 | pand %xmm0, %xmm1 |
754 | pand %xmm2, %xmm3 |
755 | pand %xmm1, %xmm3 |
756 | |
757 | pmovmskb %xmm3, %eax |
758 | incw %ax |
759 | jnz L(64bytesormore_loop_end) |
760 | |
761 | addq $64, %rsi |
762 | addq $64, %rdi |
763 | subq $64, %rdx |
764 | ja L(L2_L3_aligned_128bytes_loop) |
765 | jmp L(loop_tail) |
766 | |
767 | .p2align 4 |
768 | L(64bytesormore_loop_end): |
769 | pmovmskb %xmm0, %ecx |
770 | incw %cx |
771 | jnz L(loop_end_ret) |
772 | |
773 | pmovmskb %xmm1, %ecx |
774 | notw %cx |
775 | sall $16, %ecx |
776 | jnz L(loop_end_ret) |
777 | |
778 | pmovmskb %xmm2, %ecx |
779 | notw %cx |
780 | shlq $32, %rcx |
781 | jnz L(loop_end_ret) |
782 | |
783 | addq $48, %rdi |
784 | addq $48, %rsi |
785 | movq %rax, %rcx |
786 | |
787 | .p2align 4,, 6 |
788 | L(loop_end_ret): |
789 | bsfq %rcx, %rcx |
790 | # ifdef USE_AS_WMEMCMP |
791 | movl (%rdi, %rcx), %eax |
792 | xorl %edx, %edx |
793 | cmpl (%rsi, %rcx), %eax |
794 | setg %dl |
795 | leal -1(%rdx, %rdx), %eax |
796 | # else |
797 | movzbl (%rdi, %rcx), %eax |
798 | movzbl (%rsi, %rcx), %ecx |
799 | subl %ecx, %eax |
800 | # endif |
801 | ret |
802 | END (MEMCMP) |
803 | #endif |
804 | |