1 | /* memcmp with SSSE3, wmemcmp with SSSE3 |
2 | Copyright (C) 2011-2021 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #if IS_IN (libc) |
21 | |
22 | # include <sysdep.h> |
23 | |
24 | # ifndef MEMCMP |
25 | # define MEMCMP __memcmp_ssse3 |
26 | # endif |
27 | |
28 | /* Warning! |
29 | wmemcmp has to use SIGNED comparison for elements. |
30 | memcmp has to use UNSIGNED comparison for elemnts. |
31 | */ |
32 | |
33 | atom_text_section |
34 | ENTRY (MEMCMP) |
35 | # ifdef USE_AS_WMEMCMP |
36 | shl $2, %RDX_LP |
37 | test %RDX_LP, %RDX_LP |
38 | jz L(equal) |
39 | # elif defined __ILP32__ |
40 | /* Clear the upper 32 bits. */ |
41 | mov %edx, %edx |
42 | # endif |
43 | mov %rdx, %rcx |
44 | mov %rdi, %rdx |
45 | cmp $48, %rcx; |
46 | jae L(48bytesormore) /* LEN => 48 */ |
47 | |
48 | add %rcx, %rsi |
49 | add %rcx, %rdi |
50 | jmp L(less48bytes) |
51 | |
52 | .p2align 4 |
53 | /* ECX >= 32. */ |
54 | L(48bytesormore): |
55 | movdqu (%rdi), %xmm3 |
56 | movdqu (%rsi), %xmm0 |
57 | pcmpeqb %xmm0, %xmm3 |
58 | pmovmskb %xmm3, %edx |
59 | lea 16(%rdi), %rdi |
60 | lea 16(%rsi), %rsi |
61 | sub $0xffff, %edx |
62 | jnz L(less16bytes) |
63 | mov %edi, %edx |
64 | and $0xf, %edx |
65 | xor %rdx, %rdi |
66 | sub %rdx, %rsi |
67 | add %rdx, %rcx |
68 | mov %esi, %edx |
69 | and $0xf, %edx |
70 | jz L(shr_0) |
71 | xor %rdx, %rsi |
72 | |
73 | # ifndef USE_AS_WMEMCMP |
74 | cmp $8, %edx |
75 | jae L(next_unaligned_table) |
76 | cmp $0, %edx |
77 | je L(shr_0) |
78 | cmp $1, %edx |
79 | je L(shr_1) |
80 | cmp $2, %edx |
81 | je L(shr_2) |
82 | cmp $3, %edx |
83 | je L(shr_3) |
84 | cmp $4, %edx |
85 | je L(shr_4) |
86 | cmp $5, %edx |
87 | je L(shr_5) |
88 | cmp $6, %edx |
89 | je L(shr_6) |
90 | jmp L(shr_7) |
91 | |
92 | .p2align 2 |
93 | L(next_unaligned_table): |
94 | cmp $8, %edx |
95 | je L(shr_8) |
96 | cmp $9, %edx |
97 | je L(shr_9) |
98 | cmp $10, %edx |
99 | je L(shr_10) |
100 | cmp $11, %edx |
101 | je L(shr_11) |
102 | cmp $12, %edx |
103 | je L(shr_12) |
104 | cmp $13, %edx |
105 | je L(shr_13) |
106 | cmp $14, %edx |
107 | je L(shr_14) |
108 | jmp L(shr_15) |
109 | # else |
110 | cmp $0, %edx |
111 | je L(shr_0) |
112 | cmp $4, %edx |
113 | je L(shr_4) |
114 | cmp $8, %edx |
115 | je L(shr_8) |
116 | jmp L(shr_12) |
117 | # endif |
118 | |
119 | .p2align 4 |
120 | L(shr_0): |
121 | cmp $80, %rcx |
122 | lea -48(%rcx), %rcx |
123 | jae L(shr_0_gobble) |
124 | xor %eax, %eax |
125 | movdqa (%rsi), %xmm1 |
126 | pcmpeqb (%rdi), %xmm1 |
127 | movdqa 16(%rsi), %xmm2 |
128 | pcmpeqb 16(%rdi), %xmm2 |
129 | pand %xmm1, %xmm2 |
130 | pmovmskb %xmm2, %edx |
131 | lea 32(%rdi), %rdi |
132 | lea 32(%rsi), %rsi |
133 | sub $0xffff, %edx |
134 | jnz L(exit) |
135 | add %rcx, %rsi |
136 | add %rcx, %rdi |
137 | jmp L(less48bytes) |
138 | |
139 | .p2align 4 |
140 | L(shr_0_gobble): |
141 | movdqa (%rsi), %xmm0 |
142 | xor %eax, %eax |
143 | pcmpeqb (%rdi), %xmm0 |
144 | sub $32, %rcx |
145 | movdqa 16(%rsi), %xmm2 |
146 | pcmpeqb 16(%rdi), %xmm2 |
147 | L(shr_0_gobble_loop): |
148 | pand %xmm0, %xmm2 |
149 | sub $32, %rcx |
150 | pmovmskb %xmm2, %edx |
151 | movdqa %xmm0, %xmm1 |
152 | movdqa 32(%rsi), %xmm0 |
153 | movdqa 48(%rsi), %xmm2 |
154 | sbb $0xffff, %edx |
155 | pcmpeqb 32(%rdi), %xmm0 |
156 | pcmpeqb 48(%rdi), %xmm2 |
157 | lea 32(%rdi), %rdi |
158 | lea 32(%rsi), %rsi |
159 | jz L(shr_0_gobble_loop) |
160 | |
161 | pand %xmm0, %xmm2 |
162 | cmp $0, %rcx |
163 | jge L(next) |
164 | inc %edx |
165 | add $32, %rcx |
166 | L(next): |
167 | test %edx, %edx |
168 | jnz L(exit) |
169 | |
170 | pmovmskb %xmm2, %edx |
171 | movdqa %xmm0, %xmm1 |
172 | lea 32(%rdi), %rdi |
173 | lea 32(%rsi), %rsi |
174 | sub $0xffff, %edx |
175 | jnz L(exit) |
176 | add %rcx, %rsi |
177 | add %rcx, %rdi |
178 | jmp L(less48bytes) |
179 | |
180 | # ifndef USE_AS_WMEMCMP |
181 | |
182 | .p2align 4 |
183 | L(shr_1): |
184 | cmp $80, %rcx |
185 | lea -48(%rcx), %rcx |
186 | mov %edx, %eax |
187 | jae L(shr_1_gobble) |
188 | |
189 | movdqa 16(%rsi), %xmm1 |
190 | movdqa %xmm1, %xmm2 |
191 | palignr $1, (%rsi), %xmm1 |
192 | pcmpeqb (%rdi), %xmm1 |
193 | |
194 | movdqa 32(%rsi), %xmm3 |
195 | palignr $1, %xmm2, %xmm3 |
196 | pcmpeqb 16(%rdi), %xmm3 |
197 | |
198 | pand %xmm1, %xmm3 |
199 | pmovmskb %xmm3, %edx |
200 | lea 32(%rdi), %rdi |
201 | lea 32(%rsi), %rsi |
202 | sub $0xffff, %edx |
203 | jnz L(exit) |
204 | add $1, %rsi |
205 | add %rcx, %rsi |
206 | add %rcx, %rdi |
207 | jmp L(less48bytes) |
208 | |
209 | .p2align 4 |
210 | L(shr_1_gobble): |
211 | sub $32, %rcx |
212 | movdqa 16(%rsi), %xmm0 |
213 | palignr $1, (%rsi), %xmm0 |
214 | pcmpeqb (%rdi), %xmm0 |
215 | |
216 | movdqa 32(%rsi), %xmm3 |
217 | palignr $1, 16(%rsi), %xmm3 |
218 | pcmpeqb 16(%rdi), %xmm3 |
219 | |
220 | L(shr_1_gobble_loop): |
221 | pand %xmm0, %xmm3 |
222 | sub $32, %rcx |
223 | pmovmskb %xmm3, %edx |
224 | movdqa %xmm0, %xmm1 |
225 | |
226 | movdqa 64(%rsi), %xmm3 |
227 | palignr $1, 48(%rsi), %xmm3 |
228 | sbb $0xffff, %edx |
229 | movdqa 48(%rsi), %xmm0 |
230 | palignr $1, 32(%rsi), %xmm0 |
231 | pcmpeqb 32(%rdi), %xmm0 |
232 | lea 32(%rsi), %rsi |
233 | pcmpeqb 48(%rdi), %xmm3 |
234 | |
235 | lea 32(%rdi), %rdi |
236 | jz L(shr_1_gobble_loop) |
237 | pand %xmm0, %xmm3 |
238 | |
239 | cmp $0, %rcx |
240 | jge L(shr_1_gobble_next) |
241 | inc %edx |
242 | add $32, %rcx |
243 | L(shr_1_gobble_next): |
244 | test %edx, %edx |
245 | jnz L(exit) |
246 | |
247 | pmovmskb %xmm3, %edx |
248 | movdqa %xmm0, %xmm1 |
249 | lea 32(%rdi), %rdi |
250 | lea 32(%rsi), %rsi |
251 | sub $0xffff, %edx |
252 | jnz L(exit) |
253 | |
254 | lea 1(%rsi), %rsi |
255 | add %rcx, %rsi |
256 | add %rcx, %rdi |
257 | jmp L(less48bytes) |
258 | |
259 | |
260 | .p2align 4 |
261 | L(shr_2): |
262 | cmp $80, %rcx |
263 | lea -48(%rcx), %rcx |
264 | mov %edx, %eax |
265 | jae L(shr_2_gobble) |
266 | |
267 | movdqa 16(%rsi), %xmm1 |
268 | movdqa %xmm1, %xmm2 |
269 | palignr $2, (%rsi), %xmm1 |
270 | pcmpeqb (%rdi), %xmm1 |
271 | |
272 | movdqa 32(%rsi), %xmm3 |
273 | palignr $2, %xmm2, %xmm3 |
274 | pcmpeqb 16(%rdi), %xmm3 |
275 | |
276 | pand %xmm1, %xmm3 |
277 | pmovmskb %xmm3, %edx |
278 | lea 32(%rdi), %rdi |
279 | lea 32(%rsi), %rsi |
280 | sub $0xffff, %edx |
281 | jnz L(exit) |
282 | add $2, %rsi |
283 | add %rcx, %rsi |
284 | add %rcx, %rdi |
285 | jmp L(less48bytes) |
286 | |
287 | .p2align 4 |
288 | L(shr_2_gobble): |
289 | sub $32, %rcx |
290 | movdqa 16(%rsi), %xmm0 |
291 | palignr $2, (%rsi), %xmm0 |
292 | pcmpeqb (%rdi), %xmm0 |
293 | |
294 | movdqa 32(%rsi), %xmm3 |
295 | palignr $2, 16(%rsi), %xmm3 |
296 | pcmpeqb 16(%rdi), %xmm3 |
297 | |
298 | L(shr_2_gobble_loop): |
299 | pand %xmm0, %xmm3 |
300 | sub $32, %rcx |
301 | pmovmskb %xmm3, %edx |
302 | movdqa %xmm0, %xmm1 |
303 | |
304 | movdqa 64(%rsi), %xmm3 |
305 | palignr $2, 48(%rsi), %xmm3 |
306 | sbb $0xffff, %edx |
307 | movdqa 48(%rsi), %xmm0 |
308 | palignr $2, 32(%rsi), %xmm0 |
309 | pcmpeqb 32(%rdi), %xmm0 |
310 | lea 32(%rsi), %rsi |
311 | pcmpeqb 48(%rdi), %xmm3 |
312 | |
313 | lea 32(%rdi), %rdi |
314 | jz L(shr_2_gobble_loop) |
315 | pand %xmm0, %xmm3 |
316 | |
317 | cmp $0, %rcx |
318 | jge L(shr_2_gobble_next) |
319 | inc %edx |
320 | add $32, %rcx |
321 | L(shr_2_gobble_next): |
322 | test %edx, %edx |
323 | jnz L(exit) |
324 | |
325 | pmovmskb %xmm3, %edx |
326 | movdqa %xmm0, %xmm1 |
327 | lea 32(%rdi), %rdi |
328 | lea 32(%rsi), %rsi |
329 | sub $0xffff, %edx |
330 | jnz L(exit) |
331 | |
332 | lea 2(%rsi), %rsi |
333 | add %rcx, %rsi |
334 | add %rcx, %rdi |
335 | jmp L(less48bytes) |
336 | |
337 | .p2align 4 |
338 | L(shr_3): |
339 | cmp $80, %rcx |
340 | lea -48(%rcx), %rcx |
341 | mov %edx, %eax |
342 | jae L(shr_3_gobble) |
343 | |
344 | movdqa 16(%rsi), %xmm1 |
345 | movdqa %xmm1, %xmm2 |
346 | palignr $3, (%rsi), %xmm1 |
347 | pcmpeqb (%rdi), %xmm1 |
348 | |
349 | movdqa 32(%rsi), %xmm3 |
350 | palignr $3, %xmm2, %xmm3 |
351 | pcmpeqb 16(%rdi), %xmm3 |
352 | |
353 | pand %xmm1, %xmm3 |
354 | pmovmskb %xmm3, %edx |
355 | lea 32(%rdi), %rdi |
356 | lea 32(%rsi), %rsi |
357 | sub $0xffff, %edx |
358 | jnz L(exit) |
359 | add $3, %rsi |
360 | add %rcx, %rsi |
361 | add %rcx, %rdi |
362 | jmp L(less48bytes) |
363 | |
364 | .p2align 4 |
365 | L(shr_3_gobble): |
366 | sub $32, %rcx |
367 | movdqa 16(%rsi), %xmm0 |
368 | palignr $3, (%rsi), %xmm0 |
369 | pcmpeqb (%rdi), %xmm0 |
370 | |
371 | movdqa 32(%rsi), %xmm3 |
372 | palignr $3, 16(%rsi), %xmm3 |
373 | pcmpeqb 16(%rdi), %xmm3 |
374 | |
375 | L(shr_3_gobble_loop): |
376 | pand %xmm0, %xmm3 |
377 | sub $32, %rcx |
378 | pmovmskb %xmm3, %edx |
379 | movdqa %xmm0, %xmm1 |
380 | |
381 | movdqa 64(%rsi), %xmm3 |
382 | palignr $3, 48(%rsi), %xmm3 |
383 | sbb $0xffff, %edx |
384 | movdqa 48(%rsi), %xmm0 |
385 | palignr $3, 32(%rsi), %xmm0 |
386 | pcmpeqb 32(%rdi), %xmm0 |
387 | lea 32(%rsi), %rsi |
388 | pcmpeqb 48(%rdi), %xmm3 |
389 | |
390 | lea 32(%rdi), %rdi |
391 | jz L(shr_3_gobble_loop) |
392 | pand %xmm0, %xmm3 |
393 | |
394 | cmp $0, %rcx |
395 | jge L(shr_3_gobble_next) |
396 | inc %edx |
397 | add $32, %rcx |
398 | L(shr_3_gobble_next): |
399 | test %edx, %edx |
400 | jnz L(exit) |
401 | |
402 | pmovmskb %xmm3, %edx |
403 | movdqa %xmm0, %xmm1 |
404 | lea 32(%rdi), %rdi |
405 | lea 32(%rsi), %rsi |
406 | sub $0xffff, %edx |
407 | jnz L(exit) |
408 | |
409 | lea 3(%rsi), %rsi |
410 | add %rcx, %rsi |
411 | add %rcx, %rdi |
412 | jmp L(less48bytes) |
413 | |
414 | # endif |
415 | |
416 | .p2align 4 |
417 | L(shr_4): |
418 | cmp $80, %rcx |
419 | lea -48(%rcx), %rcx |
420 | mov %edx, %eax |
421 | jae L(shr_4_gobble) |
422 | |
423 | movdqa 16(%rsi), %xmm1 |
424 | movdqa %xmm1, %xmm2 |
425 | palignr $4, (%rsi), %xmm1 |
426 | pcmpeqb (%rdi), %xmm1 |
427 | |
428 | movdqa 32(%rsi), %xmm3 |
429 | palignr $4, %xmm2, %xmm3 |
430 | pcmpeqb 16(%rdi), %xmm3 |
431 | |
432 | pand %xmm1, %xmm3 |
433 | pmovmskb %xmm3, %edx |
434 | lea 32(%rdi), %rdi |
435 | lea 32(%rsi), %rsi |
436 | sub $0xffff, %edx |
437 | jnz L(exit) |
438 | add $4, %rsi |
439 | add %rcx, %rsi |
440 | add %rcx, %rdi |
441 | jmp L(less48bytes) |
442 | |
443 | .p2align 4 |
444 | L(shr_4_gobble): |
445 | sub $32, %rcx |
446 | movdqa 16(%rsi), %xmm0 |
447 | palignr $4, (%rsi), %xmm0 |
448 | pcmpeqb (%rdi), %xmm0 |
449 | |
450 | movdqa 32(%rsi), %xmm3 |
451 | palignr $4, 16(%rsi), %xmm3 |
452 | pcmpeqb 16(%rdi), %xmm3 |
453 | |
454 | L(shr_4_gobble_loop): |
455 | pand %xmm0, %xmm3 |
456 | sub $32, %rcx |
457 | pmovmskb %xmm3, %edx |
458 | movdqa %xmm0, %xmm1 |
459 | |
460 | movdqa 64(%rsi), %xmm3 |
461 | palignr $4, 48(%rsi), %xmm3 |
462 | sbb $0xffff, %edx |
463 | movdqa 48(%rsi), %xmm0 |
464 | palignr $4, 32(%rsi), %xmm0 |
465 | pcmpeqb 32(%rdi), %xmm0 |
466 | lea 32(%rsi), %rsi |
467 | pcmpeqb 48(%rdi), %xmm3 |
468 | |
469 | lea 32(%rdi), %rdi |
470 | jz L(shr_4_gobble_loop) |
471 | pand %xmm0, %xmm3 |
472 | |
473 | cmp $0, %rcx |
474 | jge L(shr_4_gobble_next) |
475 | inc %edx |
476 | add $32, %rcx |
477 | L(shr_4_gobble_next): |
478 | test %edx, %edx |
479 | jnz L(exit) |
480 | |
481 | pmovmskb %xmm3, %edx |
482 | movdqa %xmm0, %xmm1 |
483 | lea 32(%rdi), %rdi |
484 | lea 32(%rsi), %rsi |
485 | sub $0xffff, %edx |
486 | jnz L(exit) |
487 | |
488 | lea 4(%rsi), %rsi |
489 | add %rcx, %rsi |
490 | add %rcx, %rdi |
491 | jmp L(less48bytes) |
492 | |
493 | # ifndef USE_AS_WMEMCMP |
494 | |
495 | .p2align 4 |
496 | L(shr_5): |
497 | cmp $80, %rcx |
498 | lea -48(%rcx), %rcx |
499 | mov %edx, %eax |
500 | jae L(shr_5_gobble) |
501 | |
502 | movdqa 16(%rsi), %xmm1 |
503 | movdqa %xmm1, %xmm2 |
504 | palignr $5, (%rsi), %xmm1 |
505 | pcmpeqb (%rdi), %xmm1 |
506 | |
507 | movdqa 32(%rsi), %xmm3 |
508 | palignr $5, %xmm2, %xmm3 |
509 | pcmpeqb 16(%rdi), %xmm3 |
510 | |
511 | pand %xmm1, %xmm3 |
512 | pmovmskb %xmm3, %edx |
513 | lea 32(%rdi), %rdi |
514 | lea 32(%rsi), %rsi |
515 | sub $0xffff, %edx |
516 | jnz L(exit) |
517 | add $5, %rsi |
518 | add %rcx, %rsi |
519 | add %rcx, %rdi |
520 | jmp L(less48bytes) |
521 | |
522 | .p2align 4 |
523 | L(shr_5_gobble): |
524 | sub $32, %rcx |
525 | movdqa 16(%rsi), %xmm0 |
526 | palignr $5, (%rsi), %xmm0 |
527 | pcmpeqb (%rdi), %xmm0 |
528 | |
529 | movdqa 32(%rsi), %xmm3 |
530 | palignr $5, 16(%rsi), %xmm3 |
531 | pcmpeqb 16(%rdi), %xmm3 |
532 | |
533 | L(shr_5_gobble_loop): |
534 | pand %xmm0, %xmm3 |
535 | sub $32, %rcx |
536 | pmovmskb %xmm3, %edx |
537 | movdqa %xmm0, %xmm1 |
538 | |
539 | movdqa 64(%rsi), %xmm3 |
540 | palignr $5, 48(%rsi), %xmm3 |
541 | sbb $0xffff, %edx |
542 | movdqa 48(%rsi), %xmm0 |
543 | palignr $5, 32(%rsi), %xmm0 |
544 | pcmpeqb 32(%rdi), %xmm0 |
545 | lea 32(%rsi), %rsi |
546 | pcmpeqb 48(%rdi), %xmm3 |
547 | |
548 | lea 32(%rdi), %rdi |
549 | jz L(shr_5_gobble_loop) |
550 | pand %xmm0, %xmm3 |
551 | |
552 | cmp $0, %rcx |
553 | jge L(shr_5_gobble_next) |
554 | inc %edx |
555 | add $32, %rcx |
556 | L(shr_5_gobble_next): |
557 | test %edx, %edx |
558 | jnz L(exit) |
559 | |
560 | pmovmskb %xmm3, %edx |
561 | movdqa %xmm0, %xmm1 |
562 | lea 32(%rdi), %rdi |
563 | lea 32(%rsi), %rsi |
564 | sub $0xffff, %edx |
565 | jnz L(exit) |
566 | |
567 | lea 5(%rsi), %rsi |
568 | add %rcx, %rsi |
569 | add %rcx, %rdi |
570 | jmp L(less48bytes) |
571 | |
572 | .p2align 4 |
573 | L(shr_6): |
574 | cmp $80, %rcx |
575 | lea -48(%rcx), %rcx |
576 | mov %edx, %eax |
577 | jae L(shr_6_gobble) |
578 | |
579 | movdqa 16(%rsi), %xmm1 |
580 | movdqa %xmm1, %xmm2 |
581 | palignr $6, (%rsi), %xmm1 |
582 | pcmpeqb (%rdi), %xmm1 |
583 | |
584 | movdqa 32(%rsi), %xmm3 |
585 | palignr $6, %xmm2, %xmm3 |
586 | pcmpeqb 16(%rdi), %xmm3 |
587 | |
588 | pand %xmm1, %xmm3 |
589 | pmovmskb %xmm3, %edx |
590 | lea 32(%rdi), %rdi |
591 | lea 32(%rsi), %rsi |
592 | sub $0xffff, %edx |
593 | jnz L(exit) |
594 | add $6, %rsi |
595 | add %rcx, %rsi |
596 | add %rcx, %rdi |
597 | jmp L(less48bytes) |
598 | |
599 | .p2align 4 |
600 | L(shr_6_gobble): |
601 | sub $32, %rcx |
602 | movdqa 16(%rsi), %xmm0 |
603 | palignr $6, (%rsi), %xmm0 |
604 | pcmpeqb (%rdi), %xmm0 |
605 | |
606 | movdqa 32(%rsi), %xmm3 |
607 | palignr $6, 16(%rsi), %xmm3 |
608 | pcmpeqb 16(%rdi), %xmm3 |
609 | |
610 | L(shr_6_gobble_loop): |
611 | pand %xmm0, %xmm3 |
612 | sub $32, %rcx |
613 | pmovmskb %xmm3, %edx |
614 | movdqa %xmm0, %xmm1 |
615 | |
616 | movdqa 64(%rsi), %xmm3 |
617 | palignr $6, 48(%rsi), %xmm3 |
618 | sbb $0xffff, %edx |
619 | movdqa 48(%rsi), %xmm0 |
620 | palignr $6, 32(%rsi), %xmm0 |
621 | pcmpeqb 32(%rdi), %xmm0 |
622 | lea 32(%rsi), %rsi |
623 | pcmpeqb 48(%rdi), %xmm3 |
624 | |
625 | lea 32(%rdi), %rdi |
626 | jz L(shr_6_gobble_loop) |
627 | pand %xmm0, %xmm3 |
628 | |
629 | cmp $0, %rcx |
630 | jge L(shr_6_gobble_next) |
631 | inc %edx |
632 | add $32, %rcx |
633 | L(shr_6_gobble_next): |
634 | test %edx, %edx |
635 | jnz L(exit) |
636 | |
637 | pmovmskb %xmm3, %edx |
638 | movdqa %xmm0, %xmm1 |
639 | lea 32(%rdi), %rdi |
640 | lea 32(%rsi), %rsi |
641 | sub $0xffff, %edx |
642 | jnz L(exit) |
643 | |
644 | lea 6(%rsi), %rsi |
645 | add %rcx, %rsi |
646 | add %rcx, %rdi |
647 | jmp L(less48bytes) |
648 | |
649 | .p2align 4 |
650 | L(shr_7): |
651 | cmp $80, %rcx |
652 | lea -48(%rcx), %rcx |
653 | mov %edx, %eax |
654 | jae L(shr_7_gobble) |
655 | |
656 | movdqa 16(%rsi), %xmm1 |
657 | movdqa %xmm1, %xmm2 |
658 | palignr $7, (%rsi), %xmm1 |
659 | pcmpeqb (%rdi), %xmm1 |
660 | |
661 | movdqa 32(%rsi), %xmm3 |
662 | palignr $7, %xmm2, %xmm3 |
663 | pcmpeqb 16(%rdi), %xmm3 |
664 | |
665 | pand %xmm1, %xmm3 |
666 | pmovmskb %xmm3, %edx |
667 | lea 32(%rdi), %rdi |
668 | lea 32(%rsi), %rsi |
669 | sub $0xffff, %edx |
670 | jnz L(exit) |
671 | add $7, %rsi |
672 | add %rcx, %rsi |
673 | add %rcx, %rdi |
674 | jmp L(less48bytes) |
675 | |
676 | .p2align 4 |
677 | L(shr_7_gobble): |
678 | sub $32, %rcx |
679 | movdqa 16(%rsi), %xmm0 |
680 | palignr $7, (%rsi), %xmm0 |
681 | pcmpeqb (%rdi), %xmm0 |
682 | |
683 | movdqa 32(%rsi), %xmm3 |
684 | palignr $7, 16(%rsi), %xmm3 |
685 | pcmpeqb 16(%rdi), %xmm3 |
686 | |
687 | L(shr_7_gobble_loop): |
688 | pand %xmm0, %xmm3 |
689 | sub $32, %rcx |
690 | pmovmskb %xmm3, %edx |
691 | movdqa %xmm0, %xmm1 |
692 | |
693 | movdqa 64(%rsi), %xmm3 |
694 | palignr $7, 48(%rsi), %xmm3 |
695 | sbb $0xffff, %edx |
696 | movdqa 48(%rsi), %xmm0 |
697 | palignr $7, 32(%rsi), %xmm0 |
698 | pcmpeqb 32(%rdi), %xmm0 |
699 | lea 32(%rsi), %rsi |
700 | pcmpeqb 48(%rdi), %xmm3 |
701 | |
702 | lea 32(%rdi), %rdi |
703 | jz L(shr_7_gobble_loop) |
704 | pand %xmm0, %xmm3 |
705 | |
706 | cmp $0, %rcx |
707 | jge L(shr_7_gobble_next) |
708 | inc %edx |
709 | add $32, %rcx |
710 | L(shr_7_gobble_next): |
711 | test %edx, %edx |
712 | jnz L(exit) |
713 | |
714 | pmovmskb %xmm3, %edx |
715 | movdqa %xmm0, %xmm1 |
716 | lea 32(%rdi), %rdi |
717 | lea 32(%rsi), %rsi |
718 | sub $0xffff, %edx |
719 | jnz L(exit) |
720 | |
721 | lea 7(%rsi), %rsi |
722 | add %rcx, %rsi |
723 | add %rcx, %rdi |
724 | jmp L(less48bytes) |
725 | |
726 | # endif |
727 | |
728 | .p2align 4 |
729 | L(shr_8): |
730 | cmp $80, %rcx |
731 | lea -48(%rcx), %rcx |
732 | mov %edx, %eax |
733 | jae L(shr_8_gobble) |
734 | |
735 | movdqa 16(%rsi), %xmm1 |
736 | movdqa %xmm1, %xmm2 |
737 | palignr $8, (%rsi), %xmm1 |
738 | pcmpeqb (%rdi), %xmm1 |
739 | |
740 | movdqa 32(%rsi), %xmm3 |
741 | palignr $8, %xmm2, %xmm3 |
742 | pcmpeqb 16(%rdi), %xmm3 |
743 | |
744 | pand %xmm1, %xmm3 |
745 | pmovmskb %xmm3, %edx |
746 | lea 32(%rdi), %rdi |
747 | lea 32(%rsi), %rsi |
748 | sub $0xffff, %edx |
749 | jnz L(exit) |
750 | add $8, %rsi |
751 | add %rcx, %rsi |
752 | add %rcx, %rdi |
753 | jmp L(less48bytes) |
754 | |
755 | .p2align 4 |
756 | L(shr_8_gobble): |
757 | sub $32, %rcx |
758 | movdqa 16(%rsi), %xmm0 |
759 | palignr $8, (%rsi), %xmm0 |
760 | pcmpeqb (%rdi), %xmm0 |
761 | |
762 | movdqa 32(%rsi), %xmm3 |
763 | palignr $8, 16(%rsi), %xmm3 |
764 | pcmpeqb 16(%rdi), %xmm3 |
765 | |
766 | L(shr_8_gobble_loop): |
767 | pand %xmm0, %xmm3 |
768 | sub $32, %rcx |
769 | pmovmskb %xmm3, %edx |
770 | movdqa %xmm0, %xmm1 |
771 | |
772 | movdqa 64(%rsi), %xmm3 |
773 | palignr $8, 48(%rsi), %xmm3 |
774 | sbb $0xffff, %edx |
775 | movdqa 48(%rsi), %xmm0 |
776 | palignr $8, 32(%rsi), %xmm0 |
777 | pcmpeqb 32(%rdi), %xmm0 |
778 | lea 32(%rsi), %rsi |
779 | pcmpeqb 48(%rdi), %xmm3 |
780 | |
781 | lea 32(%rdi), %rdi |
782 | jz L(shr_8_gobble_loop) |
783 | pand %xmm0, %xmm3 |
784 | |
785 | cmp $0, %rcx |
786 | jge L(shr_8_gobble_next) |
787 | inc %edx |
788 | add $32, %rcx |
789 | L(shr_8_gobble_next): |
790 | test %edx, %edx |
791 | jnz L(exit) |
792 | |
793 | pmovmskb %xmm3, %edx |
794 | movdqa %xmm0, %xmm1 |
795 | lea 32(%rdi), %rdi |
796 | lea 32(%rsi), %rsi |
797 | sub $0xffff, %edx |
798 | jnz L(exit) |
799 | |
800 | lea 8(%rsi), %rsi |
801 | add %rcx, %rsi |
802 | add %rcx, %rdi |
803 | jmp L(less48bytes) |
804 | |
805 | # ifndef USE_AS_WMEMCMP |
806 | |
807 | .p2align 4 |
808 | L(shr_9): |
809 | cmp $80, %rcx |
810 | lea -48(%rcx), %rcx |
811 | mov %edx, %eax |
812 | jae L(shr_9_gobble) |
813 | |
814 | movdqa 16(%rsi), %xmm1 |
815 | movdqa %xmm1, %xmm2 |
816 | palignr $9, (%rsi), %xmm1 |
817 | pcmpeqb (%rdi), %xmm1 |
818 | |
819 | movdqa 32(%rsi), %xmm3 |
820 | palignr $9, %xmm2, %xmm3 |
821 | pcmpeqb 16(%rdi), %xmm3 |
822 | |
823 | pand %xmm1, %xmm3 |
824 | pmovmskb %xmm3, %edx |
825 | lea 32(%rdi), %rdi |
826 | lea 32(%rsi), %rsi |
827 | sub $0xffff, %edx |
828 | jnz L(exit) |
829 | add $9, %rsi |
830 | add %rcx, %rsi |
831 | add %rcx, %rdi |
832 | jmp L(less48bytes) |
833 | |
834 | .p2align 4 |
835 | L(shr_9_gobble): |
836 | sub $32, %rcx |
837 | movdqa 16(%rsi), %xmm0 |
838 | palignr $9, (%rsi), %xmm0 |
839 | pcmpeqb (%rdi), %xmm0 |
840 | |
841 | movdqa 32(%rsi), %xmm3 |
842 | palignr $9, 16(%rsi), %xmm3 |
843 | pcmpeqb 16(%rdi), %xmm3 |
844 | |
845 | L(shr_9_gobble_loop): |
846 | pand %xmm0, %xmm3 |
847 | sub $32, %rcx |
848 | pmovmskb %xmm3, %edx |
849 | movdqa %xmm0, %xmm1 |
850 | |
851 | movdqa 64(%rsi), %xmm3 |
852 | palignr $9, 48(%rsi), %xmm3 |
853 | sbb $0xffff, %edx |
854 | movdqa 48(%rsi), %xmm0 |
855 | palignr $9, 32(%rsi), %xmm0 |
856 | pcmpeqb 32(%rdi), %xmm0 |
857 | lea 32(%rsi), %rsi |
858 | pcmpeqb 48(%rdi), %xmm3 |
859 | |
860 | lea 32(%rdi), %rdi |
861 | jz L(shr_9_gobble_loop) |
862 | pand %xmm0, %xmm3 |
863 | |
864 | cmp $0, %rcx |
865 | jge L(shr_9_gobble_next) |
866 | inc %edx |
867 | add $32, %rcx |
868 | L(shr_9_gobble_next): |
869 | test %edx, %edx |
870 | jnz L(exit) |
871 | |
872 | pmovmskb %xmm3, %edx |
873 | movdqa %xmm0, %xmm1 |
874 | lea 32(%rdi), %rdi |
875 | lea 32(%rsi), %rsi |
876 | sub $0xffff, %edx |
877 | jnz L(exit) |
878 | |
879 | lea 9(%rsi), %rsi |
880 | add %rcx, %rsi |
881 | add %rcx, %rdi |
882 | jmp L(less48bytes) |
883 | |
884 | .p2align 4 |
885 | L(shr_10): |
886 | cmp $80, %rcx |
887 | lea -48(%rcx), %rcx |
888 | mov %edx, %eax |
889 | jae L(shr_10_gobble) |
890 | |
891 | movdqa 16(%rsi), %xmm1 |
892 | movdqa %xmm1, %xmm2 |
893 | palignr $10, (%rsi), %xmm1 |
894 | pcmpeqb (%rdi), %xmm1 |
895 | |
896 | movdqa 32(%rsi), %xmm3 |
897 | palignr $10, %xmm2, %xmm3 |
898 | pcmpeqb 16(%rdi), %xmm3 |
899 | |
900 | pand %xmm1, %xmm3 |
901 | pmovmskb %xmm3, %edx |
902 | lea 32(%rdi), %rdi |
903 | lea 32(%rsi), %rsi |
904 | sub $0xffff, %edx |
905 | jnz L(exit) |
906 | add $10, %rsi |
907 | add %rcx, %rsi |
908 | add %rcx, %rdi |
909 | jmp L(less48bytes) |
910 | |
911 | .p2align 4 |
912 | L(shr_10_gobble): |
913 | sub $32, %rcx |
914 | movdqa 16(%rsi), %xmm0 |
915 | palignr $10, (%rsi), %xmm0 |
916 | pcmpeqb (%rdi), %xmm0 |
917 | |
918 | movdqa 32(%rsi), %xmm3 |
919 | palignr $10, 16(%rsi), %xmm3 |
920 | pcmpeqb 16(%rdi), %xmm3 |
921 | |
922 | L(shr_10_gobble_loop): |
923 | pand %xmm0, %xmm3 |
924 | sub $32, %rcx |
925 | pmovmskb %xmm3, %edx |
926 | movdqa %xmm0, %xmm1 |
927 | |
928 | movdqa 64(%rsi), %xmm3 |
929 | palignr $10, 48(%rsi), %xmm3 |
930 | sbb $0xffff, %edx |
931 | movdqa 48(%rsi), %xmm0 |
932 | palignr $10, 32(%rsi), %xmm0 |
933 | pcmpeqb 32(%rdi), %xmm0 |
934 | lea 32(%rsi), %rsi |
935 | pcmpeqb 48(%rdi), %xmm3 |
936 | |
937 | lea 32(%rdi), %rdi |
938 | jz L(shr_10_gobble_loop) |
939 | pand %xmm0, %xmm3 |
940 | |
941 | cmp $0, %rcx |
942 | jge L(shr_10_gobble_next) |
943 | inc %edx |
944 | add $32, %rcx |
945 | L(shr_10_gobble_next): |
946 | test %edx, %edx |
947 | jnz L(exit) |
948 | |
949 | pmovmskb %xmm3, %edx |
950 | movdqa %xmm0, %xmm1 |
951 | lea 32(%rdi), %rdi |
952 | lea 32(%rsi), %rsi |
953 | sub $0xffff, %edx |
954 | jnz L(exit) |
955 | |
956 | lea 10(%rsi), %rsi |
957 | add %rcx, %rsi |
958 | add %rcx, %rdi |
959 | jmp L(less48bytes) |
960 | |
961 | .p2align 4 |
962 | L(shr_11): |
963 | cmp $80, %rcx |
964 | lea -48(%rcx), %rcx |
965 | mov %edx, %eax |
966 | jae L(shr_11_gobble) |
967 | |
968 | movdqa 16(%rsi), %xmm1 |
969 | movdqa %xmm1, %xmm2 |
970 | palignr $11, (%rsi), %xmm1 |
971 | pcmpeqb (%rdi), %xmm1 |
972 | |
973 | movdqa 32(%rsi), %xmm3 |
974 | palignr $11, %xmm2, %xmm3 |
975 | pcmpeqb 16(%rdi), %xmm3 |
976 | |
977 | pand %xmm1, %xmm3 |
978 | pmovmskb %xmm3, %edx |
979 | lea 32(%rdi), %rdi |
980 | lea 32(%rsi), %rsi |
981 | sub $0xffff, %edx |
982 | jnz L(exit) |
983 | add $11, %rsi |
984 | add %rcx, %rsi |
985 | add %rcx, %rdi |
986 | jmp L(less48bytes) |
987 | |
988 | .p2align 4 |
989 | L(shr_11_gobble): |
990 | sub $32, %rcx |
991 | movdqa 16(%rsi), %xmm0 |
992 | palignr $11, (%rsi), %xmm0 |
993 | pcmpeqb (%rdi), %xmm0 |
994 | |
995 | movdqa 32(%rsi), %xmm3 |
996 | palignr $11, 16(%rsi), %xmm3 |
997 | pcmpeqb 16(%rdi), %xmm3 |
998 | |
999 | L(shr_11_gobble_loop): |
1000 | pand %xmm0, %xmm3 |
1001 | sub $32, %rcx |
1002 | pmovmskb %xmm3, %edx |
1003 | movdqa %xmm0, %xmm1 |
1004 | |
1005 | movdqa 64(%rsi), %xmm3 |
1006 | palignr $11, 48(%rsi), %xmm3 |
1007 | sbb $0xffff, %edx |
1008 | movdqa 48(%rsi), %xmm0 |
1009 | palignr $11, 32(%rsi), %xmm0 |
1010 | pcmpeqb 32(%rdi), %xmm0 |
1011 | lea 32(%rsi), %rsi |
1012 | pcmpeqb 48(%rdi), %xmm3 |
1013 | |
1014 | lea 32(%rdi), %rdi |
1015 | jz L(shr_11_gobble_loop) |
1016 | pand %xmm0, %xmm3 |
1017 | |
1018 | cmp $0, %rcx |
1019 | jge L(shr_11_gobble_next) |
1020 | inc %edx |
1021 | add $32, %rcx |
1022 | L(shr_11_gobble_next): |
1023 | test %edx, %edx |
1024 | jnz L(exit) |
1025 | |
1026 | pmovmskb %xmm3, %edx |
1027 | movdqa %xmm0, %xmm1 |
1028 | lea 32(%rdi), %rdi |
1029 | lea 32(%rsi), %rsi |
1030 | sub $0xffff, %edx |
1031 | jnz L(exit) |
1032 | |
1033 | lea 11(%rsi), %rsi |
1034 | add %rcx, %rsi |
1035 | add %rcx, %rdi |
1036 | jmp L(less48bytes) |
1037 | |
1038 | # endif |
1039 | |
1040 | .p2align 4 |
1041 | L(shr_12): |
1042 | cmp $80, %rcx |
1043 | lea -48(%rcx), %rcx |
1044 | mov %edx, %eax |
1045 | jae L(shr_12_gobble) |
1046 | |
1047 | movdqa 16(%rsi), %xmm1 |
1048 | movdqa %xmm1, %xmm2 |
1049 | palignr $12, (%rsi), %xmm1 |
1050 | pcmpeqb (%rdi), %xmm1 |
1051 | |
1052 | movdqa 32(%rsi), %xmm3 |
1053 | palignr $12, %xmm2, %xmm3 |
1054 | pcmpeqb 16(%rdi), %xmm3 |
1055 | |
1056 | pand %xmm1, %xmm3 |
1057 | pmovmskb %xmm3, %edx |
1058 | lea 32(%rdi), %rdi |
1059 | lea 32(%rsi), %rsi |
1060 | sub $0xffff, %edx |
1061 | jnz L(exit) |
1062 | add $12, %rsi |
1063 | add %rcx, %rsi |
1064 | add %rcx, %rdi |
1065 | jmp L(less48bytes) |
1066 | |
1067 | .p2align 4 |
1068 | L(shr_12_gobble): |
1069 | sub $32, %rcx |
1070 | movdqa 16(%rsi), %xmm0 |
1071 | palignr $12, (%rsi), %xmm0 |
1072 | pcmpeqb (%rdi), %xmm0 |
1073 | |
1074 | movdqa 32(%rsi), %xmm3 |
1075 | palignr $12, 16(%rsi), %xmm3 |
1076 | pcmpeqb 16(%rdi), %xmm3 |
1077 | |
1078 | L(shr_12_gobble_loop): |
1079 | pand %xmm0, %xmm3 |
1080 | sub $32, %rcx |
1081 | pmovmskb %xmm3, %edx |
1082 | movdqa %xmm0, %xmm1 |
1083 | |
1084 | movdqa 64(%rsi), %xmm3 |
1085 | palignr $12, 48(%rsi), %xmm3 |
1086 | sbb $0xffff, %edx |
1087 | movdqa 48(%rsi), %xmm0 |
1088 | palignr $12, 32(%rsi), %xmm0 |
1089 | pcmpeqb 32(%rdi), %xmm0 |
1090 | lea 32(%rsi), %rsi |
1091 | pcmpeqb 48(%rdi), %xmm3 |
1092 | |
1093 | lea 32(%rdi), %rdi |
1094 | jz L(shr_12_gobble_loop) |
1095 | pand %xmm0, %xmm3 |
1096 | |
1097 | cmp $0, %rcx |
1098 | jge L(shr_12_gobble_next) |
1099 | inc %edx |
1100 | add $32, %rcx |
1101 | L(shr_12_gobble_next): |
1102 | test %edx, %edx |
1103 | jnz L(exit) |
1104 | |
1105 | pmovmskb %xmm3, %edx |
1106 | movdqa %xmm0, %xmm1 |
1107 | lea 32(%rdi), %rdi |
1108 | lea 32(%rsi), %rsi |
1109 | sub $0xffff, %edx |
1110 | jnz L(exit) |
1111 | |
1112 | lea 12(%rsi), %rsi |
1113 | add %rcx, %rsi |
1114 | add %rcx, %rdi |
1115 | jmp L(less48bytes) |
1116 | |
1117 | # ifndef USE_AS_WMEMCMP |
1118 | |
1119 | .p2align 4 |
1120 | L(shr_13): |
1121 | cmp $80, %rcx |
1122 | lea -48(%rcx), %rcx |
1123 | mov %edx, %eax |
1124 | jae L(shr_13_gobble) |
1125 | |
1126 | movdqa 16(%rsi), %xmm1 |
1127 | movdqa %xmm1, %xmm2 |
1128 | palignr $13, (%rsi), %xmm1 |
1129 | pcmpeqb (%rdi), %xmm1 |
1130 | |
1131 | movdqa 32(%rsi), %xmm3 |
1132 | palignr $13, %xmm2, %xmm3 |
1133 | pcmpeqb 16(%rdi), %xmm3 |
1134 | |
1135 | pand %xmm1, %xmm3 |
1136 | pmovmskb %xmm3, %edx |
1137 | lea 32(%rdi), %rdi |
1138 | lea 32(%rsi), %rsi |
1139 | sub $0xffff, %edx |
1140 | jnz L(exit) |
1141 | add $13, %rsi |
1142 | add %rcx, %rsi |
1143 | add %rcx, %rdi |
1144 | jmp L(less48bytes) |
1145 | |
1146 | .p2align 4 |
1147 | L(shr_13_gobble): |
1148 | sub $32, %rcx |
1149 | movdqa 16(%rsi), %xmm0 |
1150 | palignr $13, (%rsi), %xmm0 |
1151 | pcmpeqb (%rdi), %xmm0 |
1152 | |
1153 | movdqa 32(%rsi), %xmm3 |
1154 | palignr $13, 16(%rsi), %xmm3 |
1155 | pcmpeqb 16(%rdi), %xmm3 |
1156 | |
1157 | L(shr_13_gobble_loop): |
1158 | pand %xmm0, %xmm3 |
1159 | sub $32, %rcx |
1160 | pmovmskb %xmm3, %edx |
1161 | movdqa %xmm0, %xmm1 |
1162 | |
1163 | movdqa 64(%rsi), %xmm3 |
1164 | palignr $13, 48(%rsi), %xmm3 |
1165 | sbb $0xffff, %edx |
1166 | movdqa 48(%rsi), %xmm0 |
1167 | palignr $13, 32(%rsi), %xmm0 |
1168 | pcmpeqb 32(%rdi), %xmm0 |
1169 | lea 32(%rsi), %rsi |
1170 | pcmpeqb 48(%rdi), %xmm3 |
1171 | |
1172 | lea 32(%rdi), %rdi |
1173 | jz L(shr_13_gobble_loop) |
1174 | pand %xmm0, %xmm3 |
1175 | |
1176 | cmp $0, %rcx |
1177 | jge L(shr_13_gobble_next) |
1178 | inc %edx |
1179 | add $32, %rcx |
1180 | L(shr_13_gobble_next): |
1181 | test %edx, %edx |
1182 | jnz L(exit) |
1183 | |
1184 | pmovmskb %xmm3, %edx |
1185 | movdqa %xmm0, %xmm1 |
1186 | lea 32(%rdi), %rdi |
1187 | lea 32(%rsi), %rsi |
1188 | sub $0xffff, %edx |
1189 | jnz L(exit) |
1190 | |
1191 | lea 13(%rsi), %rsi |
1192 | add %rcx, %rsi |
1193 | add %rcx, %rdi |
1194 | jmp L(less48bytes) |
1195 | |
1196 | .p2align 4 |
1197 | L(shr_14): |
1198 | cmp $80, %rcx |
1199 | lea -48(%rcx), %rcx |
1200 | mov %edx, %eax |
1201 | jae L(shr_14_gobble) |
1202 | |
1203 | movdqa 16(%rsi), %xmm1 |
1204 | movdqa %xmm1, %xmm2 |
1205 | palignr $14, (%rsi), %xmm1 |
1206 | pcmpeqb (%rdi), %xmm1 |
1207 | |
1208 | movdqa 32(%rsi), %xmm3 |
1209 | palignr $14, %xmm2, %xmm3 |
1210 | pcmpeqb 16(%rdi), %xmm3 |
1211 | |
1212 | pand %xmm1, %xmm3 |
1213 | pmovmskb %xmm3, %edx |
1214 | lea 32(%rdi), %rdi |
1215 | lea 32(%rsi), %rsi |
1216 | sub $0xffff, %edx |
1217 | jnz L(exit) |
1218 | add $14, %rsi |
1219 | add %rcx, %rsi |
1220 | add %rcx, %rdi |
1221 | jmp L(less48bytes) |
1222 | |
1223 | .p2align 4 |
1224 | L(shr_14_gobble): |
1225 | sub $32, %rcx |
1226 | movdqa 16(%rsi), %xmm0 |
1227 | palignr $14, (%rsi), %xmm0 |
1228 | pcmpeqb (%rdi), %xmm0 |
1229 | |
1230 | movdqa 32(%rsi), %xmm3 |
1231 | palignr $14, 16(%rsi), %xmm3 |
1232 | pcmpeqb 16(%rdi), %xmm3 |
1233 | |
1234 | L(shr_14_gobble_loop): |
1235 | pand %xmm0, %xmm3 |
1236 | sub $32, %rcx |
1237 | pmovmskb %xmm3, %edx |
1238 | movdqa %xmm0, %xmm1 |
1239 | |
1240 | movdqa 64(%rsi), %xmm3 |
1241 | palignr $14, 48(%rsi), %xmm3 |
1242 | sbb $0xffff, %edx |
1243 | movdqa 48(%rsi), %xmm0 |
1244 | palignr $14, 32(%rsi), %xmm0 |
1245 | pcmpeqb 32(%rdi), %xmm0 |
1246 | lea 32(%rsi), %rsi |
1247 | pcmpeqb 48(%rdi), %xmm3 |
1248 | |
1249 | lea 32(%rdi), %rdi |
1250 | jz L(shr_14_gobble_loop) |
1251 | pand %xmm0, %xmm3 |
1252 | |
1253 | cmp $0, %rcx |
1254 | jge L(shr_14_gobble_next) |
1255 | inc %edx |
1256 | add $32, %rcx |
1257 | L(shr_14_gobble_next): |
1258 | test %edx, %edx |
1259 | jnz L(exit) |
1260 | |
1261 | pmovmskb %xmm3, %edx |
1262 | movdqa %xmm0, %xmm1 |
1263 | lea 32(%rdi), %rdi |
1264 | lea 32(%rsi), %rsi |
1265 | sub $0xffff, %edx |
1266 | jnz L(exit) |
1267 | |
1268 | lea 14(%rsi), %rsi |
1269 | add %rcx, %rsi |
1270 | add %rcx, %rdi |
1271 | jmp L(less48bytes) |
1272 | |
1273 | .p2align 4 |
1274 | L(shr_15): |
1275 | cmp $80, %rcx |
1276 | lea -48(%rcx), %rcx |
1277 | mov %edx, %eax |
1278 | jae L(shr_15_gobble) |
1279 | |
1280 | movdqa 16(%rsi), %xmm1 |
1281 | movdqa %xmm1, %xmm2 |
1282 | palignr $15, (%rsi), %xmm1 |
1283 | pcmpeqb (%rdi), %xmm1 |
1284 | |
1285 | movdqa 32(%rsi), %xmm3 |
1286 | palignr $15, %xmm2, %xmm3 |
1287 | pcmpeqb 16(%rdi), %xmm3 |
1288 | |
1289 | pand %xmm1, %xmm3 |
1290 | pmovmskb %xmm3, %edx |
1291 | lea 32(%rdi), %rdi |
1292 | lea 32(%rsi), %rsi |
1293 | sub $0xffff, %edx |
1294 | jnz L(exit) |
1295 | add $15, %rsi |
1296 | add %rcx, %rsi |
1297 | add %rcx, %rdi |
1298 | jmp L(less48bytes) |
1299 | |
1300 | .p2align 4 |
1301 | L(shr_15_gobble): |
1302 | sub $32, %rcx |
1303 | movdqa 16(%rsi), %xmm0 |
1304 | palignr $15, (%rsi), %xmm0 |
1305 | pcmpeqb (%rdi), %xmm0 |
1306 | |
1307 | movdqa 32(%rsi), %xmm3 |
1308 | palignr $15, 16(%rsi), %xmm3 |
1309 | pcmpeqb 16(%rdi), %xmm3 |
1310 | |
1311 | L(shr_15_gobble_loop): |
1312 | pand %xmm0, %xmm3 |
1313 | sub $32, %rcx |
1314 | pmovmskb %xmm3, %edx |
1315 | movdqa %xmm0, %xmm1 |
1316 | |
1317 | movdqa 64(%rsi), %xmm3 |
1318 | palignr $15, 48(%rsi), %xmm3 |
1319 | sbb $0xffff, %edx |
1320 | movdqa 48(%rsi), %xmm0 |
1321 | palignr $15, 32(%rsi), %xmm0 |
1322 | pcmpeqb 32(%rdi), %xmm0 |
1323 | lea 32(%rsi), %rsi |
1324 | pcmpeqb 48(%rdi), %xmm3 |
1325 | |
1326 | lea 32(%rdi), %rdi |
1327 | jz L(shr_15_gobble_loop) |
1328 | pand %xmm0, %xmm3 |
1329 | |
1330 | cmp $0, %rcx |
1331 | jge L(shr_15_gobble_next) |
1332 | inc %edx |
1333 | add $32, %rcx |
1334 | L(shr_15_gobble_next): |
1335 | test %edx, %edx |
1336 | jnz L(exit) |
1337 | |
1338 | pmovmskb %xmm3, %edx |
1339 | movdqa %xmm0, %xmm1 |
1340 | lea 32(%rdi), %rdi |
1341 | lea 32(%rsi), %rsi |
1342 | sub $0xffff, %edx |
1343 | jnz L(exit) |
1344 | |
1345 | lea 15(%rsi), %rsi |
1346 | add %rcx, %rsi |
1347 | add %rcx, %rdi |
1348 | jmp L(less48bytes) |
1349 | # endif |
1350 | .p2align 4 |
1351 | L(exit): |
1352 | pmovmskb %xmm1, %r8d |
1353 | sub $0xffff, %r8d |
1354 | jz L(first16bytes) |
1355 | lea -16(%rsi), %rsi |
1356 | lea -16(%rdi), %rdi |
1357 | mov %r8d, %edx |
1358 | L(first16bytes): |
1359 | add %rax, %rsi |
1360 | L(less16bytes): |
1361 | # ifndef USE_AS_WMEMCMP |
1362 | test %dl, %dl |
1363 | jz L(next_24_bytes) |
1364 | |
1365 | test $0x01, %dl |
1366 | jnz L(Byte16) |
1367 | |
1368 | test $0x02, %dl |
1369 | jnz L(Byte17) |
1370 | |
1371 | test $0x04, %dl |
1372 | jnz L(Byte18) |
1373 | |
1374 | test $0x08, %dl |
1375 | jnz L(Byte19) |
1376 | |
1377 | test $0x10, %dl |
1378 | jnz L(Byte20) |
1379 | |
1380 | test $0x20, %dl |
1381 | jnz L(Byte21) |
1382 | |
1383 | test $0x40, %dl |
1384 | jnz L(Byte22) |
1385 | |
1386 | movzbl -9(%rdi), %eax |
1387 | movzbl -9(%rsi), %edx |
1388 | sub %edx, %eax |
1389 | ret |
1390 | |
1391 | .p2align 4 |
1392 | L(Byte16): |
1393 | movzbl -16(%rdi), %eax |
1394 | movzbl -16(%rsi), %edx |
1395 | sub %edx, %eax |
1396 | ret |
1397 | |
1398 | .p2align 4 |
1399 | L(Byte17): |
1400 | movzbl -15(%rdi), %eax |
1401 | movzbl -15(%rsi), %edx |
1402 | sub %edx, %eax |
1403 | ret |
1404 | |
1405 | .p2align 4 |
1406 | L(Byte18): |
1407 | movzbl -14(%rdi), %eax |
1408 | movzbl -14(%rsi), %edx |
1409 | sub %edx, %eax |
1410 | ret |
1411 | |
1412 | .p2align 4 |
1413 | L(Byte19): |
1414 | movzbl -13(%rdi), %eax |
1415 | movzbl -13(%rsi), %edx |
1416 | sub %edx, %eax |
1417 | ret |
1418 | |
1419 | .p2align 4 |
1420 | L(Byte20): |
1421 | movzbl -12(%rdi), %eax |
1422 | movzbl -12(%rsi), %edx |
1423 | sub %edx, %eax |
1424 | ret |
1425 | |
1426 | .p2align 4 |
1427 | L(Byte21): |
1428 | movzbl -11(%rdi), %eax |
1429 | movzbl -11(%rsi), %edx |
1430 | sub %edx, %eax |
1431 | ret |
1432 | |
1433 | .p2align 4 |
1434 | L(Byte22): |
1435 | movzbl -10(%rdi), %eax |
1436 | movzbl -10(%rsi), %edx |
1437 | sub %edx, %eax |
1438 | ret |
1439 | |
1440 | .p2align 4 |
1441 | L(next_24_bytes): |
1442 | lea 8(%rdi), %rdi |
1443 | lea 8(%rsi), %rsi |
1444 | test $0x01, %dh |
1445 | jnz L(Byte16) |
1446 | |
1447 | test $0x02, %dh |
1448 | jnz L(Byte17) |
1449 | |
1450 | test $0x04, %dh |
1451 | jnz L(Byte18) |
1452 | |
1453 | test $0x08, %dh |
1454 | jnz L(Byte19) |
1455 | |
1456 | test $0x10, %dh |
1457 | jnz L(Byte20) |
1458 | |
1459 | test $0x20, %dh |
1460 | jnz L(Byte21) |
1461 | |
1462 | test $0x40, %dh |
1463 | jnz L(Byte22) |
1464 | |
1465 | movzbl -9(%rdi), %eax |
1466 | movzbl -9(%rsi), %edx |
1467 | sub %edx, %eax |
1468 | ret |
1469 | # else |
1470 | /* special for wmemcmp */ |
1471 | xor %eax, %eax |
1472 | test %dl, %dl |
1473 | jz L(next_two_double_words) |
1474 | and $15, %dl |
1475 | jz L(second_double_word) |
1476 | mov -16(%rdi), %eax |
1477 | cmp -16(%rsi), %eax |
1478 | jne L(find_diff) |
1479 | ret |
1480 | |
1481 | .p2align 4 |
1482 | L(second_double_word): |
1483 | mov -12(%rdi), %eax |
1484 | cmp -12(%rsi), %eax |
1485 | jne L(find_diff) |
1486 | ret |
1487 | |
1488 | .p2align 4 |
1489 | L(next_two_double_words): |
1490 | and $15, %dh |
1491 | jz L(fourth_double_word) |
1492 | mov -8(%rdi), %eax |
1493 | cmp -8(%rsi), %eax |
1494 | jne L(find_diff) |
1495 | ret |
1496 | |
1497 | .p2align 4 |
1498 | L(fourth_double_word): |
1499 | mov -4(%rdi), %eax |
1500 | cmp -4(%rsi), %eax |
1501 | jne L(find_diff) |
1502 | ret |
1503 | # endif |
1504 | |
1505 | .p2align 4 |
1506 | L(less48bytes): |
1507 | cmp $8, %ecx |
1508 | jae L(more8bytes) |
1509 | cmp $0, %ecx |
1510 | je L(0bytes) |
1511 | # ifndef USE_AS_WMEMCMP |
1512 | cmp $1, %ecx |
1513 | je L(1bytes) |
1514 | cmp $2, %ecx |
1515 | je L(2bytes) |
1516 | cmp $3, %ecx |
1517 | je L(3bytes) |
1518 | cmp $4, %ecx |
1519 | je L(4bytes) |
1520 | cmp $5, %ecx |
1521 | je L(5bytes) |
1522 | cmp $6, %ecx |
1523 | je L(6bytes) |
1524 | jmp L(7bytes) |
1525 | # else |
1526 | jmp L(4bytes) |
1527 | # endif |
1528 | |
1529 | .p2align 4 |
1530 | L(more8bytes): |
1531 | cmp $16, %ecx |
1532 | jae L(more16bytes) |
1533 | cmp $8, %ecx |
1534 | je L(8bytes) |
1535 | # ifndef USE_AS_WMEMCMP |
1536 | cmp $9, %ecx |
1537 | je L(9bytes) |
1538 | cmp $10, %ecx |
1539 | je L(10bytes) |
1540 | cmp $11, %ecx |
1541 | je L(11bytes) |
1542 | cmp $12, %ecx |
1543 | je L(12bytes) |
1544 | cmp $13, %ecx |
1545 | je L(13bytes) |
1546 | cmp $14, %ecx |
1547 | je L(14bytes) |
1548 | jmp L(15bytes) |
1549 | # else |
1550 | jmp L(12bytes) |
1551 | # endif |
1552 | |
1553 | .p2align 4 |
1554 | L(more16bytes): |
1555 | cmp $24, %ecx |
1556 | jae L(more24bytes) |
1557 | cmp $16, %ecx |
1558 | je L(16bytes) |
1559 | # ifndef USE_AS_WMEMCMP |
1560 | cmp $17, %ecx |
1561 | je L(17bytes) |
1562 | cmp $18, %ecx |
1563 | je L(18bytes) |
1564 | cmp $19, %ecx |
1565 | je L(19bytes) |
1566 | cmp $20, %ecx |
1567 | je L(20bytes) |
1568 | cmp $21, %ecx |
1569 | je L(21bytes) |
1570 | cmp $22, %ecx |
1571 | je L(22bytes) |
1572 | jmp L(23bytes) |
1573 | # else |
1574 | jmp L(20bytes) |
1575 | # endif |
1576 | |
1577 | .p2align 4 |
1578 | L(more24bytes): |
1579 | cmp $32, %ecx |
1580 | jae L(more32bytes) |
1581 | cmp $24, %ecx |
1582 | je L(24bytes) |
1583 | # ifndef USE_AS_WMEMCMP |
1584 | cmp $25, %ecx |
1585 | je L(25bytes) |
1586 | cmp $26, %ecx |
1587 | je L(26bytes) |
1588 | cmp $27, %ecx |
1589 | je L(27bytes) |
1590 | cmp $28, %ecx |
1591 | je L(28bytes) |
1592 | cmp $29, %ecx |
1593 | je L(29bytes) |
1594 | cmp $30, %ecx |
1595 | je L(30bytes) |
1596 | jmp L(31bytes) |
1597 | # else |
1598 | jmp L(28bytes) |
1599 | # endif |
1600 | |
1601 | .p2align 4 |
1602 | L(more32bytes): |
1603 | cmp $40, %ecx |
1604 | jae L(more40bytes) |
1605 | cmp $32, %ecx |
1606 | je L(32bytes) |
1607 | # ifndef USE_AS_WMEMCMP |
1608 | cmp $33, %ecx |
1609 | je L(33bytes) |
1610 | cmp $34, %ecx |
1611 | je L(34bytes) |
1612 | cmp $35, %ecx |
1613 | je L(35bytes) |
1614 | cmp $36, %ecx |
1615 | je L(36bytes) |
1616 | cmp $37, %ecx |
1617 | je L(37bytes) |
1618 | cmp $38, %ecx |
1619 | je L(38bytes) |
1620 | jmp L(39bytes) |
1621 | # else |
1622 | jmp L(36bytes) |
1623 | # endif |
1624 | |
1625 | .p2align 4 |
1626 | L(more40bytes): |
1627 | cmp $40, %ecx |
1628 | je L(40bytes) |
1629 | # ifndef USE_AS_WMEMCMP |
1630 | cmp $41, %ecx |
1631 | je L(41bytes) |
1632 | cmp $42, %ecx |
1633 | je L(42bytes) |
1634 | cmp $43, %ecx |
1635 | je L(43bytes) |
1636 | cmp $44, %ecx |
1637 | je L(44bytes) |
1638 | cmp $45, %ecx |
1639 | je L(45bytes) |
1640 | cmp $46, %ecx |
1641 | je L(46bytes) |
1642 | jmp L(47bytes) |
1643 | |
1644 | .p2align 4 |
1645 | L(44bytes): |
1646 | movl -44(%rdi), %eax |
1647 | movl -44(%rsi), %ecx |
1648 | cmp %ecx, %eax |
1649 | jne L(find_diff) |
1650 | L(40bytes): |
1651 | movl -40(%rdi), %eax |
1652 | movl -40(%rsi), %ecx |
1653 | cmp %ecx, %eax |
1654 | jne L(find_diff) |
1655 | L(36bytes): |
1656 | movl -36(%rdi), %eax |
1657 | movl -36(%rsi), %ecx |
1658 | cmp %ecx, %eax |
1659 | jne L(find_diff) |
1660 | L(32bytes): |
1661 | movl -32(%rdi), %eax |
1662 | movl -32(%rsi), %ecx |
1663 | cmp %ecx, %eax |
1664 | jne L(find_diff) |
1665 | L(28bytes): |
1666 | movl -28(%rdi), %eax |
1667 | movl -28(%rsi), %ecx |
1668 | cmp %ecx, %eax |
1669 | jne L(find_diff) |
1670 | L(24bytes): |
1671 | movl -24(%rdi), %eax |
1672 | movl -24(%rsi), %ecx |
1673 | cmp %ecx, %eax |
1674 | jne L(find_diff) |
1675 | L(20bytes): |
1676 | movl -20(%rdi), %eax |
1677 | movl -20(%rsi), %ecx |
1678 | cmp %ecx, %eax |
1679 | jne L(find_diff) |
1680 | L(16bytes): |
1681 | movl -16(%rdi), %eax |
1682 | movl -16(%rsi), %ecx |
1683 | cmp %ecx, %eax |
1684 | jne L(find_diff) |
1685 | L(12bytes): |
1686 | movl -12(%rdi), %eax |
1687 | movl -12(%rsi), %ecx |
1688 | cmp %ecx, %eax |
1689 | jne L(find_diff) |
1690 | L(8bytes): |
1691 | movl -8(%rdi), %eax |
1692 | movl -8(%rsi), %ecx |
1693 | cmp %ecx, %eax |
1694 | jne L(find_diff) |
1695 | L(4bytes): |
1696 | movl -4(%rdi), %eax |
1697 | movl -4(%rsi), %ecx |
1698 | cmp %ecx, %eax |
1699 | jne L(find_diff) |
1700 | L(0bytes): |
1701 | xor %eax, %eax |
1702 | ret |
1703 | # else |
1704 | .p2align 4 |
1705 | L(44bytes): |
1706 | movl -44(%rdi), %eax |
1707 | cmp -44(%rsi), %eax |
1708 | jne L(find_diff) |
1709 | L(40bytes): |
1710 | movl -40(%rdi), %eax |
1711 | cmp -40(%rsi), %eax |
1712 | jne L(find_diff) |
1713 | L(36bytes): |
1714 | movl -36(%rdi), %eax |
1715 | cmp -36(%rsi), %eax |
1716 | jne L(find_diff) |
1717 | L(32bytes): |
1718 | movl -32(%rdi), %eax |
1719 | cmp -32(%rsi), %eax |
1720 | jne L(find_diff) |
1721 | L(28bytes): |
1722 | movl -28(%rdi), %eax |
1723 | cmp -28(%rsi), %eax |
1724 | jne L(find_diff) |
1725 | L(24bytes): |
1726 | movl -24(%rdi), %eax |
1727 | cmp -24(%rsi), %eax |
1728 | jne L(find_diff) |
1729 | L(20bytes): |
1730 | movl -20(%rdi), %eax |
1731 | cmp -20(%rsi), %eax |
1732 | jne L(find_diff) |
1733 | L(16bytes): |
1734 | movl -16(%rdi), %eax |
1735 | cmp -16(%rsi), %eax |
1736 | jne L(find_diff) |
1737 | L(12bytes): |
1738 | movl -12(%rdi), %eax |
1739 | cmp -12(%rsi), %eax |
1740 | jne L(find_diff) |
1741 | L(8bytes): |
1742 | movl -8(%rdi), %eax |
1743 | cmp -8(%rsi), %eax |
1744 | jne L(find_diff) |
1745 | L(4bytes): |
1746 | movl -4(%rdi), %eax |
1747 | cmp -4(%rsi), %eax |
1748 | jne L(find_diff) |
1749 | L(0bytes): |
1750 | xor %eax, %eax |
1751 | ret |
1752 | # endif |
1753 | |
1754 | # ifndef USE_AS_WMEMCMP |
1755 | .p2align 4 |
1756 | L(45bytes): |
1757 | movl -45(%rdi), %eax |
1758 | movl -45(%rsi), %ecx |
1759 | cmp %ecx, %eax |
1760 | jne L(find_diff) |
1761 | L(41bytes): |
1762 | movl -41(%rdi), %eax |
1763 | movl -41(%rsi), %ecx |
1764 | cmp %ecx, %eax |
1765 | jne L(find_diff) |
1766 | L(37bytes): |
1767 | movl -37(%rdi), %eax |
1768 | movl -37(%rsi), %ecx |
1769 | cmp %ecx, %eax |
1770 | jne L(find_diff) |
1771 | L(33bytes): |
1772 | movl -33(%rdi), %eax |
1773 | movl -33(%rsi), %ecx |
1774 | cmp %ecx, %eax |
1775 | jne L(find_diff) |
1776 | L(29bytes): |
1777 | movl -29(%rdi), %eax |
1778 | movl -29(%rsi), %ecx |
1779 | cmp %ecx, %eax |
1780 | jne L(find_diff) |
1781 | L(25bytes): |
1782 | movl -25(%rdi), %eax |
1783 | movl -25(%rsi), %ecx |
1784 | cmp %ecx, %eax |
1785 | jne L(find_diff) |
1786 | L(21bytes): |
1787 | movl -21(%rdi), %eax |
1788 | movl -21(%rsi), %ecx |
1789 | cmp %ecx, %eax |
1790 | jne L(find_diff) |
1791 | L(17bytes): |
1792 | movl -17(%rdi), %eax |
1793 | movl -17(%rsi), %ecx |
1794 | cmp %ecx, %eax |
1795 | jne L(find_diff) |
1796 | L(13bytes): |
1797 | movl -13(%rdi), %eax |
1798 | movl -13(%rsi), %ecx |
1799 | cmp %ecx, %eax |
1800 | jne L(find_diff) |
1801 | L(9bytes): |
1802 | movl -9(%rdi), %eax |
1803 | movl -9(%rsi), %ecx |
1804 | cmp %ecx, %eax |
1805 | jne L(find_diff) |
1806 | L(5bytes): |
1807 | movl -5(%rdi), %eax |
1808 | movl -5(%rsi), %ecx |
1809 | cmp %ecx, %eax |
1810 | jne L(find_diff) |
1811 | L(1bytes): |
1812 | movzbl -1(%rdi), %eax |
1813 | cmpb -1(%rsi), %al |
1814 | jne L(set) |
1815 | xor %eax, %eax |
1816 | ret |
1817 | |
1818 | .p2align 4 |
1819 | L(46bytes): |
1820 | movl -46(%rdi), %eax |
1821 | movl -46(%rsi), %ecx |
1822 | cmp %ecx, %eax |
1823 | jne L(find_diff) |
1824 | L(42bytes): |
1825 | movl -42(%rdi), %eax |
1826 | movl -42(%rsi), %ecx |
1827 | cmp %ecx, %eax |
1828 | jne L(find_diff) |
1829 | L(38bytes): |
1830 | movl -38(%rdi), %eax |
1831 | movl -38(%rsi), %ecx |
1832 | cmp %ecx, %eax |
1833 | jne L(find_diff) |
1834 | L(34bytes): |
1835 | movl -34(%rdi), %eax |
1836 | movl -34(%rsi), %ecx |
1837 | cmp %ecx, %eax |
1838 | jne L(find_diff) |
1839 | L(30bytes): |
1840 | movl -30(%rdi), %eax |
1841 | movl -30(%rsi), %ecx |
1842 | cmp %ecx, %eax |
1843 | jne L(find_diff) |
1844 | L(26bytes): |
1845 | movl -26(%rdi), %eax |
1846 | movl -26(%rsi), %ecx |
1847 | cmp %ecx, %eax |
1848 | jne L(find_diff) |
1849 | L(22bytes): |
1850 | movl -22(%rdi), %eax |
1851 | movl -22(%rsi), %ecx |
1852 | cmp %ecx, %eax |
1853 | jne L(find_diff) |
1854 | L(18bytes): |
1855 | movl -18(%rdi), %eax |
1856 | movl -18(%rsi), %ecx |
1857 | cmp %ecx, %eax |
1858 | jne L(find_diff) |
1859 | L(14bytes): |
1860 | movl -14(%rdi), %eax |
1861 | movl -14(%rsi), %ecx |
1862 | cmp %ecx, %eax |
1863 | jne L(find_diff) |
1864 | L(10bytes): |
1865 | movl -10(%rdi), %eax |
1866 | movl -10(%rsi), %ecx |
1867 | cmp %ecx, %eax |
1868 | jne L(find_diff) |
1869 | L(6bytes): |
1870 | movl -6(%rdi), %eax |
1871 | movl -6(%rsi), %ecx |
1872 | cmp %ecx, %eax |
1873 | jne L(find_diff) |
1874 | L(2bytes): |
1875 | movzwl -2(%rdi), %eax |
1876 | movzwl -2(%rsi), %ecx |
1877 | cmpb %cl, %al |
1878 | jne L(set) |
1879 | cmp %ecx, %eax |
1880 | jne L(set) |
1881 | xor %eax, %eax |
1882 | ret |
1883 | |
1884 | .p2align 4 |
1885 | L(47bytes): |
1886 | movl -47(%rdi), %eax |
1887 | movl -47(%rsi), %ecx |
1888 | cmp %ecx, %eax |
1889 | jne L(find_diff) |
1890 | L(43bytes): |
1891 | movl -43(%rdi), %eax |
1892 | movl -43(%rsi), %ecx |
1893 | cmp %ecx, %eax |
1894 | jne L(find_diff) |
1895 | L(39bytes): |
1896 | movl -39(%rdi), %eax |
1897 | movl -39(%rsi), %ecx |
1898 | cmp %ecx, %eax |
1899 | jne L(find_diff) |
1900 | L(35bytes): |
1901 | movl -35(%rdi), %eax |
1902 | movl -35(%rsi), %ecx |
1903 | cmp %ecx, %eax |
1904 | jne L(find_diff) |
1905 | L(31bytes): |
1906 | movl -31(%rdi), %eax |
1907 | movl -31(%rsi), %ecx |
1908 | cmp %ecx, %eax |
1909 | jne L(find_diff) |
1910 | L(27bytes): |
1911 | movl -27(%rdi), %eax |
1912 | movl -27(%rsi), %ecx |
1913 | cmp %ecx, %eax |
1914 | jne L(find_diff) |
1915 | L(23bytes): |
1916 | movl -23(%rdi), %eax |
1917 | movl -23(%rsi), %ecx |
1918 | cmp %ecx, %eax |
1919 | jne L(find_diff) |
1920 | L(19bytes): |
1921 | movl -19(%rdi), %eax |
1922 | movl -19(%rsi), %ecx |
1923 | cmp %ecx, %eax |
1924 | jne L(find_diff) |
1925 | L(15bytes): |
1926 | movl -15(%rdi), %eax |
1927 | movl -15(%rsi), %ecx |
1928 | cmp %ecx, %eax |
1929 | jne L(find_diff) |
1930 | L(11bytes): |
1931 | movl -11(%rdi), %eax |
1932 | movl -11(%rsi), %ecx |
1933 | cmp %ecx, %eax |
1934 | jne L(find_diff) |
1935 | L(7bytes): |
1936 | movl -7(%rdi), %eax |
1937 | movl -7(%rsi), %ecx |
1938 | cmp %ecx, %eax |
1939 | jne L(find_diff) |
1940 | L(3bytes): |
1941 | movzwl -3(%rdi), %eax |
1942 | movzwl -3(%rsi), %ecx |
1943 | cmpb %cl, %al |
1944 | jne L(set) |
1945 | cmp %ecx, %eax |
1946 | jne L(set) |
1947 | movzbl -1(%rdi), %eax |
1948 | cmpb -1(%rsi), %al |
1949 | jne L(set) |
1950 | xor %eax, %eax |
1951 | ret |
1952 | |
1953 | .p2align 4 |
1954 | L(find_diff): |
1955 | cmpb %cl, %al |
1956 | jne L(set) |
1957 | cmpw %cx, %ax |
1958 | jne L(set) |
1959 | shr $16, %eax |
1960 | shr $16, %ecx |
1961 | cmpb %cl, %al |
1962 | jne L(set) |
1963 | |
1964 | /* We get there only if we already know there is a |
1965 | difference. */ |
1966 | |
1967 | cmp %ecx, %eax |
1968 | L(set): |
1969 | sbb %eax, %eax |
1970 | sbb $-1, %eax |
1971 | ret |
1972 | # else |
1973 | |
1974 | /* for wmemcmp */ |
1975 | .p2align 4 |
1976 | L(find_diff): |
1977 | mov $1, %eax |
1978 | jg L(find_diff_bigger) |
1979 | neg %eax |
1980 | ret |
1981 | |
1982 | .p2align 4 |
1983 | L(find_diff_bigger): |
1984 | ret |
1985 | # endif |
1986 | |
1987 | .p2align 4 |
1988 | L(equal): |
1989 | xor %eax, %eax |
1990 | ret |
1991 | |
1992 | END (MEMCMP) |
1993 | #endif |
1994 | |