1 | /* memcmp/wmemcmp optimized with AVX2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | /* memcmp/wmemcmp is implemented as: |
22 | 1. Use ymm vector compares when possible. The only case where |
23 | vector compares is not possible for when size < VEC_SIZE |
24 | and loading from either s1 or s2 would cause a page cross. |
25 | 2. For size from 2 to 7 bytes on page cross, load as big endian |
26 | with movbe and bswap to avoid branches. |
27 | 3. Use xmm vector compare when size >= 4 bytes for memcmp or |
28 | size >= 8 bytes for wmemcmp. |
29 | 4. Optimistically compare up to first 4 * VEC_SIZE one at a |
30 | to check for early mismatches. Only do this if its guranteed the |
31 | work is not wasted. |
32 | 5. If size is 8 * VEC_SIZE or less, unroll the loop. |
33 | 6. Compare 4 * VEC_SIZE at a time with the aligned first memory |
34 | area. |
35 | 7. Use 2 vector compares when size is 2 * VEC_SIZE or less. |
36 | 8. Use 4 vector compares when size is 4 * VEC_SIZE or less. |
37 | 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ |
38 | |
39 | |
40 | # include <sysdep.h> |
41 | |
42 | # ifndef MEMCMP |
43 | # define MEMCMP __memcmp_avx2_movbe |
44 | # endif |
45 | |
46 | # ifdef USE_AS_WMEMCMP |
47 | # define CHAR_SIZE 4 |
48 | # define VPCMPEQ vpcmpeqd |
49 | # else |
50 | # define CHAR_SIZE 1 |
51 | # define VPCMPEQ vpcmpeqb |
52 | # endif |
53 | |
54 | # ifndef VZEROUPPER |
55 | # define VZEROUPPER vzeroupper |
56 | # endif |
57 | |
58 | # ifndef SECTION |
59 | # define SECTION(p) p##.avx |
60 | # endif |
61 | |
62 | # define VEC_SIZE 32 |
63 | # define PAGE_SIZE 4096 |
64 | |
65 | /* Warning! |
66 | wmemcmp has to use SIGNED comparison for elements. |
67 | memcmp has to use UNSIGNED comparison for elemnts. |
68 | */ |
69 | |
70 | .section SECTION(.text),"ax" ,@progbits |
71 | ENTRY (MEMCMP) |
72 | # ifdef USE_AS_WMEMCMP |
73 | shl $2, %RDX_LP |
74 | # elif defined __ILP32__ |
75 | /* Clear the upper 32 bits. */ |
76 | movl %edx, %edx |
77 | # endif |
78 | cmp $VEC_SIZE, %RDX_LP |
79 | jb L(less_vec) |
80 | |
81 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
82 | vmovdqu (%rsi), %ymm1 |
83 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
84 | vpmovmskb %ymm1, %eax |
85 | /* NB: eax must be destination register if going to |
86 | L(return_vec_[0,2]). For L(return_vec_3 destination register |
87 | must be ecx. */ |
88 | incl %eax |
89 | jnz L(return_vec_0) |
90 | |
91 | cmpq $(VEC_SIZE * 2), %rdx |
92 | jbe L(last_1x_vec) |
93 | |
94 | /* Check second VEC no matter what. */ |
95 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
96 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
97 | vpmovmskb %ymm2, %eax |
98 | /* If all 4 VEC where equal eax will be all 1s so incl will |
99 | overflow and set zero flag. */ |
100 | incl %eax |
101 | jnz L(return_vec_1) |
102 | |
103 | /* Less than 4 * VEC. */ |
104 | cmpq $(VEC_SIZE * 4), %rdx |
105 | jbe L(last_2x_vec) |
106 | |
107 | /* Check third and fourth VEC no matter what. */ |
108 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
109 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
110 | vpmovmskb %ymm3, %eax |
111 | incl %eax |
112 | jnz L(return_vec_2) |
113 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
114 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
115 | vpmovmskb %ymm4, %ecx |
116 | incl %ecx |
117 | jnz L(return_vec_3) |
118 | |
119 | /* Go to 4x VEC loop. */ |
120 | cmpq $(VEC_SIZE * 8), %rdx |
121 | ja L(more_8x_vec) |
122 | |
123 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
124 | branches. */ |
125 | |
126 | /* Load first two VEC from s2 before adjusting addresses. */ |
127 | vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1 |
128 | vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2 |
129 | leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi |
130 | leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi |
131 | |
132 | /* Wait to load from s1 until addressed adjust due to |
133 | unlamination of microfusion with complex address mode. */ |
134 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
135 | VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2 |
136 | |
137 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
138 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
139 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
140 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
141 | |
142 | /* Reduce VEC0 - VEC4. */ |
143 | vpand %ymm1, %ymm2, %ymm5 |
144 | vpand %ymm3, %ymm4, %ymm6 |
145 | vpand %ymm5, %ymm6, %ymm7 |
146 | vpmovmskb %ymm7, %ecx |
147 | incl %ecx |
148 | jnz L(return_vec_0_1_2_3) |
149 | /* NB: eax must be zero to reach here. */ |
150 | VZEROUPPER_RETURN |
151 | |
152 | .p2align 4 |
153 | L(return_vec_0): |
154 | tzcntl %eax, %eax |
155 | # ifdef USE_AS_WMEMCMP |
156 | movl (%rdi, %rax), %ecx |
157 | xorl %edx, %edx |
158 | cmpl (%rsi, %rax), %ecx |
159 | /* NB: no partial register stall here because xorl zero idiom |
160 | above. */ |
161 | setg %dl |
162 | leal -1(%rdx, %rdx), %eax |
163 | # else |
164 | movzbl (%rsi, %rax), %ecx |
165 | movzbl (%rdi, %rax), %eax |
166 | subl %ecx, %eax |
167 | # endif |
168 | L(return_vzeroupper): |
169 | ZERO_UPPER_VEC_REGISTERS_RETURN |
170 | |
171 | .p2align 4 |
172 | L(return_vec_1): |
173 | tzcntl %eax, %eax |
174 | # ifdef USE_AS_WMEMCMP |
175 | movl VEC_SIZE(%rdi, %rax), %ecx |
176 | xorl %edx, %edx |
177 | cmpl VEC_SIZE(%rsi, %rax), %ecx |
178 | setg %dl |
179 | leal -1(%rdx, %rdx), %eax |
180 | # else |
181 | movzbl VEC_SIZE(%rsi, %rax), %ecx |
182 | movzbl VEC_SIZE(%rdi, %rax), %eax |
183 | subl %ecx, %eax |
184 | # endif |
185 | VZEROUPPER_RETURN |
186 | |
187 | .p2align 4 |
188 | L(return_vec_2): |
189 | tzcntl %eax, %eax |
190 | # ifdef USE_AS_WMEMCMP |
191 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
192 | xorl %edx, %edx |
193 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
194 | setg %dl |
195 | leal -1(%rdx, %rdx), %eax |
196 | # else |
197 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
198 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
199 | subl %ecx, %eax |
200 | # endif |
201 | VZEROUPPER_RETURN |
202 | |
203 | /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */ |
204 | .p2align 5 |
205 | L(8x_return_vec_0_1_2_3): |
206 | /* Returning from L(more_8x_vec) requires restoring rsi. */ |
207 | addq %rdi, %rsi |
208 | L(return_vec_0_1_2_3): |
209 | vpmovmskb %ymm1, %eax |
210 | incl %eax |
211 | jnz L(return_vec_0) |
212 | |
213 | vpmovmskb %ymm2, %eax |
214 | incl %eax |
215 | jnz L(return_vec_1) |
216 | |
217 | vpmovmskb %ymm3, %eax |
218 | incl %eax |
219 | jnz L(return_vec_2) |
220 | L(return_vec_3): |
221 | tzcntl %ecx, %ecx |
222 | # ifdef USE_AS_WMEMCMP |
223 | movl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
224 | xorl %edx, %edx |
225 | cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax |
226 | setg %dl |
227 | leal -1(%rdx, %rdx), %eax |
228 | # else |
229 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
230 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx |
231 | subl %ecx, %eax |
232 | # endif |
233 | VZEROUPPER_RETURN |
234 | |
235 | .p2align 4 |
236 | L(more_8x_vec): |
237 | /* Set end of s1 in rdx. */ |
238 | leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx |
239 | /* rsi stores s2 - s1. This allows loop to only update one |
240 | pointer. */ |
241 | subq %rdi, %rsi |
242 | /* Align s1 pointer. */ |
243 | andq $-VEC_SIZE, %rdi |
244 | /* Adjust because first 4x vec where check already. */ |
245 | subq $-(VEC_SIZE * 4), %rdi |
246 | .p2align 4 |
247 | L(loop_4x_vec): |
248 | /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). |
249 | */ |
250 | vmovdqu (%rsi, %rdi), %ymm1 |
251 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
252 | |
253 | vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 |
254 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
255 | |
256 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 |
257 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
258 | |
259 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 |
260 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
261 | |
262 | vpand %ymm1, %ymm2, %ymm5 |
263 | vpand %ymm3, %ymm4, %ymm6 |
264 | vpand %ymm5, %ymm6, %ymm7 |
265 | vpmovmskb %ymm7, %ecx |
266 | incl %ecx |
267 | jnz L(8x_return_vec_0_1_2_3) |
268 | subq $-(VEC_SIZE * 4), %rdi |
269 | /* Check if s1 pointer at end. */ |
270 | cmpq %rdx, %rdi |
271 | jb L(loop_4x_vec) |
272 | |
273 | subq %rdx, %rdi |
274 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
275 | cmpl $(VEC_SIZE * 3), %edi |
276 | jae L(8x_last_1x_vec) |
277 | /* Load regardless of branch. */ |
278 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 |
279 | cmpl $(VEC_SIZE * 2), %edi |
280 | jae L(8x_last_2x_vec) |
281 | |
282 | /* Check last 4 VEC. */ |
283 | vmovdqu (%rsi, %rdx), %ymm1 |
284 | VPCMPEQ (%rdx), %ymm1, %ymm1 |
285 | |
286 | vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2 |
287 | VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2 |
288 | |
289 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 |
290 | |
291 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 |
292 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 |
293 | |
294 | vpand %ymm1, %ymm2, %ymm5 |
295 | vpand %ymm3, %ymm4, %ymm6 |
296 | vpand %ymm5, %ymm6, %ymm7 |
297 | vpmovmskb %ymm7, %ecx |
298 | /* Restore s1 pointer to rdi. */ |
299 | movq %rdx, %rdi |
300 | incl %ecx |
301 | jnz L(8x_return_vec_0_1_2_3) |
302 | /* NB: eax must be zero to reach here. */ |
303 | VZEROUPPER_RETURN |
304 | |
305 | /* Only entry is from L(more_8x_vec). */ |
306 | .p2align 4 |
307 | L(8x_last_2x_vec): |
308 | /* Check second to last VEC. rdx store end pointer of s1 and |
309 | ymm3 has already been loaded with second to last VEC from s2. |
310 | */ |
311 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 |
312 | vpmovmskb %ymm3, %eax |
313 | incl %eax |
314 | jnz L(8x_return_vec_2) |
315 | /* Check last VEC. */ |
316 | .p2align 4 |
317 | L(8x_last_1x_vec): |
318 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 |
319 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 |
320 | vpmovmskb %ymm4, %eax |
321 | incl %eax |
322 | jnz L(8x_return_vec_3) |
323 | VZEROUPPER_RETURN |
324 | |
325 | .p2align 4 |
326 | L(last_2x_vec): |
327 | /* Check second to last VEC. */ |
328 | vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 |
329 | VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 |
330 | vpmovmskb %ymm1, %eax |
331 | incl %eax |
332 | jnz L(return_vec_1_end) |
333 | /* Check last VEC. */ |
334 | L(last_1x_vec): |
335 | vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 |
336 | VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 |
337 | vpmovmskb %ymm1, %eax |
338 | incl %eax |
339 | jnz L(return_vec_0_end) |
340 | VZEROUPPER_RETURN |
341 | |
342 | .p2align 4 |
343 | L(8x_return_vec_2): |
344 | subq $VEC_SIZE, %rdx |
345 | L(8x_return_vec_3): |
346 | tzcntl %eax, %eax |
347 | addq %rdx, %rax |
348 | # ifdef USE_AS_WMEMCMP |
349 | movl (VEC_SIZE * 3)(%rax), %ecx |
350 | xorl %edx, %edx |
351 | cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
352 | setg %dl |
353 | leal -1(%rdx, %rdx), %eax |
354 | # else |
355 | movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
356 | movzbl (VEC_SIZE * 3)(%rax), %eax |
357 | subl %ecx, %eax |
358 | # endif |
359 | VZEROUPPER_RETURN |
360 | |
361 | .p2align 4 |
362 | L(return_vec_1_end): |
363 | tzcntl %eax, %eax |
364 | addl %edx, %eax |
365 | # ifdef USE_AS_WMEMCMP |
366 | movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx |
367 | xorl %edx, %edx |
368 | cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx |
369 | setg %dl |
370 | leal -1(%rdx, %rdx), %eax |
371 | # else |
372 | movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx |
373 | movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax |
374 | subl %ecx, %eax |
375 | # endif |
376 | VZEROUPPER_RETURN |
377 | |
378 | .p2align 4 |
379 | L(return_vec_0_end): |
380 | tzcntl %eax, %eax |
381 | addl %edx, %eax |
382 | # ifdef USE_AS_WMEMCMP |
383 | movl -VEC_SIZE(%rdi, %rax), %ecx |
384 | xorl %edx, %edx |
385 | cmpl -VEC_SIZE(%rsi, %rax), %ecx |
386 | setg %dl |
387 | leal -1(%rdx, %rdx), %eax |
388 | # else |
389 | movzbl -VEC_SIZE(%rsi, %rax), %ecx |
390 | movzbl -VEC_SIZE(%rdi, %rax), %eax |
391 | subl %ecx, %eax |
392 | # endif |
393 | VZEROUPPER_RETURN |
394 | |
395 | .p2align 4 |
396 | L(less_vec): |
397 | /* Check if one or less CHAR. This is necessary for size = 0 but |
398 | is also faster for size = CHAR_SIZE. */ |
399 | cmpl $CHAR_SIZE, %edx |
400 | jbe L(one_or_less) |
401 | |
402 | /* Check if loading one VEC from either s1 or s2 could cause a |
403 | page cross. This can have false positives but is by far the |
404 | fastest method. */ |
405 | movl %edi, %eax |
406 | orl %esi, %eax |
407 | andl $(PAGE_SIZE - 1), %eax |
408 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
409 | jg L(page_cross_less_vec) |
410 | |
411 | /* No page cross possible. */ |
412 | vmovdqu (%rsi), %ymm2 |
413 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
414 | vpmovmskb %ymm2, %eax |
415 | incl %eax |
416 | /* Result will be zero if s1 and s2 match. Otherwise first set |
417 | bit will be first mismatch. */ |
418 | bzhil %edx, %eax, %edx |
419 | jnz L(return_vec_0) |
420 | xorl %eax, %eax |
421 | VZEROUPPER_RETURN |
422 | |
423 | .p2align 4 |
424 | L(page_cross_less_vec): |
425 | /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 |
426 | bytes. */ |
427 | cmpl $16, %edx |
428 | jae L(between_16_31) |
429 | # ifndef USE_AS_WMEMCMP |
430 | cmpl $8, %edx |
431 | jae L(between_8_15) |
432 | cmpl $4, %edx |
433 | jae L(between_4_7) |
434 | |
435 | /* Load as big endian to avoid branches. */ |
436 | movzwl (%rdi), %eax |
437 | movzwl (%rsi), %ecx |
438 | shll $8, %eax |
439 | shll $8, %ecx |
440 | bswap %eax |
441 | bswap %ecx |
442 | movzbl -1(%rdi, %rdx), %edi |
443 | movzbl -1(%rsi, %rdx), %esi |
444 | orl %edi, %eax |
445 | orl %esi, %ecx |
446 | /* Subtraction is okay because the upper 8 bits are zero. */ |
447 | subl %ecx, %eax |
448 | /* No ymm register was touched. */ |
449 | ret |
450 | |
451 | .p2align 4 |
452 | L(one_or_less): |
453 | jb L(zero) |
454 | movzbl (%rsi), %ecx |
455 | movzbl (%rdi), %eax |
456 | subl %ecx, %eax |
457 | /* No ymm register was touched. */ |
458 | ret |
459 | |
460 | .p2align 4 |
461 | L(between_8_15): |
462 | # endif |
463 | /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ |
464 | vmovq (%rdi), %xmm1 |
465 | vmovq (%rsi), %xmm2 |
466 | VPCMPEQ %xmm1, %xmm2, %xmm2 |
467 | vpmovmskb %xmm2, %eax |
468 | subl $0xffff, %eax |
469 | jnz L(return_vec_0) |
470 | /* Use overlapping loads to avoid branches. */ |
471 | leaq -8(%rdi, %rdx), %rdi |
472 | leaq -8(%rsi, %rdx), %rsi |
473 | vmovq (%rdi), %xmm1 |
474 | vmovq (%rsi), %xmm2 |
475 | VPCMPEQ %xmm1, %xmm2, %xmm2 |
476 | vpmovmskb %xmm2, %eax |
477 | subl $0xffff, %eax |
478 | jnz L(return_vec_0) |
479 | /* No ymm register was touched. */ |
480 | ret |
481 | |
482 | .p2align 4 |
483 | L(zero): |
484 | xorl %eax, %eax |
485 | ret |
486 | |
487 | .p2align 4 |
488 | L(between_16_31): |
489 | /* From 16 to 31 bytes. No branch when size == 16. */ |
490 | vmovdqu (%rsi), %xmm2 |
491 | VPCMPEQ (%rdi), %xmm2, %xmm2 |
492 | vpmovmskb %xmm2, %eax |
493 | subl $0xffff, %eax |
494 | jnz L(return_vec_0) |
495 | |
496 | /* Use overlapping loads to avoid branches. */ |
497 | |
498 | vmovdqu -16(%rsi, %rdx), %xmm2 |
499 | leaq -16(%rdi, %rdx), %rdi |
500 | leaq -16(%rsi, %rdx), %rsi |
501 | VPCMPEQ (%rdi), %xmm2, %xmm2 |
502 | vpmovmskb %xmm2, %eax |
503 | subl $0xffff, %eax |
504 | jnz L(return_vec_0) |
505 | /* No ymm register was touched. */ |
506 | ret |
507 | |
508 | # ifdef USE_AS_WMEMCMP |
509 | .p2align 4 |
510 | L(one_or_less): |
511 | jb L(zero) |
512 | movl (%rdi), %ecx |
513 | xorl %edx, %edx |
514 | cmpl (%rsi), %ecx |
515 | je L(zero) |
516 | setg %dl |
517 | leal -1(%rdx, %rdx), %eax |
518 | /* No ymm register was touched. */ |
519 | ret |
520 | # else |
521 | |
522 | .p2align 4 |
523 | L(between_4_7): |
524 | /* Load as big endian with overlapping movbe to avoid branches. |
525 | */ |
526 | movbe (%rdi), %eax |
527 | movbe (%rsi), %ecx |
528 | shlq $32, %rax |
529 | shlq $32, %rcx |
530 | movbe -4(%rdi, %rdx), %edi |
531 | movbe -4(%rsi, %rdx), %esi |
532 | orq %rdi, %rax |
533 | orq %rsi, %rcx |
534 | subq %rcx, %rax |
535 | jz L(zero_4_7) |
536 | sbbl %eax, %eax |
537 | orl $1, %eax |
538 | L(zero_4_7): |
539 | /* No ymm register was touched. */ |
540 | ret |
541 | # endif |
542 | |
543 | END (MEMCMP) |
544 | #endif |
545 | |