1 | /* memcmp/wmemcmp optimized with AVX2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | /* memcmp/wmemcmp is implemented as: |
24 | 1. Use ymm vector compares when possible. The only case where |
25 | vector compares is not possible for when size < VEC_SIZE |
26 | and loading from either s1 or s2 would cause a page cross. |
27 | 2. For size from 2 to 7 bytes on page cross, load as big endian |
28 | with movbe and bswap to avoid branches. |
29 | 3. Use xmm vector compare when size >= 4 bytes for memcmp or |
30 | size >= 8 bytes for wmemcmp. |
31 | 4. Optimistically compare up to first 4 * VEC_SIZE one at a |
32 | to check for early mismatches. Only do this if its guranteed the |
33 | work is not wasted. |
34 | 5. If size is 8 * VEC_SIZE or less, unroll the loop. |
35 | 6. Compare 4 * VEC_SIZE at a time with the aligned first memory |
36 | area. |
37 | 7. Use 2 vector compares when size is 2 * VEC_SIZE or less. |
38 | 8. Use 4 vector compares when size is 4 * VEC_SIZE or less. |
39 | 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ |
40 | |
41 | |
42 | # include <sysdep.h> |
43 | |
44 | # ifndef MEMCMP |
45 | # define MEMCMP __memcmp_avx2_movbe |
46 | # endif |
47 | |
48 | # ifdef USE_AS_WMEMCMP |
49 | # define CHAR_SIZE 4 |
50 | # define VPCMPEQ vpcmpeqd |
51 | # else |
52 | # define CHAR_SIZE 1 |
53 | # define VPCMPEQ vpcmpeqb |
54 | # endif |
55 | |
56 | # ifndef VZEROUPPER |
57 | # define VZEROUPPER vzeroupper |
58 | # endif |
59 | |
60 | # ifndef SECTION |
61 | # define SECTION(p) p##.avx |
62 | # endif |
63 | |
64 | # define VEC_SIZE 32 |
65 | # define PAGE_SIZE 4096 |
66 | |
67 | /* Warning! |
68 | wmemcmp has to use SIGNED comparison for elements. |
69 | memcmp has to use UNSIGNED comparison for elemnts. |
70 | */ |
71 | |
72 | .section SECTION(.text),"ax" ,@progbits |
73 | ENTRY (MEMCMP) |
74 | # ifdef USE_AS_WMEMCMP |
75 | shl $2, %RDX_LP |
76 | # elif defined __ILP32__ |
77 | /* Clear the upper 32 bits. */ |
78 | movl %edx, %edx |
79 | # endif |
80 | cmp $VEC_SIZE, %RDX_LP |
81 | jb L(less_vec) |
82 | |
83 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
84 | vmovdqu (%rsi), %ymm1 |
85 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
86 | vpmovmskb %ymm1, %eax |
87 | /* NB: eax must be destination register if going to |
88 | L(return_vec_[0,2]). For L(return_vec_3 destination register |
89 | must be ecx. */ |
90 | incl %eax |
91 | jnz L(return_vec_0) |
92 | |
93 | cmpq $(VEC_SIZE * 2), %rdx |
94 | jbe L(last_1x_vec) |
95 | |
96 | /* Check second VEC no matter what. */ |
97 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
98 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
99 | vpmovmskb %ymm2, %eax |
100 | /* If all 4 VEC where equal eax will be all 1s so incl will |
101 | overflow and set zero flag. */ |
102 | incl %eax |
103 | jnz L(return_vec_1) |
104 | |
105 | /* Less than 4 * VEC. */ |
106 | cmpq $(VEC_SIZE * 4), %rdx |
107 | jbe L(last_2x_vec) |
108 | |
109 | /* Check third and fourth VEC no matter what. */ |
110 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
111 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
112 | vpmovmskb %ymm3, %eax |
113 | incl %eax |
114 | jnz L(return_vec_2) |
115 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
116 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
117 | vpmovmskb %ymm4, %ecx |
118 | incl %ecx |
119 | jnz L(return_vec_3) |
120 | |
121 | /* Go to 4x VEC loop. */ |
122 | cmpq $(VEC_SIZE * 8), %rdx |
123 | ja L(more_8x_vec) |
124 | |
125 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
126 | branches. */ |
127 | |
128 | /* Load first two VEC from s2 before adjusting addresses. */ |
129 | vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1 |
130 | vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2 |
131 | leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi |
132 | leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi |
133 | |
134 | /* Wait to load from s1 until addressed adjust due to |
135 | unlamination of microfusion with complex address mode. */ |
136 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
137 | VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2 |
138 | |
139 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
140 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
141 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
142 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
143 | |
144 | /* Reduce VEC0 - VEC4. */ |
145 | vpand %ymm1, %ymm2, %ymm5 |
146 | vpand %ymm3, %ymm4, %ymm6 |
147 | vpand %ymm5, %ymm6, %ymm7 |
148 | vpmovmskb %ymm7, %ecx |
149 | incl %ecx |
150 | jnz L(return_vec_0_1_2_3) |
151 | /* NB: eax must be zero to reach here. */ |
152 | VZEROUPPER_RETURN |
153 | |
154 | .p2align 4 |
155 | L(return_vec_0): |
156 | tzcntl %eax, %eax |
157 | # ifdef USE_AS_WMEMCMP |
158 | movl (%rdi, %rax), %ecx |
159 | xorl %edx, %edx |
160 | cmpl (%rsi, %rax), %ecx |
161 | /* NB: no partial register stall here because xorl zero idiom |
162 | above. */ |
163 | setg %dl |
164 | leal -1(%rdx, %rdx), %eax |
165 | # else |
166 | movzbl (%rsi, %rax), %ecx |
167 | movzbl (%rdi, %rax), %eax |
168 | subl %ecx, %eax |
169 | # endif |
170 | L(return_vzeroupper): |
171 | ZERO_UPPER_VEC_REGISTERS_RETURN |
172 | |
173 | .p2align 4 |
174 | L(return_vec_1): |
175 | tzcntl %eax, %eax |
176 | # ifdef USE_AS_WMEMCMP |
177 | movl VEC_SIZE(%rdi, %rax), %ecx |
178 | xorl %edx, %edx |
179 | cmpl VEC_SIZE(%rsi, %rax), %ecx |
180 | setg %dl |
181 | leal -1(%rdx, %rdx), %eax |
182 | # else |
183 | movzbl VEC_SIZE(%rsi, %rax), %ecx |
184 | movzbl VEC_SIZE(%rdi, %rax), %eax |
185 | subl %ecx, %eax |
186 | # endif |
187 | VZEROUPPER_RETURN |
188 | |
189 | .p2align 4 |
190 | L(return_vec_2): |
191 | tzcntl %eax, %eax |
192 | # ifdef USE_AS_WMEMCMP |
193 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
194 | xorl %edx, %edx |
195 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
196 | setg %dl |
197 | leal -1(%rdx, %rdx), %eax |
198 | # else |
199 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
200 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
201 | subl %ecx, %eax |
202 | # endif |
203 | VZEROUPPER_RETURN |
204 | |
205 | /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */ |
206 | .p2align 5 |
207 | L(8x_return_vec_0_1_2_3): |
208 | /* Returning from L(more_8x_vec) requires restoring rsi. */ |
209 | addq %rdi, %rsi |
210 | L(return_vec_0_1_2_3): |
211 | vpmovmskb %ymm1, %eax |
212 | incl %eax |
213 | jnz L(return_vec_0) |
214 | |
215 | vpmovmskb %ymm2, %eax |
216 | incl %eax |
217 | jnz L(return_vec_1) |
218 | |
219 | vpmovmskb %ymm3, %eax |
220 | incl %eax |
221 | jnz L(return_vec_2) |
222 | L(return_vec_3): |
223 | tzcntl %ecx, %ecx |
224 | # ifdef USE_AS_WMEMCMP |
225 | movl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
226 | xorl %edx, %edx |
227 | cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax |
228 | setg %dl |
229 | leal -1(%rdx, %rdx), %eax |
230 | # else |
231 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
232 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx |
233 | subl %ecx, %eax |
234 | # endif |
235 | VZEROUPPER_RETURN |
236 | |
237 | .p2align 4 |
238 | L(more_8x_vec): |
239 | /* Set end of s1 in rdx. */ |
240 | leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx |
241 | /* rsi stores s2 - s1. This allows loop to only update one |
242 | pointer. */ |
243 | subq %rdi, %rsi |
244 | /* Align s1 pointer. */ |
245 | andq $-VEC_SIZE, %rdi |
246 | /* Adjust because first 4x vec where check already. */ |
247 | subq $-(VEC_SIZE * 4), %rdi |
248 | .p2align 4 |
249 | L(loop_4x_vec): |
250 | /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). |
251 | */ |
252 | vmovdqu (%rsi, %rdi), %ymm1 |
253 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
254 | |
255 | vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 |
256 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
257 | |
258 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 |
259 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
260 | |
261 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 |
262 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
263 | |
264 | vpand %ymm1, %ymm2, %ymm5 |
265 | vpand %ymm3, %ymm4, %ymm6 |
266 | vpand %ymm5, %ymm6, %ymm7 |
267 | vpmovmskb %ymm7, %ecx |
268 | incl %ecx |
269 | jnz L(8x_return_vec_0_1_2_3) |
270 | subq $-(VEC_SIZE * 4), %rdi |
271 | /* Check if s1 pointer at end. */ |
272 | cmpq %rdx, %rdi |
273 | jb L(loop_4x_vec) |
274 | |
275 | subq %rdx, %rdi |
276 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
277 | cmpl $(VEC_SIZE * 3), %edi |
278 | jae L(8x_last_1x_vec) |
279 | /* Load regardless of branch. */ |
280 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 |
281 | cmpl $(VEC_SIZE * 2), %edi |
282 | jae L(8x_last_2x_vec) |
283 | |
284 | /* Check last 4 VEC. */ |
285 | vmovdqu (%rsi, %rdx), %ymm1 |
286 | VPCMPEQ (%rdx), %ymm1, %ymm1 |
287 | |
288 | vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2 |
289 | VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2 |
290 | |
291 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 |
292 | |
293 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 |
294 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 |
295 | |
296 | vpand %ymm1, %ymm2, %ymm5 |
297 | vpand %ymm3, %ymm4, %ymm6 |
298 | vpand %ymm5, %ymm6, %ymm7 |
299 | vpmovmskb %ymm7, %ecx |
300 | /* Restore s1 pointer to rdi. */ |
301 | movq %rdx, %rdi |
302 | incl %ecx |
303 | jnz L(8x_return_vec_0_1_2_3) |
304 | /* NB: eax must be zero to reach here. */ |
305 | VZEROUPPER_RETURN |
306 | |
307 | /* Only entry is from L(more_8x_vec). */ |
308 | .p2align 4 |
309 | L(8x_last_2x_vec): |
310 | /* Check second to last VEC. rdx store end pointer of s1 and |
311 | ymm3 has already been loaded with second to last VEC from s2. |
312 | */ |
313 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 |
314 | vpmovmskb %ymm3, %eax |
315 | incl %eax |
316 | jnz L(8x_return_vec_2) |
317 | /* Check last VEC. */ |
318 | .p2align 4 |
319 | L(8x_last_1x_vec): |
320 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 |
321 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 |
322 | vpmovmskb %ymm4, %eax |
323 | incl %eax |
324 | jnz L(8x_return_vec_3) |
325 | VZEROUPPER_RETURN |
326 | |
327 | .p2align 4 |
328 | L(last_2x_vec): |
329 | /* Check second to last VEC. */ |
330 | vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 |
331 | VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 |
332 | vpmovmskb %ymm1, %eax |
333 | incl %eax |
334 | jnz L(return_vec_1_end) |
335 | /* Check last VEC. */ |
336 | L(last_1x_vec): |
337 | vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 |
338 | VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 |
339 | vpmovmskb %ymm1, %eax |
340 | incl %eax |
341 | jnz L(return_vec_0_end) |
342 | VZEROUPPER_RETURN |
343 | |
344 | .p2align 4 |
345 | L(8x_return_vec_2): |
346 | subq $VEC_SIZE, %rdx |
347 | L(8x_return_vec_3): |
348 | tzcntl %eax, %eax |
349 | addq %rdx, %rax |
350 | # ifdef USE_AS_WMEMCMP |
351 | movl (VEC_SIZE * 3)(%rax), %ecx |
352 | xorl %edx, %edx |
353 | cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
354 | setg %dl |
355 | leal -1(%rdx, %rdx), %eax |
356 | # else |
357 | movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
358 | movzbl (VEC_SIZE * 3)(%rax), %eax |
359 | subl %ecx, %eax |
360 | # endif |
361 | VZEROUPPER_RETURN |
362 | |
363 | .p2align 4 |
364 | L(return_vec_1_end): |
365 | tzcntl %eax, %eax |
366 | addl %edx, %eax |
367 | # ifdef USE_AS_WMEMCMP |
368 | movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx |
369 | xorl %edx, %edx |
370 | cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx |
371 | setg %dl |
372 | leal -1(%rdx, %rdx), %eax |
373 | # else |
374 | movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx |
375 | movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax |
376 | subl %ecx, %eax |
377 | # endif |
378 | VZEROUPPER_RETURN |
379 | |
380 | .p2align 4 |
381 | L(return_vec_0_end): |
382 | tzcntl %eax, %eax |
383 | addl %edx, %eax |
384 | # ifdef USE_AS_WMEMCMP |
385 | movl -VEC_SIZE(%rdi, %rax), %ecx |
386 | xorl %edx, %edx |
387 | cmpl -VEC_SIZE(%rsi, %rax), %ecx |
388 | setg %dl |
389 | leal -1(%rdx, %rdx), %eax |
390 | # else |
391 | movzbl -VEC_SIZE(%rsi, %rax), %ecx |
392 | movzbl -VEC_SIZE(%rdi, %rax), %eax |
393 | subl %ecx, %eax |
394 | # endif |
395 | VZEROUPPER_RETURN |
396 | |
397 | .p2align 4 |
398 | L(less_vec): |
399 | /* Check if one or less CHAR. This is necessary for size = 0 but |
400 | is also faster for size = CHAR_SIZE. */ |
401 | cmpl $CHAR_SIZE, %edx |
402 | jbe L(one_or_less) |
403 | |
404 | /* Check if loading one VEC from either s1 or s2 could cause a |
405 | page cross. This can have false positives but is by far the |
406 | fastest method. */ |
407 | movl %edi, %eax |
408 | orl %esi, %eax |
409 | andl $(PAGE_SIZE - 1), %eax |
410 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
411 | jg L(page_cross_less_vec) |
412 | |
413 | /* No page cross possible. */ |
414 | vmovdqu (%rsi), %ymm2 |
415 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
416 | vpmovmskb %ymm2, %eax |
417 | incl %eax |
418 | /* Result will be zero if s1 and s2 match. Otherwise first set |
419 | bit will be first mismatch. */ |
420 | bzhil %edx, %eax, %edx |
421 | jnz L(return_vec_0) |
422 | xorl %eax, %eax |
423 | VZEROUPPER_RETURN |
424 | |
425 | .p2align 4 |
426 | L(page_cross_less_vec): |
427 | /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 |
428 | bytes. */ |
429 | cmpl $16, %edx |
430 | jae L(between_16_31) |
431 | # ifndef USE_AS_WMEMCMP |
432 | cmpl $8, %edx |
433 | jae L(between_8_15) |
434 | /* Fall through for [4, 7]. */ |
435 | cmpl $4, %edx |
436 | jb L(between_2_3) |
437 | |
438 | movbe (%rdi), %eax |
439 | movbe (%rsi), %ecx |
440 | shlq $32, %rax |
441 | shlq $32, %rcx |
442 | movbe -4(%rdi, %rdx), %edi |
443 | movbe -4(%rsi, %rdx), %esi |
444 | orq %rdi, %rax |
445 | orq %rsi, %rcx |
446 | subq %rcx, %rax |
447 | /* Fast path for return zero. */ |
448 | jnz L(ret_nonzero) |
449 | /* No ymm register was touched. */ |
450 | ret |
451 | |
452 | .p2align 4 |
453 | L(one_or_less): |
454 | jb L(zero) |
455 | movzbl (%rsi), %ecx |
456 | movzbl (%rdi), %eax |
457 | subl %ecx, %eax |
458 | /* No ymm register was touched. */ |
459 | ret |
460 | |
461 | .p2align 4,, 5 |
462 | L(ret_nonzero): |
463 | sbbl %eax, %eax |
464 | orl $1, %eax |
465 | /* No ymm register was touched. */ |
466 | ret |
467 | |
468 | .p2align 4,, 2 |
469 | L(zero): |
470 | xorl %eax, %eax |
471 | /* No ymm register was touched. */ |
472 | ret |
473 | |
474 | .p2align 4 |
475 | L(between_8_15): |
476 | movbe (%rdi), %rax |
477 | movbe (%rsi), %rcx |
478 | subq %rcx, %rax |
479 | jnz L(ret_nonzero) |
480 | movbe -8(%rdi, %rdx), %rax |
481 | movbe -8(%rsi, %rdx), %rcx |
482 | subq %rcx, %rax |
483 | /* Fast path for return zero. */ |
484 | jnz L(ret_nonzero) |
485 | /* No ymm register was touched. */ |
486 | ret |
487 | # else |
488 | /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ |
489 | vmovq (%rdi), %xmm1 |
490 | vmovq (%rsi), %xmm2 |
491 | VPCMPEQ %xmm1, %xmm2, %xmm2 |
492 | vpmovmskb %xmm2, %eax |
493 | subl $0xffff, %eax |
494 | jnz L(return_vec_0) |
495 | /* Use overlapping loads to avoid branches. */ |
496 | leaq -8(%rdi, %rdx), %rdi |
497 | leaq -8(%rsi, %rdx), %rsi |
498 | vmovq (%rdi), %xmm1 |
499 | vmovq (%rsi), %xmm2 |
500 | VPCMPEQ %xmm1, %xmm2, %xmm2 |
501 | vpmovmskb %xmm2, %eax |
502 | subl $0xffff, %eax |
503 | /* Fast path for return zero. */ |
504 | jnz L(return_vec_0) |
505 | /* No ymm register was touched. */ |
506 | ret |
507 | # endif |
508 | |
509 | .p2align 4,, 10 |
510 | L(between_16_31): |
511 | /* From 16 to 31 bytes. No branch when size == 16. */ |
512 | vmovdqu (%rsi), %xmm2 |
513 | VPCMPEQ (%rdi), %xmm2, %xmm2 |
514 | vpmovmskb %xmm2, %eax |
515 | subl $0xffff, %eax |
516 | jnz L(return_vec_0) |
517 | |
518 | /* Use overlapping loads to avoid branches. */ |
519 | |
520 | vmovdqu -16(%rsi, %rdx), %xmm2 |
521 | leaq -16(%rdi, %rdx), %rdi |
522 | leaq -16(%rsi, %rdx), %rsi |
523 | VPCMPEQ (%rdi), %xmm2, %xmm2 |
524 | vpmovmskb %xmm2, %eax |
525 | subl $0xffff, %eax |
526 | /* Fast path for return zero. */ |
527 | jnz L(return_vec_0) |
528 | /* No ymm register was touched. */ |
529 | ret |
530 | |
531 | # ifdef USE_AS_WMEMCMP |
532 | .p2align 4,, 2 |
533 | L(zero): |
534 | xorl %eax, %eax |
535 | ret |
536 | |
537 | .p2align 4 |
538 | L(one_or_less): |
539 | jb L(zero) |
540 | movl (%rdi), %ecx |
541 | xorl %edx, %edx |
542 | cmpl (%rsi), %ecx |
543 | je L(zero) |
544 | setg %dl |
545 | leal -1(%rdx, %rdx), %eax |
546 | /* No ymm register was touched. */ |
547 | ret |
548 | # else |
549 | |
550 | .p2align 4 |
551 | L(between_2_3): |
552 | /* Load as big endian to avoid branches. */ |
553 | movzwl (%rdi), %eax |
554 | movzwl (%rsi), %ecx |
555 | bswap %eax |
556 | bswap %ecx |
557 | shrl %eax |
558 | shrl %ecx |
559 | movzbl -1(%rdi, %rdx), %edi |
560 | movzbl -1(%rsi, %rdx), %esi |
561 | orl %edi, %eax |
562 | orl %esi, %ecx |
563 | /* Subtraction is okay because the upper bit is zero. */ |
564 | subl %ecx, %eax |
565 | /* No ymm register was touched. */ |
566 | ret |
567 | # endif |
568 | |
569 | END (MEMCMP) |
570 | #endif |
571 | |