1 | /* memcmp/wmemcmp optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | /* memcmp/wmemcmp is implemented as: |
22 | 1. Use ymm vector compares when possible. The only case where |
23 | vector compares is not possible for when size < CHAR_PER_VEC |
24 | and loading from either s1 or s2 would cause a page cross. |
25 | 2. For size from 2 to 7 bytes on page cross, load as big endian |
26 | with movbe and bswap to avoid branches. |
27 | 3. Use xmm vector compare when size >= 4 bytes for memcmp or |
28 | size >= 8 bytes for wmemcmp. |
29 | 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a |
30 | to check for early mismatches. Only do this if its guranteed the |
31 | work is not wasted. |
32 | 5. If size is 8 * VEC_SIZE or less, unroll the loop. |
33 | 6. Compare 4 * VEC_SIZE at a time with the aligned first memory |
34 | area. |
35 | 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. |
36 | 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. |
37 | 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ |
38 | |
39 | # include <sysdep.h> |
40 | |
41 | # ifndef MEMCMP |
42 | # define MEMCMP __memcmp_evex_movbe |
43 | # endif |
44 | |
45 | # define VMOVU vmovdqu64 |
46 | |
47 | # ifdef USE_AS_WMEMCMP |
48 | # define CHAR_SIZE 4 |
49 | # define VPCMP vpcmpd |
50 | # else |
51 | # define CHAR_SIZE 1 |
52 | # define VPCMP vpcmpub |
53 | # endif |
54 | |
55 | # define VEC_SIZE 32 |
56 | # define PAGE_SIZE 4096 |
57 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
58 | |
59 | # define XMM0 xmm16 |
60 | # define XMM1 xmm17 |
61 | # define XMM2 xmm18 |
62 | # define YMM0 ymm16 |
63 | # define XMM1 xmm17 |
64 | # define XMM2 xmm18 |
65 | # define YMM1 ymm17 |
66 | # define YMM2 ymm18 |
67 | # define YMM3 ymm19 |
68 | # define YMM4 ymm20 |
69 | # define YMM5 ymm21 |
70 | # define YMM6 ymm22 |
71 | |
72 | /* Warning! |
73 | wmemcmp has to use SIGNED comparison for elements. |
74 | memcmp has to use UNSIGNED comparison for elemnts. |
75 | */ |
76 | |
77 | .section .text.evex,"ax" ,@progbits |
78 | ENTRY (MEMCMP) |
79 | # ifdef __ILP32__ |
80 | /* Clear the upper 32 bits. */ |
81 | movl %edx, %edx |
82 | # endif |
83 | cmp $CHAR_PER_VEC, %RDX_LP |
84 | jb L(less_vec) |
85 | |
86 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
87 | VMOVU (%rsi), %YMM1 |
88 | /* Use compare not equals to directly check for mismatch. */ |
89 | VPCMP $4, (%rdi), %YMM1, %k1 |
90 | kmovd %k1, %eax |
91 | /* NB: eax must be destination register if going to |
92 | L(return_vec_[0,2]). For L(return_vec_3 destination register |
93 | must be ecx. */ |
94 | testl %eax, %eax |
95 | jnz L(return_vec_0) |
96 | |
97 | cmpq $(CHAR_PER_VEC * 2), %rdx |
98 | jbe L(last_1x_vec) |
99 | |
100 | /* Check second VEC no matter what. */ |
101 | VMOVU VEC_SIZE(%rsi), %YMM2 |
102 | VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 |
103 | kmovd %k1, %eax |
104 | testl %eax, %eax |
105 | jnz L(return_vec_1) |
106 | |
107 | /* Less than 4 * VEC. */ |
108 | cmpq $(CHAR_PER_VEC * 4), %rdx |
109 | jbe L(last_2x_vec) |
110 | |
111 | /* Check third and fourth VEC no matter what. */ |
112 | VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 |
113 | VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 |
114 | kmovd %k1, %eax |
115 | testl %eax, %eax |
116 | jnz L(return_vec_2) |
117 | |
118 | VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 |
119 | VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 |
120 | kmovd %k1, %ecx |
121 | testl %ecx, %ecx |
122 | jnz L(return_vec_3) |
123 | |
124 | /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so |
125 | compare with zero to get a mask is needed. */ |
126 | vpxorq %XMM0, %XMM0, %XMM0 |
127 | |
128 | /* Go to 4x VEC loop. */ |
129 | cmpq $(CHAR_PER_VEC * 8), %rdx |
130 | ja L(more_8x_vec) |
131 | |
132 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
133 | branches. */ |
134 | |
135 | /* Load first two VEC from s2 before adjusting addresses. */ |
136 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 |
137 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 |
138 | leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi |
139 | leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi |
140 | |
141 | /* Wait to load from s1 until addressed adjust due to |
142 | unlamination of microfusion with complex address mode. */ |
143 | |
144 | /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it |
145 | will have some 1s. */ |
146 | vpxorq (%rdi), %YMM1, %YMM1 |
147 | vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 |
148 | |
149 | VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 |
150 | vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 |
151 | /* Or together YMM1, YMM2, and YMM3 into YMM3. */ |
152 | vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 |
153 | |
154 | VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 |
155 | /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while |
156 | oring with YMM3. Result is stored in YMM4. */ |
157 | vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 |
158 | /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ |
159 | VPCMP $4, %YMM4, %YMM0, %k1 |
160 | kmovd %k1, %ecx |
161 | testl %ecx, %ecx |
162 | jnz L(return_vec_0_1_2_3) |
163 | /* NB: eax must be zero to reach here. */ |
164 | ret |
165 | |
166 | /* NB: aligning 32 here allows for the rest of the jump targets |
167 | to be tuned for 32 byte alignment. Most important this ensures |
168 | the L(more_8x_vec) loop is 32 byte aligned. */ |
169 | .p2align 5 |
170 | L(less_vec): |
171 | /* Check if one or less CHAR. This is necessary for size = 0 but |
172 | is also faster for size = CHAR_SIZE. */ |
173 | cmpl $1, %edx |
174 | jbe L(one_or_less) |
175 | |
176 | /* Check if loading one VEC from either s1 or s2 could cause a |
177 | page cross. This can have false positives but is by far the |
178 | fastest method. */ |
179 | movl %edi, %eax |
180 | orl %esi, %eax |
181 | andl $(PAGE_SIZE - 1), %eax |
182 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
183 | jg L(page_cross_less_vec) |
184 | |
185 | /* No page cross possible. */ |
186 | VMOVU (%rsi), %YMM2 |
187 | VPCMP $4, (%rdi), %YMM2, %k1 |
188 | kmovd %k1, %eax |
189 | /* Create mask in ecx for potentially in bound matches. */ |
190 | bzhil %edx, %eax, %eax |
191 | jnz L(return_vec_0) |
192 | ret |
193 | |
194 | .p2align 4 |
195 | L(return_vec_0): |
196 | tzcntl %eax, %eax |
197 | # ifdef USE_AS_WMEMCMP |
198 | movl (%rdi, %rax, CHAR_SIZE), %ecx |
199 | xorl %edx, %edx |
200 | cmpl (%rsi, %rax, CHAR_SIZE), %ecx |
201 | /* NB: no partial register stall here because xorl zero idiom |
202 | above. */ |
203 | setg %dl |
204 | leal -1(%rdx, %rdx), %eax |
205 | # else |
206 | movzbl (%rsi, %rax), %ecx |
207 | movzbl (%rdi, %rax), %eax |
208 | subl %ecx, %eax |
209 | # endif |
210 | ret |
211 | |
212 | /* NB: No p2align necessary. Alignment % 16 is naturally 1 |
213 | which is good enough for a target not in a loop. */ |
214 | L(return_vec_1): |
215 | tzcntl %eax, %eax |
216 | # ifdef USE_AS_WMEMCMP |
217 | movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx |
218 | xorl %edx, %edx |
219 | cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx |
220 | setg %dl |
221 | leal -1(%rdx, %rdx), %eax |
222 | # else |
223 | movzbl VEC_SIZE(%rsi, %rax), %ecx |
224 | movzbl VEC_SIZE(%rdi, %rax), %eax |
225 | subl %ecx, %eax |
226 | # endif |
227 | ret |
228 | |
229 | /* NB: No p2align necessary. Alignment % 16 is naturally 2 |
230 | which is good enough for a target not in a loop. */ |
231 | L(return_vec_2): |
232 | tzcntl %eax, %eax |
233 | # ifdef USE_AS_WMEMCMP |
234 | movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx |
235 | xorl %edx, %edx |
236 | cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx |
237 | setg %dl |
238 | leal -1(%rdx, %rdx), %eax |
239 | # else |
240 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
241 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
242 | subl %ecx, %eax |
243 | # endif |
244 | ret |
245 | |
246 | .p2align 4 |
247 | L(8x_return_vec_0_1_2_3): |
248 | /* Returning from L(more_8x_vec) requires restoring rsi. */ |
249 | addq %rdi, %rsi |
250 | L(return_vec_0_1_2_3): |
251 | VPCMP $4, %YMM1, %YMM0, %k0 |
252 | kmovd %k0, %eax |
253 | testl %eax, %eax |
254 | jnz L(return_vec_0) |
255 | |
256 | VPCMP $4, %YMM2, %YMM0, %k0 |
257 | kmovd %k0, %eax |
258 | testl %eax, %eax |
259 | jnz L(return_vec_1) |
260 | |
261 | VPCMP $4, %YMM3, %YMM0, %k0 |
262 | kmovd %k0, %eax |
263 | testl %eax, %eax |
264 | jnz L(return_vec_2) |
265 | L(return_vec_3): |
266 | tzcntl %ecx, %ecx |
267 | # ifdef USE_AS_WMEMCMP |
268 | movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax |
269 | xorl %edx, %edx |
270 | cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax |
271 | setg %dl |
272 | leal -1(%rdx, %rdx), %eax |
273 | # else |
274 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
275 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx |
276 | subl %ecx, %eax |
277 | # endif |
278 | ret |
279 | |
280 | .p2align 4 |
281 | L(more_8x_vec): |
282 | /* Set end of s1 in rdx. */ |
283 | leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx |
284 | /* rsi stores s2 - s1. This allows loop to only update one |
285 | pointer. */ |
286 | subq %rdi, %rsi |
287 | /* Align s1 pointer. */ |
288 | andq $-VEC_SIZE, %rdi |
289 | /* Adjust because first 4x vec where check already. */ |
290 | subq $-(VEC_SIZE * 4), %rdi |
291 | .p2align 4 |
292 | L(loop_4x_vec): |
293 | VMOVU (%rsi, %rdi), %YMM1 |
294 | vpxorq (%rdi), %YMM1, %YMM1 |
295 | |
296 | VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 |
297 | vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 |
298 | |
299 | VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 |
300 | vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 |
301 | vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 |
302 | |
303 | VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 |
304 | vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 |
305 | VPCMP $4, %YMM4, %YMM0, %k1 |
306 | kmovd %k1, %ecx |
307 | testl %ecx, %ecx |
308 | jnz L(8x_return_vec_0_1_2_3) |
309 | subq $-(VEC_SIZE * 4), %rdi |
310 | cmpq %rdx, %rdi |
311 | jb L(loop_4x_vec) |
312 | |
313 | subq %rdx, %rdi |
314 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
315 | cmpl $(VEC_SIZE * 3), %edi |
316 | jae L(8x_last_1x_vec) |
317 | /* Load regardless of branch. */ |
318 | VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 |
319 | cmpl $(VEC_SIZE * 2), %edi |
320 | jae L(8x_last_2x_vec) |
321 | |
322 | VMOVU (%rsi, %rdx), %YMM1 |
323 | vpxorq (%rdx), %YMM1, %YMM1 |
324 | |
325 | VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 |
326 | vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 |
327 | |
328 | vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 |
329 | vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 |
330 | |
331 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 |
332 | vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 |
333 | VPCMP $4, %YMM4, %YMM0, %k1 |
334 | kmovd %k1, %ecx |
335 | /* Restore s1 pointer to rdi. */ |
336 | movq %rdx, %rdi |
337 | testl %ecx, %ecx |
338 | jnz L(8x_return_vec_0_1_2_3) |
339 | /* NB: eax must be zero to reach here. */ |
340 | ret |
341 | |
342 | /* Only entry is from L(more_8x_vec). */ |
343 | .p2align 4 |
344 | L(8x_last_2x_vec): |
345 | VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 |
346 | kmovd %k1, %eax |
347 | testl %eax, %eax |
348 | jnz L(8x_return_vec_2) |
349 | /* Naturally aligned to 16 bytes. */ |
350 | L(8x_last_1x_vec): |
351 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 |
352 | VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1 |
353 | kmovd %k1, %eax |
354 | testl %eax, %eax |
355 | jnz L(8x_return_vec_3) |
356 | ret |
357 | |
358 | .p2align 4 |
359 | L(last_2x_vec): |
360 | /* Check second to last VEC. */ |
361 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 |
362 | VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 |
363 | kmovd %k1, %eax |
364 | testl %eax, %eax |
365 | jnz L(return_vec_1_end) |
366 | |
367 | /* Check last VEC. */ |
368 | .p2align 4 |
369 | L(last_1x_vec): |
370 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 |
371 | VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 |
372 | kmovd %k1, %eax |
373 | testl %eax, %eax |
374 | jnz L(return_vec_0_end) |
375 | ret |
376 | |
377 | .p2align 4 |
378 | L(8x_return_vec_2): |
379 | subq $VEC_SIZE, %rdx |
380 | L(8x_return_vec_3): |
381 | tzcntl %eax, %eax |
382 | # ifdef USE_AS_WMEMCMP |
383 | leaq (%rdx, %rax, CHAR_SIZE), %rax |
384 | movl (VEC_SIZE * 3)(%rax), %ecx |
385 | xorl %edx, %edx |
386 | cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
387 | setg %dl |
388 | leal -1(%rdx, %rdx), %eax |
389 | # else |
390 | addq %rdx, %rax |
391 | movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
392 | movzbl (VEC_SIZE * 3)(%rax), %eax |
393 | subl %ecx, %eax |
394 | # endif |
395 | ret |
396 | |
397 | .p2align 4 |
398 | L(return_vec_0_end): |
399 | tzcntl %eax, %eax |
400 | addl %edx, %eax |
401 | # ifdef USE_AS_WMEMCMP |
402 | movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx |
403 | xorl %edx, %edx |
404 | cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx |
405 | setg %dl |
406 | leal -1(%rdx, %rdx), %eax |
407 | # else |
408 | movzbl -VEC_SIZE(%rsi, %rax), %ecx |
409 | movzbl -VEC_SIZE(%rdi, %rax), %eax |
410 | subl %ecx, %eax |
411 | # endif |
412 | ret |
413 | |
414 | .p2align 4 |
415 | L(return_vec_1_end): |
416 | tzcntl %eax, %eax |
417 | addl %edx, %eax |
418 | # ifdef USE_AS_WMEMCMP |
419 | movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx |
420 | xorl %edx, %edx |
421 | cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx |
422 | setg %dl |
423 | leal -1(%rdx, %rdx), %eax |
424 | # else |
425 | movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx |
426 | movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax |
427 | subl %ecx, %eax |
428 | # endif |
429 | ret |
430 | |
431 | |
432 | .p2align 4 |
433 | L(page_cross_less_vec): |
434 | /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 |
435 | bytes. */ |
436 | cmpl $(16 / CHAR_SIZE), %edx |
437 | jae L(between_16_31) |
438 | # ifndef USE_AS_WMEMCMP |
439 | cmpl $8, %edx |
440 | jae L(between_8_15) |
441 | cmpl $4, %edx |
442 | jae L(between_4_7) |
443 | L(between_2_3): |
444 | /* Load as big endian to avoid branches. */ |
445 | movzwl (%rdi), %eax |
446 | movzwl (%rsi), %ecx |
447 | shll $8, %eax |
448 | shll $8, %ecx |
449 | bswap %eax |
450 | bswap %ecx |
451 | movzbl -1(%rdi, %rdx), %edi |
452 | movzbl -1(%rsi, %rdx), %esi |
453 | orl %edi, %eax |
454 | orl %esi, %ecx |
455 | /* Subtraction is okay because the upper 8 bits are zero. */ |
456 | subl %ecx, %eax |
457 | ret |
458 | .p2align 4 |
459 | L(one_or_less): |
460 | jb L(zero) |
461 | movzbl (%rsi), %ecx |
462 | movzbl (%rdi), %eax |
463 | subl %ecx, %eax |
464 | ret |
465 | |
466 | .p2align 4 |
467 | L(between_8_15): |
468 | # endif |
469 | /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ |
470 | vmovq (%rdi), %XMM1 |
471 | vmovq (%rsi), %XMM2 |
472 | VPCMP $4, %XMM1, %XMM2, %k1 |
473 | kmovd %k1, %eax |
474 | testl %eax, %eax |
475 | jnz L(return_vec_0) |
476 | /* Use overlapping loads to avoid branches. */ |
477 | leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi |
478 | leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi |
479 | vmovq (%rdi), %XMM1 |
480 | vmovq (%rsi), %XMM2 |
481 | VPCMP $4, %XMM1, %XMM2, %k1 |
482 | kmovd %k1, %eax |
483 | testl %eax, %eax |
484 | jnz L(return_vec_0) |
485 | ret |
486 | |
487 | .p2align 4 |
488 | L(zero): |
489 | xorl %eax, %eax |
490 | ret |
491 | |
492 | .p2align 4 |
493 | L(between_16_31): |
494 | /* From 16 to 31 bytes. No branch when size == 16. */ |
495 | VMOVU (%rsi), %XMM2 |
496 | VPCMP $4, (%rdi), %XMM2, %k1 |
497 | kmovd %k1, %eax |
498 | testl %eax, %eax |
499 | jnz L(return_vec_0) |
500 | |
501 | /* Use overlapping loads to avoid branches. */ |
502 | |
503 | VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 |
504 | leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi |
505 | leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi |
506 | VPCMP $4, (%rdi), %XMM2, %k1 |
507 | kmovd %k1, %eax |
508 | testl %eax, %eax |
509 | jnz L(return_vec_0) |
510 | ret |
511 | |
512 | # ifdef USE_AS_WMEMCMP |
513 | .p2align 4 |
514 | L(one_or_less): |
515 | jb L(zero) |
516 | movl (%rdi), %ecx |
517 | xorl %edx, %edx |
518 | cmpl (%rsi), %ecx |
519 | je L(zero) |
520 | setg %dl |
521 | leal -1(%rdx, %rdx), %eax |
522 | ret |
523 | # else |
524 | |
525 | .p2align 4 |
526 | L(between_4_7): |
527 | /* Load as big endian with overlapping movbe to avoid branches. |
528 | */ |
529 | movbe (%rdi), %eax |
530 | movbe (%rsi), %ecx |
531 | shlq $32, %rax |
532 | shlq $32, %rcx |
533 | movbe -4(%rdi, %rdx), %edi |
534 | movbe -4(%rsi, %rdx), %esi |
535 | orq %rdi, %rax |
536 | orq %rsi, %rcx |
537 | subq %rcx, %rax |
538 | jz L(zero_4_7) |
539 | sbbl %eax, %eax |
540 | orl $1, %eax |
541 | L(zero_4_7): |
542 | ret |
543 | # endif |
544 | |
545 | END (MEMCMP) |
546 | #endif |
547 | |