1 | /* memcmp with SSE2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | |
20 | #include <isa-level.h> |
21 | |
22 | /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation |
23 | so we need this to build for ISA V2 builds. */ |
24 | #if ISA_SHOULD_BUILD (2) |
25 | |
26 | #include <sysdep.h> |
27 | |
28 | # ifndef MEMCMP |
29 | # define MEMCMP __memcmp_sse2 |
30 | # endif |
31 | |
32 | # ifdef USE_AS_WMEMCMP |
33 | # define PCMPEQ pcmpeqd |
34 | # define CHAR_SIZE 4 |
35 | # define SIZE_OFFSET (0) |
36 | # else |
37 | # define PCMPEQ pcmpeqb |
38 | # define CHAR_SIZE 1 |
39 | # endif |
40 | |
41 | # ifdef USE_AS_MEMCMPEQ |
42 | # define SIZE_OFFSET (0) |
43 | # define CHECK_CMP(x, y) subl x, y |
44 | # else |
45 | # ifndef SIZE_OFFSET |
46 | # define SIZE_OFFSET (CHAR_PER_VEC * 2) |
47 | # endif |
48 | # define CHECK_CMP(x, y) cmpl x, y |
49 | # endif |
50 | |
51 | # define VEC_SIZE 16 |
52 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
53 | |
54 | # ifndef MEMCMP |
55 | # define MEMCMP memcmp |
56 | # endif |
57 | |
58 | .text |
59 | ENTRY(MEMCMP) |
60 | # ifdef __ILP32__ |
61 | /* Clear the upper 32 bits. */ |
62 | movl %edx, %edx |
63 | # endif |
64 | # ifdef USE_AS_WMEMCMP |
65 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store |
66 | in ecx for code size. This is preferable to using `incw` as |
67 | it avoids partial register stalls on older hardware (pre |
68 | SnB). */ |
69 | movl $0xffff, %ecx |
70 | # endif |
71 | cmpq $CHAR_PER_VEC, %rdx |
72 | ja L(more_1x_vec) |
73 | |
74 | # ifdef USE_AS_WMEMCMP |
75 | /* saves a byte of code keeping the fall through path n = [2, 4] |
76 | in the initial cache line. */ |
77 | decl %edx |
78 | jle L(cmp_0_1) |
79 | |
80 | movq (%rsi), %xmm0 |
81 | movq (%rdi), %xmm1 |
82 | PCMPEQ %xmm0, %xmm1 |
83 | pmovmskb %xmm1, %eax |
84 | subl %ecx, %eax |
85 | jnz L(ret_nonzero_vec_start_0) |
86 | |
87 | movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 |
88 | movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 |
89 | PCMPEQ %xmm0, %xmm1 |
90 | pmovmskb %xmm1, %eax |
91 | subl %ecx, %eax |
92 | jnz L(ret_nonzero_vec_end_0_adj) |
93 | # else |
94 | cmpl $8, %edx |
95 | ja L(cmp_9_16) |
96 | |
97 | cmpl $4, %edx |
98 | jb L(cmp_0_3) |
99 | |
100 | # ifdef USE_AS_MEMCMPEQ |
101 | movl (%rsi), %eax |
102 | subl (%rdi), %eax |
103 | |
104 | movl -4(%rsi, %rdx), %esi |
105 | subl -4(%rdi, %rdx), %esi |
106 | |
107 | orl %esi, %eax |
108 | ret |
109 | # else |
110 | /* Combine comparisons for lo and hi 4-byte comparisons. */ |
111 | movl -4(%rsi, %rdx), %ecx |
112 | movl -4(%rdi, %rdx), %eax |
113 | shlq $32, %rcx |
114 | shlq $32, %rax |
115 | movl (%rsi), %esi |
116 | movl (%rdi), %edi |
117 | orq %rsi, %rcx |
118 | orq %rdi, %rax |
119 | /* Only compute proper return if not-equal. */ |
120 | cmpq %rcx, %rax |
121 | jnz L(ret_nonzero) |
122 | xorl %eax, %eax |
123 | ret |
124 | # endif |
125 | |
126 | .p2align 4,, 10 |
127 | L(cmp_9_16): |
128 | # ifdef USE_AS_MEMCMPEQ |
129 | movq (%rsi), %rax |
130 | subq (%rdi), %rax |
131 | |
132 | movq -8(%rsi, %rdx), %rcx |
133 | subq -8(%rdi, %rdx), %rcx |
134 | orq %rcx, %rax |
135 | /* Convert 64 bit -> 32 bit boolean (we should have made the ABI |
136 | return long). */ |
137 | setnz %cl |
138 | movzbl %cl, %eax |
139 | # else |
140 | movq (%rsi), %rcx |
141 | movq (%rdi), %rax |
142 | /* Only compute proper return if not-equal. */ |
143 | cmpq %rcx, %rax |
144 | jnz L(ret_nonzero) |
145 | |
146 | movq -8(%rsi, %rdx, CHAR_SIZE), %rcx |
147 | movq -8(%rdi, %rdx, CHAR_SIZE), %rax |
148 | /* Only compute proper return if not-equal. */ |
149 | cmpq %rcx, %rax |
150 | jnz L(ret_nonzero) |
151 | xorl %eax, %eax |
152 | # endif |
153 | # endif |
154 | ret |
155 | |
156 | .p2align 4,, 8 |
157 | L(cmp_0_1): |
158 | /* Flag set by earlier comparison against 1. */ |
159 | jne L(cmp_0_0) |
160 | # ifdef USE_AS_WMEMCMP |
161 | movl (%rdi), %ecx |
162 | xorl %edx, %edx |
163 | cmpl (%rsi), %ecx |
164 | je L(cmp_0_0) |
165 | setg %dl |
166 | leal -1(%rdx, %rdx), %eax |
167 | # else |
168 | movzbl (%rdi), %eax |
169 | movzbl (%rsi), %ecx |
170 | subl %ecx, %eax |
171 | # endif |
172 | ret |
173 | |
174 | /* Fits in aligning bytes. */ |
175 | L(cmp_0_0): |
176 | xorl %eax, %eax |
177 | ret |
178 | |
179 | # ifdef USE_AS_WMEMCMP |
180 | .p2align 4 |
181 | L(ret_nonzero_vec_start_0): |
182 | bsfl %eax, %eax |
183 | movl (%rdi, %rax), %ecx |
184 | xorl %edx, %edx |
185 | cmpl (%rsi, %rax), %ecx |
186 | /* NB: no partial register stall here because xorl zero idiom |
187 | above. */ |
188 | setg %dl |
189 | leal -1(%rdx, %rdx), %eax |
190 | ret |
191 | # else |
192 | |
193 | # ifndef USE_AS_MEMCMPEQ |
194 | .p2align 4,, 14 |
195 | L(ret_nonzero): |
196 | /* Need to bswap to get proper return without branch. */ |
197 | bswapq %rcx |
198 | bswapq %rax |
199 | subq %rcx, %rax |
200 | sbbl %eax, %eax |
201 | orl $1, %eax |
202 | ret |
203 | # endif |
204 | |
205 | .p2align 4 |
206 | L(cmp_0_3): |
207 | # ifdef USE_AS_MEMCMPEQ |
208 | /* No reason to add to dependency chain on rdx. Saving a the |
209 | bytes here doesn't change number of fetch blocks. */ |
210 | cmpl $1, %edx |
211 | jbe L(cmp_0_1) |
212 | # else |
213 | /* We need the code size to prevent taking an extra fetch block. |
214 | */ |
215 | decl %edx |
216 | jle L(cmp_0_1) |
217 | # endif |
218 | movzwl (%rsi), %ecx |
219 | movzwl (%rdi), %eax |
220 | |
221 | # ifdef USE_AS_MEMCMPEQ |
222 | subl %ecx, %eax |
223 | |
224 | movzbl -1(%rsi, %rdx), %esi |
225 | movzbl -1(%rdi, %rdx), %edi |
226 | subl %edi, %esi |
227 | orl %esi, %eax |
228 | # else |
229 | bswapl %ecx |
230 | bswapl %eax |
231 | |
232 | /* Implicit right shift by one. We just need to displace the |
233 | sign bits. */ |
234 | shrl %ecx |
235 | shrl %eax |
236 | |
237 | /* Eat a partial register stall here. Saves code stopping |
238 | L(cmp_0_3) from bleeding into the next fetch block and saves |
239 | an ALU. */ |
240 | movb (%rsi, %rdx), %cl |
241 | movzbl (%rdi, %rdx), %edi |
242 | orl %edi, %eax |
243 | subl %ecx, %eax |
244 | # endif |
245 | ret |
246 | # endif |
247 | |
248 | .p2align 5 |
249 | L(more_1x_vec): |
250 | # ifndef USE_AS_WMEMCMP |
251 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store |
252 | in ecx for code size. This is preferable to using `incw` as |
253 | it avoids partial register stalls on older hardware (pre |
254 | SnB). */ |
255 | movl $0xffff, %ecx |
256 | # endif |
257 | movups (%rsi), %xmm0 |
258 | movups (%rdi), %xmm1 |
259 | PCMPEQ %xmm0, %xmm1 |
260 | pmovmskb %xmm1, %eax |
261 | subl %ecx, %eax |
262 | jnz L(ret_nonzero_vec_start_0) |
263 | # if SIZE_OFFSET == 0 |
264 | cmpq $(CHAR_PER_VEC * 2), %rdx |
265 | # else |
266 | /* Offset rdx. Saves just enough code size to keep the |
267 | L(last_2x_vec) case and the non-zero return in a single |
268 | cache line. */ |
269 | subq $(CHAR_PER_VEC * 2), %rdx |
270 | # endif |
271 | ja L(more_2x_vec) |
272 | |
273 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 |
274 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 |
275 | PCMPEQ %xmm0, %xmm1 |
276 | pmovmskb %xmm1, %eax |
277 | subl %ecx, %eax |
278 | # ifndef USE_AS_MEMCMPEQ |
279 | /* Don't use `incw ax` as machines this code runs on are liable |
280 | to have partial register stall. */ |
281 | jnz L(ret_nonzero_vec_end_0) |
282 | # else |
283 | /* Various return targets for memcmpeq. Will always be hot in |
284 | Icache and get short encoding. */ |
285 | L(ret_nonzero_vec_start_1): |
286 | L(ret_nonzero_vec_start_0): |
287 | L(ret_nonzero_vec_end_0): |
288 | # endif |
289 | ret |
290 | |
291 | # ifndef USE_AS_MEMCMPEQ |
292 | # ifdef USE_AS_WMEMCMP |
293 | .p2align 4 |
294 | L(ret_nonzero_vec_end_0_adj): |
295 | addl $3, %edx |
296 | # else |
297 | .p2align 4,, 8 |
298 | # endif |
299 | L(ret_nonzero_vec_end_0): |
300 | bsfl %eax, %eax |
301 | # ifdef USE_AS_WMEMCMP |
302 | leal (%rax, %rdx, CHAR_SIZE), %eax |
303 | movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx |
304 | xorl %edx, %edx |
305 | cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx |
306 | /* NB: no partial register stall here because xorl zero idiom |
307 | above. */ |
308 | setg %dl |
309 | leal -1(%rdx, %rdx), %eax |
310 | # else |
311 | addl %edx, %eax |
312 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx |
313 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax |
314 | subl %ecx, %eax |
315 | # endif |
316 | ret |
317 | # ifndef USE_AS_WMEMCMP |
318 | .p2align 4,, 10 |
319 | L(ret_nonzero_vec_start_0): |
320 | bsfl %eax, %eax |
321 | movzbl (%rsi, %rax), %ecx |
322 | movzbl (%rdi, %rax), %eax |
323 | subl %ecx, %eax |
324 | ret |
325 | # endif |
326 | # else |
327 | # endif |
328 | |
329 | .p2align 5 |
330 | L(more_2x_vec): |
331 | movups (VEC_SIZE * 1)(%rsi), %xmm0 |
332 | movups (VEC_SIZE * 1)(%rdi), %xmm1 |
333 | PCMPEQ %xmm0, %xmm1 |
334 | pmovmskb %xmm1, %eax |
335 | subl %ecx, %eax |
336 | jnz L(ret_nonzero_vec_start_1) |
337 | |
338 | cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx |
339 | jbe L(last_2x_vec) |
340 | |
341 | cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx |
342 | ja L(more_8x_vec) |
343 | |
344 | /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. |
345 | This can harm performance if non-zero return in [65, 80] or |
346 | [97, 112] but helps performance otherwise. Generally zero- |
347 | return is hotter. */ |
348 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
349 | movups (VEC_SIZE * 2)(%rdi), %xmm1 |
350 | PCMPEQ %xmm0, %xmm1 |
351 | movups (VEC_SIZE * 3)(%rsi), %xmm2 |
352 | movups (VEC_SIZE * 3)(%rdi), %xmm3 |
353 | PCMPEQ %xmm2, %xmm3 |
354 | pand %xmm1, %xmm3 |
355 | |
356 | pmovmskb %xmm3, %eax |
357 | CHECK_CMP (%ecx, %eax) |
358 | jnz L(ret_nonzero_vec_start_2_3) |
359 | |
360 | cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx |
361 | jbe L(last_2x_vec) |
362 | |
363 | movups (VEC_SIZE * 4)(%rsi), %xmm0 |
364 | movups (VEC_SIZE * 4)(%rdi), %xmm1 |
365 | PCMPEQ %xmm0, %xmm1 |
366 | movups (VEC_SIZE * 5)(%rsi), %xmm2 |
367 | movups (VEC_SIZE * 5)(%rdi), %xmm3 |
368 | PCMPEQ %xmm2, %xmm3 |
369 | pand %xmm1, %xmm3 |
370 | |
371 | pmovmskb %xmm3, %eax |
372 | CHECK_CMP (%ecx, %eax) |
373 | # ifdef USE_AS_MEMCMPEQ |
374 | jz L(last_2x_vec) |
375 | ret |
376 | # else |
377 | jnz L(ret_nonzero_vec_start_4_5) |
378 | # endif |
379 | .p2align 4 |
380 | L(last_2x_vec): |
381 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 |
382 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 |
383 | PCMPEQ %xmm0, %xmm1 |
384 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 |
385 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 |
386 | PCMPEQ %xmm2, %xmm3 |
387 | pand %xmm1, %xmm3 |
388 | pmovmskb %xmm3, %eax |
389 | subl %ecx, %eax |
390 | # ifdef USE_AS_MEMCMPEQ |
391 | /* Various return targets for memcmpeq. Will always be hot in |
392 | Icache and get short encoding. */ |
393 | L(ret_nonzero_vec_start_2_3): |
394 | L(ret_nonzero_vec_start_4_5): |
395 | ret |
396 | # else |
397 | jnz L(ret_nonzero_vec_end_1) |
398 | ret |
399 | |
400 | .p2align 4,, 8 |
401 | L(ret_nonzero_vec_end_1): |
402 | pmovmskb %xmm1, %ecx |
403 | /* High 16 bits of eax guranteed to be all ones. Rotate them in |
404 | to we can do `or + not` with just `xor`. */ |
405 | rorl $16, %eax |
406 | xorl %ecx, %eax |
407 | /* Partial register stall. */ |
408 | |
409 | bsfl %eax, %eax |
410 | # ifdef USE_AS_WMEMCMP |
411 | leal (%rax, %rdx, CHAR_SIZE), %eax |
412 | movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx |
413 | xorl %edx, %edx |
414 | cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx |
415 | /* NB: no partial register stall here because xorl zero idiom |
416 | above. */ |
417 | setg %dl |
418 | leal -1(%rdx, %rdx), %eax |
419 | # else |
420 | addl %edx, %eax |
421 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx |
422 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax |
423 | subl %ecx, %eax |
424 | # endif |
425 | ret |
426 | |
427 | .p2align 4 |
428 | L(ret_nonzero_vec_start_4_5): |
429 | pmovmskb %xmm1, %edx |
430 | sall $16, %eax |
431 | leal 1(%rax, %rdx), %eax |
432 | bsfl %eax, %eax |
433 | # ifdef USE_AS_WMEMCMP |
434 | movl (VEC_SIZE * 4)(%rdi, %rax), %ecx |
435 | xorl %edx, %edx |
436 | cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx |
437 | /* NB: no partial register stall here because xorl zero idiom |
438 | above. */ |
439 | setg %dl |
440 | leal -1(%rdx, %rdx), %eax |
441 | # else |
442 | movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx |
443 | movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax |
444 | subl %ecx, %eax |
445 | # endif |
446 | ret |
447 | |
448 | .p2align 4,, 8 |
449 | L(ret_nonzero_vec_start_1): |
450 | bsfl %eax, %eax |
451 | # ifdef USE_AS_WMEMCMP |
452 | movl (VEC_SIZE * 1)(%rdi, %rax), %ecx |
453 | xorl %edx, %edx |
454 | cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx |
455 | /* NB: no partial register stall here because xorl zero idiom |
456 | above. */ |
457 | setg %dl |
458 | leal -1(%rdx, %rdx), %eax |
459 | # else |
460 | movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx |
461 | movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax |
462 | subl %ecx, %eax |
463 | # endif |
464 | ret |
465 | # endif |
466 | |
467 | .p2align 4 |
468 | L(more_8x_vec): |
469 | subq %rdi, %rsi |
470 | leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx |
471 | andq $(VEC_SIZE * -1), %rdi |
472 | addq %rdi, %rsi |
473 | .p2align 4 |
474 | L(loop_4x): |
475 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
476 | movups (VEC_SIZE * 3)(%rsi), %xmm1 |
477 | |
478 | PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 |
479 | PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 |
480 | |
481 | movups (VEC_SIZE * 4)(%rsi), %xmm2 |
482 | movups (VEC_SIZE * 5)(%rsi), %xmm3 |
483 | |
484 | PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 |
485 | PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 |
486 | |
487 | pand %xmm0, %xmm1 |
488 | pand %xmm2, %xmm3 |
489 | pand %xmm1, %xmm3 |
490 | |
491 | pmovmskb %xmm3, %eax |
492 | subl %ecx, %eax |
493 | jnz L(ret_nonzero_loop) |
494 | |
495 | addq $(VEC_SIZE * 4), %rdi |
496 | addq $(VEC_SIZE * 4), %rsi |
497 | cmpq %rdi, %rdx |
498 | ja L(loop_4x) |
499 | /* Get remaining length in edx. */ |
500 | subl %edi, %edx |
501 | /* Restore offset so we can reuse L(last_2x_vec). */ |
502 | addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx |
503 | # ifdef USE_AS_WMEMCMP |
504 | shrl $2, %edx |
505 | # endif |
506 | cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx |
507 | jbe L(last_2x_vec) |
508 | |
509 | |
510 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
511 | movups (VEC_SIZE * 2)(%rdi), %xmm1 |
512 | PCMPEQ %xmm0, %xmm1 |
513 | movups (VEC_SIZE * 3)(%rsi), %xmm2 |
514 | movups (VEC_SIZE * 3)(%rdi), %xmm3 |
515 | PCMPEQ %xmm2, %xmm3 |
516 | pand %xmm1, %xmm3 |
517 | |
518 | pmovmskb %xmm3, %eax |
519 | CHECK_CMP (%ecx, %eax) |
520 | jz L(last_2x_vec) |
521 | # ifdef USE_AS_MEMCMPEQ |
522 | L(ret_nonzero_loop): |
523 | ret |
524 | # else |
525 | |
526 | .p2align 4 |
527 | L(ret_nonzero_vec_start_2_3): |
528 | pmovmskb %xmm1, %edx |
529 | sall $16, %eax |
530 | leal 1(%rax, %rdx), %eax |
531 | |
532 | bsfl %eax, %eax |
533 | # ifdef USE_AS_WMEMCMP |
534 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
535 | xorl %edx, %edx |
536 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
537 | /* NB: no partial register stall here because xorl zero idiom |
538 | above. */ |
539 | setg %dl |
540 | leal -1(%rdx, %rdx), %eax |
541 | # else |
542 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
543 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
544 | subl %ecx, %eax |
545 | # endif |
546 | ret |
547 | |
548 | .p2align 4 |
549 | L(ret_nonzero_loop): |
550 | pmovmskb %xmm0, %ecx |
551 | pmovmskb %xmm1, %edx |
552 | sall $(VEC_SIZE * 1), %edx |
553 | leal 1(%rcx, %rdx), %edx |
554 | pmovmskb %xmm2, %ecx |
555 | /* High 16 bits of eax guranteed to be all ones. Rotate them in |
556 | to we can do `or + not` with just `xor`. */ |
557 | rorl $16, %eax |
558 | xorl %ecx, %eax |
559 | |
560 | salq $32, %rax |
561 | orq %rdx, %rax |
562 | |
563 | bsfq %rax, %rax |
564 | # ifdef USE_AS_WMEMCMP |
565 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
566 | xorl %edx, %edx |
567 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
568 | /* NB: no partial register stall here because xorl zero idiom |
569 | above. */ |
570 | setg %dl |
571 | leal -1(%rdx, %rdx), %eax |
572 | # else |
573 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
574 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
575 | subl %ecx, %eax |
576 | # endif |
577 | ret |
578 | # endif |
579 | END(MEMCMP) |
580 | #endif |
581 | |