1 | /* memcmp with SSE2. |
2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | |
20 | #include <isa-level.h> |
21 | |
22 | /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation |
23 | so we need this to build for ISA V2 builds. */ |
24 | #if ISA_SHOULD_BUILD (2) |
25 | |
26 | #include <sysdep.h> |
27 | |
28 | # ifndef MEMCMP |
29 | # define MEMCMP __memcmp_sse2 |
30 | # endif |
31 | |
32 | # ifdef USE_AS_WMEMCMP |
33 | # define PCMPEQ pcmpeqd |
34 | # define CHAR_SIZE 4 |
35 | # define SIZE_OFFSET (0) |
36 | # else |
37 | # define PCMPEQ pcmpeqb |
38 | # define CHAR_SIZE 1 |
39 | # endif |
40 | |
41 | # ifdef USE_AS_MEMCMPEQ |
42 | # define SIZE_OFFSET (0) |
43 | # define CHECK_CMP(x, y) subl x, y |
44 | # else |
45 | # ifndef SIZE_OFFSET |
46 | # define SIZE_OFFSET (CHAR_PER_VEC * 2) |
47 | # endif |
48 | # define CHECK_CMP(x, y) cmpl x, y |
49 | # endif |
50 | |
51 | # define VEC_SIZE 16 |
52 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
53 | |
54 | # ifndef MEMCMP |
55 | # define MEMCMP memcmp |
56 | # endif |
57 | |
58 | .text |
59 | ENTRY(MEMCMP) |
60 | # ifdef __ILP32__ |
61 | /* Clear the upper 32 bits. */ |
62 | movl %edx, %edx |
63 | # endif |
64 | # ifdef USE_AS_WMEMCMP |
65 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store |
66 | in ecx for code size. This is preferable to using `incw` as |
67 | it avoids partial register stalls on older hardware (pre |
68 | SnB). */ |
69 | movl $0xffff, %ecx |
70 | # endif |
71 | cmpq $CHAR_PER_VEC, %rdx |
72 | ja L(more_1x_vec) |
73 | |
74 | # ifdef USE_AS_WMEMCMP |
75 | /* saves a byte of code keeping the fall through path n = [2, 4] |
76 | in the initial cache line. */ |
77 | decl %edx |
78 | jle L(cmp_0_1) |
79 | |
80 | movq (%rsi), %xmm0 |
81 | movq (%rdi), %xmm1 |
82 | PCMPEQ %xmm0, %xmm1 |
83 | pmovmskb %xmm1, %eax |
84 | subl %ecx, %eax |
85 | jnz L(ret_nonzero_vec_start_0) |
86 | |
87 | movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 |
88 | movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 |
89 | PCMPEQ %xmm0, %xmm1 |
90 | pmovmskb %xmm1, %eax |
91 | subl %ecx, %eax |
92 | jnz L(ret_nonzero_vec_end_0_adj) |
93 | # else |
94 | cmpl $8, %edx |
95 | ja L(cmp_9_16) |
96 | |
97 | cmpl $4, %edx |
98 | jb L(cmp_0_3) |
99 | |
100 | # ifdef USE_AS_MEMCMPEQ |
101 | movl (%rsi), %eax |
102 | subl (%rdi), %eax |
103 | |
104 | movl -4(%rsi, %rdx), %esi |
105 | subl -4(%rdi, %rdx), %esi |
106 | |
107 | orl %esi, %eax |
108 | ret |
109 | # else |
110 | /* Combine comparisons for lo and hi 4-byte comparisons. */ |
111 | movl -4(%rsi, %rdx), %ecx |
112 | movl -4(%rdi, %rdx), %eax |
113 | shlq $32, %rcx |
114 | shlq $32, %rax |
115 | movl (%rsi), %esi |
116 | movl (%rdi), %edi |
117 | orq %rsi, %rcx |
118 | orq %rdi, %rax |
119 | /* Only compute proper return if not-equal. */ |
120 | cmpq %rcx, %rax |
121 | jnz L(ret_nonzero) |
122 | xorl %eax, %eax |
123 | ret |
124 | # endif |
125 | |
126 | .p2align 4,, 10 |
127 | L(cmp_9_16): |
128 | # ifdef USE_AS_MEMCMPEQ |
129 | movq (%rsi), %rax |
130 | subq (%rdi), %rax |
131 | |
132 | movq -8(%rsi, %rdx), %rcx |
133 | subq -8(%rdi, %rdx), %rcx |
134 | orq %rcx, %rax |
135 | /* Convert 64 bit -> 32 bit boolean (we should have made the ABI |
136 | return long). */ |
137 | setnz %cl |
138 | movzbl %cl, %eax |
139 | # else |
140 | movq (%rsi), %rcx |
141 | movq (%rdi), %rax |
142 | /* Only compute proper return if not-equal. */ |
143 | cmpq %rcx, %rax |
144 | jnz L(ret_nonzero) |
145 | |
146 | movq -8(%rsi, %rdx, CHAR_SIZE), %rcx |
147 | movq -8(%rdi, %rdx, CHAR_SIZE), %rax |
148 | /* Only compute proper return if not-equal. */ |
149 | cmpq %rcx, %rax |
150 | jnz L(ret_nonzero) |
151 | xorl %eax, %eax |
152 | # endif |
153 | # endif |
154 | ret |
155 | |
156 | .p2align 4,, 8 |
157 | L(cmp_0_1): |
158 | /* Flag set by earlier comparison against 1. */ |
159 | jne L(cmp_0_0) |
160 | # ifdef USE_AS_WMEMCMP |
161 | movl (%rdi), %ecx |
162 | xorl %edx, %edx |
163 | cmpl (%rsi), %ecx |
164 | je L(cmp_0_0) |
165 | setg %dl |
166 | leal -1(%rdx, %rdx), %eax |
167 | # else |
168 | movzbl (%rdi), %eax |
169 | movzbl (%rsi), %ecx |
170 | subl %ecx, %eax |
171 | # endif |
172 | ret |
173 | |
174 | /* Fits in aligning bytes. */ |
175 | L(cmp_0_0): |
176 | xorl %eax, %eax |
177 | ret |
178 | |
179 | # ifdef USE_AS_WMEMCMP |
180 | .p2align 4 |
181 | L(ret_nonzero_vec_start_0): |
182 | bsfl %eax, %eax |
183 | movl (%rdi, %rax), %ecx |
184 | xorl %edx, %edx |
185 | cmpl (%rsi, %rax), %ecx |
186 | /* NB: no partial register stall here because xorl zero idiom |
187 | above. */ |
188 | setg %dl |
189 | leal -1(%rdx, %rdx), %eax |
190 | ret |
191 | # else |
192 | |
193 | # ifndef USE_AS_MEMCMPEQ |
194 | .p2align 4,, 14 |
195 | L(ret_nonzero): |
196 | /* Need to bswap to get proper return without branch. */ |
197 | bswapq %rcx |
198 | bswapq %rax |
199 | subq %rcx, %rax |
200 | sbbl %eax, %eax |
201 | orl $1, %eax |
202 | ret |
203 | # endif |
204 | |
205 | .p2align 4 |
206 | L(cmp_0_3): |
207 | # ifdef USE_AS_MEMCMPEQ |
208 | /* No reason to add to dependency chain on rdx. Saving a the |
209 | bytes here doesn't change number of fetch blocks. */ |
210 | cmpl $1, %edx |
211 | jbe L(cmp_0_1) |
212 | # else |
213 | /* We need the code size to prevent taking an extra fetch block. |
214 | */ |
215 | decl %edx |
216 | jle L(cmp_0_1) |
217 | # endif |
218 | movzwl (%rsi), %ecx |
219 | movzwl (%rdi), %eax |
220 | |
221 | # ifdef USE_AS_MEMCMPEQ |
222 | subl %ecx, %eax |
223 | |
224 | movzbl -1(%rsi, %rdx), %esi |
225 | movzbl -1(%rdi, %rdx), %edi |
226 | subl %edi, %esi |
227 | orl %esi, %eax |
228 | # else |
229 | bswapl %ecx |
230 | bswapl %eax |
231 | |
232 | /* Implicit right shift by one. We just need to displace the |
233 | sign bits. */ |
234 | shrl %ecx |
235 | shrl %eax |
236 | |
237 | /* Eat a partial register stall here. Saves code stopping |
238 | L(cmp_0_3) from bleeding into the next fetch block and saves |
239 | an ALU. */ |
240 | movb (%rsi, %rdx), %cl |
241 | movzbl (%rdi, %rdx), %edi |
242 | orl %edi, %eax |
243 | subl %ecx, %eax |
244 | # endif |
245 | ret |
246 | # endif |
247 | |
248 | .p2align 5 |
249 | L(more_1x_vec): |
250 | # ifndef USE_AS_WMEMCMP |
251 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store |
252 | in ecx for code size. This is preferable to using `incw` as |
253 | it avoids partial register stalls on older hardware (pre |
254 | SnB). */ |
255 | movl $0xffff, %ecx |
256 | # endif |
257 | movups (%rsi), %xmm0 |
258 | movups (%rdi), %xmm1 |
259 | PCMPEQ %xmm0, %xmm1 |
260 | pmovmskb %xmm1, %eax |
261 | subl %ecx, %eax |
262 | jnz L(ret_nonzero_vec_start_0) |
263 | # if SIZE_OFFSET == 0 |
264 | cmpq $(CHAR_PER_VEC * 2), %rdx |
265 | # else |
266 | /* Offset rdx. Saves just enough code size to keep the |
267 | L(last_2x_vec) case and the non-zero return in a single |
268 | cache line. */ |
269 | subq $(CHAR_PER_VEC * 2), %rdx |
270 | # endif |
271 | ja L(more_2x_vec) |
272 | |
273 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 |
274 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 |
275 | PCMPEQ %xmm0, %xmm1 |
276 | pmovmskb %xmm1, %eax |
277 | subl %ecx, %eax |
278 | # ifndef USE_AS_MEMCMPEQ |
279 | /* Don't use `incw ax` as machines this code runs on are liable |
280 | to have partial register stall. */ |
281 | jnz L(ret_nonzero_vec_end_0) |
282 | # else |
283 | /* Various return targets for memcmpeq. Will always be hot in |
284 | Icache and get short encoding. */ |
285 | L(ret_nonzero_vec_start_1): |
286 | L(ret_nonzero_vec_start_0): |
287 | L(ret_nonzero_vec_end_0): |
288 | # endif |
289 | ret |
290 | |
291 | # ifndef USE_AS_MEMCMPEQ |
292 | # ifdef USE_AS_WMEMCMP |
293 | .p2align 4 |
294 | L(ret_nonzero_vec_end_0_adj): |
295 | addl $3, %edx |
296 | # else |
297 | .p2align 4,, 8 |
298 | # endif |
299 | L(ret_nonzero_vec_end_0): |
300 | bsfl %eax, %eax |
301 | # ifdef USE_AS_WMEMCMP |
302 | leal (%rax, %rdx, CHAR_SIZE), %eax |
303 | movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx |
304 | xorl %edx, %edx |
305 | cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx |
306 | /* NB: no partial register stall here because xorl zero idiom |
307 | above. */ |
308 | setg %dl |
309 | leal -1(%rdx, %rdx), %eax |
310 | # else |
311 | /* Use `addq` instead of `addl` here so that even if `rax` + `rdx` |
312 | is negative value of the sum will be usable as a 64-bit offset |
313 | (negative 32-bit numbers zero-extend to a large and often |
314 | out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is |
315 | an invariant when `memcmp` is used correctly, but if the input |
316 | strings `rsi`/`rdi` are concurrently modified as the function |
317 | runs (there is a Data-Race) it is possible for `rax` + `rdx` to |
318 | be negative. Given that there is virtually no extra to cost |
319 | using `addq` instead of `addl` we may as well protect the |
320 | data-race case. */ |
321 | addq %rdx, %rax |
322 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx |
323 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax |
324 | subl %ecx, %eax |
325 | # endif |
326 | ret |
327 | # ifndef USE_AS_WMEMCMP |
328 | .p2align 4,, 10 |
329 | L(ret_nonzero_vec_start_0): |
330 | bsfl %eax, %eax |
331 | movzbl (%rsi, %rax), %ecx |
332 | movzbl (%rdi, %rax), %eax |
333 | subl %ecx, %eax |
334 | ret |
335 | # endif |
336 | # else |
337 | # endif |
338 | |
339 | .p2align 5 |
340 | L(more_2x_vec): |
341 | movups (VEC_SIZE * 1)(%rsi), %xmm0 |
342 | movups (VEC_SIZE * 1)(%rdi), %xmm1 |
343 | PCMPEQ %xmm0, %xmm1 |
344 | pmovmskb %xmm1, %eax |
345 | subl %ecx, %eax |
346 | jnz L(ret_nonzero_vec_start_1) |
347 | |
348 | cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx |
349 | jbe L(last_2x_vec) |
350 | |
351 | cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx |
352 | ja L(more_8x_vec) |
353 | |
354 | /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. |
355 | This can harm performance if non-zero return in [65, 80] or |
356 | [97, 112] but helps performance otherwise. Generally zero- |
357 | return is hotter. */ |
358 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
359 | movups (VEC_SIZE * 2)(%rdi), %xmm1 |
360 | PCMPEQ %xmm0, %xmm1 |
361 | movups (VEC_SIZE * 3)(%rsi), %xmm2 |
362 | movups (VEC_SIZE * 3)(%rdi), %xmm3 |
363 | PCMPEQ %xmm2, %xmm3 |
364 | pand %xmm1, %xmm3 |
365 | |
366 | pmovmskb %xmm3, %eax |
367 | CHECK_CMP (%ecx, %eax) |
368 | jnz L(ret_nonzero_vec_start_2_3) |
369 | |
370 | cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx |
371 | jbe L(last_2x_vec) |
372 | |
373 | movups (VEC_SIZE * 4)(%rsi), %xmm0 |
374 | movups (VEC_SIZE * 4)(%rdi), %xmm1 |
375 | PCMPEQ %xmm0, %xmm1 |
376 | movups (VEC_SIZE * 5)(%rsi), %xmm2 |
377 | movups (VEC_SIZE * 5)(%rdi), %xmm3 |
378 | PCMPEQ %xmm2, %xmm3 |
379 | pand %xmm1, %xmm3 |
380 | |
381 | pmovmskb %xmm3, %eax |
382 | CHECK_CMP (%ecx, %eax) |
383 | # ifdef USE_AS_MEMCMPEQ |
384 | jz L(last_2x_vec) |
385 | ret |
386 | # else |
387 | jnz L(ret_nonzero_vec_start_4_5) |
388 | # endif |
389 | .p2align 4 |
390 | L(last_2x_vec): |
391 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 |
392 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 |
393 | PCMPEQ %xmm0, %xmm1 |
394 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 |
395 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 |
396 | PCMPEQ %xmm2, %xmm3 |
397 | pand %xmm1, %xmm3 |
398 | pmovmskb %xmm3, %eax |
399 | subl %ecx, %eax |
400 | # ifdef USE_AS_MEMCMPEQ |
401 | /* Various return targets for memcmpeq. Will always be hot in |
402 | Icache and get short encoding. */ |
403 | L(ret_nonzero_vec_start_2_3): |
404 | L(ret_nonzero_vec_start_4_5): |
405 | ret |
406 | # else |
407 | jnz L(ret_nonzero_vec_end_1) |
408 | ret |
409 | |
410 | .p2align 4,, 8 |
411 | L(ret_nonzero_vec_end_1): |
412 | pmovmskb %xmm1, %ecx |
413 | /* High 16 bits of eax guaranteed to be all ones. Rotate them in |
414 | to we can do `or + not` with just `xor`. */ |
415 | rorl $16, %eax |
416 | xorl %ecx, %eax |
417 | /* Partial register stall. */ |
418 | |
419 | bsfl %eax, %eax |
420 | # ifdef USE_AS_WMEMCMP |
421 | leal (%rax, %rdx, CHAR_SIZE), %eax |
422 | movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx |
423 | xorl %edx, %edx |
424 | cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx |
425 | /* NB: no partial register stall here because xorl zero idiom |
426 | above. */ |
427 | setg %dl |
428 | leal -1(%rdx, %rdx), %eax |
429 | # else |
430 | addl %edx, %eax |
431 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx |
432 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax |
433 | subl %ecx, %eax |
434 | # endif |
435 | ret |
436 | |
437 | .p2align 4 |
438 | L(ret_nonzero_vec_start_4_5): |
439 | pmovmskb %xmm1, %edx |
440 | sall $16, %eax |
441 | leal 1(%rax, %rdx), %eax |
442 | bsfl %eax, %eax |
443 | # ifdef USE_AS_WMEMCMP |
444 | movl (VEC_SIZE * 4)(%rdi, %rax), %ecx |
445 | xorl %edx, %edx |
446 | cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx |
447 | /* NB: no partial register stall here because xorl zero idiom |
448 | above. */ |
449 | setg %dl |
450 | leal -1(%rdx, %rdx), %eax |
451 | # else |
452 | movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx |
453 | movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax |
454 | subl %ecx, %eax |
455 | # endif |
456 | ret |
457 | |
458 | .p2align 4,, 8 |
459 | L(ret_nonzero_vec_start_1): |
460 | bsfl %eax, %eax |
461 | # ifdef USE_AS_WMEMCMP |
462 | movl (VEC_SIZE * 1)(%rdi, %rax), %ecx |
463 | xorl %edx, %edx |
464 | cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx |
465 | /* NB: no partial register stall here because xorl zero idiom |
466 | above. */ |
467 | setg %dl |
468 | leal -1(%rdx, %rdx), %eax |
469 | # else |
470 | movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx |
471 | movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax |
472 | subl %ecx, %eax |
473 | # endif |
474 | ret |
475 | # endif |
476 | |
477 | .p2align 4 |
478 | L(more_8x_vec): |
479 | subq %rdi, %rsi |
480 | leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx |
481 | andq $(VEC_SIZE * -1), %rdi |
482 | addq %rdi, %rsi |
483 | .p2align 4 |
484 | L(loop_4x): |
485 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
486 | movups (VEC_SIZE * 3)(%rsi), %xmm1 |
487 | |
488 | PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 |
489 | PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 |
490 | |
491 | movups (VEC_SIZE * 4)(%rsi), %xmm2 |
492 | movups (VEC_SIZE * 5)(%rsi), %xmm3 |
493 | |
494 | PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 |
495 | PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 |
496 | |
497 | pand %xmm0, %xmm1 |
498 | pand %xmm2, %xmm3 |
499 | pand %xmm1, %xmm3 |
500 | |
501 | pmovmskb %xmm3, %eax |
502 | subl %ecx, %eax |
503 | jnz L(ret_nonzero_loop) |
504 | |
505 | addq $(VEC_SIZE * 4), %rdi |
506 | addq $(VEC_SIZE * 4), %rsi |
507 | cmpq %rdi, %rdx |
508 | ja L(loop_4x) |
509 | /* Get remaining length in edx. */ |
510 | subl %edi, %edx |
511 | /* Restore offset so we can reuse L(last_2x_vec). */ |
512 | addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx |
513 | # ifdef USE_AS_WMEMCMP |
514 | shrl $2, %edx |
515 | # endif |
516 | cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx |
517 | jbe L(last_2x_vec) |
518 | |
519 | |
520 | movups (VEC_SIZE * 2)(%rsi), %xmm0 |
521 | movups (VEC_SIZE * 2)(%rdi), %xmm1 |
522 | PCMPEQ %xmm0, %xmm1 |
523 | movups (VEC_SIZE * 3)(%rsi), %xmm2 |
524 | movups (VEC_SIZE * 3)(%rdi), %xmm3 |
525 | PCMPEQ %xmm2, %xmm3 |
526 | pand %xmm1, %xmm3 |
527 | |
528 | pmovmskb %xmm3, %eax |
529 | CHECK_CMP (%ecx, %eax) |
530 | jz L(last_2x_vec) |
531 | # ifdef USE_AS_MEMCMPEQ |
532 | L(ret_nonzero_loop): |
533 | ret |
534 | # else |
535 | |
536 | .p2align 4 |
537 | L(ret_nonzero_vec_start_2_3): |
538 | pmovmskb %xmm1, %edx |
539 | sall $16, %eax |
540 | leal 1(%rax, %rdx), %eax |
541 | |
542 | bsfl %eax, %eax |
543 | # ifdef USE_AS_WMEMCMP |
544 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
545 | xorl %edx, %edx |
546 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
547 | /* NB: no partial register stall here because xorl zero idiom |
548 | above. */ |
549 | setg %dl |
550 | leal -1(%rdx, %rdx), %eax |
551 | # else |
552 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
553 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
554 | subl %ecx, %eax |
555 | # endif |
556 | ret |
557 | |
558 | .p2align 4 |
559 | L(ret_nonzero_loop): |
560 | pmovmskb %xmm0, %ecx |
561 | pmovmskb %xmm1, %edx |
562 | sall $(VEC_SIZE * 1), %edx |
563 | leal 1(%rcx, %rdx), %edx |
564 | pmovmskb %xmm2, %ecx |
565 | /* High 16 bits of eax guaranteed to be all ones. Rotate them in |
566 | to we can do `or + not` with just `xor`. */ |
567 | rorl $16, %eax |
568 | xorl %ecx, %eax |
569 | |
570 | salq $32, %rax |
571 | orq %rdx, %rax |
572 | |
573 | bsfq %rax, %rax |
574 | # ifdef USE_AS_WMEMCMP |
575 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx |
576 | xorl %edx, %edx |
577 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
578 | /* NB: no partial register stall here because xorl zero idiom |
579 | above. */ |
580 | setg %dl |
581 | leal -1(%rdx, %rdx), %eax |
582 | # else |
583 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
584 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
585 | subl %ecx, %eax |
586 | # endif |
587 | ret |
588 | # endif |
589 | END(MEMCMP) |
590 | #endif |
591 | |