1/* memcmp with SSE2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19
20#include <isa-level.h>
21
22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
23 so we need this to build for ISA V2 builds. */
24#if ISA_SHOULD_BUILD (2)
25
26#include <sysdep.h>
27
28# ifndef MEMCMP
29# define MEMCMP __memcmp_sse2
30# endif
31
32# ifdef USE_AS_WMEMCMP
33# define PCMPEQ pcmpeqd
34# define CHAR_SIZE 4
35# define SIZE_OFFSET (0)
36# else
37# define PCMPEQ pcmpeqb
38# define CHAR_SIZE 1
39# endif
40
41# ifdef USE_AS_MEMCMPEQ
42# define SIZE_OFFSET (0)
43# define CHECK_CMP(x, y) subl x, y
44# else
45# ifndef SIZE_OFFSET
46# define SIZE_OFFSET (CHAR_PER_VEC * 2)
47# endif
48# define CHECK_CMP(x, y) cmpl x, y
49# endif
50
51# define VEC_SIZE 16
52# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54# ifndef MEMCMP
55# define MEMCMP memcmp
56# endif
57
58 .text
59ENTRY(MEMCMP)
60# ifdef __ILP32__
61 /* Clear the upper 32 bits. */
62 movl %edx, %edx
63# endif
64# ifdef USE_AS_WMEMCMP
65 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
66 in ecx for code size. This is preferable to using `incw` as
67 it avoids partial register stalls on older hardware (pre
68 SnB). */
69 movl $0xffff, %ecx
70# endif
71 cmpq $CHAR_PER_VEC, %rdx
72 ja L(more_1x_vec)
73
74# ifdef USE_AS_WMEMCMP
75 /* saves a byte of code keeping the fall through path n = [2, 4]
76 in the initial cache line. */
77 decl %edx
78 jle L(cmp_0_1)
79
80 movq (%rsi), %xmm0
81 movq (%rdi), %xmm1
82 PCMPEQ %xmm0, %xmm1
83 pmovmskb %xmm1, %eax
84 subl %ecx, %eax
85 jnz L(ret_nonzero_vec_start_0)
86
87 movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
88 movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
89 PCMPEQ %xmm0, %xmm1
90 pmovmskb %xmm1, %eax
91 subl %ecx, %eax
92 jnz L(ret_nonzero_vec_end_0_adj)
93# else
94 cmpl $8, %edx
95 ja L(cmp_9_16)
96
97 cmpl $4, %edx
98 jb L(cmp_0_3)
99
100# ifdef USE_AS_MEMCMPEQ
101 movl (%rsi), %eax
102 subl (%rdi), %eax
103
104 movl -4(%rsi, %rdx), %esi
105 subl -4(%rdi, %rdx), %esi
106
107 orl %esi, %eax
108 ret
109# else
110 /* Combine comparisons for lo and hi 4-byte comparisons. */
111 movl -4(%rsi, %rdx), %ecx
112 movl -4(%rdi, %rdx), %eax
113 shlq $32, %rcx
114 shlq $32, %rax
115 movl (%rsi), %esi
116 movl (%rdi), %edi
117 orq %rsi, %rcx
118 orq %rdi, %rax
119 /* Only compute proper return if not-equal. */
120 cmpq %rcx, %rax
121 jnz L(ret_nonzero)
122 xorl %eax, %eax
123 ret
124# endif
125
126 .p2align 4,, 10
127L(cmp_9_16):
128# ifdef USE_AS_MEMCMPEQ
129 movq (%rsi), %rax
130 subq (%rdi), %rax
131
132 movq -8(%rsi, %rdx), %rcx
133 subq -8(%rdi, %rdx), %rcx
134 orq %rcx, %rax
135 /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
136 return long). */
137 setnz %cl
138 movzbl %cl, %eax
139# else
140 movq (%rsi), %rcx
141 movq (%rdi), %rax
142 /* Only compute proper return if not-equal. */
143 cmpq %rcx, %rax
144 jnz L(ret_nonzero)
145
146 movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
147 movq -8(%rdi, %rdx, CHAR_SIZE), %rax
148 /* Only compute proper return if not-equal. */
149 cmpq %rcx, %rax
150 jnz L(ret_nonzero)
151 xorl %eax, %eax
152# endif
153# endif
154 ret
155
156 .p2align 4,, 8
157L(cmp_0_1):
158 /* Flag set by earlier comparison against 1. */
159 jne L(cmp_0_0)
160# ifdef USE_AS_WMEMCMP
161 movl (%rdi), %ecx
162 xorl %edx, %edx
163 cmpl (%rsi), %ecx
164 je L(cmp_0_0)
165 setg %dl
166 leal -1(%rdx, %rdx), %eax
167# else
168 movzbl (%rdi), %eax
169 movzbl (%rsi), %ecx
170 subl %ecx, %eax
171# endif
172 ret
173
174 /* Fits in aligning bytes. */
175L(cmp_0_0):
176 xorl %eax, %eax
177 ret
178
179# ifdef USE_AS_WMEMCMP
180 .p2align 4
181L(ret_nonzero_vec_start_0):
182 bsfl %eax, %eax
183 movl (%rdi, %rax), %ecx
184 xorl %edx, %edx
185 cmpl (%rsi, %rax), %ecx
186 /* NB: no partial register stall here because xorl zero idiom
187 above. */
188 setg %dl
189 leal -1(%rdx, %rdx), %eax
190 ret
191# else
192
193# ifndef USE_AS_MEMCMPEQ
194 .p2align 4,, 14
195L(ret_nonzero):
196 /* Need to bswap to get proper return without branch. */
197 bswapq %rcx
198 bswapq %rax
199 subq %rcx, %rax
200 sbbl %eax, %eax
201 orl $1, %eax
202 ret
203# endif
204
205 .p2align 4
206L(cmp_0_3):
207# ifdef USE_AS_MEMCMPEQ
208 /* No reason to add to dependency chain on rdx. Saving a the
209 bytes here doesn't change number of fetch blocks. */
210 cmpl $1, %edx
211 jbe L(cmp_0_1)
212# else
213 /* We need the code size to prevent taking an extra fetch block.
214 */
215 decl %edx
216 jle L(cmp_0_1)
217# endif
218 movzwl (%rsi), %ecx
219 movzwl (%rdi), %eax
220
221# ifdef USE_AS_MEMCMPEQ
222 subl %ecx, %eax
223
224 movzbl -1(%rsi, %rdx), %esi
225 movzbl -1(%rdi, %rdx), %edi
226 subl %edi, %esi
227 orl %esi, %eax
228# else
229 bswapl %ecx
230 bswapl %eax
231
232 /* Implicit right shift by one. We just need to displace the
233 sign bits. */
234 shrl %ecx
235 shrl %eax
236
237 /* Eat a partial register stall here. Saves code stopping
238 L(cmp_0_3) from bleeding into the next fetch block and saves
239 an ALU. */
240 movb (%rsi, %rdx), %cl
241 movzbl (%rdi, %rdx), %edi
242 orl %edi, %eax
243 subl %ecx, %eax
244# endif
245 ret
246# endif
247
248 .p2align 5
249L(more_1x_vec):
250# ifndef USE_AS_WMEMCMP
251 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
252 in ecx for code size. This is preferable to using `incw` as
253 it avoids partial register stalls on older hardware (pre
254 SnB). */
255 movl $0xffff, %ecx
256# endif
257 movups (%rsi), %xmm0
258 movups (%rdi), %xmm1
259 PCMPEQ %xmm0, %xmm1
260 pmovmskb %xmm1, %eax
261 subl %ecx, %eax
262 jnz L(ret_nonzero_vec_start_0)
263# if SIZE_OFFSET == 0
264 cmpq $(CHAR_PER_VEC * 2), %rdx
265# else
266 /* Offset rdx. Saves just enough code size to keep the
267 L(last_2x_vec) case and the non-zero return in a single
268 cache line. */
269 subq $(CHAR_PER_VEC * 2), %rdx
270# endif
271 ja L(more_2x_vec)
272
273 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
274 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
275 PCMPEQ %xmm0, %xmm1
276 pmovmskb %xmm1, %eax
277 subl %ecx, %eax
278# ifndef USE_AS_MEMCMPEQ
279 /* Don't use `incw ax` as machines this code runs on are liable
280 to have partial register stall. */
281 jnz L(ret_nonzero_vec_end_0)
282# else
283 /* Various return targets for memcmpeq. Will always be hot in
284 Icache and get short encoding. */
285L(ret_nonzero_vec_start_1):
286L(ret_nonzero_vec_start_0):
287L(ret_nonzero_vec_end_0):
288# endif
289 ret
290
291# ifndef USE_AS_MEMCMPEQ
292# ifdef USE_AS_WMEMCMP
293 .p2align 4
294L(ret_nonzero_vec_end_0_adj):
295 addl $3, %edx
296# else
297 .p2align 4,, 8
298# endif
299L(ret_nonzero_vec_end_0):
300 bsfl %eax, %eax
301# ifdef USE_AS_WMEMCMP
302 leal (%rax, %rdx, CHAR_SIZE), %eax
303 movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
304 xorl %edx, %edx
305 cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
306 /* NB: no partial register stall here because xorl zero idiom
307 above. */
308 setg %dl
309 leal -1(%rdx, %rdx), %eax
310# else
311 addl %edx, %eax
312 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
313 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
314 subl %ecx, %eax
315# endif
316 ret
317# ifndef USE_AS_WMEMCMP
318 .p2align 4,, 10
319L(ret_nonzero_vec_start_0):
320 bsfl %eax, %eax
321 movzbl (%rsi, %rax), %ecx
322 movzbl (%rdi, %rax), %eax
323 subl %ecx, %eax
324 ret
325# endif
326# else
327# endif
328
329 .p2align 5
330L(more_2x_vec):
331 movups (VEC_SIZE * 1)(%rsi), %xmm0
332 movups (VEC_SIZE * 1)(%rdi), %xmm1
333 PCMPEQ %xmm0, %xmm1
334 pmovmskb %xmm1, %eax
335 subl %ecx, %eax
336 jnz L(ret_nonzero_vec_start_1)
337
338 cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
339 jbe L(last_2x_vec)
340
341 cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
342 ja L(more_8x_vec)
343
344 /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
345 This can harm performance if non-zero return in [65, 80] or
346 [97, 112] but helps performance otherwise. Generally zero-
347 return is hotter. */
348 movups (VEC_SIZE * 2)(%rsi), %xmm0
349 movups (VEC_SIZE * 2)(%rdi), %xmm1
350 PCMPEQ %xmm0, %xmm1
351 movups (VEC_SIZE * 3)(%rsi), %xmm2
352 movups (VEC_SIZE * 3)(%rdi), %xmm3
353 PCMPEQ %xmm2, %xmm3
354 pand %xmm1, %xmm3
355
356 pmovmskb %xmm3, %eax
357 CHECK_CMP (%ecx, %eax)
358 jnz L(ret_nonzero_vec_start_2_3)
359
360 cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
361 jbe L(last_2x_vec)
362
363 movups (VEC_SIZE * 4)(%rsi), %xmm0
364 movups (VEC_SIZE * 4)(%rdi), %xmm1
365 PCMPEQ %xmm0, %xmm1
366 movups (VEC_SIZE * 5)(%rsi), %xmm2
367 movups (VEC_SIZE * 5)(%rdi), %xmm3
368 PCMPEQ %xmm2, %xmm3
369 pand %xmm1, %xmm3
370
371 pmovmskb %xmm3, %eax
372 CHECK_CMP (%ecx, %eax)
373# ifdef USE_AS_MEMCMPEQ
374 jz L(last_2x_vec)
375 ret
376# else
377 jnz L(ret_nonzero_vec_start_4_5)
378# endif
379 .p2align 4
380L(last_2x_vec):
381 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
382 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
383 PCMPEQ %xmm0, %xmm1
384 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
385 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
386 PCMPEQ %xmm2, %xmm3
387 pand %xmm1, %xmm3
388 pmovmskb %xmm3, %eax
389 subl %ecx, %eax
390# ifdef USE_AS_MEMCMPEQ
391 /* Various return targets for memcmpeq. Will always be hot in
392 Icache and get short encoding. */
393L(ret_nonzero_vec_start_2_3):
394L(ret_nonzero_vec_start_4_5):
395 ret
396# else
397 jnz L(ret_nonzero_vec_end_1)
398 ret
399
400 .p2align 4,, 8
401L(ret_nonzero_vec_end_1):
402 pmovmskb %xmm1, %ecx
403 /* High 16 bits of eax guranteed to be all ones. Rotate them in
404 to we can do `or + not` with just `xor`. */
405 rorl $16, %eax
406 xorl %ecx, %eax
407 /* Partial register stall. */
408
409 bsfl %eax, %eax
410# ifdef USE_AS_WMEMCMP
411 leal (%rax, %rdx, CHAR_SIZE), %eax
412 movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
413 xorl %edx, %edx
414 cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
415 /* NB: no partial register stall here because xorl zero idiom
416 above. */
417 setg %dl
418 leal -1(%rdx, %rdx), %eax
419# else
420 addl %edx, %eax
421 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
422 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
423 subl %ecx, %eax
424# endif
425 ret
426
427 .p2align 4
428L(ret_nonzero_vec_start_4_5):
429 pmovmskb %xmm1, %edx
430 sall $16, %eax
431 leal 1(%rax, %rdx), %eax
432 bsfl %eax, %eax
433# ifdef USE_AS_WMEMCMP
434 movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
435 xorl %edx, %edx
436 cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
437 /* NB: no partial register stall here because xorl zero idiom
438 above. */
439 setg %dl
440 leal -1(%rdx, %rdx), %eax
441# else
442 movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
443 movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
444 subl %ecx, %eax
445# endif
446 ret
447
448 .p2align 4,, 8
449L(ret_nonzero_vec_start_1):
450 bsfl %eax, %eax
451# ifdef USE_AS_WMEMCMP
452 movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
453 xorl %edx, %edx
454 cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
455 /* NB: no partial register stall here because xorl zero idiom
456 above. */
457 setg %dl
458 leal -1(%rdx, %rdx), %eax
459# else
460 movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
461 movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
462 subl %ecx, %eax
463# endif
464 ret
465# endif
466
467 .p2align 4
468L(more_8x_vec):
469 subq %rdi, %rsi
470 leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
471 andq $(VEC_SIZE * -1), %rdi
472 addq %rdi, %rsi
473 .p2align 4
474L(loop_4x):
475 movups (VEC_SIZE * 2)(%rsi), %xmm0
476 movups (VEC_SIZE * 3)(%rsi), %xmm1
477
478 PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
479 PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
480
481 movups (VEC_SIZE * 4)(%rsi), %xmm2
482 movups (VEC_SIZE * 5)(%rsi), %xmm3
483
484 PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
485 PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
486
487 pand %xmm0, %xmm1
488 pand %xmm2, %xmm3
489 pand %xmm1, %xmm3
490
491 pmovmskb %xmm3, %eax
492 subl %ecx, %eax
493 jnz L(ret_nonzero_loop)
494
495 addq $(VEC_SIZE * 4), %rdi
496 addq $(VEC_SIZE * 4), %rsi
497 cmpq %rdi, %rdx
498 ja L(loop_4x)
499 /* Get remaining length in edx. */
500 subl %edi, %edx
501 /* Restore offset so we can reuse L(last_2x_vec). */
502 addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
503# ifdef USE_AS_WMEMCMP
504 shrl $2, %edx
505# endif
506 cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
507 jbe L(last_2x_vec)
508
509
510 movups (VEC_SIZE * 2)(%rsi), %xmm0
511 movups (VEC_SIZE * 2)(%rdi), %xmm1
512 PCMPEQ %xmm0, %xmm1
513 movups (VEC_SIZE * 3)(%rsi), %xmm2
514 movups (VEC_SIZE * 3)(%rdi), %xmm3
515 PCMPEQ %xmm2, %xmm3
516 pand %xmm1, %xmm3
517
518 pmovmskb %xmm3, %eax
519 CHECK_CMP (%ecx, %eax)
520 jz L(last_2x_vec)
521# ifdef USE_AS_MEMCMPEQ
522L(ret_nonzero_loop):
523 ret
524# else
525
526 .p2align 4
527L(ret_nonzero_vec_start_2_3):
528 pmovmskb %xmm1, %edx
529 sall $16, %eax
530 leal 1(%rax, %rdx), %eax
531
532 bsfl %eax, %eax
533# ifdef USE_AS_WMEMCMP
534 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
535 xorl %edx, %edx
536 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
537 /* NB: no partial register stall here because xorl zero idiom
538 above. */
539 setg %dl
540 leal -1(%rdx, %rdx), %eax
541# else
542 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
543 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
544 subl %ecx, %eax
545# endif
546 ret
547
548 .p2align 4
549L(ret_nonzero_loop):
550 pmovmskb %xmm0, %ecx
551 pmovmskb %xmm1, %edx
552 sall $(VEC_SIZE * 1), %edx
553 leal 1(%rcx, %rdx), %edx
554 pmovmskb %xmm2, %ecx
555 /* High 16 bits of eax guranteed to be all ones. Rotate them in
556 to we can do `or + not` with just `xor`. */
557 rorl $16, %eax
558 xorl %ecx, %eax
559
560 salq $32, %rax
561 orq %rdx, %rax
562
563 bsfq %rax, %rax
564# ifdef USE_AS_WMEMCMP
565 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
566 xorl %edx, %edx
567 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
568 /* NB: no partial register stall here because xorl zero idiom
569 above. */
570 setg %dl
571 leal -1(%rdx, %rdx), %eax
572# else
573 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
574 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
575 subl %ecx, %eax
576# endif
577 ret
578# endif
579END(MEMCMP)
580#endif
581