1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21/* memcmp/wmemcmp is implemented as:
22 1. Use ymm vector compares when possible. The only case where
23 vector compares is not possible for when size < CHAR_PER_VEC
24 and loading from either s1 or s2 would cause a page cross.
25 2. For size from 2 to 7 bytes on page cross, load as big endian
26 with movbe and bswap to avoid branches.
27 3. Use xmm vector compare when size >= 4 bytes for memcmp or
28 size >= 8 bytes for wmemcmp.
29 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
30 to check for early mismatches. Only do this if its guranteed the
31 work is not wasted.
32 5. If size is 8 * VEC_SIZE or less, unroll the loop.
33 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
34 area.
35 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
36 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
37 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
38
39# include <sysdep.h>
40
41# ifndef MEMCMP
42# define MEMCMP __memcmp_evex_movbe
43# endif
44
45# define VMOVU vmovdqu64
46
47# ifdef USE_AS_WMEMCMP
48# define CHAR_SIZE 4
49# define VPCMP vpcmpd
50# else
51# define CHAR_SIZE 1
52# define VPCMP vpcmpub
53# endif
54
55# define VEC_SIZE 32
56# define PAGE_SIZE 4096
57# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
58
59# define XMM0 xmm16
60# define XMM1 xmm17
61# define XMM2 xmm18
62# define YMM0 ymm16
63# define XMM1 xmm17
64# define XMM2 xmm18
65# define YMM1 ymm17
66# define YMM2 ymm18
67# define YMM3 ymm19
68# define YMM4 ymm20
69# define YMM5 ymm21
70# define YMM6 ymm22
71
72/* Warning!
73 wmemcmp has to use SIGNED comparison for elements.
74 memcmp has to use UNSIGNED comparison for elemnts.
75*/
76
77 .section .text.evex,"ax",@progbits
78ENTRY (MEMCMP)
79# ifdef __ILP32__
80 /* Clear the upper 32 bits. */
81 movl %edx, %edx
82# endif
83 cmp $CHAR_PER_VEC, %RDX_LP
84 jb L(less_vec)
85
86 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
87 VMOVU (%rsi), %YMM1
88 /* Use compare not equals to directly check for mismatch. */
89 VPCMP $4, (%rdi), %YMM1, %k1
90 kmovd %k1, %eax
91 /* NB: eax must be destination register if going to
92 L(return_vec_[0,2]). For L(return_vec_3 destination register
93 must be ecx. */
94 testl %eax, %eax
95 jnz L(return_vec_0)
96
97 cmpq $(CHAR_PER_VEC * 2), %rdx
98 jbe L(last_1x_vec)
99
100 /* Check second VEC no matter what. */
101 VMOVU VEC_SIZE(%rsi), %YMM2
102 VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
103 kmovd %k1, %eax
104 testl %eax, %eax
105 jnz L(return_vec_1)
106
107 /* Less than 4 * VEC. */
108 cmpq $(CHAR_PER_VEC * 4), %rdx
109 jbe L(last_2x_vec)
110
111 /* Check third and fourth VEC no matter what. */
112 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
113 VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
114 kmovd %k1, %eax
115 testl %eax, %eax
116 jnz L(return_vec_2)
117
118 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
119 VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
120 kmovd %k1, %ecx
121 testl %ecx, %ecx
122 jnz L(return_vec_3)
123
124 /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
125 compare with zero to get a mask is needed. */
126 vpxorq %XMM0, %XMM0, %XMM0
127
128 /* Go to 4x VEC loop. */
129 cmpq $(CHAR_PER_VEC * 8), %rdx
130 ja L(more_8x_vec)
131
132 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
133 branches. */
134
135 /* Load first two VEC from s2 before adjusting addresses. */
136 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
137 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
138 leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
139 leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
140
141 /* Wait to load from s1 until addressed adjust due to
142 unlamination of microfusion with complex address mode. */
143
144 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
145 will have some 1s. */
146 vpxorq (%rdi), %YMM1, %YMM1
147 vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
148
149 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
150 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
151 /* Or together YMM1, YMM2, and YMM3 into YMM3. */
152 vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
153
154 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
155 /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
156 oring with YMM3. Result is stored in YMM4. */
157 vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
158 /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
159 VPCMP $4, %YMM4, %YMM0, %k1
160 kmovd %k1, %ecx
161 testl %ecx, %ecx
162 jnz L(return_vec_0_1_2_3)
163 /* NB: eax must be zero to reach here. */
164 ret
165
166 /* NB: aligning 32 here allows for the rest of the jump targets
167 to be tuned for 32 byte alignment. Most important this ensures
168 the L(more_8x_vec) loop is 32 byte aligned. */
169 .p2align 5
170L(less_vec):
171 /* Check if one or less CHAR. This is necessary for size = 0 but
172 is also faster for size = CHAR_SIZE. */
173 cmpl $1, %edx
174 jbe L(one_or_less)
175
176 /* Check if loading one VEC from either s1 or s2 could cause a
177 page cross. This can have false positives but is by far the
178 fastest method. */
179 movl %edi, %eax
180 orl %esi, %eax
181 andl $(PAGE_SIZE - 1), %eax
182 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
183 jg L(page_cross_less_vec)
184
185 /* No page cross possible. */
186 VMOVU (%rsi), %YMM2
187 VPCMP $4, (%rdi), %YMM2, %k1
188 kmovd %k1, %eax
189 /* Create mask in ecx for potentially in bound matches. */
190 bzhil %edx, %eax, %eax
191 jnz L(return_vec_0)
192 ret
193
194 .p2align 4
195L(return_vec_0):
196 tzcntl %eax, %eax
197# ifdef USE_AS_WMEMCMP
198 movl (%rdi, %rax, CHAR_SIZE), %ecx
199 xorl %edx, %edx
200 cmpl (%rsi, %rax, CHAR_SIZE), %ecx
201 /* NB: no partial register stall here because xorl zero idiom
202 above. */
203 setg %dl
204 leal -1(%rdx, %rdx), %eax
205# else
206 movzbl (%rsi, %rax), %ecx
207 movzbl (%rdi, %rax), %eax
208 subl %ecx, %eax
209# endif
210 ret
211
212 /* NB: No p2align necessary. Alignment % 16 is naturally 1
213 which is good enough for a target not in a loop. */
214L(return_vec_1):
215 tzcntl %eax, %eax
216# ifdef USE_AS_WMEMCMP
217 movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
218 xorl %edx, %edx
219 cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
220 setg %dl
221 leal -1(%rdx, %rdx), %eax
222# else
223 movzbl VEC_SIZE(%rsi, %rax), %ecx
224 movzbl VEC_SIZE(%rdi, %rax), %eax
225 subl %ecx, %eax
226# endif
227 ret
228
229 /* NB: No p2align necessary. Alignment % 16 is naturally 2
230 which is good enough for a target not in a loop. */
231L(return_vec_2):
232 tzcntl %eax, %eax
233# ifdef USE_AS_WMEMCMP
234 movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
235 xorl %edx, %edx
236 cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
237 setg %dl
238 leal -1(%rdx, %rdx), %eax
239# else
240 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
241 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
242 subl %ecx, %eax
243# endif
244 ret
245
246 .p2align 4
247L(8x_return_vec_0_1_2_3):
248 /* Returning from L(more_8x_vec) requires restoring rsi. */
249 addq %rdi, %rsi
250L(return_vec_0_1_2_3):
251 VPCMP $4, %YMM1, %YMM0, %k0
252 kmovd %k0, %eax
253 testl %eax, %eax
254 jnz L(return_vec_0)
255
256 VPCMP $4, %YMM2, %YMM0, %k0
257 kmovd %k0, %eax
258 testl %eax, %eax
259 jnz L(return_vec_1)
260
261 VPCMP $4, %YMM3, %YMM0, %k0
262 kmovd %k0, %eax
263 testl %eax, %eax
264 jnz L(return_vec_2)
265L(return_vec_3):
266 tzcntl %ecx, %ecx
267# ifdef USE_AS_WMEMCMP
268 movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
269 xorl %edx, %edx
270 cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
271 setg %dl
272 leal -1(%rdx, %rdx), %eax
273# else
274 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
275 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
276 subl %ecx, %eax
277# endif
278 ret
279
280 .p2align 4
281L(more_8x_vec):
282 /* Set end of s1 in rdx. */
283 leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
284 /* rsi stores s2 - s1. This allows loop to only update one
285 pointer. */
286 subq %rdi, %rsi
287 /* Align s1 pointer. */
288 andq $-VEC_SIZE, %rdi
289 /* Adjust because first 4x vec where check already. */
290 subq $-(VEC_SIZE * 4), %rdi
291 .p2align 4
292L(loop_4x_vec):
293 VMOVU (%rsi, %rdi), %YMM1
294 vpxorq (%rdi), %YMM1, %YMM1
295
296 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
297 vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
298
299 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
300 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
301 vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
302
303 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
304 vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
305 VPCMP $4, %YMM4, %YMM0, %k1
306 kmovd %k1, %ecx
307 testl %ecx, %ecx
308 jnz L(8x_return_vec_0_1_2_3)
309 subq $-(VEC_SIZE * 4), %rdi
310 cmpq %rdx, %rdi
311 jb L(loop_4x_vec)
312
313 subq %rdx, %rdi
314 /* rdi has 4 * VEC_SIZE - remaining length. */
315 cmpl $(VEC_SIZE * 3), %edi
316 jae L(8x_last_1x_vec)
317 /* Load regardless of branch. */
318 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
319 cmpl $(VEC_SIZE * 2), %edi
320 jae L(8x_last_2x_vec)
321
322 VMOVU (%rsi, %rdx), %YMM1
323 vpxorq (%rdx), %YMM1, %YMM1
324
325 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
326 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
327
328 vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
329 vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
330
331 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
332 vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
333 VPCMP $4, %YMM4, %YMM0, %k1
334 kmovd %k1, %ecx
335 /* Restore s1 pointer to rdi. */
336 movq %rdx, %rdi
337 testl %ecx, %ecx
338 jnz L(8x_return_vec_0_1_2_3)
339 /* NB: eax must be zero to reach here. */
340 ret
341
342 /* Only entry is from L(more_8x_vec). */
343 .p2align 4
344L(8x_last_2x_vec):
345 VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
346 kmovd %k1, %eax
347 testl %eax, %eax
348 jnz L(8x_return_vec_2)
349 /* Naturally aligned to 16 bytes. */
350L(8x_last_1x_vec):
351 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
352 VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
353 kmovd %k1, %eax
354 testl %eax, %eax
355 jnz L(8x_return_vec_3)
356 ret
357
358 .p2align 4
359L(last_2x_vec):
360 /* Check second to last VEC. */
361 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
362 VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
363 kmovd %k1, %eax
364 testl %eax, %eax
365 jnz L(return_vec_1_end)
366
367 /* Check last VEC. */
368 .p2align 4
369L(last_1x_vec):
370 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
371 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
372 kmovd %k1, %eax
373 testl %eax, %eax
374 jnz L(return_vec_0_end)
375 ret
376
377 .p2align 4
378L(8x_return_vec_2):
379 subq $VEC_SIZE, %rdx
380L(8x_return_vec_3):
381 tzcntl %eax, %eax
382# ifdef USE_AS_WMEMCMP
383 leaq (%rdx, %rax, CHAR_SIZE), %rax
384 movl (VEC_SIZE * 3)(%rax), %ecx
385 xorl %edx, %edx
386 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
387 setg %dl
388 leal -1(%rdx, %rdx), %eax
389# else
390 addq %rdx, %rax
391 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
392 movzbl (VEC_SIZE * 3)(%rax), %eax
393 subl %ecx, %eax
394# endif
395 ret
396
397 .p2align 4
398L(return_vec_0_end):
399 tzcntl %eax, %eax
400 addl %edx, %eax
401# ifdef USE_AS_WMEMCMP
402 movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
403 xorl %edx, %edx
404 cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
405 setg %dl
406 leal -1(%rdx, %rdx), %eax
407# else
408 movzbl -VEC_SIZE(%rsi, %rax), %ecx
409 movzbl -VEC_SIZE(%rdi, %rax), %eax
410 subl %ecx, %eax
411# endif
412 ret
413
414 .p2align 4
415L(return_vec_1_end):
416 tzcntl %eax, %eax
417 addl %edx, %eax
418# ifdef USE_AS_WMEMCMP
419 movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
420 xorl %edx, %edx
421 cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
422 setg %dl
423 leal -1(%rdx, %rdx), %eax
424# else
425 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
426 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
427 subl %ecx, %eax
428# endif
429 ret
430
431
432 .p2align 4
433L(page_cross_less_vec):
434 /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
435 bytes. */
436 cmpl $(16 / CHAR_SIZE), %edx
437 jae L(between_16_31)
438# ifndef USE_AS_WMEMCMP
439 cmpl $8, %edx
440 jae L(between_8_15)
441 cmpl $4, %edx
442 jae L(between_4_7)
443L(between_2_3):
444 /* Load as big endian to avoid branches. */
445 movzwl (%rdi), %eax
446 movzwl (%rsi), %ecx
447 shll $8, %eax
448 shll $8, %ecx
449 bswap %eax
450 bswap %ecx
451 movzbl -1(%rdi, %rdx), %edi
452 movzbl -1(%rsi, %rdx), %esi
453 orl %edi, %eax
454 orl %esi, %ecx
455 /* Subtraction is okay because the upper 8 bits are zero. */
456 subl %ecx, %eax
457 ret
458 .p2align 4
459L(one_or_less):
460 jb L(zero)
461 movzbl (%rsi), %ecx
462 movzbl (%rdi), %eax
463 subl %ecx, %eax
464 ret
465
466 .p2align 4
467L(between_8_15):
468# endif
469 /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
470 vmovq (%rdi), %XMM1
471 vmovq (%rsi), %XMM2
472 VPCMP $4, %XMM1, %XMM2, %k1
473 kmovd %k1, %eax
474 testl %eax, %eax
475 jnz L(return_vec_0)
476 /* Use overlapping loads to avoid branches. */
477 leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
478 leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
479 vmovq (%rdi), %XMM1
480 vmovq (%rsi), %XMM2
481 VPCMP $4, %XMM1, %XMM2, %k1
482 kmovd %k1, %eax
483 testl %eax, %eax
484 jnz L(return_vec_0)
485 ret
486
487 .p2align 4
488L(zero):
489 xorl %eax, %eax
490 ret
491
492 .p2align 4
493L(between_16_31):
494 /* From 16 to 31 bytes. No branch when size == 16. */
495 VMOVU (%rsi), %XMM2
496 VPCMP $4, (%rdi), %XMM2, %k1
497 kmovd %k1, %eax
498 testl %eax, %eax
499 jnz L(return_vec_0)
500
501 /* Use overlapping loads to avoid branches. */
502
503 VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
504 leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
505 leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
506 VPCMP $4, (%rdi), %XMM2, %k1
507 kmovd %k1, %eax
508 testl %eax, %eax
509 jnz L(return_vec_0)
510 ret
511
512# ifdef USE_AS_WMEMCMP
513 .p2align 4
514L(one_or_less):
515 jb L(zero)
516 movl (%rdi), %ecx
517 xorl %edx, %edx
518 cmpl (%rsi), %ecx
519 je L(zero)
520 setg %dl
521 leal -1(%rdx, %rdx), %eax
522 ret
523# else
524
525 .p2align 4
526L(between_4_7):
527 /* Load as big endian with overlapping movbe to avoid branches.
528 */
529 movbe (%rdi), %eax
530 movbe (%rsi), %ecx
531 shlq $32, %rax
532 shlq $32, %rcx
533 movbe -4(%rdi, %rdx), %edi
534 movbe -4(%rsi, %rdx), %esi
535 orq %rdi, %rax
536 orq %rsi, %rcx
537 subq %rcx, %rax
538 jz L(zero_4_7)
539 sbbl %eax, %eax
540 orl $1, %eax
541L(zero_4_7):
542 ret
543# endif
544
545END (MEMCMP)
546#endif
547