1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21/* memcmp/wmemcmp is implemented as:
22 1. Use ymm vector compares when possible. The only case where
23 vector compares is not possible for when size < CHAR_PER_VEC
24 and loading from either s1 or s2 would cause a page cross.
25 2. For size from 2 to 7 bytes on page cross, load as big endian
26 with movbe and bswap to avoid branches.
27 3. Use xmm vector compare when size >= 4 bytes for memcmp or
28 size >= 8 bytes for wmemcmp.
29 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
30 to check for early mismatches. Only do this if its guranteed the
31 work is not wasted.
32 5. If size is 8 * VEC_SIZE or less, unroll the loop.
33 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
34 area.
35 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
36 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
37 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
38
39When possible the implementation tries to optimize for frontend in the
40following ways:
41Throughput:
42 1. All code sections that fit are able to run optimally out of the
43 LSD.
44 2. All code sections that fit are able to run optimally out of the
45 DSB
46 3. Basic blocks are contained in minimum number of fetch blocks
47 necessary.
48
49Latency:
50 1. Logically connected basic blocks are put in the same
51 cache-line.
52 2. Logically connected basic blocks that do not fit in the same
53 cache-line are put in adjacent lines. This can get beneficial
54 L2 spatial prefetching and L1 next-line prefetching. */
55
56# include <sysdep.h>
57
58# ifndef MEMCMP
59# define MEMCMP __memcmp_evex_movbe
60# endif
61
62# define VMOVU vmovdqu64
63
64# ifdef USE_AS_WMEMCMP
65# define VMOVU_MASK vmovdqu32
66# define CHAR_SIZE 4
67# define VPCMP vpcmpd
68# define VPTEST vptestmd
69# else
70# define VMOVU_MASK vmovdqu8
71# define CHAR_SIZE 1
72# define VPCMP vpcmpub
73# define VPTEST vptestmb
74# endif
75
76
77# define VEC_SIZE 32
78# define PAGE_SIZE 4096
79# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
80
81# define XMM0 xmm16
82# define XMM1 xmm17
83# define XMM2 xmm18
84# define YMM0 ymm16
85# define XMM1 xmm17
86# define XMM2 xmm18
87# define YMM1 ymm17
88# define YMM2 ymm18
89# define YMM3 ymm19
90# define YMM4 ymm20
91# define YMM5 ymm21
92# define YMM6 ymm22
93
94/* Warning!
95 wmemcmp has to use SIGNED comparison for elements.
96 memcmp has to use UNSIGNED comparison for elemnts.
97*/
98
99 .section .text.evex,"ax",@progbits
100/* Cache align memcmp entry. This allows for much more thorough
101 frontend optimization. */
102ENTRY_P2ALIGN (MEMCMP, 6)
103# ifdef __ILP32__
104 /* Clear the upper 32 bits. */
105 movl %edx, %edx
106# endif
107 cmp $CHAR_PER_VEC, %RDX_LP
108 /* Fall through for [0, VEC_SIZE] as its the hottest. */
109 ja L(more_1x_vec)
110
111 /* Create mask for CHAR's we want to compare. This allows us to
112 avoid having to include page cross logic. */
113 movl $-1, %ecx
114 bzhil %edx, %ecx, %ecx
115 kmovd %ecx, %k2
116
117 /* Safe to load full ymm with mask. */
118 VMOVU_MASK (%rsi), %YMM2{%k2}
119 VPCMP $4,(%rdi), %YMM2, %k1{%k2}
120 kmovd %k1, %eax
121 testl %eax, %eax
122 jnz L(return_vec_0)
123 ret
124
125 .p2align 4
126L(return_vec_0):
127 tzcntl %eax, %eax
128# ifdef USE_AS_WMEMCMP
129 movl (%rdi, %rax, CHAR_SIZE), %ecx
130 xorl %edx, %edx
131 cmpl (%rsi, %rax, CHAR_SIZE), %ecx
132 /* NB: no partial register stall here because xorl zero idiom
133 above. */
134 setg %dl
135 leal -1(%rdx, %rdx), %eax
136# else
137 movzbl (%rsi, %rax), %ecx
138 movzbl (%rdi, %rax), %eax
139 subl %ecx, %eax
140# endif
141 ret
142
143
144 .p2align 4
145L(more_1x_vec):
146 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
147 VMOVU (%rsi), %YMM1
148 /* Use compare not equals to directly check for mismatch. */
149 VPCMP $4,(%rdi), %YMM1, %k1
150 kmovd %k1, %eax
151 /* NB: eax must be destination register if going to
152 L(return_vec_[0,2]). For L(return_vec_3) destination register
153 must be ecx. */
154 testl %eax, %eax
155 jnz L(return_vec_0)
156
157 cmpq $(CHAR_PER_VEC * 2), %rdx
158 jbe L(last_1x_vec)
159
160 /* Check second VEC no matter what. */
161 VMOVU VEC_SIZE(%rsi), %YMM2
162 VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
163 kmovd %k1, %eax
164 testl %eax, %eax
165 jnz L(return_vec_1)
166
167 /* Less than 4 * VEC. */
168 cmpq $(CHAR_PER_VEC * 4), %rdx
169 jbe L(last_2x_vec)
170
171 /* Check third and fourth VEC no matter what. */
172 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
173 VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
174 kmovd %k1, %eax
175 testl %eax, %eax
176 jnz L(return_vec_2)
177
178 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
179 VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
180 kmovd %k1, %ecx
181 testl %ecx, %ecx
182 jnz L(return_vec_3)
183
184 /* Go to 4x VEC loop. */
185 cmpq $(CHAR_PER_VEC * 8), %rdx
186 ja L(more_8x_vec)
187
188 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
189 branches. */
190
191 /* Load first two VEC from s2 before adjusting addresses. */
192 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
193 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
194 leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
195 leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
196
197 /* Wait to load from s1 until addressed adjust due to
198 unlamination of microfusion with complex address mode. */
199
200 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
201 will have some 1s. */
202 vpxorq (%rdi), %YMM1, %YMM1
203 vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
204
205 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
206 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
207
208 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
209 /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
210 oring with YMM1. Result is stored in YMM4. */
211 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
212
213 /* Or together YMM2, YMM3, and YMM4 into YMM4. */
214 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
215
216 /* Test YMM4 against itself. Store any CHAR mismatches in k1.
217 */
218 VPTEST %YMM4, %YMM4, %k1
219 /* k1 must go to ecx for L(return_vec_0_1_2_3). */
220 kmovd %k1, %ecx
221 testl %ecx, %ecx
222 jnz L(return_vec_0_1_2_3)
223 /* NB: eax must be zero to reach here. */
224 ret
225
226
227 .p2align 4,, 8
228L(8x_end_return_vec_0_1_2_3):
229 movq %rdx, %rdi
230L(8x_return_vec_0_1_2_3):
231 addq %rdi, %rsi
232L(return_vec_0_1_2_3):
233 VPTEST %YMM1, %YMM1, %k0
234 kmovd %k0, %eax
235 testl %eax, %eax
236 jnz L(return_vec_0)
237
238 VPTEST %YMM2, %YMM2, %k0
239 kmovd %k0, %eax
240 testl %eax, %eax
241 jnz L(return_vec_1)
242
243 VPTEST %YMM3, %YMM3, %k0
244 kmovd %k0, %eax
245 testl %eax, %eax
246 jnz L(return_vec_2)
247L(return_vec_3):
248 /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
249 fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
250 line. */
251 bsfl %ecx, %ecx
252# ifdef USE_AS_WMEMCMP
253 movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
254 xorl %edx, %edx
255 cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
256 setg %dl
257 leal -1(%rdx, %rdx), %eax
258# else
259 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
260 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
261 subl %ecx, %eax
262# endif
263 ret
264
265
266 .p2align 4
267L(return_vec_1):
268 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
269 fetch block. */
270 bsfl %eax, %eax
271# ifdef USE_AS_WMEMCMP
272 movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
273 xorl %edx, %edx
274 cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
275 setg %dl
276 leal -1(%rdx, %rdx), %eax
277# else
278 movzbl VEC_SIZE(%rsi, %rax), %ecx
279 movzbl VEC_SIZE(%rdi, %rax), %eax
280 subl %ecx, %eax
281# endif
282 ret
283
284 .p2align 4,, 10
285L(return_vec_2):
286 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
287 fetch block. */
288 bsfl %eax, %eax
289# ifdef USE_AS_WMEMCMP
290 movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
291 xorl %edx, %edx
292 cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
293 setg %dl
294 leal -1(%rdx, %rdx), %eax
295# else
296 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
297 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
298 subl %ecx, %eax
299# endif
300 ret
301
302 .p2align 4
303L(more_8x_vec):
304 /* Set end of s1 in rdx. */
305 leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
306 /* rsi stores s2 - s1. This allows loop to only update one
307 pointer. */
308 subq %rdi, %rsi
309 /* Align s1 pointer. */
310 andq $-VEC_SIZE, %rdi
311 /* Adjust because first 4x vec where check already. */
312 subq $-(VEC_SIZE * 4), %rdi
313
314 .p2align 4
315L(loop_4x_vec):
316 VMOVU (%rsi, %rdi), %YMM1
317 vpxorq (%rdi), %YMM1, %YMM1
318 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
319 vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
320 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
321 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
322 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
323 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
324 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
325 VPTEST %YMM4, %YMM4, %k1
326 kmovd %k1, %ecx
327 testl %ecx, %ecx
328 jnz L(8x_return_vec_0_1_2_3)
329 subq $-(VEC_SIZE * 4), %rdi
330 cmpq %rdx, %rdi
331 jb L(loop_4x_vec)
332
333 subq %rdx, %rdi
334 /* rdi has 4 * VEC_SIZE - remaining length. */
335 cmpl $(VEC_SIZE * 3), %edi
336 jae L(8x_last_1x_vec)
337 /* Load regardless of branch. */
338 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
339 cmpl $(VEC_SIZE * 2), %edi
340 jae L(8x_last_2x_vec)
341
342 vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
343
344 VMOVU (%rsi, %rdx), %YMM1
345 vpxorq (%rdx), %YMM1, %YMM1
346
347 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
348 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
349 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
350 vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
351 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
352 VPTEST %YMM4, %YMM4, %k1
353 kmovd %k1, %ecx
354 testl %ecx, %ecx
355 jnz L(8x_end_return_vec_0_1_2_3)
356 /* NB: eax must be zero to reach here. */
357 ret
358
359 /* Only entry is from L(more_8x_vec). */
360 .p2align 4,, 10
361L(8x_last_2x_vec):
362 VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
363 kmovd %k1, %eax
364 testl %eax, %eax
365 jnz L(8x_return_vec_2)
366 /* Naturally aligned to 16 bytes. */
367L(8x_last_1x_vec):
368 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
369 VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
370 kmovd %k1, %eax
371 testl %eax, %eax
372 jnz L(8x_return_vec_3)
373 ret
374
375 /* Not ideally aligned (at offset +9 bytes in fetch block) but
376 not aligning keeps it in the same cache line as
377 L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
378 size. */
379 .p2align 4,, 4
380L(8x_return_vec_2):
381 subq $VEC_SIZE, %rdx
382L(8x_return_vec_3):
383 bsfl %eax, %eax
384# ifdef USE_AS_WMEMCMP
385 leaq (%rdx, %rax, CHAR_SIZE), %rax
386 movl (VEC_SIZE * 3)(%rax), %ecx
387 xorl %edx, %edx
388 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
389 setg %dl
390 leal -1(%rdx, %rdx), %eax
391# else
392 addq %rdx, %rax
393 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
394 movzbl (VEC_SIZE * 3)(%rax), %eax
395 subl %ecx, %eax
396# endif
397 ret
398
399 .p2align 4,, 10
400L(last_2x_vec):
401 /* Check second to last VEC. */
402 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
403 VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
404 kmovd %k1, %eax
405 testl %eax, %eax
406 jnz L(return_vec_1_end)
407
408 /* Check last VEC. */
409 .p2align 4
410L(last_1x_vec):
411 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
412 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
413 kmovd %k1, %eax
414 testl %eax, %eax
415 jnz L(return_vec_0_end)
416 ret
417
418
419 /* Don't align. Takes 2-fetch blocks either way and aligning
420 will cause code to spill into another cacheline. */
421L(return_vec_1_end):
422 /* Use bsf to save code size. This is necessary to have
423 L(one_or_less) fit in aligning bytes between. */
424 bsfl %eax, %eax
425 addl %edx, %eax
426# ifdef USE_AS_WMEMCMP
427 movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
428 xorl %edx, %edx
429 cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
430 setg %dl
431 leal -1(%rdx, %rdx), %eax
432# else
433 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
434 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
435 subl %ecx, %eax
436# endif
437 ret
438
439 /* Don't align. Takes 2-fetch blocks either way and aligning
440 will cause code to spill into another cacheline. */
441L(return_vec_0_end):
442 tzcntl %eax, %eax
443 addl %edx, %eax
444# ifdef USE_AS_WMEMCMP
445 movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
446 xorl %edx, %edx
447 cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
448 setg %dl
449 leal -1(%rdx, %rdx), %eax
450# else
451 movzbl -VEC_SIZE(%rsi, %rax), %ecx
452 movzbl -VEC_SIZE(%rdi, %rax), %eax
453 subl %ecx, %eax
454# endif
455 ret
456 /* 1-byte until next cache line. */
457
458END (MEMCMP)
459#endif
460