1/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23
24/* memcmp/wmemcmp is implemented as:
25 1. Use ymm vector compares when possible. The only case where
26 vector compares is not possible for when size < CHAR_PER_VEC
27 and loading from either s1 or s2 would cause a page cross.
28 2. For size from 2 to 7 bytes on page cross, load as big endian
29 with movbe and bswap to avoid branches.
30 3. Use xmm vector compare when size >= 4 bytes for memcmp or
31 size >= 8 bytes for wmemcmp.
32 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
33 to check for early mismatches. Only do this if its guranteed the
34 work is not wasted.
35 5. If size is 8 * VEC_SIZE or less, unroll the loop.
36 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
37 area.
38 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
39 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
40 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
41
42When possible the implementation tries to optimize for frontend in the
43following ways:
44Throughput:
45 1. All code sections that fit are able to run optimally out of the
46 LSD.
47 2. All code sections that fit are able to run optimally out of the
48 DSB
49 3. Basic blocks are contained in minimum number of fetch blocks
50 necessary.
51
52Latency:
53 1. Logically connected basic blocks are put in the same
54 cache-line.
55 2. Logically connected basic blocks that do not fit in the same
56 cache-line are put in adjacent lines. This can get beneficial
57 L2 spatial prefetching and L1 next-line prefetching. */
58
59# include <sysdep.h>
60
61# ifndef MEMCMP
62# define MEMCMP __memcmp_evex_movbe
63# endif
64
65# define VMOVU vmovdqu64
66
67# ifdef USE_AS_WMEMCMP
68# define VMOVU_MASK vmovdqu32
69# define CHAR_SIZE 4
70# define VPCMP vpcmpd
71# define VPTEST vptestmd
72# else
73# define VMOVU_MASK vmovdqu8
74# define CHAR_SIZE 1
75# define VPCMP vpcmpub
76# define VPTEST vptestmb
77# endif
78
79
80# define VEC_SIZE 32
81# define PAGE_SIZE 4096
82# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
83
84# define XMM0 xmm16
85# define XMM1 xmm17
86# define XMM2 xmm18
87# define YMM0 ymm16
88# define XMM1 xmm17
89# define XMM2 xmm18
90# define YMM1 ymm17
91# define YMM2 ymm18
92# define YMM3 ymm19
93# define YMM4 ymm20
94# define YMM5 ymm21
95# define YMM6 ymm22
96
97/* Warning!
98 wmemcmp has to use SIGNED comparison for elements.
99 memcmp has to use UNSIGNED comparison for elemnts.
100*/
101
102 .section .text.evex,"ax",@progbits
103/* Cache align memcmp entry. This allows for much more thorough
104 frontend optimization. */
105ENTRY_P2ALIGN (MEMCMP, 6)
106# ifdef __ILP32__
107 /* Clear the upper 32 bits. */
108 movl %edx, %edx
109# endif
110 cmp $CHAR_PER_VEC, %RDX_LP
111 /* Fall through for [0, VEC_SIZE] as its the hottest. */
112 ja L(more_1x_vec)
113
114 /* Create mask for CHAR's we want to compare. This allows us to
115 avoid having to include page cross logic. */
116 movl $-1, %ecx
117 bzhil %edx, %ecx, %ecx
118 kmovd %ecx, %k2
119
120 /* Safe to load full ymm with mask. */
121 VMOVU_MASK (%rsi), %YMM2{%k2}
122 VPCMP $4,(%rdi), %YMM2, %k1{%k2}
123 kmovd %k1, %eax
124 testl %eax, %eax
125 jnz L(return_vec_0)
126 ret
127
128 .p2align 4
129L(return_vec_0):
130 tzcntl %eax, %eax
131# ifdef USE_AS_WMEMCMP
132 movl (%rdi, %rax, CHAR_SIZE), %ecx
133 xorl %edx, %edx
134 cmpl (%rsi, %rax, CHAR_SIZE), %ecx
135 /* NB: no partial register stall here because xorl zero idiom
136 above. */
137 setg %dl
138 leal -1(%rdx, %rdx), %eax
139# else
140 movzbl (%rsi, %rax), %ecx
141 movzbl (%rdi, %rax), %eax
142 subl %ecx, %eax
143# endif
144 ret
145
146
147 .p2align 4
148L(more_1x_vec):
149 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
150 VMOVU (%rsi), %YMM1
151 /* Use compare not equals to directly check for mismatch. */
152 VPCMP $4,(%rdi), %YMM1, %k1
153 kmovd %k1, %eax
154 /* NB: eax must be destination register if going to
155 L(return_vec_[0,2]). For L(return_vec_3) destination register
156 must be ecx. */
157 testl %eax, %eax
158 jnz L(return_vec_0)
159
160 cmpq $(CHAR_PER_VEC * 2), %rdx
161 jbe L(last_1x_vec)
162
163 /* Check second VEC no matter what. */
164 VMOVU VEC_SIZE(%rsi), %YMM2
165 VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
166 kmovd %k1, %eax
167 testl %eax, %eax
168 jnz L(return_vec_1)
169
170 /* Less than 4 * VEC. */
171 cmpq $(CHAR_PER_VEC * 4), %rdx
172 jbe L(last_2x_vec)
173
174 /* Check third and fourth VEC no matter what. */
175 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
176 VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
177 kmovd %k1, %eax
178 testl %eax, %eax
179 jnz L(return_vec_2)
180
181 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
182 VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
183 kmovd %k1, %ecx
184 testl %ecx, %ecx
185 jnz L(return_vec_3)
186
187 /* Go to 4x VEC loop. */
188 cmpq $(CHAR_PER_VEC * 8), %rdx
189 ja L(more_8x_vec)
190
191 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
192 branches. */
193
194 /* Load first two VEC from s2 before adjusting addresses. */
195 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
196 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
197 leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
198 leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
199
200 /* Wait to load from s1 until addressed adjust due to
201 unlamination of microfusion with complex address mode. */
202
203 /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
204 will have some 1s. */
205 vpxorq (%rdi), %YMM1, %YMM1
206 vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
207
208 VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
209 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
210
211 VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
212 /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
213 oring with YMM1. Result is stored in YMM4. */
214 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
215
216 /* Or together YMM2, YMM3, and YMM4 into YMM4. */
217 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
218
219 /* Test YMM4 against itself. Store any CHAR mismatches in k1.
220 */
221 VPTEST %YMM4, %YMM4, %k1
222 /* k1 must go to ecx for L(return_vec_0_1_2_3). */
223 kmovd %k1, %ecx
224 testl %ecx, %ecx
225 jnz L(return_vec_0_1_2_3)
226 /* NB: eax must be zero to reach here. */
227 ret
228
229
230 .p2align 4,, 8
231L(8x_end_return_vec_0_1_2_3):
232 movq %rdx, %rdi
233L(8x_return_vec_0_1_2_3):
234 addq %rdi, %rsi
235L(return_vec_0_1_2_3):
236 VPTEST %YMM1, %YMM1, %k0
237 kmovd %k0, %eax
238 testl %eax, %eax
239 jnz L(return_vec_0)
240
241 VPTEST %YMM2, %YMM2, %k0
242 kmovd %k0, %eax
243 testl %eax, %eax
244 jnz L(return_vec_1)
245
246 VPTEST %YMM3, %YMM3, %k0
247 kmovd %k0, %eax
248 testl %eax, %eax
249 jnz L(return_vec_2)
250L(return_vec_3):
251 /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
252 fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
253 line. */
254 bsfl %ecx, %ecx
255# ifdef USE_AS_WMEMCMP
256 movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
257 xorl %edx, %edx
258 cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
259 setg %dl
260 leal -1(%rdx, %rdx), %eax
261# else
262 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
263 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
264 subl %ecx, %eax
265# endif
266 ret
267
268
269 .p2align 4
270L(return_vec_1):
271 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
272 fetch block. */
273 bsfl %eax, %eax
274# ifdef USE_AS_WMEMCMP
275 movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
276 xorl %edx, %edx
277 cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
278 setg %dl
279 leal -1(%rdx, %rdx), %eax
280# else
281 movzbl VEC_SIZE(%rsi, %rax), %ecx
282 movzbl VEC_SIZE(%rdi, %rax), %eax
283 subl %ecx, %eax
284# endif
285 ret
286
287 .p2align 4,, 10
288L(return_vec_2):
289 /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
290 fetch block. */
291 bsfl %eax, %eax
292# ifdef USE_AS_WMEMCMP
293 movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
294 xorl %edx, %edx
295 cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
296 setg %dl
297 leal -1(%rdx, %rdx), %eax
298# else
299 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
300 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
301 subl %ecx, %eax
302# endif
303 ret
304
305 .p2align 4
306L(more_8x_vec):
307 /* Set end of s1 in rdx. */
308 leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
309 /* rsi stores s2 - s1. This allows loop to only update one
310 pointer. */
311 subq %rdi, %rsi
312 /* Align s1 pointer. */
313 andq $-VEC_SIZE, %rdi
314 /* Adjust because first 4x vec where check already. */
315 subq $-(VEC_SIZE * 4), %rdi
316
317 .p2align 4
318L(loop_4x_vec):
319 VMOVU (%rsi, %rdi), %YMM1
320 vpxorq (%rdi), %YMM1, %YMM1
321 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
322 vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
323 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
324 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
325 VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
326 vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
327 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
328 VPTEST %YMM4, %YMM4, %k1
329 kmovd %k1, %ecx
330 testl %ecx, %ecx
331 jnz L(8x_return_vec_0_1_2_3)
332 subq $-(VEC_SIZE * 4), %rdi
333 cmpq %rdx, %rdi
334 jb L(loop_4x_vec)
335
336 subq %rdx, %rdi
337 /* rdi has 4 * VEC_SIZE - remaining length. */
338 cmpl $(VEC_SIZE * 3), %edi
339 jae L(8x_last_1x_vec)
340 /* Load regardless of branch. */
341 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
342 cmpl $(VEC_SIZE * 2), %edi
343 jae L(8x_last_2x_vec)
344
345 vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
346
347 VMOVU (%rsi, %rdx), %YMM1
348 vpxorq (%rdx), %YMM1, %YMM1
349
350 VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
351 vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
352 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
353 vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
354 vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
355 VPTEST %YMM4, %YMM4, %k1
356 kmovd %k1, %ecx
357 testl %ecx, %ecx
358 jnz L(8x_end_return_vec_0_1_2_3)
359 /* NB: eax must be zero to reach here. */
360 ret
361
362 /* Only entry is from L(more_8x_vec). */
363 .p2align 4,, 10
364L(8x_last_2x_vec):
365 VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
366 kmovd %k1, %eax
367 testl %eax, %eax
368 jnz L(8x_return_vec_2)
369 /* Naturally aligned to 16 bytes. */
370L(8x_last_1x_vec):
371 VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
372 VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
373 kmovd %k1, %eax
374 testl %eax, %eax
375 jnz L(8x_return_vec_3)
376 ret
377
378 /* Not ideally aligned (at offset +9 bytes in fetch block) but
379 not aligning keeps it in the same cache line as
380 L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
381 size. */
382 .p2align 4,, 4
383L(8x_return_vec_2):
384 subq $VEC_SIZE, %rdx
385L(8x_return_vec_3):
386 bsfl %eax, %eax
387# ifdef USE_AS_WMEMCMP
388 leaq (%rdx, %rax, CHAR_SIZE), %rax
389 movl (VEC_SIZE * 3)(%rax), %ecx
390 xorl %edx, %edx
391 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
392 setg %dl
393 leal -1(%rdx, %rdx), %eax
394# else
395 addq %rdx, %rax
396 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
397 movzbl (VEC_SIZE * 3)(%rax), %eax
398 subl %ecx, %eax
399# endif
400 ret
401
402 .p2align 4,, 10
403L(last_2x_vec):
404 /* Check second to last VEC. */
405 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
406 VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
407 kmovd %k1, %eax
408 testl %eax, %eax
409 jnz L(return_vec_1_end)
410
411 /* Check last VEC. */
412 .p2align 4
413L(last_1x_vec):
414 VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
415 VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
416 kmovd %k1, %eax
417 testl %eax, %eax
418 jnz L(return_vec_0_end)
419 ret
420
421
422 /* Don't align. Takes 2-fetch blocks either way and aligning
423 will cause code to spill into another cacheline. */
424L(return_vec_1_end):
425 /* Use bsf to save code size. This is necessary to have
426 L(one_or_less) fit in aligning bytes between. */
427 bsfl %eax, %eax
428 addl %edx, %eax
429# ifdef USE_AS_WMEMCMP
430 movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
431 xorl %edx, %edx
432 cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
433 setg %dl
434 leal -1(%rdx, %rdx), %eax
435# else
436 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
437 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
438 subl %ecx, %eax
439# endif
440 ret
441
442 /* Don't align. Takes 2-fetch blocks either way and aligning
443 will cause code to spill into another cacheline. */
444L(return_vec_0_end):
445 tzcntl %eax, %eax
446 addl %edx, %eax
447# ifdef USE_AS_WMEMCMP
448 movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
449 xorl %edx, %edx
450 cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
451 setg %dl
452 leal -1(%rdx, %rdx), %eax
453# else
454 movzbl -VEC_SIZE(%rsi, %rax), %ecx
455 movzbl -VEC_SIZE(%rdi, %rax), %eax
456 subl %ecx, %eax
457# endif
458 ret
459 /* 1-byte until next cache line. */
460
461END (MEMCMP)
462#endif
463