1 | /* memcmp/wmemcmp optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | /* memcmp/wmemcmp is implemented as: |
22 | 1. Use ymm vector compares when possible. The only case where |
23 | vector compares is not possible for when size < CHAR_PER_VEC |
24 | and loading from either s1 or s2 would cause a page cross. |
25 | 2. For size from 2 to 7 bytes on page cross, load as big endian |
26 | with movbe and bswap to avoid branches. |
27 | 3. Use xmm vector compare when size >= 4 bytes for memcmp or |
28 | size >= 8 bytes for wmemcmp. |
29 | 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a |
30 | to check for early mismatches. Only do this if its guranteed the |
31 | work is not wasted. |
32 | 5. If size is 8 * VEC_SIZE or less, unroll the loop. |
33 | 6. Compare 4 * VEC_SIZE at a time with the aligned first memory |
34 | area. |
35 | 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. |
36 | 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. |
37 | 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. |
38 | |
39 | When possible the implementation tries to optimize for frontend in the |
40 | following ways: |
41 | Throughput: |
42 | 1. All code sections that fit are able to run optimally out of the |
43 | LSD. |
44 | 2. All code sections that fit are able to run optimally out of the |
45 | DSB |
46 | 3. Basic blocks are contained in minimum number of fetch blocks |
47 | necessary. |
48 | |
49 | Latency: |
50 | 1. Logically connected basic blocks are put in the same |
51 | cache-line. |
52 | 2. Logically connected basic blocks that do not fit in the same |
53 | cache-line are put in adjacent lines. This can get beneficial |
54 | L2 spatial prefetching and L1 next-line prefetching. */ |
55 | |
56 | # include <sysdep.h> |
57 | |
58 | # ifndef MEMCMP |
59 | # define MEMCMP __memcmp_evex_movbe |
60 | # endif |
61 | |
62 | # define VMOVU vmovdqu64 |
63 | |
64 | # ifdef USE_AS_WMEMCMP |
65 | # define VMOVU_MASK vmovdqu32 |
66 | # define CHAR_SIZE 4 |
67 | # define VPCMP vpcmpd |
68 | # define VPTEST vptestmd |
69 | # else |
70 | # define VMOVU_MASK vmovdqu8 |
71 | # define CHAR_SIZE 1 |
72 | # define VPCMP vpcmpub |
73 | # define VPTEST vptestmb |
74 | # endif |
75 | |
76 | |
77 | # define VEC_SIZE 32 |
78 | # define PAGE_SIZE 4096 |
79 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
80 | |
81 | # define XMM0 xmm16 |
82 | # define XMM1 xmm17 |
83 | # define XMM2 xmm18 |
84 | # define YMM0 ymm16 |
85 | # define XMM1 xmm17 |
86 | # define XMM2 xmm18 |
87 | # define YMM1 ymm17 |
88 | # define YMM2 ymm18 |
89 | # define YMM3 ymm19 |
90 | # define YMM4 ymm20 |
91 | # define YMM5 ymm21 |
92 | # define YMM6 ymm22 |
93 | |
94 | /* Warning! |
95 | wmemcmp has to use SIGNED comparison for elements. |
96 | memcmp has to use UNSIGNED comparison for elemnts. |
97 | */ |
98 | |
99 | .section .text.evex,"ax" ,@progbits |
100 | /* Cache align memcmp entry. This allows for much more thorough |
101 | frontend optimization. */ |
102 | ENTRY_P2ALIGN (MEMCMP, 6) |
103 | # ifdef __ILP32__ |
104 | /* Clear the upper 32 bits. */ |
105 | movl %edx, %edx |
106 | # endif |
107 | cmp $CHAR_PER_VEC, %RDX_LP |
108 | /* Fall through for [0, VEC_SIZE] as its the hottest. */ |
109 | ja L(more_1x_vec) |
110 | |
111 | /* Create mask for CHAR's we want to compare. This allows us to |
112 | avoid having to include page cross logic. */ |
113 | movl $-1, %ecx |
114 | bzhil %edx, %ecx, %ecx |
115 | kmovd %ecx, %k2 |
116 | |
117 | /* Safe to load full ymm with mask. */ |
118 | VMOVU_MASK (%rsi), %YMM2{%k2} |
119 | VPCMP $4,(%rdi), %YMM2, %k1{%k2} |
120 | kmovd %k1, %eax |
121 | testl %eax, %eax |
122 | jnz L(return_vec_0) |
123 | ret |
124 | |
125 | .p2align 4 |
126 | L(return_vec_0): |
127 | tzcntl %eax, %eax |
128 | # ifdef USE_AS_WMEMCMP |
129 | movl (%rdi, %rax, CHAR_SIZE), %ecx |
130 | xorl %edx, %edx |
131 | cmpl (%rsi, %rax, CHAR_SIZE), %ecx |
132 | /* NB: no partial register stall here because xorl zero idiom |
133 | above. */ |
134 | setg %dl |
135 | leal -1(%rdx, %rdx), %eax |
136 | # else |
137 | movzbl (%rsi, %rax), %ecx |
138 | movzbl (%rdi, %rax), %eax |
139 | subl %ecx, %eax |
140 | # endif |
141 | ret |
142 | |
143 | |
144 | .p2align 4 |
145 | L(more_1x_vec): |
146 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
147 | VMOVU (%rsi), %YMM1 |
148 | /* Use compare not equals to directly check for mismatch. */ |
149 | VPCMP $4,(%rdi), %YMM1, %k1 |
150 | kmovd %k1, %eax |
151 | /* NB: eax must be destination register if going to |
152 | L(return_vec_[0,2]). For L(return_vec_3) destination register |
153 | must be ecx. */ |
154 | testl %eax, %eax |
155 | jnz L(return_vec_0) |
156 | |
157 | cmpq $(CHAR_PER_VEC * 2), %rdx |
158 | jbe L(last_1x_vec) |
159 | |
160 | /* Check second VEC no matter what. */ |
161 | VMOVU VEC_SIZE(%rsi), %YMM2 |
162 | VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 |
163 | kmovd %k1, %eax |
164 | testl %eax, %eax |
165 | jnz L(return_vec_1) |
166 | |
167 | /* Less than 4 * VEC. */ |
168 | cmpq $(CHAR_PER_VEC * 4), %rdx |
169 | jbe L(last_2x_vec) |
170 | |
171 | /* Check third and fourth VEC no matter what. */ |
172 | VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 |
173 | VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 |
174 | kmovd %k1, %eax |
175 | testl %eax, %eax |
176 | jnz L(return_vec_2) |
177 | |
178 | VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 |
179 | VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 |
180 | kmovd %k1, %ecx |
181 | testl %ecx, %ecx |
182 | jnz L(return_vec_3) |
183 | |
184 | /* Go to 4x VEC loop. */ |
185 | cmpq $(CHAR_PER_VEC * 8), %rdx |
186 | ja L(more_8x_vec) |
187 | |
188 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
189 | branches. */ |
190 | |
191 | /* Load first two VEC from s2 before adjusting addresses. */ |
192 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 |
193 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 |
194 | leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi |
195 | leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi |
196 | |
197 | /* Wait to load from s1 until addressed adjust due to |
198 | unlamination of microfusion with complex address mode. */ |
199 | |
200 | /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it |
201 | will have some 1s. */ |
202 | vpxorq (%rdi), %YMM1, %YMM1 |
203 | vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 |
204 | |
205 | VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 |
206 | vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 |
207 | |
208 | VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 |
209 | /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while |
210 | oring with YMM1. Result is stored in YMM4. */ |
211 | vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 |
212 | |
213 | /* Or together YMM2, YMM3, and YMM4 into YMM4. */ |
214 | vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 |
215 | |
216 | /* Test YMM4 against itself. Store any CHAR mismatches in k1. |
217 | */ |
218 | VPTEST %YMM4, %YMM4, %k1 |
219 | /* k1 must go to ecx for L(return_vec_0_1_2_3). */ |
220 | kmovd %k1, %ecx |
221 | testl %ecx, %ecx |
222 | jnz L(return_vec_0_1_2_3) |
223 | /* NB: eax must be zero to reach here. */ |
224 | ret |
225 | |
226 | |
227 | .p2align 4,, 8 |
228 | L(8x_end_return_vec_0_1_2_3): |
229 | movq %rdx, %rdi |
230 | L(8x_return_vec_0_1_2_3): |
231 | addq %rdi, %rsi |
232 | L(return_vec_0_1_2_3): |
233 | VPTEST %YMM1, %YMM1, %k0 |
234 | kmovd %k0, %eax |
235 | testl %eax, %eax |
236 | jnz L(return_vec_0) |
237 | |
238 | VPTEST %YMM2, %YMM2, %k0 |
239 | kmovd %k0, %eax |
240 | testl %eax, %eax |
241 | jnz L(return_vec_1) |
242 | |
243 | VPTEST %YMM3, %YMM3, %k0 |
244 | kmovd %k0, %eax |
245 | testl %eax, %eax |
246 | jnz L(return_vec_2) |
247 | L(return_vec_3): |
248 | /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one |
249 | fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache |
250 | line. */ |
251 | bsfl %ecx, %ecx |
252 | # ifdef USE_AS_WMEMCMP |
253 | movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax |
254 | xorl %edx, %edx |
255 | cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax |
256 | setg %dl |
257 | leal -1(%rdx, %rdx), %eax |
258 | # else |
259 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
260 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx |
261 | subl %ecx, %eax |
262 | # endif |
263 | ret |
264 | |
265 | |
266 | .p2align 4 |
267 | L(return_vec_1): |
268 | /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one |
269 | fetch block. */ |
270 | bsfl %eax, %eax |
271 | # ifdef USE_AS_WMEMCMP |
272 | movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx |
273 | xorl %edx, %edx |
274 | cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx |
275 | setg %dl |
276 | leal -1(%rdx, %rdx), %eax |
277 | # else |
278 | movzbl VEC_SIZE(%rsi, %rax), %ecx |
279 | movzbl VEC_SIZE(%rdi, %rax), %eax |
280 | subl %ecx, %eax |
281 | # endif |
282 | ret |
283 | |
284 | .p2align 4,, 10 |
285 | L(return_vec_2): |
286 | /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one |
287 | fetch block. */ |
288 | bsfl %eax, %eax |
289 | # ifdef USE_AS_WMEMCMP |
290 | movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx |
291 | xorl %edx, %edx |
292 | cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx |
293 | setg %dl |
294 | leal -1(%rdx, %rdx), %eax |
295 | # else |
296 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx |
297 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax |
298 | subl %ecx, %eax |
299 | # endif |
300 | ret |
301 | |
302 | .p2align 4 |
303 | L(more_8x_vec): |
304 | /* Set end of s1 in rdx. */ |
305 | leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx |
306 | /* rsi stores s2 - s1. This allows loop to only update one |
307 | pointer. */ |
308 | subq %rdi, %rsi |
309 | /* Align s1 pointer. */ |
310 | andq $-VEC_SIZE, %rdi |
311 | /* Adjust because first 4x vec where check already. */ |
312 | subq $-(VEC_SIZE * 4), %rdi |
313 | |
314 | .p2align 4 |
315 | L(loop_4x_vec): |
316 | VMOVU (%rsi, %rdi), %YMM1 |
317 | vpxorq (%rdi), %YMM1, %YMM1 |
318 | VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 |
319 | vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 |
320 | VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 |
321 | vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 |
322 | VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 |
323 | vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 |
324 | vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 |
325 | VPTEST %YMM4, %YMM4, %k1 |
326 | kmovd %k1, %ecx |
327 | testl %ecx, %ecx |
328 | jnz L(8x_return_vec_0_1_2_3) |
329 | subq $-(VEC_SIZE * 4), %rdi |
330 | cmpq %rdx, %rdi |
331 | jb L(loop_4x_vec) |
332 | |
333 | subq %rdx, %rdi |
334 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
335 | cmpl $(VEC_SIZE * 3), %edi |
336 | jae L(8x_last_1x_vec) |
337 | /* Load regardless of branch. */ |
338 | VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 |
339 | cmpl $(VEC_SIZE * 2), %edi |
340 | jae L(8x_last_2x_vec) |
341 | |
342 | vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 |
343 | |
344 | VMOVU (%rsi, %rdx), %YMM1 |
345 | vpxorq (%rdx), %YMM1, %YMM1 |
346 | |
347 | VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 |
348 | vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 |
349 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 |
350 | vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 |
351 | vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 |
352 | VPTEST %YMM4, %YMM4, %k1 |
353 | kmovd %k1, %ecx |
354 | testl %ecx, %ecx |
355 | jnz L(8x_end_return_vec_0_1_2_3) |
356 | /* NB: eax must be zero to reach here. */ |
357 | ret |
358 | |
359 | /* Only entry is from L(more_8x_vec). */ |
360 | .p2align 4,, 10 |
361 | L(8x_last_2x_vec): |
362 | VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 |
363 | kmovd %k1, %eax |
364 | testl %eax, %eax |
365 | jnz L(8x_return_vec_2) |
366 | /* Naturally aligned to 16 bytes. */ |
367 | L(8x_last_1x_vec): |
368 | VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 |
369 | VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 |
370 | kmovd %k1, %eax |
371 | testl %eax, %eax |
372 | jnz L(8x_return_vec_3) |
373 | ret |
374 | |
375 | /* Not ideally aligned (at offset +9 bytes in fetch block) but |
376 | not aligning keeps it in the same cache line as |
377 | L(8x_last_1x/2x_vec) so likely worth it. As well, saves code |
378 | size. */ |
379 | .p2align 4,, 4 |
380 | L(8x_return_vec_2): |
381 | subq $VEC_SIZE, %rdx |
382 | L(8x_return_vec_3): |
383 | bsfl %eax, %eax |
384 | # ifdef USE_AS_WMEMCMP |
385 | leaq (%rdx, %rax, CHAR_SIZE), %rax |
386 | movl (VEC_SIZE * 3)(%rax), %ecx |
387 | xorl %edx, %edx |
388 | cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
389 | setg %dl |
390 | leal -1(%rdx, %rdx), %eax |
391 | # else |
392 | addq %rdx, %rax |
393 | movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx |
394 | movzbl (VEC_SIZE * 3)(%rax), %eax |
395 | subl %ecx, %eax |
396 | # endif |
397 | ret |
398 | |
399 | .p2align 4,, 10 |
400 | L(last_2x_vec): |
401 | /* Check second to last VEC. */ |
402 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 |
403 | VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 |
404 | kmovd %k1, %eax |
405 | testl %eax, %eax |
406 | jnz L(return_vec_1_end) |
407 | |
408 | /* Check last VEC. */ |
409 | .p2align 4 |
410 | L(last_1x_vec): |
411 | VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 |
412 | VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 |
413 | kmovd %k1, %eax |
414 | testl %eax, %eax |
415 | jnz L(return_vec_0_end) |
416 | ret |
417 | |
418 | |
419 | /* Don't align. Takes 2-fetch blocks either way and aligning |
420 | will cause code to spill into another cacheline. */ |
421 | L(return_vec_1_end): |
422 | /* Use bsf to save code size. This is necessary to have |
423 | L(one_or_less) fit in aligning bytes between. */ |
424 | bsfl %eax, %eax |
425 | addl %edx, %eax |
426 | # ifdef USE_AS_WMEMCMP |
427 | movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx |
428 | xorl %edx, %edx |
429 | cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx |
430 | setg %dl |
431 | leal -1(%rdx, %rdx), %eax |
432 | # else |
433 | movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx |
434 | movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax |
435 | subl %ecx, %eax |
436 | # endif |
437 | ret |
438 | |
439 | /* Don't align. Takes 2-fetch blocks either way and aligning |
440 | will cause code to spill into another cacheline. */ |
441 | L(return_vec_0_end): |
442 | tzcntl %eax, %eax |
443 | addl %edx, %eax |
444 | # ifdef USE_AS_WMEMCMP |
445 | movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx |
446 | xorl %edx, %edx |
447 | cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx |
448 | setg %dl |
449 | leal -1(%rdx, %rdx), %eax |
450 | # else |
451 | movzbl -VEC_SIZE(%rsi, %rax), %ecx |
452 | movzbl -VEC_SIZE(%rdi, %rax), %eax |
453 | subl %ecx, %eax |
454 | # endif |
455 | ret |
456 | /* 1-byte until next cache line. */ |
457 | |
458 | END (MEMCMP) |
459 | #endif |
460 | |