1 | /* __memcmpeq optimized with AVX2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | /* __memcmpeq is implemented as: |
22 | 1. Use ymm vector compares when possible. The only case where |
23 | vector compares is not possible for when size < VEC_SIZE |
24 | and loading from either s1 or s2 would cause a page cross. |
25 | 2. Use xmm vector compare when size >= 8 bytes. |
26 | 3. Optimistically compare up to first 4 * VEC_SIZE one at a |
27 | to check for early mismatches. Only do this if its guranteed the |
28 | work is not wasted. |
29 | 4. If size is 8 * VEC_SIZE or less, unroll the loop. |
30 | 5. Compare 4 * VEC_SIZE at a time with the aligned first memory |
31 | area. |
32 | 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. |
33 | 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. |
34 | 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ |
35 | |
36 | # include <sysdep.h> |
37 | |
38 | # ifndef MEMCMPEQ |
39 | # define MEMCMPEQ __memcmpeq_avx2 |
40 | # endif |
41 | |
42 | # define VPCMPEQ vpcmpeqb |
43 | |
44 | # ifndef VZEROUPPER |
45 | # define VZEROUPPER vzeroupper |
46 | # endif |
47 | |
48 | # ifndef SECTION |
49 | # define SECTION(p) p##.avx |
50 | # endif |
51 | |
52 | # define VEC_SIZE 32 |
53 | # define PAGE_SIZE 4096 |
54 | |
55 | .section SECTION(.text), "ax" , @progbits |
56 | ENTRY_P2ALIGN (MEMCMPEQ, 6) |
57 | # ifdef __ILP32__ |
58 | /* Clear the upper 32 bits. */ |
59 | movl %edx, %edx |
60 | # endif |
61 | cmp $VEC_SIZE, %RDX_LP |
62 | jb L(less_vec) |
63 | |
64 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
65 | vmovdqu (%rsi), %ymm1 |
66 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
67 | vpmovmskb %ymm1, %eax |
68 | incl %eax |
69 | jnz L(return_neq0) |
70 | cmpq $(VEC_SIZE * 2), %rdx |
71 | jbe L(last_1x_vec) |
72 | |
73 | /* Check second VEC no matter what. */ |
74 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
75 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
76 | vpmovmskb %ymm2, %eax |
77 | /* If all 4 VEC where equal eax will be all 1s so incl will overflow |
78 | and set zero flag. */ |
79 | incl %eax |
80 | jnz L(return_neq0) |
81 | |
82 | /* Less than 4 * VEC. */ |
83 | cmpq $(VEC_SIZE * 4), %rdx |
84 | jbe L(last_2x_vec) |
85 | |
86 | /* Check third and fourth VEC no matter what. */ |
87 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
88 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
89 | vpmovmskb %ymm3, %eax |
90 | incl %eax |
91 | jnz L(return_neq0) |
92 | |
93 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
94 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
95 | vpmovmskb %ymm4, %eax |
96 | incl %eax |
97 | jnz L(return_neq0) |
98 | |
99 | /* Go to 4x VEC loop. */ |
100 | cmpq $(VEC_SIZE * 8), %rdx |
101 | ja L(more_8x_vec) |
102 | |
103 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
104 | branches. */ |
105 | |
106 | /* Adjust rsi and rdi to avoid indexed address mode. This end up |
107 | saving a 16 bytes of code, prevents unlamination, and bottlenecks in |
108 | the AGU. */ |
109 | addq %rdx, %rsi |
110 | vmovdqu -(VEC_SIZE * 4)(%rsi), %ymm1 |
111 | vmovdqu -(VEC_SIZE * 3)(%rsi), %ymm2 |
112 | addq %rdx, %rdi |
113 | |
114 | VPCMPEQ -(VEC_SIZE * 4)(%rdi), %ymm1, %ymm1 |
115 | VPCMPEQ -(VEC_SIZE * 3)(%rdi), %ymm2, %ymm2 |
116 | |
117 | vmovdqu -(VEC_SIZE * 2)(%rsi), %ymm3 |
118 | VPCMPEQ -(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
119 | vmovdqu -VEC_SIZE(%rsi), %ymm4 |
120 | VPCMPEQ -VEC_SIZE(%rdi), %ymm4, %ymm4 |
121 | |
122 | /* Reduce VEC0 - VEC4. */ |
123 | vpand %ymm1, %ymm2, %ymm2 |
124 | vpand %ymm3, %ymm4, %ymm4 |
125 | vpand %ymm2, %ymm4, %ymm4 |
126 | vpmovmskb %ymm4, %eax |
127 | incl %eax |
128 | L(return_neq0): |
129 | L(return_vzeroupper): |
130 | ZERO_UPPER_VEC_REGISTERS_RETURN |
131 | |
132 | /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 byte |
133 | aligned. */ |
134 | .p2align 5 |
135 | L(less_vec): |
136 | /* Check if one or less char. This is necessary for size = 0 but is |
137 | also faster for size = 1. */ |
138 | cmpl $1, %edx |
139 | jbe L(one_or_less) |
140 | |
141 | /* Check if loading one VEC from either s1 or s2 could cause a page |
142 | cross. This can have false positives but is by far the fastest |
143 | method. */ |
144 | movl %edi, %eax |
145 | orl %esi, %eax |
146 | andl $(PAGE_SIZE - 1), %eax |
147 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
148 | jg L(page_cross_less_vec) |
149 | |
150 | /* No page cross possible. */ |
151 | vmovdqu (%rsi), %ymm2 |
152 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
153 | vpmovmskb %ymm2, %eax |
154 | incl %eax |
155 | /* Result will be zero if s1 and s2 match. Otherwise first set bit |
156 | will be first mismatch. */ |
157 | bzhil %edx, %eax, %eax |
158 | VZEROUPPER_RETURN |
159 | |
160 | /* Relatively cold but placing close to L(less_vec) for 2 byte jump |
161 | encoding. */ |
162 | .p2align 4 |
163 | L(one_or_less): |
164 | jb L(zero) |
165 | movzbl (%rsi), %ecx |
166 | movzbl (%rdi), %eax |
167 | subl %ecx, %eax |
168 | /* No ymm register was touched. */ |
169 | ret |
170 | /* Within the same 16 byte block is L(one_or_less). */ |
171 | L(zero): |
172 | xorl %eax, %eax |
173 | ret |
174 | |
175 | .p2align 4 |
176 | L(last_1x_vec): |
177 | vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 |
178 | VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 |
179 | vpmovmskb %ymm1, %eax |
180 | incl %eax |
181 | VZEROUPPER_RETURN |
182 | |
183 | .p2align 4 |
184 | L(last_2x_vec): |
185 | vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 |
186 | VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 |
187 | vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm2 |
188 | VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm2, %ymm2 |
189 | vpand %ymm1, %ymm2, %ymm2 |
190 | vpmovmskb %ymm2, %eax |
191 | incl %eax |
192 | VZEROUPPER_RETURN |
193 | |
194 | .p2align 4 |
195 | L(more_8x_vec): |
196 | /* Set end of s1 in rdx. */ |
197 | leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx |
198 | /* rsi stores s2 - s1. This allows loop to only update one pointer. |
199 | */ |
200 | subq %rdi, %rsi |
201 | /* Align s1 pointer. */ |
202 | andq $-VEC_SIZE, %rdi |
203 | /* Adjust because first 4x vec where check already. */ |
204 | subq $-(VEC_SIZE * 4), %rdi |
205 | .p2align 4 |
206 | L(loop_4x_vec): |
207 | /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). */ |
208 | vmovdqu (%rsi, %rdi), %ymm1 |
209 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
210 | |
211 | vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 |
212 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
213 | |
214 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 |
215 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
216 | |
217 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 |
218 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
219 | |
220 | vpand %ymm1, %ymm2, %ymm2 |
221 | vpand %ymm3, %ymm4, %ymm4 |
222 | vpand %ymm2, %ymm4, %ymm4 |
223 | vpmovmskb %ymm4, %eax |
224 | incl %eax |
225 | jnz L(return_neq1) |
226 | subq $-(VEC_SIZE * 4), %rdi |
227 | /* Check if s1 pointer at end. */ |
228 | cmpq %rdx, %rdi |
229 | jb L(loop_4x_vec) |
230 | |
231 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 |
232 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 |
233 | subq %rdx, %rdi |
234 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
235 | cmpl $(VEC_SIZE * 3), %edi |
236 | jae L(8x_last_1x_vec) |
237 | /* Load regardless of branch. */ |
238 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 |
239 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 |
240 | cmpl $(VEC_SIZE * 2), %edi |
241 | jae L(8x_last_2x_vec) |
242 | /* Check last 4 VEC. */ |
243 | vmovdqu VEC_SIZE(%rsi, %rdx), %ymm1 |
244 | VPCMPEQ VEC_SIZE(%rdx), %ymm1, %ymm1 |
245 | |
246 | vmovdqu (%rsi, %rdx), %ymm2 |
247 | VPCMPEQ (%rdx), %ymm2, %ymm2 |
248 | |
249 | vpand %ymm3, %ymm4, %ymm4 |
250 | vpand %ymm1, %ymm2, %ymm3 |
251 | L(8x_last_2x_vec): |
252 | vpand %ymm3, %ymm4, %ymm4 |
253 | L(8x_last_1x_vec): |
254 | vpmovmskb %ymm4, %eax |
255 | /* Restore s1 pointer to rdi. */ |
256 | incl %eax |
257 | L(return_neq1): |
258 | VZEROUPPER_RETURN |
259 | |
260 | /* Relatively cold case as page cross are unexpected. */ |
261 | .p2align 4 |
262 | L(page_cross_less_vec): |
263 | cmpl $16, %edx |
264 | jae L(between_16_31) |
265 | cmpl $8, %edx |
266 | ja L(between_9_15) |
267 | cmpl $4, %edx |
268 | jb L(between_2_3) |
269 | /* From 4 to 8 bytes. No branch when size == 4. */ |
270 | movl (%rdi), %eax |
271 | subl (%rsi), %eax |
272 | movl -4(%rdi, %rdx), %ecx |
273 | movl -4(%rsi, %rdx), %edi |
274 | subl %edi, %ecx |
275 | orl %ecx, %eax |
276 | ret |
277 | |
278 | .p2align 4,, 8 |
279 | L(between_16_31): |
280 | /* From 16 to 31 bytes. No branch when size == 16. */ |
281 | |
282 | /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe. |
283 | */ |
284 | vmovdqu (%rsi), %xmm1 |
285 | vpcmpeqb (%rdi), %xmm1, %xmm1 |
286 | vmovdqu -16(%rsi, %rdx), %xmm2 |
287 | vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2 |
288 | vpand %xmm1, %xmm2, %xmm2 |
289 | vpmovmskb %xmm2, %eax |
290 | notw %ax |
291 | /* No ymm register was touched. */ |
292 | ret |
293 | |
294 | .p2align 4,, 8 |
295 | L(between_9_15): |
296 | /* From 9 to 15 bytes. */ |
297 | movq (%rdi), %rax |
298 | subq (%rsi), %rax |
299 | movq -8(%rdi, %rdx), %rcx |
300 | movq -8(%rsi, %rdx), %rdi |
301 | subq %rdi, %rcx |
302 | orq %rcx, %rax |
303 | /* edx is guranteed to be a non-zero int. */ |
304 | cmovnz %edx, %eax |
305 | ret |
306 | |
307 | /* Don't align. This is cold and aligning here will cause code |
308 | to spill into next cache line. */ |
309 | L(between_2_3): |
310 | /* From 2 to 3 bytes. No branch when size == 2. */ |
311 | movzwl (%rdi), %eax |
312 | movzwl (%rsi), %ecx |
313 | subl %ecx, %eax |
314 | movzbl -1(%rdi, %rdx), %ecx |
315 | /* All machines that support evex will insert a "merging uop" |
316 | avoiding any serious partial register stalls. */ |
317 | subb -1(%rsi, %rdx), %cl |
318 | orl %ecx, %eax |
319 | /* No ymm register was touched. */ |
320 | ret |
321 | |
322 | /* 2 Bytes from next cache line. */ |
323 | END (MEMCMPEQ) |
324 | #endif |
325 | |