1 | /* __memcmpeq optimized with AVX2. |
2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | /* __memcmpeq is implemented as: |
24 | 1. Use ymm vector compares when possible. The only case where |
25 | vector compares is not possible for when size < VEC_SIZE |
26 | and loading from either s1 or s2 would cause a page cross. |
27 | 2. Use xmm vector compare when size >= 8 bytes. |
28 | 3. Optimistically compare up to first 4 * VEC_SIZE one at a |
29 | to check for early mismatches. Only do this if its guranteed the |
30 | work is not wasted. |
31 | 4. If size is 8 * VEC_SIZE or less, unroll the loop. |
32 | 5. Compare 4 * VEC_SIZE at a time with the aligned first memory |
33 | area. |
34 | 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. |
35 | 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. |
36 | 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ |
37 | |
38 | # include <sysdep.h> |
39 | |
40 | # ifndef MEMCMPEQ |
41 | # define MEMCMPEQ __memcmpeq_avx2 |
42 | # endif |
43 | |
44 | # define VPCMPEQ vpcmpeqb |
45 | |
46 | # ifndef VZEROUPPER |
47 | # define VZEROUPPER vzeroupper |
48 | # endif |
49 | |
50 | # ifndef SECTION |
51 | # define SECTION(p) p##.avx |
52 | # endif |
53 | |
54 | # define VEC_SIZE 32 |
55 | # define PAGE_SIZE 4096 |
56 | |
57 | .section SECTION(.text), "ax" , @progbits |
58 | ENTRY_P2ALIGN (MEMCMPEQ, 6) |
59 | # ifdef __ILP32__ |
60 | /* Clear the upper 32 bits. */ |
61 | movl %edx, %edx |
62 | # endif |
63 | cmp $VEC_SIZE, %RDX_LP |
64 | jb L(less_vec) |
65 | |
66 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
67 | vmovdqu (%rsi), %ymm1 |
68 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
69 | vpmovmskb %ymm1, %eax |
70 | incl %eax |
71 | jnz L(return_neq0) |
72 | cmpq $(VEC_SIZE * 2), %rdx |
73 | jbe L(last_1x_vec) |
74 | |
75 | /* Check second VEC no matter what. */ |
76 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
77 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
78 | vpmovmskb %ymm2, %eax |
79 | /* If all 4 VEC where equal eax will be all 1s so incl will overflow |
80 | and set zero flag. */ |
81 | incl %eax |
82 | jnz L(return_neq0) |
83 | |
84 | /* Less than 4 * VEC. */ |
85 | cmpq $(VEC_SIZE * 4), %rdx |
86 | jbe L(last_2x_vec) |
87 | |
88 | /* Check third and fourth VEC no matter what. */ |
89 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
90 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
91 | vpmovmskb %ymm3, %eax |
92 | incl %eax |
93 | jnz L(return_neq0) |
94 | |
95 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
96 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
97 | vpmovmskb %ymm4, %eax |
98 | incl %eax |
99 | jnz L(return_neq0) |
100 | |
101 | /* Go to 4x VEC loop. */ |
102 | cmpq $(VEC_SIZE * 8), %rdx |
103 | ja L(more_8x_vec) |
104 | |
105 | /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any |
106 | branches. */ |
107 | |
108 | /* Adjust rsi and rdi to avoid indexed address mode. This end up |
109 | saving a 16 bytes of code, prevents unlamination, and bottlenecks in |
110 | the AGU. */ |
111 | addq %rdx, %rsi |
112 | vmovdqu -(VEC_SIZE * 4)(%rsi), %ymm1 |
113 | vmovdqu -(VEC_SIZE * 3)(%rsi), %ymm2 |
114 | addq %rdx, %rdi |
115 | |
116 | VPCMPEQ -(VEC_SIZE * 4)(%rdi), %ymm1, %ymm1 |
117 | VPCMPEQ -(VEC_SIZE * 3)(%rdi), %ymm2, %ymm2 |
118 | |
119 | vmovdqu -(VEC_SIZE * 2)(%rsi), %ymm3 |
120 | VPCMPEQ -(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
121 | vmovdqu -VEC_SIZE(%rsi), %ymm4 |
122 | VPCMPEQ -VEC_SIZE(%rdi), %ymm4, %ymm4 |
123 | |
124 | /* Reduce VEC0 - VEC4. */ |
125 | vpand %ymm1, %ymm2, %ymm2 |
126 | vpand %ymm3, %ymm4, %ymm4 |
127 | vpand %ymm2, %ymm4, %ymm4 |
128 | vpmovmskb %ymm4, %eax |
129 | incl %eax |
130 | L(return_neq0): |
131 | L(return_vzeroupper): |
132 | ZERO_UPPER_VEC_REGISTERS_RETURN |
133 | |
134 | /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 byte |
135 | aligned. */ |
136 | .p2align 5 |
137 | L(less_vec): |
138 | /* Check if one or less char. This is necessary for size = 0 but is |
139 | also faster for size = 1. */ |
140 | cmpl $1, %edx |
141 | jbe L(one_or_less) |
142 | |
143 | /* Check if loading one VEC from either s1 or s2 could cause a page |
144 | cross. This can have false positives but is by far the fastest |
145 | method. */ |
146 | movl %edi, %eax |
147 | orl %esi, %eax |
148 | andl $(PAGE_SIZE - 1), %eax |
149 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
150 | jg L(page_cross_less_vec) |
151 | |
152 | /* No page cross possible. */ |
153 | vmovdqu (%rsi), %ymm2 |
154 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
155 | vpmovmskb %ymm2, %eax |
156 | incl %eax |
157 | /* Result will be zero if s1 and s2 match. Otherwise first set bit |
158 | will be first mismatch. */ |
159 | bzhil %edx, %eax, %eax |
160 | VZEROUPPER_RETURN |
161 | |
162 | /* Relatively cold but placing close to L(less_vec) for 2 byte jump |
163 | encoding. */ |
164 | .p2align 4 |
165 | L(one_or_less): |
166 | jb L(zero) |
167 | movzbl (%rsi), %ecx |
168 | movzbl (%rdi), %eax |
169 | subl %ecx, %eax |
170 | /* No ymm register was touched. */ |
171 | ret |
172 | /* Within the same 16 byte block is L(one_or_less). */ |
173 | L(zero): |
174 | xorl %eax, %eax |
175 | ret |
176 | |
177 | .p2align 4 |
178 | L(last_1x_vec): |
179 | vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 |
180 | VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 |
181 | vpmovmskb %ymm1, %eax |
182 | incl %eax |
183 | VZEROUPPER_RETURN |
184 | |
185 | .p2align 4 |
186 | L(last_2x_vec): |
187 | vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 |
188 | VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 |
189 | vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm2 |
190 | VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm2, %ymm2 |
191 | vpand %ymm1, %ymm2, %ymm2 |
192 | vpmovmskb %ymm2, %eax |
193 | incl %eax |
194 | VZEROUPPER_RETURN |
195 | |
196 | .p2align 4 |
197 | L(more_8x_vec): |
198 | /* Set end of s1 in rdx. */ |
199 | leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx |
200 | /* rsi stores s2 - s1. This allows loop to only update one pointer. |
201 | */ |
202 | subq %rdi, %rsi |
203 | /* Align s1 pointer. */ |
204 | andq $-VEC_SIZE, %rdi |
205 | /* Adjust because first 4x vec where check already. */ |
206 | subq $-(VEC_SIZE * 4), %rdi |
207 | .p2align 4 |
208 | L(loop_4x_vec): |
209 | /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). */ |
210 | vmovdqu (%rsi, %rdi), %ymm1 |
211 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
212 | |
213 | vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 |
214 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
215 | |
216 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 |
217 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
218 | |
219 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 |
220 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
221 | |
222 | vpand %ymm1, %ymm2, %ymm2 |
223 | vpand %ymm3, %ymm4, %ymm4 |
224 | vpand %ymm2, %ymm4, %ymm4 |
225 | vpmovmskb %ymm4, %eax |
226 | incl %eax |
227 | jnz L(return_neq1) |
228 | subq $-(VEC_SIZE * 4), %rdi |
229 | /* Check if s1 pointer at end. */ |
230 | cmpq %rdx, %rdi |
231 | jb L(loop_4x_vec) |
232 | |
233 | vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 |
234 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 |
235 | subq %rdx, %rdi |
236 | /* rdi has 4 * VEC_SIZE - remaining length. */ |
237 | cmpl $(VEC_SIZE * 3), %edi |
238 | jae L(8x_last_1x_vec) |
239 | /* Load regardless of branch. */ |
240 | vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 |
241 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 |
242 | cmpl $(VEC_SIZE * 2), %edi |
243 | jae L(8x_last_2x_vec) |
244 | /* Check last 4 VEC. */ |
245 | vmovdqu VEC_SIZE(%rsi, %rdx), %ymm1 |
246 | VPCMPEQ VEC_SIZE(%rdx), %ymm1, %ymm1 |
247 | |
248 | vmovdqu (%rsi, %rdx), %ymm2 |
249 | VPCMPEQ (%rdx), %ymm2, %ymm2 |
250 | |
251 | vpand %ymm3, %ymm4, %ymm4 |
252 | vpand %ymm1, %ymm2, %ymm3 |
253 | L(8x_last_2x_vec): |
254 | vpand %ymm3, %ymm4, %ymm4 |
255 | L(8x_last_1x_vec): |
256 | vpmovmskb %ymm4, %eax |
257 | /* Restore s1 pointer to rdi. */ |
258 | incl %eax |
259 | L(return_neq1): |
260 | VZEROUPPER_RETURN |
261 | |
262 | /* Relatively cold case as page cross are unexpected. */ |
263 | .p2align 4 |
264 | L(page_cross_less_vec): |
265 | cmpl $16, %edx |
266 | jae L(between_16_31) |
267 | cmpl $8, %edx |
268 | ja L(between_9_15) |
269 | cmpl $4, %edx |
270 | jb L(between_2_3) |
271 | /* From 4 to 8 bytes. No branch when size == 4. */ |
272 | movl (%rdi), %eax |
273 | subl (%rsi), %eax |
274 | movl -4(%rdi, %rdx), %ecx |
275 | movl -4(%rsi, %rdx), %edi |
276 | subl %edi, %ecx |
277 | orl %ecx, %eax |
278 | ret |
279 | |
280 | .p2align 4,, 8 |
281 | L(between_16_31): |
282 | /* From 16 to 31 bytes. No branch when size == 16. */ |
283 | |
284 | /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe. |
285 | */ |
286 | vmovdqu (%rsi), %xmm1 |
287 | vpcmpeqb (%rdi), %xmm1, %xmm1 |
288 | vmovdqu -16(%rsi, %rdx), %xmm2 |
289 | vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2 |
290 | vpand %xmm1, %xmm2, %xmm2 |
291 | vpmovmskb %xmm2, %eax |
292 | notw %ax |
293 | /* No ymm register was touched. */ |
294 | ret |
295 | |
296 | .p2align 4,, 8 |
297 | L(between_9_15): |
298 | /* From 9 to 15 bytes. */ |
299 | movq (%rdi), %rax |
300 | subq (%rsi), %rax |
301 | movq -8(%rdi, %rdx), %rcx |
302 | movq -8(%rsi, %rdx), %rdi |
303 | subq %rdi, %rcx |
304 | orq %rcx, %rax |
305 | /* edx is guranteed to be a non-zero int. */ |
306 | cmovnz %edx, %eax |
307 | ret |
308 | |
309 | /* Don't align. This is cold and aligning here will cause code |
310 | to spill into next cache line. */ |
311 | L(between_2_3): |
312 | /* From 2 to 3 bytes. No branch when size == 2. */ |
313 | movzwl (%rdi), %eax |
314 | movzwl (%rsi), %ecx |
315 | subl %ecx, %eax |
316 | movzbl -1(%rdi, %rdx), %ecx |
317 | /* All machines that support evex will insert a "merging uop" |
318 | avoiding any serious partial register stalls. */ |
319 | subb -1(%rsi, %rdx), %cl |
320 | orl %ecx, %eax |
321 | /* No ymm register was touched. */ |
322 | ret |
323 | |
324 | /* 2 Bytes from next cache line. */ |
325 | END (MEMCMPEQ) |
326 | #endif |
327 | |