1 | /* memcmp/wmemcmp optimized with AVX2. |
2 | Copyright (C) 2017 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | /* memcmp/wmemcmp is implemented as: |
22 | 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap |
23 | to avoid branches. |
24 | 2. Use overlapping compare to avoid branch. |
25 | 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 |
26 | bytes for wmemcmp. |
27 | 4. If size is 8 * VEC_SIZE or less, unroll the loop. |
28 | 5. Compare 4 * VEC_SIZE at a time with the aligned first memory |
29 | area. |
30 | 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. |
31 | 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. |
32 | 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ |
33 | |
34 | # include <sysdep.h> |
35 | |
36 | # ifndef MEMCMP |
37 | # define MEMCMP __memcmp_avx2_movbe |
38 | # endif |
39 | |
40 | # ifdef USE_AS_WMEMCMP |
41 | # define VPCMPEQ vpcmpeqd |
42 | # else |
43 | # define VPCMPEQ vpcmpeqb |
44 | # endif |
45 | |
46 | # ifndef VZEROUPPER |
47 | # define VZEROUPPER vzeroupper |
48 | # endif |
49 | |
50 | # define VEC_SIZE 32 |
51 | # define VEC_MASK ((1 << VEC_SIZE) - 1) |
52 | |
53 | /* Warning! |
54 | wmemcmp has to use SIGNED comparison for elements. |
55 | memcmp has to use UNSIGNED comparison for elemnts. |
56 | */ |
57 | |
58 | .section .text.avx,"ax" ,@progbits |
59 | ENTRY (MEMCMP) |
60 | # ifdef USE_AS_WMEMCMP |
61 | shl $2, %rdx |
62 | # endif |
63 | cmpq $VEC_SIZE, %rdx |
64 | jb L(less_vec) |
65 | |
66 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
67 | vmovdqu (%rsi), %ymm2 |
68 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
69 | vpmovmskb %ymm2, %eax |
70 | subl $VEC_MASK, %eax |
71 | jnz L(first_vec) |
72 | |
73 | cmpq $(VEC_SIZE * 2), %rdx |
74 | jbe L(last_vec) |
75 | |
76 | VPCMPEQ %ymm0, %ymm0, %ymm0 |
77 | /* More than 2 * VEC. */ |
78 | cmpq $(VEC_SIZE * 8), %rdx |
79 | ja L(more_8x_vec) |
80 | cmpq $(VEC_SIZE * 4), %rdx |
81 | jb L(last_4x_vec) |
82 | |
83 | /* From 4 * VEC to 8 * VEC, inclusively. */ |
84 | vmovdqu (%rsi), %ymm1 |
85 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
86 | |
87 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
88 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
89 | |
90 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
91 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
92 | |
93 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
94 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
95 | |
96 | vpand %ymm1, %ymm2, %ymm5 |
97 | vpand %ymm3, %ymm4, %ymm6 |
98 | vpand %ymm5, %ymm6, %ymm5 |
99 | |
100 | vptest %ymm0, %ymm5 |
101 | jnc L(4x_vec_end) |
102 | |
103 | leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi |
104 | leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi |
105 | vmovdqu (%rsi), %ymm1 |
106 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
107 | |
108 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
109 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
110 | vpand %ymm2, %ymm1, %ymm5 |
111 | |
112 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
113 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
114 | vpand %ymm3, %ymm5, %ymm5 |
115 | |
116 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
117 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
118 | vpand %ymm4, %ymm5, %ymm5 |
119 | |
120 | vptest %ymm0, %ymm5 |
121 | jnc L(4x_vec_end) |
122 | xorl %eax, %eax |
123 | VZEROUPPER |
124 | ret |
125 | |
126 | .p2align 4 |
127 | L(last_2x_vec): |
128 | /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ |
129 | vmovdqu (%rsi), %ymm2 |
130 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
131 | vpmovmskb %ymm2, %eax |
132 | subl $VEC_MASK, %eax |
133 | jnz L(first_vec) |
134 | |
135 | L(last_vec): |
136 | /* Use overlapping loads to avoid branches. */ |
137 | leaq -VEC_SIZE(%rdi, %rdx), %rdi |
138 | leaq -VEC_SIZE(%rsi, %rdx), %rsi |
139 | vmovdqu (%rsi), %ymm2 |
140 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
141 | vpmovmskb %ymm2, %eax |
142 | subl $VEC_MASK, %eax |
143 | jnz L(first_vec) |
144 | VZEROUPPER |
145 | ret |
146 | |
147 | .p2align 4 |
148 | L(first_vec): |
149 | /* A byte or int32 is different within 16 or 32 bytes. */ |
150 | tzcntl %eax, %ecx |
151 | # ifdef USE_AS_WMEMCMP |
152 | xorl %eax, %eax |
153 | movl (%rdi, %rcx), %edx |
154 | cmpl (%rsi, %rcx), %edx |
155 | L(wmemcmp_return): |
156 | setl %al |
157 | negl %eax |
158 | orl $1, %eax |
159 | # else |
160 | movzbl (%rdi, %rcx), %eax |
161 | movzbl (%rsi, %rcx), %edx |
162 | sub %edx, %eax |
163 | # endif |
164 | VZEROUPPER |
165 | ret |
166 | |
167 | # ifdef USE_AS_WMEMCMP |
168 | .p2align 4 |
169 | L(4): |
170 | xorl %eax, %eax |
171 | movl (%rdi), %edx |
172 | cmpl (%rsi), %edx |
173 | jne L(wmemcmp_return) |
174 | ret |
175 | # else |
176 | .p2align 4 |
177 | L(between_4_7): |
178 | /* Load as big endian with overlapping movbe to avoid branches. */ |
179 | movbe (%rdi), %eax |
180 | movbe (%rsi), %ecx |
181 | shlq $32, %rax |
182 | shlq $32, %rcx |
183 | movbe -4(%rdi, %rdx), %edi |
184 | movbe -4(%rsi, %rdx), %esi |
185 | orq %rdi, %rax |
186 | orq %rsi, %rcx |
187 | subq %rcx, %rax |
188 | je L(exit) |
189 | sbbl %eax, %eax |
190 | orl $1, %eax |
191 | ret |
192 | |
193 | .p2align 4 |
194 | L(exit): |
195 | ret |
196 | |
197 | .p2align 4 |
198 | L(between_2_3): |
199 | /* Load as big endian to avoid branches. */ |
200 | movzwl (%rdi), %eax |
201 | movzwl (%rsi), %ecx |
202 | shll $8, %eax |
203 | shll $8, %ecx |
204 | bswap %eax |
205 | bswap %ecx |
206 | movb -1(%rdi, %rdx), %al |
207 | movb -1(%rsi, %rdx), %cl |
208 | /* Subtraction is okay because the upper 8 bits are zero. */ |
209 | subl %ecx, %eax |
210 | ret |
211 | |
212 | .p2align 4 |
213 | L(1): |
214 | movzbl (%rdi), %eax |
215 | movzbl (%rsi), %ecx |
216 | subl %ecx, %eax |
217 | ret |
218 | # endif |
219 | |
220 | .p2align 4 |
221 | L(zero): |
222 | xorl %eax, %eax |
223 | ret |
224 | |
225 | .p2align 4 |
226 | L(less_vec): |
227 | # ifdef USE_AS_WMEMCMP |
228 | /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ |
229 | cmpb $4, %dl |
230 | je L(4) |
231 | jb L(zero) |
232 | # else |
233 | cmpb $1, %dl |
234 | je L(1) |
235 | jb L(zero) |
236 | cmpb $4, %dl |
237 | jb L(between_2_3) |
238 | cmpb $8, %dl |
239 | jb L(between_4_7) |
240 | # endif |
241 | cmpb $16, %dl |
242 | jae L(between_16_31) |
243 | /* It is between 8 and 15 bytes. */ |
244 | vmovq (%rdi), %xmm1 |
245 | vmovq (%rsi), %xmm2 |
246 | VPCMPEQ %xmm1, %xmm2, %xmm2 |
247 | vpmovmskb %xmm2, %eax |
248 | subl $0xffff, %eax |
249 | jnz L(first_vec) |
250 | /* Use overlapping loads to avoid branches. */ |
251 | leaq -8(%rdi, %rdx), %rdi |
252 | leaq -8(%rsi, %rdx), %rsi |
253 | vmovq (%rdi), %xmm1 |
254 | vmovq (%rsi), %xmm2 |
255 | VPCMPEQ %xmm1, %xmm2, %xmm2 |
256 | vpmovmskb %xmm2, %eax |
257 | subl $0xffff, %eax |
258 | jnz L(first_vec) |
259 | ret |
260 | |
261 | .p2align 4 |
262 | L(between_16_31): |
263 | /* From 16 to 31 bytes. No branch when size == 16. */ |
264 | vmovdqu (%rsi), %xmm2 |
265 | VPCMPEQ (%rdi), %xmm2, %xmm2 |
266 | vpmovmskb %xmm2, %eax |
267 | subl $0xffff, %eax |
268 | jnz L(first_vec) |
269 | |
270 | /* Use overlapping loads to avoid branches. */ |
271 | leaq -16(%rdi, %rdx), %rdi |
272 | leaq -16(%rsi, %rdx), %rsi |
273 | vmovdqu (%rsi), %xmm2 |
274 | VPCMPEQ (%rdi), %xmm2, %xmm2 |
275 | vpmovmskb %xmm2, %eax |
276 | subl $0xffff, %eax |
277 | jnz L(first_vec) |
278 | ret |
279 | |
280 | .p2align 4 |
281 | L(more_8x_vec): |
282 | /* More than 8 * VEC. Check the first VEC. */ |
283 | vmovdqu (%rsi), %ymm2 |
284 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
285 | vpmovmskb %ymm2, %eax |
286 | subl $VEC_MASK, %eax |
287 | jnz L(first_vec) |
288 | |
289 | /* Align the first memory area for aligned loads in the loop. |
290 | Compute how much the first memory area is misaligned. */ |
291 | movq %rdi, %rcx |
292 | andl $(VEC_SIZE - 1), %ecx |
293 | /* Get the negative of offset for alignment. */ |
294 | subq $VEC_SIZE, %rcx |
295 | /* Adjust the second memory area. */ |
296 | subq %rcx, %rsi |
297 | /* Adjust the first memory area which should be aligned now. */ |
298 | subq %rcx, %rdi |
299 | /* Adjust length. */ |
300 | addq %rcx, %rdx |
301 | |
302 | L(loop_4x_vec): |
303 | /* Compare 4 * VEC at a time forward. */ |
304 | vmovdqu (%rsi), %ymm1 |
305 | VPCMPEQ (%rdi), %ymm1, %ymm1 |
306 | |
307 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
308 | VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 |
309 | vpand %ymm2, %ymm1, %ymm5 |
310 | |
311 | vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 |
312 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 |
313 | vpand %ymm3, %ymm5, %ymm5 |
314 | |
315 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 |
316 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 |
317 | vpand %ymm4, %ymm5, %ymm5 |
318 | |
319 | vptest %ymm0, %ymm5 |
320 | jnc L(4x_vec_end) |
321 | |
322 | addq $(VEC_SIZE * 4), %rdi |
323 | addq $(VEC_SIZE * 4), %rsi |
324 | |
325 | subq $(VEC_SIZE * 4), %rdx |
326 | cmpq $(VEC_SIZE * 4), %rdx |
327 | jae L(loop_4x_vec) |
328 | |
329 | /* Less than 4 * VEC. */ |
330 | cmpq $VEC_SIZE, %rdx |
331 | jbe L(last_vec) |
332 | cmpq $(VEC_SIZE * 2), %rdx |
333 | jbe L(last_2x_vec) |
334 | |
335 | L(last_4x_vec): |
336 | /* From 2 * VEC to 4 * VEC. */ |
337 | vmovdqu (%rsi), %ymm2 |
338 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
339 | vpmovmskb %ymm2, %eax |
340 | subl $VEC_MASK, %eax |
341 | jnz L(first_vec) |
342 | |
343 | addq $VEC_SIZE, %rdi |
344 | addq $VEC_SIZE, %rsi |
345 | vmovdqu (%rsi), %ymm2 |
346 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
347 | vpmovmskb %ymm2, %eax |
348 | subl $VEC_MASK, %eax |
349 | jnz L(first_vec) |
350 | |
351 | /* Use overlapping loads to avoid branches. */ |
352 | leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi |
353 | leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi |
354 | vmovdqu (%rsi), %ymm2 |
355 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
356 | vpmovmskb %ymm2, %eax |
357 | subl $VEC_MASK, %eax |
358 | jnz L(first_vec) |
359 | |
360 | addq $VEC_SIZE, %rdi |
361 | addq $VEC_SIZE, %rsi |
362 | vmovdqu (%rsi), %ymm2 |
363 | VPCMPEQ (%rdi), %ymm2, %ymm2 |
364 | vpmovmskb %ymm2, %eax |
365 | subl $VEC_MASK, %eax |
366 | jnz L(first_vec) |
367 | VZEROUPPER |
368 | ret |
369 | |
370 | .p2align 4 |
371 | L(4x_vec_end): |
372 | vpmovmskb %ymm1, %eax |
373 | subl $VEC_MASK, %eax |
374 | jnz L(first_vec) |
375 | vpmovmskb %ymm2, %eax |
376 | subl $VEC_MASK, %eax |
377 | jnz L(first_vec_x1) |
378 | vpmovmskb %ymm3, %eax |
379 | subl $VEC_MASK, %eax |
380 | jnz L(first_vec_x2) |
381 | vpmovmskb %ymm4, %eax |
382 | subl $VEC_MASK, %eax |
383 | tzcntl %eax, %ecx |
384 | # ifdef USE_AS_WMEMCMP |
385 | xorl %eax, %eax |
386 | movl (VEC_SIZE * 3)(%rdi, %rcx), %edx |
387 | cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx |
388 | jmp L(wmemcmp_return) |
389 | # else |
390 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
391 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx |
392 | sub %edx, %eax |
393 | # endif |
394 | VZEROUPPER |
395 | ret |
396 | |
397 | .p2align 4 |
398 | L(first_vec_x1): |
399 | tzcntl %eax, %ecx |
400 | # ifdef USE_AS_WMEMCMP |
401 | xorl %eax, %eax |
402 | movl VEC_SIZE(%rdi, %rcx), %edx |
403 | cmpl VEC_SIZE(%rsi, %rcx), %edx |
404 | jmp L(wmemcmp_return) |
405 | # else |
406 | movzbl VEC_SIZE(%rdi, %rcx), %eax |
407 | movzbl VEC_SIZE(%rsi, %rcx), %edx |
408 | sub %edx, %eax |
409 | # endif |
410 | VZEROUPPER |
411 | ret |
412 | |
413 | .p2align 4 |
414 | L(first_vec_x2): |
415 | tzcntl %eax, %ecx |
416 | # ifdef USE_AS_WMEMCMP |
417 | xorl %eax, %eax |
418 | movl (VEC_SIZE * 2)(%rdi, %rcx), %edx |
419 | cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
420 | jmp L(wmemcmp_return) |
421 | # else |
422 | movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax |
423 | movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
424 | sub %edx, %eax |
425 | # endif |
426 | VZEROUPPER |
427 | ret |
428 | END (MEMCMP) |
429 | #endif |
430 | |