memcmp-avx2-movbe.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S]

1	/ memcmp/wmemcmp optimized with AVX2.*
2	Copyright (C) 2017-2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	/ memcmp/wmemcmp is implemented as:*
22	1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
23	to avoid branches.
24	2. Use overlapping compare to avoid branch.
25	3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
26	bytes for wmemcmp.
27	4. If size is 8 VEC_SIZE or less, unroll the loop.*
28	5. Compare 4 VEC_SIZE at a time with the aligned first memory*
29	area.
30	6. Use 2 vector compares when size is 2 VEC_SIZE or less.*
31	7. Use 4 vector compares when size is 4 VEC_SIZE or less.*
32	8. Use 8 vector compares when size is 8 VEC_SIZE or less. /
33
34	# include <sysdep.h>
35
36	# ifndef MEMCMP
37	# define MEMCMP __memcmp_avx2_movbe
38	# endif
39
40	# ifdef USE_AS_WMEMCMP
41	# define VPCMPEQ vpcmpeqd
42	# else
43	# define VPCMPEQ vpcmpeqb
44	# endif
45
46	# ifndef VZEROUPPER
47	# define VZEROUPPER vzeroupper
48	# endif
49
50	# define VEC_SIZE 32
51	# define VEC_MASK ((1 << VEC_SIZE) - 1)
52
53	/ Warning!*
54	wmemcmp has to use SIGNED comparison for elements.
55	memcmp has to use UNSIGNED comparison for elemnts.
56	*/
57
58	.section .text.avx,"ax",@progbits
59	ENTRY (MEMCMP)
60	# ifdef USE_AS_WMEMCMP
61	shl $`2`, %RDX_LP
62	# elif defined __ILP32__
63	/ Clear the upper 32 bits. /
64	movl %edx, %edx
65	# endif
66	cmp $VEC_SIZE, %RDX_LP
67	jb L(less_vec)
68
69	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
70	vmovdqu (%rsi), %ymm2
71	VPCMPEQ (%rdi), %ymm2, %ymm2
72	vpmovmskb %ymm2, %eax
73	subl $VEC_MASK, %eax
74	jnz L(first_vec)
75
76	cmpq $(VEC_SIZE * `2`), %rdx
77	jbe L(last_vec)
78
79	VPCMPEQ %ymm0, %ymm0, %ymm0
80	/ More than 2 * VEC. /
81	cmpq $(VEC_SIZE * `8`), %rdx
82	ja L(more_8x_vec)
83	cmpq $(VEC_SIZE * `4`), %rdx
84	jb L(last_4x_vec)
85
86	/ From 4 * VEC to 8 * VEC, inclusively. /
87	vmovdqu (%rsi), %ymm1
88	VPCMPEQ (%rdi), %ymm1, %ymm1
89
90	vmovdqu VEC_SIZE(%rsi), %ymm2
91	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
92
93	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
94	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
95
96	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
97	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
98
99	vpand %ymm1, %ymm2, %ymm5
100	vpand %ymm3, %ymm4, %ymm6
101	vpand %ymm5, %ymm6, %ymm5
102
103	vptest %ymm0, %ymm5
104	jnc L(`4x_vec_end`)
105
106	leaq -(`4` * VEC_SIZE)(%rdi, %rdx), %rdi
107	leaq -(`4` * VEC_SIZE)(%rsi, %rdx), %rsi
108	vmovdqu (%rsi), %ymm1
109	VPCMPEQ (%rdi), %ymm1, %ymm1
110
111	vmovdqu VEC_SIZE(%rsi), %ymm2
112	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
113	vpand %ymm2, %ymm1, %ymm5
114
115	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
116	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
117	vpand %ymm3, %ymm5, %ymm5
118
119	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
120	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
121	vpand %ymm4, %ymm5, %ymm5
122
123	vptest %ymm0, %ymm5
124	jnc L(`4x_vec_end`)
125	xorl %eax, %eax
126	VZEROUPPER
127	ret
128
129	.p2align `4`
130	L(last_2x_vec):
131	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
132	vmovdqu (%rsi), %ymm2
133	VPCMPEQ (%rdi), %ymm2, %ymm2
134	vpmovmskb %ymm2, %eax
135	subl $VEC_MASK, %eax
136	jnz L(first_vec)
137
138	L(last_vec):
139	/ Use overlapping loads to avoid branches. /
140	leaq -VEC_SIZE(%rdi, %rdx), %rdi
141	leaq -VEC_SIZE(%rsi, %rdx), %rsi
142	vmovdqu (%rsi), %ymm2
143	VPCMPEQ (%rdi), %ymm2, %ymm2
144	vpmovmskb %ymm2, %eax
145	subl $VEC_MASK, %eax
146	jnz L(first_vec)
147	VZEROUPPER
148	ret
149
150	.p2align `4`
151	L(first_vec):
152	/ A byte or int32 is different within 16 or 32 bytes. /
153	tzcntl %eax, %ecx
154	# ifdef USE_AS_WMEMCMP
155	xorl %eax, %eax
156	movl (%rdi, %rcx), %edx
157	cmpl (%rsi, %rcx), %edx
158	L(wmemcmp_return):
159	setl %al
160	negl %eax
161	orl $`1`, %eax
162	# else
163	movzbl (%rdi, %rcx), %eax
164	movzbl (%rsi, %rcx), %edx
165	sub %edx, %eax
166	# endif
167	VZEROUPPER
168	ret
169
170	# ifdef USE_AS_WMEMCMP
171	.p2align `4`
172	L(`4`):
173	xorl %eax, %eax
174	movl (%rdi), %edx
175	cmpl (%rsi), %edx
176	jne L(wmemcmp_return)
177	ret
178	# else
179	.p2align `4`
180	L(between_4_7):
181	/ Load as big endian with overlapping movbe to avoid branches. /
182	movbe (%rdi), %eax
183	movbe (%rsi), %ecx
184	shlq $`32`, %rax
185	shlq $`32`, %rcx
186	movbe -`4`(%rdi, %rdx), %edi
187	movbe -`4`(%rsi, %rdx), %esi
188	orq %rdi, %rax
189	orq %rsi, %rcx
190	subq %rcx, %rax
191	je L(exit)
192	sbbl %eax, %eax
193	orl $`1`, %eax
194	ret
195
196	.p2align `4`
197	L(exit):
198	ret
199
200	.p2align `4`
201	L(between_2_3):
202	/ Load as big endian to avoid branches. /
203	movzwl (%rdi), %eax
204	movzwl (%rsi), %ecx
205	shll $`8`, %eax
206	shll $`8`, %ecx
207	bswap %eax
208	bswap %ecx
209	movb -`1`(%rdi, %rdx), %al
210	movb -`1`(%rsi, %rdx), %cl
211	/ Subtraction is okay because the upper 8 bits are zero. /
212	subl %ecx, %eax
213	ret
214
215	.p2align `4`
216	L(`1`):
217	movzbl (%rdi), %eax
218	movzbl (%rsi), %ecx
219	subl %ecx, %eax
220	ret
221	# endif
222
223	.p2align `4`
224	L(zero):
225	xorl %eax, %eax
226	ret
227
228	.p2align `4`
229	L(less_vec):
230	# ifdef USE_AS_WMEMCMP
231	/ It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. /
232	cmpb $`4`, %dl
233	je L(`4`)
234	jb L(zero)
235	# else
236	cmpb $`1`, %dl
237	je L(`1`)
238	jb L(zero)
239	cmpb $`4`, %dl
240	jb L(between_2_3)
241	cmpb $`8`, %dl
242	jb L(between_4_7)
243	# endif
244	cmpb $`16`, %dl
245	jae L(between_16_31)
246	/ It is between 8 and 15 bytes. /
247	vmovq (%rdi), %xmm1
248	vmovq (%rsi), %xmm2
249	VPCMPEQ %xmm1, %xmm2, %xmm2
250	vpmovmskb %xmm2, %eax
251	subl $`0xffff`, %eax
252	jnz L(first_vec)
253	/ Use overlapping loads to avoid branches. /
254	leaq -`8`(%rdi, %rdx), %rdi
255	leaq -`8`(%rsi, %rdx), %rsi
256	vmovq (%rdi), %xmm1
257	vmovq (%rsi), %xmm2
258	VPCMPEQ %xmm1, %xmm2, %xmm2
259	vpmovmskb %xmm2, %eax
260	subl $`0xffff`, %eax
261	jnz L(first_vec)
262	ret
263
264	.p2align `4`
265	L(between_16_31):
266	/ From 16 to 31 bytes. No branch when size == 16. /
267	vmovdqu (%rsi), %xmm2
268	VPCMPEQ (%rdi), %xmm2, %xmm2
269	vpmovmskb %xmm2, %eax
270	subl $`0xffff`, %eax
271	jnz L(first_vec)
272
273	/ Use overlapping loads to avoid branches. /
274	leaq -`16`(%rdi, %rdx), %rdi
275	leaq -`16`(%rsi, %rdx), %rsi
276	vmovdqu (%rsi), %xmm2
277	VPCMPEQ (%rdi), %xmm2, %xmm2
278	vpmovmskb %xmm2, %eax
279	subl $`0xffff`, %eax
280	jnz L(first_vec)
281	ret
282
283	.p2align `4`
284	L(more_8x_vec):
285	/ More than 8 * VEC. Check the first VEC. /
286	vmovdqu (%rsi), %ymm2
287	VPCMPEQ (%rdi), %ymm2, %ymm2
288	vpmovmskb %ymm2, %eax
289	subl $VEC_MASK, %eax
290	jnz L(first_vec)
291
292	/ Align the first memory area for aligned loads in the loop.*
293	Compute how much the first memory area is misaligned. /*
294	movq %rdi, %rcx
295	andl $(VEC_SIZE - `1`), %ecx
296	/ Get the negative of offset for alignment. /
297	subq $VEC_SIZE, %rcx
298	/ Adjust the second memory area. /
299	subq %rcx, %rsi
300	/ Adjust the first memory area which should be aligned now. /
301	subq %rcx, %rdi
302	/ Adjust length. /
303	addq %rcx, %rdx
304
305	L(loop_4x_vec):
306	/ Compare 4 * VEC at a time forward. /
307	vmovdqu (%rsi), %ymm1
308	VPCMPEQ (%rdi), %ymm1, %ymm1
309
310	vmovdqu VEC_SIZE(%rsi), %ymm2
311	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
312	vpand %ymm2, %ymm1, %ymm5
313
314	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
315	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
316	vpand %ymm3, %ymm5, %ymm5
317
318	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
319	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
320	vpand %ymm4, %ymm5, %ymm5
321
322	vptest %ymm0, %ymm5
323	jnc L(`4x_vec_end`)
324
325	addq $(VEC_SIZE * `4`), %rdi
326	addq $(VEC_SIZE * `4`), %rsi
327
328	subq $(VEC_SIZE * `4`), %rdx
329	cmpq $(VEC_SIZE * `4`), %rdx
330	jae L(loop_4x_vec)
331
332	/ Less than 4 * VEC. /
333	cmpq $VEC_SIZE, %rdx
334	jbe L(last_vec)
335	cmpq $(VEC_SIZE * `2`), %rdx
336	jbe L(last_2x_vec)
337
338	L(last_4x_vec):
339	/ From 2 * VEC to 4 * VEC. /
340	vmovdqu (%rsi), %ymm2
341	VPCMPEQ (%rdi), %ymm2, %ymm2
342	vpmovmskb %ymm2, %eax
343	subl $VEC_MASK, %eax
344	jnz L(first_vec)
345
346	addq $VEC_SIZE, %rdi
347	addq $VEC_SIZE, %rsi
348	vmovdqu (%rsi), %ymm2
349	VPCMPEQ (%rdi), %ymm2, %ymm2
350	vpmovmskb %ymm2, %eax
351	subl $VEC_MASK, %eax
352	jnz L(first_vec)
353
354	/ Use overlapping loads to avoid branches. /
355	leaq -(`3` * VEC_SIZE)(%rdi, %rdx), %rdi
356	leaq -(`3` * VEC_SIZE)(%rsi, %rdx), %rsi
357	vmovdqu (%rsi), %ymm2
358	VPCMPEQ (%rdi), %ymm2, %ymm2
359	vpmovmskb %ymm2, %eax
360	subl $VEC_MASK, %eax
361	jnz L(first_vec)
362
363	addq $VEC_SIZE, %rdi
364	addq $VEC_SIZE, %rsi
365	vmovdqu (%rsi), %ymm2
366	VPCMPEQ (%rdi), %ymm2, %ymm2
367	vpmovmskb %ymm2, %eax
368	subl $VEC_MASK, %eax
369	jnz L(first_vec)
370	VZEROUPPER
371	ret
372
373	.p2align `4`
374	L(`4x_vec_end`):
375	vpmovmskb %ymm1, %eax
376	subl $VEC_MASK, %eax
377	jnz L(first_vec)
378	vpmovmskb %ymm2, %eax
379	subl $VEC_MASK, %eax
380	jnz L(first_vec_x1)
381	vpmovmskb %ymm3, %eax
382	subl $VEC_MASK, %eax
383	jnz L(first_vec_x2)
384	vpmovmskb %ymm4, %eax
385	subl $VEC_MASK, %eax
386	tzcntl %eax, %ecx
387	# ifdef USE_AS_WMEMCMP
388	xorl %eax, %eax
389	movl (VEC_SIZE * `3`)(%rdi, %rcx), %edx
390	cmpl (VEC_SIZE * `3`)(%rsi, %rcx), %edx
391	jmp L(wmemcmp_return)
392	# else
393	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
394	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %edx
395	sub %edx, %eax
396	# endif
397	VZEROUPPER
398	ret
399
400	.p2align `4`
401	L(first_vec_x1):
402	tzcntl %eax, %ecx
403	# ifdef USE_AS_WMEMCMP
404	xorl %eax, %eax
405	movl VEC_SIZE(%rdi, %rcx), %edx
406	cmpl VEC_SIZE(%rsi, %rcx), %edx
407	jmp L(wmemcmp_return)
408	# else
409	movzbl VEC_SIZE(%rdi, %rcx), %eax
410	movzbl VEC_SIZE(%rsi, %rcx), %edx
411	sub %edx, %eax
412	# endif
413	VZEROUPPER
414	ret
415
416	.p2align `4`
417	L(first_vec_x2):
418	tzcntl %eax, %ecx
419	# ifdef USE_AS_WMEMCMP
420	xorl %eax, %eax
421	movl (VEC_SIZE * `2`)(%rdi, %rcx), %edx
422	cmpl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
423	jmp L(wmemcmp_return)
424	# else
425	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
426	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
427	sub %edx, %eax
428	# endif
429	VZEROUPPER
430	ret
431	END (MEMCMP)
432	#endif
433

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S