memcmp-avx2-movbe.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S]

1	/ memcmp/wmemcmp optimized with AVX2.*
2	Copyright (C) 2017 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	/ memcmp/wmemcmp is implemented as:*
22	1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
23	to avoid branches.
24	2. Use overlapping compare to avoid branch.
25	3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
26	bytes for wmemcmp.
27	4. If size is 8 VEC_SIZE or less, unroll the loop.*
28	5. Compare 4 VEC_SIZE at a time with the aligned first memory*
29	area.
30	6. Use 2 vector compares when size is 2 VEC_SIZE or less.*
31	7. Use 4 vector compares when size is 4 VEC_SIZE or less.*
32	8. Use 8 vector compares when size is 8 VEC_SIZE or less. /
33
34	# include <sysdep.h>
35
36	# ifndef MEMCMP
37	# define MEMCMP __memcmp_avx2_movbe
38	# endif
39
40	# ifdef USE_AS_WMEMCMP
41	# define VPCMPEQ vpcmpeqd
42	# else
43	# define VPCMPEQ vpcmpeqb
44	# endif
45
46	# ifndef VZEROUPPER
47	# define VZEROUPPER vzeroupper
48	# endif
49
50	# define VEC_SIZE 32
51	# define VEC_MASK ((1 << VEC_SIZE) - 1)
52
53	/ Warning!*
54	wmemcmp has to use SIGNED comparison for elements.
55	memcmp has to use UNSIGNED comparison for elemnts.
56	*/
57
58	.section .text.avx,"ax",@progbits
59	ENTRY (MEMCMP)
60	# ifdef USE_AS_WMEMCMP
61	shl $`2`, %rdx
62	# endif
63	cmpq $VEC_SIZE, %rdx
64	jb L(less_vec)
65
66	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
67	vmovdqu (%rsi), %ymm2
68	VPCMPEQ (%rdi), %ymm2, %ymm2
69	vpmovmskb %ymm2, %eax
70	subl $VEC_MASK, %eax
71	jnz L(first_vec)
72
73	cmpq $(VEC_SIZE * `2`), %rdx
74	jbe L(last_vec)
75
76	VPCMPEQ %ymm0, %ymm0, %ymm0
77	/ More than 2 * VEC. /
78	cmpq $(VEC_SIZE * `8`), %rdx
79	ja L(more_8x_vec)
80	cmpq $(VEC_SIZE * `4`), %rdx
81	jb L(last_4x_vec)
82
83	/ From 4 * VEC to 8 * VEC, inclusively. /
84	vmovdqu (%rsi), %ymm1
85	VPCMPEQ (%rdi), %ymm1, %ymm1
86
87	vmovdqu VEC_SIZE(%rsi), %ymm2
88	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
89
90	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
91	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
92
93	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
94	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
95
96	vpand %ymm1, %ymm2, %ymm5
97	vpand %ymm3, %ymm4, %ymm6
98	vpand %ymm5, %ymm6, %ymm5
99
100	vptest %ymm0, %ymm5
101	jnc L(`4x_vec_end`)
102
103	leaq -(`4` * VEC_SIZE)(%rdi, %rdx), %rdi
104	leaq -(`4` * VEC_SIZE)(%rsi, %rdx), %rsi
105	vmovdqu (%rsi), %ymm1
106	VPCMPEQ (%rdi), %ymm1, %ymm1
107
108	vmovdqu VEC_SIZE(%rsi), %ymm2
109	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
110	vpand %ymm2, %ymm1, %ymm5
111
112	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
113	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
114	vpand %ymm3, %ymm5, %ymm5
115
116	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
117	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
118	vpand %ymm4, %ymm5, %ymm5
119
120	vptest %ymm0, %ymm5
121	jnc L(`4x_vec_end`)
122	xorl %eax, %eax
123	VZEROUPPER
124	ret
125
126	.p2align `4`
127	L(last_2x_vec):
128	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
129	vmovdqu (%rsi), %ymm2
130	VPCMPEQ (%rdi), %ymm2, %ymm2
131	vpmovmskb %ymm2, %eax
132	subl $VEC_MASK, %eax
133	jnz L(first_vec)
134
135	L(last_vec):
136	/ Use overlapping loads to avoid branches. /
137	leaq -VEC_SIZE(%rdi, %rdx), %rdi
138	leaq -VEC_SIZE(%rsi, %rdx), %rsi
139	vmovdqu (%rsi), %ymm2
140	VPCMPEQ (%rdi), %ymm2, %ymm2
141	vpmovmskb %ymm2, %eax
142	subl $VEC_MASK, %eax
143	jnz L(first_vec)
144	VZEROUPPER
145	ret
146
147	.p2align `4`
148	L(first_vec):
149	/ A byte or int32 is different within 16 or 32 bytes. /
150	tzcntl %eax, %ecx
151	# ifdef USE_AS_WMEMCMP
152	xorl %eax, %eax
153	movl (%rdi, %rcx), %edx
154	cmpl (%rsi, %rcx), %edx
155	L(wmemcmp_return):
156	setl %al
157	negl %eax
158	orl $`1`, %eax
159	# else
160	movzbl (%rdi, %rcx), %eax
161	movzbl (%rsi, %rcx), %edx
162	sub %edx, %eax
163	# endif
164	VZEROUPPER
165	ret
166
167	# ifdef USE_AS_WMEMCMP
168	.p2align `4`
169	L(`4`):
170	xorl %eax, %eax
171	movl (%rdi), %edx
172	cmpl (%rsi), %edx
173	jne L(wmemcmp_return)
174	ret
175	# else
176	.p2align `4`
177	L(between_4_7):
178	/ Load as big endian with overlapping movbe to avoid branches. /
179	movbe (%rdi), %eax
180	movbe (%rsi), %ecx
181	shlq $`32`, %rax
182	shlq $`32`, %rcx
183	movbe -`4`(%rdi, %rdx), %edi
184	movbe -`4`(%rsi, %rdx), %esi
185	orq %rdi, %rax
186	orq %rsi, %rcx
187	subq %rcx, %rax
188	je L(exit)
189	sbbl %eax, %eax
190	orl $`1`, %eax
191	ret
192
193	.p2align `4`
194	L(exit):
195	ret
196
197	.p2align `4`
198	L(between_2_3):
199	/ Load as big endian to avoid branches. /
200	movzwl (%rdi), %eax
201	movzwl (%rsi), %ecx
202	shll $`8`, %eax
203	shll $`8`, %ecx
204	bswap %eax
205	bswap %ecx
206	movb -`1`(%rdi, %rdx), %al
207	movb -`1`(%rsi, %rdx), %cl
208	/ Subtraction is okay because the upper 8 bits are zero. /
209	subl %ecx, %eax
210	ret
211
212	.p2align `4`
213	L(`1`):
214	movzbl (%rdi), %eax
215	movzbl (%rsi), %ecx
216	subl %ecx, %eax
217	ret
218	# endif
219
220	.p2align `4`
221	L(zero):
222	xorl %eax, %eax
223	ret
224
225	.p2align `4`
226	L(less_vec):
227	# ifdef USE_AS_WMEMCMP
228	/ It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. /
229	cmpb $`4`, %dl
230	je L(`4`)
231	jb L(zero)
232	# else
233	cmpb $`1`, %dl
234	je L(`1`)
235	jb L(zero)
236	cmpb $`4`, %dl
237	jb L(between_2_3)
238	cmpb $`8`, %dl
239	jb L(between_4_7)
240	# endif
241	cmpb $`16`, %dl
242	jae L(between_16_31)
243	/ It is between 8 and 15 bytes. /
244	vmovq (%rdi), %xmm1
245	vmovq (%rsi), %xmm2
246	VPCMPEQ %xmm1, %xmm2, %xmm2
247	vpmovmskb %xmm2, %eax
248	subl $`0xffff`, %eax
249	jnz L(first_vec)
250	/ Use overlapping loads to avoid branches. /
251	leaq -`8`(%rdi, %rdx), %rdi
252	leaq -`8`(%rsi, %rdx), %rsi
253	vmovq (%rdi), %xmm1
254	vmovq (%rsi), %xmm2
255	VPCMPEQ %xmm1, %xmm2, %xmm2
256	vpmovmskb %xmm2, %eax
257	subl $`0xffff`, %eax
258	jnz L(first_vec)
259	ret
260
261	.p2align `4`
262	L(between_16_31):
263	/ From 16 to 31 bytes. No branch when size == 16. /
264	vmovdqu (%rsi), %xmm2
265	VPCMPEQ (%rdi), %xmm2, %xmm2
266	vpmovmskb %xmm2, %eax
267	subl $`0xffff`, %eax
268	jnz L(first_vec)
269
270	/ Use overlapping loads to avoid branches. /
271	leaq -`16`(%rdi, %rdx), %rdi
272	leaq -`16`(%rsi, %rdx), %rsi
273	vmovdqu (%rsi), %xmm2
274	VPCMPEQ (%rdi), %xmm2, %xmm2
275	vpmovmskb %xmm2, %eax
276	subl $`0xffff`, %eax
277	jnz L(first_vec)
278	ret
279
280	.p2align `4`
281	L(more_8x_vec):
282	/ More than 8 * VEC. Check the first VEC. /
283	vmovdqu (%rsi), %ymm2
284	VPCMPEQ (%rdi), %ymm2, %ymm2
285	vpmovmskb %ymm2, %eax
286	subl $VEC_MASK, %eax
287	jnz L(first_vec)
288
289	/ Align the first memory area for aligned loads in the loop.*
290	Compute how much the first memory area is misaligned. /*
291	movq %rdi, %rcx
292	andl $(VEC_SIZE - `1`), %ecx
293	/ Get the negative of offset for alignment. /
294	subq $VEC_SIZE, %rcx
295	/ Adjust the second memory area. /
296	subq %rcx, %rsi
297	/ Adjust the first memory area which should be aligned now. /
298	subq %rcx, %rdi
299	/ Adjust length. /
300	addq %rcx, %rdx
301
302	L(loop_4x_vec):
303	/ Compare 4 * VEC at a time forward. /
304	vmovdqu (%rsi), %ymm1
305	VPCMPEQ (%rdi), %ymm1, %ymm1
306
307	vmovdqu VEC_SIZE(%rsi), %ymm2
308	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
309	vpand %ymm2, %ymm1, %ymm5
310
311	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
312	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
313	vpand %ymm3, %ymm5, %ymm5
314
315	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
316	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
317	vpand %ymm4, %ymm5, %ymm5
318
319	vptest %ymm0, %ymm5
320	jnc L(`4x_vec_end`)
321
322	addq $(VEC_SIZE * `4`), %rdi
323	addq $(VEC_SIZE * `4`), %rsi
324
325	subq $(VEC_SIZE * `4`), %rdx
326	cmpq $(VEC_SIZE * `4`), %rdx
327	jae L(loop_4x_vec)
328
329	/ Less than 4 * VEC. /
330	cmpq $VEC_SIZE, %rdx
331	jbe L(last_vec)
332	cmpq $(VEC_SIZE * `2`), %rdx
333	jbe L(last_2x_vec)
334
335	L(last_4x_vec):
336	/ From 2 * VEC to 4 * VEC. /
337	vmovdqu (%rsi), %ymm2
338	VPCMPEQ (%rdi), %ymm2, %ymm2
339	vpmovmskb %ymm2, %eax
340	subl $VEC_MASK, %eax
341	jnz L(first_vec)
342
343	addq $VEC_SIZE, %rdi
344	addq $VEC_SIZE, %rsi
345	vmovdqu (%rsi), %ymm2
346	VPCMPEQ (%rdi), %ymm2, %ymm2
347	vpmovmskb %ymm2, %eax
348	subl $VEC_MASK, %eax
349	jnz L(first_vec)
350
351	/ Use overlapping loads to avoid branches. /
352	leaq -(`3` * VEC_SIZE)(%rdi, %rdx), %rdi
353	leaq -(`3` * VEC_SIZE)(%rsi, %rdx), %rsi
354	vmovdqu (%rsi), %ymm2
355	VPCMPEQ (%rdi), %ymm2, %ymm2
356	vpmovmskb %ymm2, %eax
357	subl $VEC_MASK, %eax
358	jnz L(first_vec)
359
360	addq $VEC_SIZE, %rdi
361	addq $VEC_SIZE, %rsi
362	vmovdqu (%rsi), %ymm2
363	VPCMPEQ (%rdi), %ymm2, %ymm2
364	vpmovmskb %ymm2, %eax
365	subl $VEC_MASK, %eax
366	jnz L(first_vec)
367	VZEROUPPER
368	ret
369
370	.p2align `4`
371	L(`4x_vec_end`):
372	vpmovmskb %ymm1, %eax
373	subl $VEC_MASK, %eax
374	jnz L(first_vec)
375	vpmovmskb %ymm2, %eax
376	subl $VEC_MASK, %eax
377	jnz L(first_vec_x1)
378	vpmovmskb %ymm3, %eax
379	subl $VEC_MASK, %eax
380	jnz L(first_vec_x2)
381	vpmovmskb %ymm4, %eax
382	subl $VEC_MASK, %eax
383	tzcntl %eax, %ecx
384	# ifdef USE_AS_WMEMCMP
385	xorl %eax, %eax
386	movl (VEC_SIZE * `3`)(%rdi, %rcx), %edx
387	cmpl (VEC_SIZE * `3`)(%rsi, %rcx), %edx
388	jmp L(wmemcmp_return)
389	# else
390	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
391	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %edx
392	sub %edx, %eax
393	# endif
394	VZEROUPPER
395	ret
396
397	.p2align `4`
398	L(first_vec_x1):
399	tzcntl %eax, %ecx
400	# ifdef USE_AS_WMEMCMP
401	xorl %eax, %eax
402	movl VEC_SIZE(%rdi, %rcx), %edx
403	cmpl VEC_SIZE(%rsi, %rcx), %edx
404	jmp L(wmemcmp_return)
405	# else
406	movzbl VEC_SIZE(%rdi, %rcx), %eax
407	movzbl VEC_SIZE(%rsi, %rcx), %edx
408	sub %edx, %eax
409	# endif
410	VZEROUPPER
411	ret
412
413	.p2align `4`
414	L(first_vec_x2):
415	tzcntl %eax, %ecx
416	# ifdef USE_AS_WMEMCMP
417	xorl %eax, %eax
418	movl (VEC_SIZE * `2`)(%rdi, %rcx), %edx
419	cmpl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
420	jmp L(wmemcmp_return)
421	# else
422	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
423	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
424	sub %edx, %eax
425	# endif
426	VZEROUPPER
427	ret
428	END (MEMCMP)
429	#endif
430

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S