memcmpeq-avx2.S source code [glibc/sysdeps/x86_64/multiarch/memcmpeq-avx2.S]

1	/ __memcmpeq optimized with AVX2.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	/ __memcmpeq is implemented as:*
22	1. Use ymm vector compares when possible. The only case where
23	vector compares is not possible for when size < VEC_SIZE
24	and loading from either s1 or s2 would cause a page cross.
25	2. Use xmm vector compare when size >= 8 bytes.
26	3. Optimistically compare up to first 4 VEC_SIZE one at a*
27	to check for early mismatches. Only do this if its guranteed the
28	work is not wasted.
29	4. If size is 8 VEC_SIZE or less, unroll the loop.*
30	5. Compare 4 VEC_SIZE at a time with the aligned first memory*
31	area.
32	6. Use 2 vector compares when size is 2 VEC_SIZE or less.*
33	7. Use 4 vector compares when size is 4 VEC_SIZE or less.*
34	8. Use 8 vector compares when size is 8 VEC_SIZE or less. /
35
36	# include <sysdep.h>
37
38	# ifndef MEMCMPEQ
39	# define MEMCMPEQ __memcmpeq_avx2
40	# endif
41
42	# define VPCMPEQ vpcmpeqb
43
44	# ifndef VZEROUPPER
45	# define VZEROUPPER vzeroupper
46	# endif
47
48	# ifndef SECTION
49	# define SECTION(p) p##.avx
50	# endif
51
52	# define VEC_SIZE 32
53	# define PAGE_SIZE 4096
54
55	.section SECTION(.text), "ax", @progbits
56	ENTRY_P2ALIGN (MEMCMPEQ, `6`)
57	# ifdef __ILP32__
58	/ Clear the upper 32 bits. /
59	movl %edx, %edx
60	# endif
61	cmp $VEC_SIZE, %RDX_LP
62	jb L(less_vec)
63
64	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
65	vmovdqu (%rsi), %ymm1
66	VPCMPEQ (%rdi), %ymm1, %ymm1
67	vpmovmskb %ymm1, %eax
68	incl %eax
69	jnz L(return_neq0)
70	cmpq $(VEC_SIZE * `2`), %rdx
71	jbe L(last_1x_vec)
72
73	/ Check second VEC no matter what. /
74	vmovdqu VEC_SIZE(%rsi), %ymm2
75	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
76	vpmovmskb %ymm2, %eax
77	/ If all 4 VEC where equal eax will be all 1s so incl will overflow*
78	and set zero flag. /*
79	incl %eax
80	jnz L(return_neq0)
81
82	/ Less than 4 * VEC. /
83	cmpq $(VEC_SIZE * `4`), %rdx
84	jbe L(last_2x_vec)
85
86	/ Check third and fourth VEC no matter what. /
87	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
88	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
89	vpmovmskb %ymm3, %eax
90	incl %eax
91	jnz L(return_neq0)
92
93	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
94	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
95	vpmovmskb %ymm4, %eax
96	incl %eax
97	jnz L(return_neq0)
98
99	/ Go to 4x VEC loop. /
100	cmpq $(VEC_SIZE * `8`), %rdx
101	ja L(more_8x_vec)
102
103	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
104	branches. /*
105
106	/ Adjust rsi and rdi to avoid indexed address mode. This end up*
107	saving a 16 bytes of code, prevents unlamination, and bottlenecks in
108	the AGU. /*
109	addq %rdx, %rsi
110	vmovdqu -(VEC_SIZE * `4`)(%rsi), %ymm1
111	vmovdqu -(VEC_SIZE * `3`)(%rsi), %ymm2
112	addq %rdx, %rdi
113
114	VPCMPEQ -(VEC_SIZE * `4`)(%rdi), %ymm1, %ymm1
115	VPCMPEQ -(VEC_SIZE * `3`)(%rdi), %ymm2, %ymm2
116
117	vmovdqu -(VEC_SIZE * `2`)(%rsi), %ymm3
118	VPCMPEQ -(VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
119	vmovdqu -VEC_SIZE(%rsi), %ymm4
120	VPCMPEQ -VEC_SIZE(%rdi), %ymm4, %ymm4
121
122	/ Reduce VEC0 - VEC4. /
123	vpand %ymm1, %ymm2, %ymm2
124	vpand %ymm3, %ymm4, %ymm4
125	vpand %ymm2, %ymm4, %ymm4
126	vpmovmskb %ymm4, %eax
127	incl %eax
128	L(return_neq0):
129	L(return_vzeroupper):
130	ZERO_UPPER_VEC_REGISTERS_RETURN
131
132	/ NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 byte*
133	aligned. /*
134	.p2align `5`
135	L(less_vec):
136	/ Check if one or less char. This is necessary for size = 0 but is*
137	also faster for size = 1. /*
138	cmpl $`1`, %edx
139	jbe L(one_or_less)
140
141	/ Check if loading one VEC from either s1 or s2 could cause a page*
142	cross. This can have false positives but is by far the fastest
143	method. /*
144	movl %edi, %eax
145	orl %esi, %eax
146	andl $(PAGE_SIZE - `1`), %eax
147	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
148	jg L(page_cross_less_vec)
149
150	/ No page cross possible. /
151	vmovdqu (%rsi), %ymm2
152	VPCMPEQ (%rdi), %ymm2, %ymm2
153	vpmovmskb %ymm2, %eax
154	incl %eax
155	/ Result will be zero if s1 and s2 match. Otherwise first set bit*
156	will be first mismatch. /*
157	bzhil %edx, %eax, %eax
158	VZEROUPPER_RETURN
159
160	/ Relatively cold but placing close to L(less_vec) for 2 byte jump*
161	encoding. /*
162	.p2align `4`
163	L(one_or_less):
164	jb L(zero)
165	movzbl (%rsi), %ecx
166	movzbl (%rdi), %eax
167	subl %ecx, %eax
168	/ No ymm register was touched. /
169	ret
170	/ Within the same 16 byte block is L(one_or_less). /
171	L(zero):
172	xorl %eax, %eax
173	ret
174
175	.p2align `4`
176	L(last_1x_vec):
177	vmovdqu -(VEC_SIZE * `1`)(%rsi, %rdx), %ymm1
178	VPCMPEQ -(VEC_SIZE * `1`)(%rdi, %rdx), %ymm1, %ymm1
179	vpmovmskb %ymm1, %eax
180	incl %eax
181	VZEROUPPER_RETURN
182
183	.p2align `4`
184	L(last_2x_vec):
185	vmovdqu -(VEC_SIZE * `2`)(%rsi, %rdx), %ymm1
186	VPCMPEQ -(VEC_SIZE * `2`)(%rdi, %rdx), %ymm1, %ymm1
187	vmovdqu -(VEC_SIZE * `1`)(%rsi, %rdx), %ymm2
188	VPCMPEQ -(VEC_SIZE * `1`)(%rdi, %rdx), %ymm2, %ymm2
189	vpand %ymm1, %ymm2, %ymm2
190	vpmovmskb %ymm2, %eax
191	incl %eax
192	VZEROUPPER_RETURN
193
194	.p2align `4`
195	L(more_8x_vec):
196	/ Set end of s1 in rdx. /
197	leaq -(VEC_SIZE * `4`)(%rdi, %rdx), %rdx
198	/ rsi stores s2 - s1. This allows loop to only update one pointer.*
199	*/
200	subq %rdi, %rsi
201	/ Align s1 pointer. /
202	andq $-VEC_SIZE, %rdi
203	/ Adjust because first 4x vec where check already. /
204	subq $-(VEC_SIZE * `4`), %rdi
205	.p2align `4`
206	L(loop_4x_vec):
207	/ rsi has s2 - s1 so get correct address by adding s1 (in rdi). /
208	vmovdqu (%rsi, %rdi), %ymm1
209	VPCMPEQ (%rdi), %ymm1, %ymm1
210
211	vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
212	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
213
214	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdi), %ymm3
215	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
216
217	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdi), %ymm4
218	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
219
220	vpand %ymm1, %ymm2, %ymm2
221	vpand %ymm3, %ymm4, %ymm4
222	vpand %ymm2, %ymm4, %ymm4
223	vpmovmskb %ymm4, %eax
224	incl %eax
225	jnz L(return_neq1)
226	subq $-(VEC_SIZE * `4`), %rdi
227	/ Check if s1 pointer at end. /
228	cmpq %rdx, %rdi
229	jb L(loop_4x_vec)
230
231	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdx), %ymm4
232	VPCMPEQ (VEC_SIZE * `3`)(%rdx), %ymm4, %ymm4
233	subq %rdx, %rdi
234	/ rdi has 4 * VEC_SIZE - remaining length. /
235	cmpl $(VEC_SIZE * `3`), %edi
236	jae L(`8x_last_1x_vec`)
237	/ Load regardless of branch. /
238	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdx), %ymm3
239	VPCMPEQ (VEC_SIZE * `2`)(%rdx), %ymm3, %ymm3
240	cmpl $(VEC_SIZE * `2`), %edi
241	jae L(`8x_last_2x_vec`)
242	/ Check last 4 VEC. /
243	vmovdqu VEC_SIZE(%rsi, %rdx), %ymm1
244	VPCMPEQ VEC_SIZE(%rdx), %ymm1, %ymm1
245
246	vmovdqu (%rsi, %rdx), %ymm2
247	VPCMPEQ (%rdx), %ymm2, %ymm2
248
249	vpand %ymm3, %ymm4, %ymm4
250	vpand %ymm1, %ymm2, %ymm3
251	L(`8x_last_2x_vec`):
252	vpand %ymm3, %ymm4, %ymm4
253	L(`8x_last_1x_vec`):
254	vpmovmskb %ymm4, %eax
255	/ Restore s1 pointer to rdi. /
256	incl %eax
257	L(return_neq1):
258	VZEROUPPER_RETURN
259
260	/ Relatively cold case as page cross are unexpected. /
261	.p2align `4`
262	L(page_cross_less_vec):
263	cmpl $`16`, %edx
264	jae L(between_16_31)
265	cmpl $`8`, %edx
266	ja L(between_9_15)
267	cmpl $`4`, %edx
268	jb L(between_2_3)
269	/ From 4 to 8 bytes. No branch when size == 4. /
270	movl (%rdi), %eax
271	subl (%rsi), %eax
272	movl -`4`(%rdi, %rdx), %ecx
273	movl -`4`(%rsi, %rdx), %edi
274	subl %edi, %ecx
275	orl %ecx, %eax
276	ret
277
278	.p2align `4`,, `8`
279	L(between_16_31):
280	/ From 16 to 31 bytes. No branch when size == 16. /
281
282	/ Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.*
283	*/
284	vmovdqu (%rsi), %xmm1
285	vpcmpeqb (%rdi), %xmm1, %xmm1
286	vmovdqu -`16`(%rsi, %rdx), %xmm2
287	vpcmpeqb -`16`(%rdi, %rdx), %xmm2, %xmm2
288	vpand %xmm1, %xmm2, %xmm2
289	vpmovmskb %xmm2, %eax
290	notw %ax
291	/ No ymm register was touched. /
292	ret
293
294	.p2align `4`,, `8`
295	L(between_9_15):
296	/ From 9 to 15 bytes. /
297	movq (%rdi), %rax
298	subq (%rsi), %rax
299	movq -`8`(%rdi, %rdx), %rcx
300	movq -`8`(%rsi, %rdx), %rdi
301	subq %rdi, %rcx
302	orq %rcx, %rax
303	/ edx is guranteed to be a non-zero int. /
304	cmovnz %edx, %eax
305	ret
306
307	/ Don't align. This is cold and aligning here will cause code*
308	to spill into next cache line. /*
309	L(between_2_3):
310	/ From 2 to 3 bytes. No branch when size == 2. /
311	movzwl (%rdi), %eax
312	movzwl (%rsi), %ecx
313	subl %ecx, %eax
314	movzbl -`1`(%rdi, %rdx), %ecx
315	/ All machines that support evex will insert a "merging uop"*
316	avoiding any serious partial register stalls. /*
317	subb -`1`(%rsi, %rdx), %cl
318	orl %ecx, %eax
319	/ No ymm register was touched. /
320	ret
321
322	/ 2 Bytes from next cache line. /
323	END (MEMCMPEQ)
324	#endif
325

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcmpeq-avx2.S