memrchr-avx2.S source code [glibc/sysdeps/x86_64/multiarch/memrchr-avx2.S]

1	/ memrchr optimized with AVX2.*
2	Copyright (C) 2017-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	# include <sysdep.h>
22
23	# ifndef MEMRCHR
24	# define MEMRCHR __memrchr_avx2
25	# endif
26
27	# ifndef VZEROUPPER
28	# define VZEROUPPER vzeroupper
29	# endif
30
31	# ifndef SECTION
32	# define SECTION(p) p##.avx
33	# endif
34
35	# define VEC_SIZE 32
36
37	.section SECTION(.text),"ax",@progbits
38	ENTRY (MEMRCHR)
39	/ Broadcast CHAR to YMM0. /
40	vmovd %esi, %xmm0
41	vpbroadcastb %xmm0, %ymm0
42
43	sub $VEC_SIZE, %RDX_LP
44	jbe L(last_vec_or_less)
45
46	add %RDX_LP, %RDI_LP
47
48	/ Check the last VEC_SIZE bytes. /
49	vpcmpeqb (%rdi), %ymm0, %ymm1
50	vpmovmskb %ymm1, %eax
51	testl %eax, %eax
52	jnz L(last_vec_x0)
53
54	subq $(VEC_SIZE * `4`), %rdi
55	movl %edi, %ecx
56	andl $(VEC_SIZE - `1`), %ecx
57	jz L(aligned_more)
58
59	/ Align data for aligned loads in the loop. /
60	addq $VEC_SIZE, %rdi
61	addq $VEC_SIZE, %rdx
62	andq $-VEC_SIZE, %rdi
63	subq %rcx, %rdx
64
65	.p2align `4`
66	L(aligned_more):
67	subq $(VEC_SIZE * `4`), %rdx
68	jbe L(last_4x_vec_or_less)
69
70	/ Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time*
71	since data is only aligned to VEC_SIZE. /*
72	vpcmpeqb (VEC_SIZE * `3`)(%rdi), %ymm0, %ymm1
73	vpmovmskb %ymm1, %eax
74	testl %eax, %eax
75	jnz L(last_vec_x3)
76
77	vpcmpeqb (VEC_SIZE * `2`)(%rdi), %ymm0, %ymm2
78	vpmovmskb %ymm2, %eax
79	testl %eax, %eax
80	jnz L(last_vec_x2)
81
82	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
83	vpmovmskb %ymm3, %eax
84	testl %eax, %eax
85	jnz L(last_vec_x1)
86
87	vpcmpeqb (%rdi), %ymm0, %ymm4
88	vpmovmskb %ymm4, %eax
89	testl %eax, %eax
90	jnz L(last_vec_x0)
91
92	/ Align data to 4 * VEC_SIZE for loop with fewer branches.*
93	There are some overlaps with above if data isn't aligned
94	to 4 VEC_SIZE. /
95	movl %edi, %ecx
96	andl $(VEC_SIZE * `4` - `1`), %ecx
97	jz L(loop_4x_vec)
98
99	addq $(VEC_SIZE * `4`), %rdi
100	addq $(VEC_SIZE * `4`), %rdx
101	andq $-(VEC_SIZE * `4`), %rdi
102	subq %rcx, %rdx
103
104	.p2align `4`
105	L(loop_4x_vec):
106	/ Compare 4 * VEC at a time forward. /
107	subq $(VEC_SIZE * `4`), %rdi
108	subq $(VEC_SIZE * `4`), %rdx
109	jbe L(last_4x_vec_or_less)
110
111	vmovdqa (%rdi), %ymm1
112	vmovdqa VEC_SIZE(%rdi), %ymm2
113	vmovdqa (VEC_SIZE * `2`)(%rdi), %ymm3
114	vmovdqa (VEC_SIZE * `3`)(%rdi), %ymm4
115
116	vpcmpeqb %ymm1, %ymm0, %ymm1
117	vpcmpeqb %ymm2, %ymm0, %ymm2
118	vpcmpeqb %ymm3, %ymm0, %ymm3
119	vpcmpeqb %ymm4, %ymm0, %ymm4
120
121	vpor %ymm1, %ymm2, %ymm5
122	vpor %ymm3, %ymm4, %ymm6
123	vpor %ymm5, %ymm6, %ymm5
124
125	vpmovmskb %ymm5, %eax
126	testl %eax, %eax
127	jz L(loop_4x_vec)
128
129	/ There is a match. /
130	vpmovmskb %ymm4, %eax
131	testl %eax, %eax
132	jnz L(last_vec_x3)
133
134	vpmovmskb %ymm3, %eax
135	testl %eax, %eax
136	jnz L(last_vec_x2)
137
138	vpmovmskb %ymm2, %eax
139	testl %eax, %eax
140	jnz L(last_vec_x1)
141
142	vpmovmskb %ymm1, %eax
143	bsrl %eax, %eax
144	addq %rdi, %rax
145	L(return_vzeroupper):
146	ZERO_UPPER_VEC_REGISTERS_RETURN
147
148	.p2align `4`
149	L(last_4x_vec_or_less):
150	addl $(VEC_SIZE * `4`), %edx
151	cmpl $(VEC_SIZE * `2`), %edx
152	jbe L(last_2x_vec)
153
154	vpcmpeqb (VEC_SIZE * `3`)(%rdi), %ymm0, %ymm1
155	vpmovmskb %ymm1, %eax
156	testl %eax, %eax
157	jnz L(last_vec_x3)
158
159	vpcmpeqb (VEC_SIZE * `2`)(%rdi), %ymm0, %ymm2
160	vpmovmskb %ymm2, %eax
161	testl %eax, %eax
162	jnz L(last_vec_x2)
163
164	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
165	vpmovmskb %ymm3, %eax
166	testl %eax, %eax
167	jnz L(last_vec_x1_check)
168	cmpl $(VEC_SIZE * `3`), %edx
169	jbe L(zero)
170
171	vpcmpeqb (%rdi), %ymm0, %ymm4
172	vpmovmskb %ymm4, %eax
173	testl %eax, %eax
174	jz L(zero)
175	bsrl %eax, %eax
176	subq $(VEC_SIZE * `4`), %rdx
177	addq %rax, %rdx
178	jl L(zero)
179	addq %rdi, %rax
180	VZEROUPPER_RETURN
181
182	.p2align `4`
183	L(last_2x_vec):
184	vpcmpeqb (VEC_SIZE * `3`)(%rdi), %ymm0, %ymm1
185	vpmovmskb %ymm1, %eax
186	testl %eax, %eax
187	jnz L(last_vec_x3_check)
188	cmpl $VEC_SIZE, %edx
189	jbe L(zero)
190
191	vpcmpeqb (VEC_SIZE * `2`)(%rdi), %ymm0, %ymm1
192	vpmovmskb %ymm1, %eax
193	testl %eax, %eax
194	jz L(zero)
195	bsrl %eax, %eax
196	subq $(VEC_SIZE * `2`), %rdx
197	addq %rax, %rdx
198	jl L(zero)
199	addl $(VEC_SIZE * `2`), %eax
200	addq %rdi, %rax
201	VZEROUPPER_RETURN
202
203	.p2align `4`
204	L(last_vec_x0):
205	bsrl %eax, %eax
206	addq %rdi, %rax
207	VZEROUPPER_RETURN
208
209	.p2align `4`
210	L(last_vec_x1):
211	bsrl %eax, %eax
212	addl $VEC_SIZE, %eax
213	addq %rdi, %rax
214	VZEROUPPER_RETURN
215
216	.p2align `4`
217	L(last_vec_x2):
218	bsrl %eax, %eax
219	addl $(VEC_SIZE * `2`), %eax
220	addq %rdi, %rax
221	VZEROUPPER_RETURN
222
223	.p2align `4`
224	L(last_vec_x3):
225	bsrl %eax, %eax
226	addl $(VEC_SIZE * `3`), %eax
227	addq %rdi, %rax
228	ret
229
230	.p2align `4`
231	L(last_vec_x1_check):
232	bsrl %eax, %eax
233	subq $(VEC_SIZE * `3`), %rdx
234	addq %rax, %rdx
235	jl L(zero)
236	addl $VEC_SIZE, %eax
237	addq %rdi, %rax
238	VZEROUPPER_RETURN
239
240	.p2align `4`
241	L(last_vec_x3_check):
242	bsrl %eax, %eax
243	subq $VEC_SIZE, %rdx
244	addq %rax, %rdx
245	jl L(zero)
246	addl $(VEC_SIZE * `3`), %eax
247	addq %rdi, %rax
248	VZEROUPPER_RETURN
249
250	.p2align `4`
251	L(zero):
252	xorl %eax, %eax
253	VZEROUPPER_RETURN
254
255	.p2align `4`
256	L(null):
257	xorl %eax, %eax
258	ret
259
260	.p2align `4`
261	L(last_vec_or_less_aligned):
262	movl %edx, %ecx
263
264	vpcmpeqb (%rdi), %ymm0, %ymm1
265
266	movl $`1`, %edx
267	/ Support rdx << 32. /
268	salq %cl, %rdx
269	subq $`1`, %rdx
270
271	vpmovmskb %ymm1, %eax
272
273	/ Remove the trailing bytes. /
274	andl %edx, %eax
275	testl %eax, %eax
276	jz L(zero)
277
278	bsrl %eax, %eax
279	addq %rdi, %rax
280	VZEROUPPER_RETURN
281
282	.p2align `4`
283	L(last_vec_or_less):
284	addl $VEC_SIZE, %edx
285
286	/ Check for zero length. /
287	testl %edx, %edx
288	jz L(null)
289
290	movl %edi, %ecx
291	andl $(VEC_SIZE - `1`), %ecx
292	jz L(last_vec_or_less_aligned)
293
294	movl %ecx, %esi
295	movl %ecx, %r8d
296	addl %edx, %esi
297	andq $-VEC_SIZE, %rdi
298
299	subl $VEC_SIZE, %esi
300	ja L(last_vec_2x_aligned)
301
302	/ Check the last VEC. /
303	vpcmpeqb (%rdi), %ymm0, %ymm1
304	vpmovmskb %ymm1, %eax
305
306	/ Remove the leading and trailing bytes. /
307	sarl %cl, %eax
308	movl %edx, %ecx
309
310	movl $`1`, %edx
311	sall %cl, %edx
312	subl $`1`, %edx
313
314	andl %edx, %eax
315	testl %eax, %eax
316	jz L(zero)
317
318	bsrl %eax, %eax
319	addq %rdi, %rax
320	addq %r8, %rax
321	VZEROUPPER_RETURN
322
323	.p2align `4`
324	L(last_vec_2x_aligned):
325	movl %esi, %ecx
326
327	/ Check the last VEC. /
328	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
329
330	movl $`1`, %edx
331	sall %cl, %edx
332	subl $`1`, %edx
333
334	vpmovmskb %ymm1, %eax
335
336	/ Remove the trailing bytes. /
337	andl %edx, %eax
338
339	testl %eax, %eax
340	jnz L(last_vec_x1)
341
342	/ Check the second last VEC. /
343	vpcmpeqb (%rdi), %ymm0, %ymm1
344
345	movl %r8d, %ecx
346
347	vpmovmskb %ymm1, %eax
348
349	/ Remove the leading bytes. Must use unsigned right shift for*
350	bsrl below. /*
351	shrl %cl, %eax
352	testl %eax, %eax
353	jz L(zero)
354
355	bsrl %eax, %eax
356	addq %rdi, %rax
357	addq %r8, %rax
358	VZEROUPPER_RETURN
359	END (MEMRCHR)
360	#endif
361

Browse the source code of glibc/sysdeps/x86_64/multiarch/memrchr-avx2.S