strlen-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strlen-avx2.S]

1	/ strlen/strnlen/wcslen/wcsnlen optimized with AVX2.*
2	Copyright (C) 2017 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	#if IS_IN (libc)
20
21	# include <sysdep.h>
22
23	# ifndef STRLEN
24	# define STRLEN __strlen_avx2
25	# endif
26
27	# ifdef USE_AS_WCSLEN
28	# define VPCMPEQ vpcmpeqd
29	# define VPMINU vpminud
30	# else
31	# define VPCMPEQ vpcmpeqb
32	# define VPMINU vpminub
33	# endif
34
35	# ifndef VZEROUPPER
36	# define VZEROUPPER vzeroupper
37	# endif
38
39	# define VEC_SIZE 32
40
41	.section .text.avx,"ax",@progbits
42	ENTRY (STRLEN)
43	# ifdef USE_AS_STRNLEN
44	/ Check for zero length. /
45	testq %rsi, %rsi
46	jz L(zero)
47	# ifdef USE_AS_WCSLEN
48	shl $`2`, %rsi
49	# endif
50	movq %rsi, %r8
51	# endif
52	movl %edi, %ecx
53	movq %rdi, %rdx
54	vpxor %xmm0, %xmm0, %xmm0
55
56	/ Check if we may cross page boundary with one vector load. /
57	andl $(`2` * VEC_SIZE - `1`), %ecx
58	cmpl $VEC_SIZE, %ecx
59	ja L(cros_page_boundary)
60
61	/ Check the first VEC_SIZE bytes. /
62	VPCMPEQ (%rdi), %ymm0, %ymm1
63	vpmovmskb %ymm1, %eax
64	testl %eax, %eax
65
66	# ifdef USE_AS_STRNLEN
67	jnz L(first_vec_x0_check)
68	/ Adjust length and check the end of data. /
69	subq $VEC_SIZE, %rsi
70	jbe L(max)
71	# else
72	jnz L(first_vec_x0)
73	# endif
74
75	/ Align data for aligned loads in the loop. /
76	addq $VEC_SIZE, %rdi
77	andl $(VEC_SIZE - `1`), %ecx
78	andq $-VEC_SIZE, %rdi
79
80	# ifdef USE_AS_STRNLEN
81	/ Adjust length. /
82	addq %rcx, %rsi
83
84	subq $(VEC_SIZE * `4`), %rsi
85	jbe L(last_4x_vec_or_less)
86	# endif
87	jmp L(more_4x_vec)
88
89	.p2align `4`
90	L(cros_page_boundary):
91	andl $(VEC_SIZE - `1`), %ecx
92	andq $-VEC_SIZE, %rdi
93	VPCMPEQ (%rdi), %ymm0, %ymm1
94	vpmovmskb %ymm1, %eax
95	/ Remove the leading bytes. /
96	sarl %cl, %eax
97	testl %eax, %eax
98	jz L(aligned_more)
99	tzcntl %eax, %eax
100	# ifdef USE_AS_STRNLEN
101	/ Check the end of data. /
102	cmpq %rax, %rsi
103	jbe L(max)
104	# endif
105	addq %rdi, %rax
106	addq %rcx, %rax
107	subq %rdx, %rax
108	# ifdef USE_AS_WCSLEN
109	shrq $`2`, %rax
110	# endif
111	VZEROUPPER
112	ret
113
114	.p2align `4`
115	L(aligned_more):
116	# ifdef USE_AS_STRNLEN
117	/ "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"*
118	with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
119	to void possible addition overflow. /*
120	negq %rcx
121	addq $VEC_SIZE, %rcx
122
123	/ Check the end of data. /
124	subq %rcx, %rsi
125	jbe L(max)
126	# endif
127
128	addq $VEC_SIZE, %rdi
129
130	# ifdef USE_AS_STRNLEN
131	subq $(VEC_SIZE * `4`), %rsi
132	jbe L(last_4x_vec_or_less)
133	# endif
134
135	L(more_4x_vec):
136	/ Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time*
137	since data is only aligned to VEC_SIZE. /*
138	VPCMPEQ (%rdi), %ymm0, %ymm1
139	vpmovmskb %ymm1, %eax
140	testl %eax, %eax
141	jnz L(first_vec_x0)
142
143	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
144	vpmovmskb %ymm1, %eax
145	testl %eax, %eax
146	jnz L(first_vec_x1)
147
148	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm0, %ymm1
149	vpmovmskb %ymm1, %eax
150	testl %eax, %eax
151	jnz L(first_vec_x2)
152
153	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm0, %ymm1
154	vpmovmskb %ymm1, %eax
155	testl %eax, %eax
156	jnz L(first_vec_x3)
157
158	addq $(VEC_SIZE * `4`), %rdi
159
160	# ifdef USE_AS_STRNLEN
161	subq $(VEC_SIZE * `4`), %rsi
162	jbe L(last_4x_vec_or_less)
163	# endif
164
165	/ Align data to 4 * VEC_SIZE. /
166	movq %rdi, %rcx
167	andl $(`4` * VEC_SIZE - `1`), %ecx
168	andq $-(`4` * VEC_SIZE), %rdi
169
170	# ifdef USE_AS_STRNLEN
171	/ Adjust length. /
172	addq %rcx, %rsi
173	# endif
174
175	.p2align `4`
176	L(loop_4x_vec):
177	/ Compare 4 * VEC at a time forward. /
178	vmovdqa (%rdi), %ymm1
179	vmovdqa VEC_SIZE(%rdi), %ymm2
180	vmovdqa (VEC_SIZE * `2`)(%rdi), %ymm3
181	vmovdqa (VEC_SIZE * `3`)(%rdi), %ymm4
182	VPMINU %ymm1, %ymm2, %ymm5
183	VPMINU %ymm3, %ymm4, %ymm6
184	VPMINU %ymm5, %ymm6, %ymm5
185
186	VPCMPEQ %ymm5, %ymm0, %ymm5
187	vpmovmskb %ymm5, %eax
188	testl %eax, %eax
189	jnz L(`4x_vec_end`)
190
191	addq $(VEC_SIZE * `4`), %rdi
192
193	# ifndef USE_AS_STRNLEN
194	jmp L(loop_4x_vec)
195	# else
196	subq $(VEC_SIZE * `4`), %rsi
197	ja L(loop_4x_vec)
198
199	L(last_4x_vec_or_less):
200	/ Less than 4 * VEC and aligned to VEC_SIZE. /
201	addl $(VEC_SIZE * `2`), %esi
202	jle L(last_2x_vec)
203
204	VPCMPEQ (%rdi), %ymm0, %ymm1
205	vpmovmskb %ymm1, %eax
206	testl %eax, %eax
207	jnz L(first_vec_x0)
208
209	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
210	vpmovmskb %ymm1, %eax
211	testl %eax, %eax
212	jnz L(first_vec_x1)
213
214	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm0, %ymm1
215	vpmovmskb %ymm1, %eax
216	testl %eax, %eax
217
218	jnz L(first_vec_x2_check)
219	subl $VEC_SIZE, %esi
220	jle L(max)
221
222	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm0, %ymm1
223	vpmovmskb %ymm1, %eax
224	testl %eax, %eax
225
226	jnz L(first_vec_x3_check)
227	movq %r8, %rax
228	# ifdef USE_AS_WCSLEN
229	shrq $`2`, %rax
230	# endif
231	VZEROUPPER
232	ret
233
234	.p2align `4`
235	L(last_2x_vec):
236	addl $(VEC_SIZE * `2`), %esi
237	VPCMPEQ (%rdi), %ymm0, %ymm1
238	vpmovmskb %ymm1, %eax
239	testl %eax, %eax
240
241	jnz L(first_vec_x0_check)
242	subl $VEC_SIZE, %esi
243	jle L(max)
244
245	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
246	vpmovmskb %ymm1, %eax
247	testl %eax, %eax
248	jnz L(first_vec_x1_check)
249	movq %r8, %rax
250	# ifdef USE_AS_WCSLEN
251	shrq $`2`, %rax
252	# endif
253	VZEROUPPER
254	ret
255
256	.p2align `4`
257	L(first_vec_x0_check):
258	tzcntl %eax, %eax
259	/ Check the end of data. /
260	cmpq %rax, %rsi
261	jbe L(max)
262	addq %rdi, %rax
263	subq %rdx, %rax
264	# ifdef USE_AS_WCSLEN
265	shrq $`2`, %rax
266	# endif
267	VZEROUPPER
268	ret
269
270	.p2align `4`
271	L(first_vec_x1_check):
272	tzcntl %eax, %eax
273	/ Check the end of data. /
274	cmpq %rax, %rsi
275	jbe L(max)
276	addq $VEC_SIZE, %rax
277	addq %rdi, %rax
278	subq %rdx, %rax
279	# ifdef USE_AS_WCSLEN
280	shrq $`2`, %rax
281	# endif
282	VZEROUPPER
283	ret
284
285	.p2align `4`
286	L(first_vec_x2_check):
287	tzcntl %eax, %eax
288	/ Check the end of data. /
289	cmpq %rax, %rsi
290	jbe L(max)
291	addq $(VEC_SIZE * `2`), %rax
292	addq %rdi, %rax
293	subq %rdx, %rax
294	# ifdef USE_AS_WCSLEN
295	shrq $`2`, %rax
296	# endif
297	VZEROUPPER
298	ret
299
300	.p2align `4`
301	L(first_vec_x3_check):
302	tzcntl %eax, %eax
303	/ Check the end of data. /
304	cmpq %rax, %rsi
305	jbe L(max)
306	addq $(VEC_SIZE * `3`), %rax
307	addq %rdi, %rax
308	subq %rdx, %rax
309	# ifdef USE_AS_WCSLEN
310	shrq $`2`, %rax
311	# endif
312	VZEROUPPER
313	ret
314
315	.p2align `4`
316	L(max):
317	movq %r8, %rax
318	# ifdef USE_AS_WCSLEN
319	shrq $`2`, %rax
320	# endif
321	VZEROUPPER
322	ret
323
324	.p2align `4`
325	L(zero):
326	xorl %eax, %eax
327	ret
328	# endif
329
330	.p2align `4`
331	L(first_vec_x0):
332	tzcntl %eax, %eax
333	addq %rdi, %rax
334	subq %rdx, %rax
335	# ifdef USE_AS_WCSLEN
336	shrq $`2`, %rax
337	# endif
338	VZEROUPPER
339	ret
340
341	.p2align `4`
342	L(first_vec_x1):
343	tzcntl %eax, %eax
344	addq $VEC_SIZE, %rax
345	addq %rdi, %rax
346	subq %rdx, %rax
347	# ifdef USE_AS_WCSLEN
348	shrq $`2`, %rax
349	# endif
350	VZEROUPPER
351	ret
352
353	.p2align `4`
354	L(first_vec_x2):
355	tzcntl %eax, %eax
356	addq $(VEC_SIZE * `2`), %rax
357	addq %rdi, %rax
358	subq %rdx, %rax
359	# ifdef USE_AS_WCSLEN
360	shrq $`2`, %rax
361	# endif
362	VZEROUPPER
363	ret
364
365	.p2align `4`
366	L(`4x_vec_end`):
367	VPCMPEQ %ymm1, %ymm0, %ymm1
368	vpmovmskb %ymm1, %eax
369	testl %eax, %eax
370	jnz L(first_vec_x0)
371	VPCMPEQ %ymm2, %ymm0, %ymm2
372	vpmovmskb %ymm2, %eax
373	testl %eax, %eax
374	jnz L(first_vec_x1)
375	VPCMPEQ %ymm3, %ymm0, %ymm3
376	vpmovmskb %ymm3, %eax
377	testl %eax, %eax
378	jnz L(first_vec_x2)
379	VPCMPEQ %ymm4, %ymm0, %ymm4
380	vpmovmskb %ymm4, %eax
381	testl %eax, %eax
382	L(first_vec_x3):
383	tzcntl %eax, %eax
384	addq $(VEC_SIZE * `3`), %rax
385	addq %rdi, %rax
386	subq %rdx, %rax
387	# ifdef USE_AS_WCSLEN
388	shrq $`2`, %rax
389	# endif
390	VZEROUPPER
391	ret
392
393	END (STRLEN)
394	#endif
395

Browse the source code of glibc/sysdeps/x86_64/multiarch/strlen-avx2.S