strlen-evex-base.S source code [glibc/sysdeps/x86_64/multiarch/strlen-evex-base.S]

1	/ Placeholder function, not used by any processor at the moment.*
2	Copyright (C) 2022-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ UNUSED. Exists purely as reference implementation. /
20
21	#include <isa-level.h>
22
23	#if ISA_SHOULD_BUILD (4)
24
25	# include <sysdep.h>
26
27	# ifdef USE_AS_WCSLEN
28	# define VPCMPEQ vpcmpeqd
29	# define VPTESTN vptestnmd
30	# define VPMINU vpminud
31	# define CHAR_SIZE 4
32	# else
33	# define VPCMPEQ vpcmpeqb
34	# define VPTESTN vptestnmb
35	# define VPMINU vpminub
36	# define CHAR_SIZE 1
37	# endif
38
39	# define PAGE_SIZE 4096
40	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
41
42	.section SECTION(.text),"ax",@progbits
43	/ Aligning entry point to 64 byte, provides better performance for*
44	one vector length string. /*
45	ENTRY_P2ALIGN (STRLEN, `6`)
46	# ifdef USE_AS_STRNLEN
47	/ Check zero length. /
48	test %RSI_LP, %RSI_LP
49	jz L(ret_max)
50	# ifdef __ILP32__
51	/ Clear the upper 32 bits. /
52	movl %esi, %esi
53	# endif
54	# endif
55
56	movl %edi, %eax
57	vpxorq %VMM_128(`0`), %VMM_128(`0`), %VMM_128(`0`)
58	sall $`20`, %eax
59	cmpl $((PAGE_SIZE - VEC_SIZE) << `20`), %eax
60	ja L(page_cross)
61
62	/ Compare [w]char for null, mask bit will be set for match. /
63	VPCMPEQ (%rdi), %VMM(`0`), %k0
64	# ifdef USE_AS_STRNLEN
65	KMOV %k0, %VRCX
66	/ Store max length in rax. /
67	mov %rsi, %rax
68	/ If rcx is 0, rax will have max length. We can not use VRCX*
69	and VRAX here for evex256 because, upper 32 bits may be
70	undefined for ecx and eax. /*
71	bsfq %rcx, %rax
72	cmp $CHAR_PER_VEC, %rax
73	ja L(align_more)
74	cmpq %rax, %rsi
75	cmovb %esi, %eax
76	# else
77	KMOV %k0, %VRAX
78	test %VRAX, %VRAX
79	jz L(align_more)
80	bsf %VRAX, %VRAX
81	# endif
82	ret
83
84	/ At this point vector max length reached. /
85	# ifdef USE_AS_STRNLEN
86	.p2align `4`,,`3`
87	L(ret_max):
88	movq %rsi, %rax
89	ret
90	# endif
91
92	L(align_more):
93	mov %rdi, %rax
94	/ Align rax to VEC_SIZE. /
95	andq $-VEC_SIZE, %rax
96	# ifdef USE_AS_STRNLEN
97	movq %rdi, %rdx
98	subq %rax, %rdx
99	# ifdef USE_AS_WCSLEN
100	shr $`2`, %VRDX
101	# endif
102	/ At this point rdx contains [w]chars already compared. /
103	leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
104	/ At this point rdx contains number of w[char] needs to go.*
105	Now onwards rdx will keep decrementing with each compare. /*
106	# endif
107
108	/ Loop unroll 4 times for 4 vector loop. /
109	VPCMPEQ VEC_SIZE(%rax), %VMM(`0`), %k0
110	subq $-VEC_SIZE, %rax
111	KMOV %k0, %VRCX
112	test %VRCX, %VRCX
113	jnz L(ret_vec_x1)
114
115	# ifdef USE_AS_STRNLEN
116	subq $CHAR_PER_VEC, %rdx
117	jbe L(ret_max)
118	# endif
119
120	VPCMPEQ VEC_SIZE(%rax), %VMM(`0`), %k0
121	KMOV %k0, %VRCX
122	test %VRCX, %VRCX
123	jnz L(ret_vec_x2)
124
125	# ifdef USE_AS_STRNLEN
126	subq $CHAR_PER_VEC, %rdx
127	jbe L(ret_max)
128	# endif
129
130	VPCMPEQ (VEC_SIZE * `2`)(%rax), %VMM(`0`), %k0
131	KMOV %k0, %VRCX
132	test %VRCX, %VRCX
133	jnz L(ret_vec_x3)
134
135	# ifdef USE_AS_STRNLEN
136	subq $CHAR_PER_VEC, %rdx
137	jbe L(ret_max)
138	# endif
139
140	VPCMPEQ (VEC_SIZE * `3`)(%rax), %VMM(`0`), %k0
141	KMOV %k0, %VRCX
142	test %VRCX, %VRCX
143	jnz L(ret_vec_x4)
144
145	# ifdef USE_AS_STRNLEN
146	subq $CHAR_PER_VEC, %rdx
147	jbe L(ret_max)
148	/ Save pointer before 4 x VEC_SIZE alignment. /
149	movq %rax, %rcx
150	# endif
151
152	/ Align address to VEC_SIZE * 4 for loop. /
153	andq $-(VEC_SIZE * `4`), %rax
154
155	# ifdef USE_AS_STRNLEN
156	subq %rax, %rcx
157	# ifdef USE_AS_WCSLEN
158	shr $`2`, %VRCX
159	# endif
160	/ rcx contains number of [w]char will be recompared due to*
161	alignment fixes. rdx must be incremented by rcx to offset
162	alignment adjustment. /*
163	addq %rcx, %rdx
164	/ Need jump as we don't want to add/subtract rdx for first*
165	iteration of 4 x VEC_SIZE aligned loop. /*
166	# endif
167
168	.p2align `4`,,`11`
169	L(loop):
170	/ VPMINU and VPCMP combination provide better performance as*
171	compared to alternative combinations. /*
172	VMOVA (VEC_SIZE * `4`)(%rax), %VMM(`1`)
173	VPMINU (VEC_SIZE * `5`)(%rax), %VMM(`1`), %VMM(`2`)
174	VMOVA (VEC_SIZE * `6`)(%rax), %VMM(`3`)
175	VPMINU (VEC_SIZE * `7`)(%rax), %VMM(`3`), %VMM(`4`)
176
177	VPTESTN %VMM(`2`), %VMM(`2`), %k0
178	VPTESTN %VMM(`4`), %VMM(`4`), %k1
179
180	subq $-(VEC_SIZE * `4`), %rax
181	KORTEST %k0, %k1
182
183	# ifndef USE_AS_STRNLEN
184	jz L(loop)
185	# else
186	jnz L(loopend)
187	subq $(CHAR_PER_VEC * `4`), %rdx
188	ja L(loop)
189	mov %rsi, %rax
190	ret
191	# endif
192
193	L(loopend):
194
195	VPTESTN %VMM(`1`), %VMM(`1`), %k2
196	KMOV %k2, %VRCX
197	test %VRCX, %VRCX
198	jnz L(ret_vec_x1)
199
200	KMOV %k0, %VRCX
201	/ At this point, if k0 is non zero, null char must be in the*
202	second vector. /*
203	test %VRCX, %VRCX
204	jnz L(ret_vec_x2)
205
206	VPTESTN %VMM(`3`), %VMM(`3`), %k3
207	KMOV %k3, %VRCX
208	test %VRCX, %VRCX
209	jnz L(ret_vec_x3)
210	/ At this point null [w]char must be in the fourth vector so no*
211	need to check. /*
212	KMOV %k1, %VRCX
213
214	/ Fourth, third, second vector terminating are pretty much*
215	same, implemented this way to avoid branching and reuse code
216	from pre loop exit condition. /*
217	L(ret_vec_x4):
218	bsf %VRCX, %VRCX
219	subq %rdi, %rax
220	# ifdef USE_AS_WCSLEN
221	subq $-(VEC_SIZE * `3`), %rax
222	shrq $`2`, %rax
223	addq %rcx, %rax
224	# else
225	leaq (VEC_SIZE * `3`)(%rcx, %rax), %rax
226	# endif
227	# ifdef USE_AS_STRNLEN
228	cmpq %rsi, %rax
229	cmovnb %rsi, %rax
230	# endif
231	ret
232
233	L(ret_vec_x3):
234	bsf %VRCX, %VRCX
235	subq %rdi, %rax
236	# ifdef USE_AS_WCSLEN
237	subq $-(VEC_SIZE * `2`), %rax
238	shrq $`2`, %rax
239	addq %rcx, %rax
240	# else
241	leaq (VEC_SIZE * `2`)(%rcx, %rax), %rax
242	# endif
243	# ifdef USE_AS_STRNLEN
244	cmpq %rsi, %rax
245	cmovnb %rsi, %rax
246	# endif
247	ret
248
249	L(ret_vec_x2):
250	subq $-VEC_SIZE, %rax
251	L(ret_vec_x1):
252	bsf %VRCX, %VRCX
253	subq %rdi, %rax
254	# ifdef USE_AS_WCSLEN
255	shrq $`2`, %rax
256	# endif
257	addq %rcx, %rax
258	# ifdef USE_AS_STRNLEN
259	cmpq %rsi, %rax
260	cmovnb %rsi, %rax
261	# endif
262	ret
263
264	L(page_cross):
265	mov %rdi, %rax
266	movl %edi, %ecx
267	andl $(VEC_SIZE - `1`), %ecx
268	# ifdef USE_AS_WCSLEN
269	sarl $`2`, %ecx
270	# endif
271	/ ecx contains number of w[char] to be skipped as a result*
272	of address alignment. /*
273	andq $-VEC_SIZE, %rax
274	VPCMPEQ (%rax), %VMM(`0`), %k0
275	KMOV %k0, %VRDX
276	/ Ignore number of character for alignment adjustment. /
277	shr %cl, %VRDX
278	# ifdef USE_AS_STRNLEN
279	jnz L(page_cross_end)
280	movl $CHAR_PER_VEC, %eax
281	sub %ecx, %eax
282	cmp %rax, %rsi
283	ja L(align_more)
284	# else
285	jz L(align_more)
286	# endif
287
288	L(page_cross_end):
289	bsf %VRDX, %VRAX
290	# ifdef USE_AS_STRNLEN
291	cmpq %rsi, %rax
292	cmovnb %esi, %eax
293	# endif
294	ret
295
296	END (STRLEN)
297	#endif
298

Browse the source code of glibc/sysdeps/x86_64/multiarch/strlen-evex-base.S