strlen-evex-base.S source code [glibc/sysdeps/x86_64/multiarch/strlen-evex-base.S]

1	/ Placeholder function, not used by any processor at the moment.*
2	Copyright (C) 2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ UNUSED. Exists purely as reference implementation. /
20
21	#include <isa-level.h>
22
23	#if ISA_SHOULD_BUILD (4)
24
25	# include <sysdep.h>
26
27	# ifdef USE_AS_WCSLEN
28	# define VPCMP vpcmpd
29	# define VPTESTN vptestnmd
30	# define VPMINU vpminud
31	# define CHAR_SIZE 4
32	# else
33	# define VPCMP vpcmpb
34	# define VPTESTN vptestnmb
35	# define VPMINU vpminub
36	# define CHAR_SIZE 1
37	# endif
38
39	# define XMM0 xmm16
40	# define PAGE_SIZE 4096
41	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
42
43	# if VEC_SIZE == 64
44	# define KMOV kmovq
45	# define KORTEST kortestq
46	# define RAX rax
47	# define RCX rcx
48	# define RDX rdx
49	# define SHR shrq
50	# define TEXTSUFFIX evex512
51	# define VMM0 zmm16
52	# define VMM1 zmm17
53	# define VMM2 zmm18
54	# define VMM3 zmm19
55	# define VMM4 zmm20
56	# define VMOVA vmovdqa64
57	# elif VEC_SIZE == 32
58	/ Currently Unused. /
59	# define KMOV kmovd
60	# define KORTEST kortestd
61	# define RAX eax
62	# define RCX ecx
63	# define RDX edx
64	# define SHR shrl
65	# define TEXTSUFFIX evex256
66	# define VMM0 ymm16
67	# define VMM1 ymm17
68	# define VMM2 ymm18
69	# define VMM3 ymm19
70	# define VMM4 ymm20
71	# define VMOVA vmovdqa32
72	# endif
73
74	.section .text.TEXTSUFFIX, "ax", @progbits
75	/ Aligning entry point to 64 byte, provides better performance for*
76	one vector length string. /*
77	ENTRY_P2ALIGN (STRLEN, `6`)
78	# ifdef USE_AS_STRNLEN
79	/ Check zero length. /
80	test %RSI_LP, %RSI_LP
81	jz L(ret_max)
82	# ifdef __ILP32__
83	/ Clear the upper 32 bits. /
84	movl %esi, %esi
85	# endif
86	# endif
87
88	movl %edi, %eax
89	vpxorq %XMM0, %XMM0, %XMM0
90	andl $(PAGE_SIZE - `1`), %eax
91	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
92	ja L(page_cross)
93
94	/ Compare [w]char for null, mask bit will be set for match. /
95	VPCMP $`0`, (%rdi), %VMM0, %k0
96	KMOV %k0, %RAX
97	test %RAX, %RAX
98	jz L(align_more)
99
100	bsf %RAX, %RAX
101	# ifdef USE_AS_STRNLEN
102	cmpq %rsi, %rax
103	cmovnb %rsi, %rax
104	# endif
105	ret
106
107	/ At this point vector max length reached. /
108	# ifdef USE_AS_STRNLEN
109	.p2align `4`,,`3`
110	L(ret_max):
111	movq %rsi, %rax
112	ret
113	# endif
114
115	L(align_more):
116	leaq VEC_SIZE(%rdi), %rax
117	/ Align rax to VEC_SIZE. /
118	andq $-VEC_SIZE, %rax
119	# ifdef USE_AS_STRNLEN
120	movq %rax, %rdx
121	subq %rdi, %rdx
122	# ifdef USE_AS_WCSLEN
123	SHR $`2`, %RDX
124	# endif
125	/ At this point rdx contains [w]chars already compared. /
126	subq %rsi, %rdx
127	jae L(ret_max)
128	negq %rdx
129	/ At this point rdx contains number of w[char] needs to go.*
130	Now onwards rdx will keep decrementing with each compare. /*
131	# endif
132
133	/ Loop unroll 4 times for 4 vector loop. /
134	VPCMP $`0`, (%rax), %VMM0, %k0
135	KMOV %k0, %RCX
136	test %RCX, %RCX
137	jnz L(ret_vec_x1)
138
139	# ifdef USE_AS_STRNLEN
140	subq $CHAR_PER_VEC, %rdx
141	jbe L(ret_max)
142	# endif
143
144	VPCMP $`0`, VEC_SIZE(%rax), %VMM0, %k0
145	KMOV %k0, %RCX
146	test %RCX, %RCX
147	jnz L(ret_vec_x2)
148
149	# ifdef USE_AS_STRNLEN
150	subq $CHAR_PER_VEC, %rdx
151	jbe L(ret_max)
152	# endif
153
154	VPCMP $`0`, (VEC_SIZE * `2`)(%rax), %VMM0, %k0
155	KMOV %k0, %RCX
156	test %RCX, %RCX
157	jnz L(ret_vec_x3)
158
159	# ifdef USE_AS_STRNLEN
160	subq $CHAR_PER_VEC, %rdx
161	jbe L(ret_max)
162	# endif
163
164	VPCMP $`0`, (VEC_SIZE * `3`)(%rax), %VMM0, %k0
165	KMOV %k0, %RCX
166	test %RCX, %RCX
167	jnz L(ret_vec_x4)
168
169	# ifdef USE_AS_STRNLEN
170	subq $CHAR_PER_VEC, %rdx
171	jbe L(ret_max)
172	/ Save pointer before 4 x VEC_SIZE alignment. /
173	movq %rax, %rcx
174	# endif
175
176	/ Align address to VEC_SIZE * 4 for loop. /
177	andq $-(VEC_SIZE * `4`), %rax
178
179	# ifdef USE_AS_STRNLEN
180	subq %rax, %rcx
181	# ifdef USE_AS_WCSLEN
182	SHR $`2`, %RCX
183	# endif
184	/ rcx contains number of [w]char will be recompared due to*
185	alignment fixes. rdx must be incremented by rcx to offset
186	alignment adjustment. /*
187	addq %rcx, %rdx
188	/ Need jump as we don't want to add/subtract rdx for first*
189	iteration of 4 x VEC_SIZE aligned loop. /*
190	jmp L(loop_entry)
191	# endif
192
193	.p2align `4`,,`11`
194	L(loop):
195	# ifdef USE_AS_STRNLEN
196	subq $(CHAR_PER_VEC * `4`), %rdx
197	jbe L(ret_max)
198	L(loop_entry):
199	# endif
200	/ VPMINU and VPCMP combination provide better performance as*
201	compared to alternative combinations. /*
202	VMOVA (VEC_SIZE * `4`)(%rax), %VMM1
203	VPMINU (VEC_SIZE * `5`)(%rax), %VMM1, %VMM2
204	VMOVA (VEC_SIZE * `6`)(%rax), %VMM3
205	VPMINU (VEC_SIZE * `7`)(%rax), %VMM3, %VMM4
206
207	VPTESTN %VMM2, %VMM2, %k0
208	VPTESTN %VMM4, %VMM4, %k1
209
210	subq $-(VEC_SIZE * `4`), %rax
211	KORTEST %k0, %k1
212	jz L(loop)
213
214	VPTESTN %VMM1, %VMM1, %k2
215	KMOV %k2, %RCX
216	test %RCX, %RCX
217	jnz L(ret_vec_x1)
218
219	KMOV %k0, %RCX
220	/ At this point, if k0 is non zero, null char must be in the*
221	second vector. /*
222	test %RCX, %RCX
223	jnz L(ret_vec_x2)
224
225	VPTESTN %VMM3, %VMM3, %k3
226	KMOV %k3, %RCX
227	test %RCX, %RCX
228	jnz L(ret_vec_x3)
229	/ At this point null [w]char must be in the fourth vector so no*
230	need to check. /*
231	KMOV %k1, %RCX
232
233	/ Fourth, third, second vector terminating are pretty much*
234	same, implemented this way to avoid branching and reuse code
235	from pre loop exit condition. /*
236	L(ret_vec_x4):
237	bsf %RCX, %RCX
238	subq %rdi, %rax
239	# ifdef USE_AS_WCSLEN
240	subq $-(VEC_SIZE * `3`), %rax
241	shrq $`2`, %rax
242	addq %rcx, %rax
243	# else
244	leaq (VEC_SIZE * `3`)(%rcx, %rax), %rax
245	# endif
246	# ifdef USE_AS_STRNLEN
247	cmpq %rsi, %rax
248	cmovnb %rsi, %rax
249	# endif
250	ret
251
252	L(ret_vec_x3):
253	bsf %RCX, %RCX
254	subq %rdi, %rax
255	# ifdef USE_AS_WCSLEN
256	subq $-(VEC_SIZE * `2`), %rax
257	shrq $`2`, %rax
258	addq %rcx, %rax
259	# else
260	leaq (VEC_SIZE * `2`)(%rcx, %rax), %rax
261	# endif
262	# ifdef USE_AS_STRNLEN
263	cmpq %rsi, %rax
264	cmovnb %rsi, %rax
265	# endif
266	ret
267
268	L(ret_vec_x2):
269	subq $-VEC_SIZE, %rax
270	L(ret_vec_x1):
271	bsf %RCX, %RCX
272	subq %rdi, %rax
273	# ifdef USE_AS_WCSLEN
274	shrq $`2`, %rax
275	# endif
276	addq %rcx, %rax
277	# ifdef USE_AS_STRNLEN
278	cmpq %rsi, %rax
279	cmovnb %rsi, %rax
280	# endif
281	ret
282
283	L(page_cross):
284	movl %eax, %ecx
285	# ifdef USE_AS_WCSLEN
286	andl $(VEC_SIZE - `1`), %ecx
287	sarl $`2`, %ecx
288	# endif
289	/ ecx contains number of w[char] to be skipped as a result*
290	of address alignment. /*
291	xorq %rdi, %rax
292	VPCMP $`0`, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
293	KMOV %k0, %RAX
294	/ Ignore number of character for alignment adjustment. /
295	SHR %cl, %RAX
296	jz L(align_more)
297
298	bsf %RAX, %RAX
299	# ifdef USE_AS_STRNLEN
300	cmpq %rsi, %rax
301	cmovnb %rsi, %rax
302	# endif
303	ret
304
305	END (STRLEN)
306	#endif
307

Browse the source code of glibc/sysdeps/x86_64/multiarch/strlen-evex-base.S