1/* Placeholder function, not used by any processor at the moment.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* UNUSED. Exists purely as reference implementation. */
20
21#include <isa-level.h>
22
23#if ISA_SHOULD_BUILD (4)
24
25# include <sysdep.h>
26
27# ifdef USE_AS_WCSLEN
28# define VPCMP vpcmpd
29# define VPTESTN vptestnmd
30# define VPMINU vpminud
31# define CHAR_SIZE 4
32# else
33# define VPCMP vpcmpb
34# define VPTESTN vptestnmb
35# define VPMINU vpminub
36# define CHAR_SIZE 1
37# endif
38
39# define XMM0 xmm16
40# define PAGE_SIZE 4096
41# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
42
43# if VEC_SIZE == 64
44# define KMOV kmovq
45# define KORTEST kortestq
46# define RAX rax
47# define RCX rcx
48# define RDX rdx
49# define SHR shrq
50# define TEXTSUFFIX evex512
51# define VMM0 zmm16
52# define VMM1 zmm17
53# define VMM2 zmm18
54# define VMM3 zmm19
55# define VMM4 zmm20
56# define VMOVA vmovdqa64
57# elif VEC_SIZE == 32
58/* Currently Unused. */
59# define KMOV kmovd
60# define KORTEST kortestd
61# define RAX eax
62# define RCX ecx
63# define RDX edx
64# define SHR shrl
65# define TEXTSUFFIX evex256
66# define VMM0 ymm16
67# define VMM1 ymm17
68# define VMM2 ymm18
69# define VMM3 ymm19
70# define VMM4 ymm20
71# define VMOVA vmovdqa32
72# endif
73
74 .section .text.TEXTSUFFIX, "ax", @progbits
75/* Aligning entry point to 64 byte, provides better performance for
76 one vector length string. */
77ENTRY_P2ALIGN (STRLEN, 6)
78# ifdef USE_AS_STRNLEN
79 /* Check zero length. */
80 test %RSI_LP, %RSI_LP
81 jz L(ret_max)
82# ifdef __ILP32__
83 /* Clear the upper 32 bits. */
84 movl %esi, %esi
85# endif
86# endif
87
88 movl %edi, %eax
89 vpxorq %XMM0, %XMM0, %XMM0
90 andl $(PAGE_SIZE - 1), %eax
91 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
92 ja L(page_cross)
93
94 /* Compare [w]char for null, mask bit will be set for match. */
95 VPCMP $0, (%rdi), %VMM0, %k0
96 KMOV %k0, %RAX
97 test %RAX, %RAX
98 jz L(align_more)
99
100 bsf %RAX, %RAX
101# ifdef USE_AS_STRNLEN
102 cmpq %rsi, %rax
103 cmovnb %rsi, %rax
104# endif
105 ret
106
107 /* At this point vector max length reached. */
108# ifdef USE_AS_STRNLEN
109 .p2align 4,,3
110L(ret_max):
111 movq %rsi, %rax
112 ret
113# endif
114
115L(align_more):
116 leaq VEC_SIZE(%rdi), %rax
117 /* Align rax to VEC_SIZE. */
118 andq $-VEC_SIZE, %rax
119# ifdef USE_AS_STRNLEN
120 movq %rax, %rdx
121 subq %rdi, %rdx
122# ifdef USE_AS_WCSLEN
123 SHR $2, %RDX
124# endif
125 /* At this point rdx contains [w]chars already compared. */
126 subq %rsi, %rdx
127 jae L(ret_max)
128 negq %rdx
129 /* At this point rdx contains number of w[char] needs to go.
130 Now onwards rdx will keep decrementing with each compare. */
131# endif
132
133 /* Loop unroll 4 times for 4 vector loop. */
134 VPCMP $0, (%rax), %VMM0, %k0
135 KMOV %k0, %RCX
136 test %RCX, %RCX
137 jnz L(ret_vec_x1)
138
139# ifdef USE_AS_STRNLEN
140 subq $CHAR_PER_VEC, %rdx
141 jbe L(ret_max)
142# endif
143
144 VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
145 KMOV %k0, %RCX
146 test %RCX, %RCX
147 jnz L(ret_vec_x2)
148
149# ifdef USE_AS_STRNLEN
150 subq $CHAR_PER_VEC, %rdx
151 jbe L(ret_max)
152# endif
153
154 VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
155 KMOV %k0, %RCX
156 test %RCX, %RCX
157 jnz L(ret_vec_x3)
158
159# ifdef USE_AS_STRNLEN
160 subq $CHAR_PER_VEC, %rdx
161 jbe L(ret_max)
162# endif
163
164 VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
165 KMOV %k0, %RCX
166 test %RCX, %RCX
167 jnz L(ret_vec_x4)
168
169# ifdef USE_AS_STRNLEN
170 subq $CHAR_PER_VEC, %rdx
171 jbe L(ret_max)
172 /* Save pointer before 4 x VEC_SIZE alignment. */
173 movq %rax, %rcx
174# endif
175
176 /* Align address to VEC_SIZE * 4 for loop. */
177 andq $-(VEC_SIZE * 4), %rax
178
179# ifdef USE_AS_STRNLEN
180 subq %rax, %rcx
181# ifdef USE_AS_WCSLEN
182 SHR $2, %RCX
183# endif
184 /* rcx contains number of [w]char will be recompared due to
185 alignment fixes. rdx must be incremented by rcx to offset
186 alignment adjustment. */
187 addq %rcx, %rdx
188 /* Need jump as we don't want to add/subtract rdx for first
189 iteration of 4 x VEC_SIZE aligned loop. */
190 jmp L(loop_entry)
191# endif
192
193 .p2align 4,,11
194L(loop):
195# ifdef USE_AS_STRNLEN
196 subq $(CHAR_PER_VEC * 4), %rdx
197 jbe L(ret_max)
198L(loop_entry):
199# endif
200 /* VPMINU and VPCMP combination provide better performance as
201 compared to alternative combinations. */
202 VMOVA (VEC_SIZE * 4)(%rax), %VMM1
203 VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
204 VMOVA (VEC_SIZE * 6)(%rax), %VMM3
205 VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
206
207 VPTESTN %VMM2, %VMM2, %k0
208 VPTESTN %VMM4, %VMM4, %k1
209
210 subq $-(VEC_SIZE * 4), %rax
211 KORTEST %k0, %k1
212 jz L(loop)
213
214 VPTESTN %VMM1, %VMM1, %k2
215 KMOV %k2, %RCX
216 test %RCX, %RCX
217 jnz L(ret_vec_x1)
218
219 KMOV %k0, %RCX
220 /* At this point, if k0 is non zero, null char must be in the
221 second vector. */
222 test %RCX, %RCX
223 jnz L(ret_vec_x2)
224
225 VPTESTN %VMM3, %VMM3, %k3
226 KMOV %k3, %RCX
227 test %RCX, %RCX
228 jnz L(ret_vec_x3)
229 /* At this point null [w]char must be in the fourth vector so no
230 need to check. */
231 KMOV %k1, %RCX
232
233 /* Fourth, third, second vector terminating are pretty much
234 same, implemented this way to avoid branching and reuse code
235 from pre loop exit condition. */
236L(ret_vec_x4):
237 bsf %RCX, %RCX
238 subq %rdi, %rax
239# ifdef USE_AS_WCSLEN
240 subq $-(VEC_SIZE * 3), %rax
241 shrq $2, %rax
242 addq %rcx, %rax
243# else
244 leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
245# endif
246# ifdef USE_AS_STRNLEN
247 cmpq %rsi, %rax
248 cmovnb %rsi, %rax
249# endif
250 ret
251
252L(ret_vec_x3):
253 bsf %RCX, %RCX
254 subq %rdi, %rax
255# ifdef USE_AS_WCSLEN
256 subq $-(VEC_SIZE * 2), %rax
257 shrq $2, %rax
258 addq %rcx, %rax
259# else
260 leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
261# endif
262# ifdef USE_AS_STRNLEN
263 cmpq %rsi, %rax
264 cmovnb %rsi, %rax
265# endif
266 ret
267
268L(ret_vec_x2):
269 subq $-VEC_SIZE, %rax
270L(ret_vec_x1):
271 bsf %RCX, %RCX
272 subq %rdi, %rax
273# ifdef USE_AS_WCSLEN
274 shrq $2, %rax
275# endif
276 addq %rcx, %rax
277# ifdef USE_AS_STRNLEN
278 cmpq %rsi, %rax
279 cmovnb %rsi, %rax
280# endif
281 ret
282
283L(page_cross):
284 movl %eax, %ecx
285# ifdef USE_AS_WCSLEN
286 andl $(VEC_SIZE - 1), %ecx
287 sarl $2, %ecx
288# endif
289 /* ecx contains number of w[char] to be skipped as a result
290 of address alignment. */
291 xorq %rdi, %rax
292 VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
293 KMOV %k0, %RAX
294 /* Ignore number of character for alignment adjustment. */
295 SHR %cl, %RAX
296 jz L(align_more)
297
298 bsf %RAX, %RAX
299# ifdef USE_AS_STRNLEN
300 cmpq %rsi, %rax
301 cmovnb %rsi, %rax
302# endif
303 ret
304
305END (STRLEN)
306#endif
307