1 | /* Placeholder function, not used by any processor at the moment. |
2 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* UNUSED. Exists purely as reference implementation. */ |
20 | |
21 | #include <isa-level.h> |
22 | |
23 | #if ISA_SHOULD_BUILD (4) |
24 | |
25 | # include <sysdep.h> |
26 | |
27 | # ifdef USE_AS_WCSLEN |
28 | # define VPCMPEQ vpcmpeqd |
29 | # define VPTESTN vptestnmd |
30 | # define VPMINU vpminud |
31 | # define CHAR_SIZE 4 |
32 | # else |
33 | # define VPCMPEQ vpcmpeqb |
34 | # define VPTESTN vptestnmb |
35 | # define VPMINU vpminub |
36 | # define CHAR_SIZE 1 |
37 | # endif |
38 | |
39 | # define PAGE_SIZE 4096 |
40 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
41 | |
42 | .section SECTION(.text),"ax" ,@progbits |
43 | /* Aligning entry point to 64 byte, provides better performance for |
44 | one vector length string. */ |
45 | ENTRY_P2ALIGN (STRLEN, 6) |
46 | # ifdef USE_AS_STRNLEN |
47 | /* Check zero length. */ |
48 | test %RSI_LP, %RSI_LP |
49 | jz L(ret_max) |
50 | # ifdef __ILP32__ |
51 | /* Clear the upper 32 bits. */ |
52 | movl %esi, %esi |
53 | # endif |
54 | # endif |
55 | |
56 | movl %edi, %eax |
57 | vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) |
58 | sall $20, %eax |
59 | cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax |
60 | ja L(page_cross) |
61 | |
62 | /* Compare [w]char for null, mask bit will be set for match. */ |
63 | VPCMPEQ (%rdi), %VMM(0), %k0 |
64 | # ifdef USE_AS_STRNLEN |
65 | KMOV %k0, %VRCX |
66 | /* Store max length in rax. */ |
67 | mov %rsi, %rax |
68 | /* If rcx is 0, rax will have max length. We can not use VRCX |
69 | and VRAX here for evex256 because, upper 32 bits may be |
70 | undefined for ecx and eax. */ |
71 | bsfq %rcx, %rax |
72 | cmp $CHAR_PER_VEC, %rax |
73 | ja L(align_more) |
74 | cmpq %rax, %rsi |
75 | cmovb %esi, %eax |
76 | # else |
77 | KMOV %k0, %VRAX |
78 | test %VRAX, %VRAX |
79 | jz L(align_more) |
80 | bsf %VRAX, %VRAX |
81 | # endif |
82 | ret |
83 | |
84 | /* At this point vector max length reached. */ |
85 | # ifdef USE_AS_STRNLEN |
86 | .p2align 4,,3 |
87 | L(ret_max): |
88 | movq %rsi, %rax |
89 | ret |
90 | # endif |
91 | |
92 | L(align_more): |
93 | mov %rdi, %rax |
94 | /* Align rax to VEC_SIZE. */ |
95 | andq $-VEC_SIZE, %rax |
96 | # ifdef USE_AS_STRNLEN |
97 | movq %rdi, %rdx |
98 | subq %rax, %rdx |
99 | # ifdef USE_AS_WCSLEN |
100 | shr $2, %VRDX |
101 | # endif |
102 | /* At this point rdx contains [w]chars already compared. */ |
103 | leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx |
104 | /* At this point rdx contains number of w[char] needs to go. |
105 | Now onwards rdx will keep decrementing with each compare. */ |
106 | # endif |
107 | |
108 | /* Loop unroll 4 times for 4 vector loop. */ |
109 | VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 |
110 | subq $-VEC_SIZE, %rax |
111 | KMOV %k0, %VRCX |
112 | test %VRCX, %VRCX |
113 | jnz L(ret_vec_x1) |
114 | |
115 | # ifdef USE_AS_STRNLEN |
116 | subq $CHAR_PER_VEC, %rdx |
117 | jbe L(ret_max) |
118 | # endif |
119 | |
120 | VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 |
121 | KMOV %k0, %VRCX |
122 | test %VRCX, %VRCX |
123 | jnz L(ret_vec_x2) |
124 | |
125 | # ifdef USE_AS_STRNLEN |
126 | subq $CHAR_PER_VEC, %rdx |
127 | jbe L(ret_max) |
128 | # endif |
129 | |
130 | VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 |
131 | KMOV %k0, %VRCX |
132 | test %VRCX, %VRCX |
133 | jnz L(ret_vec_x3) |
134 | |
135 | # ifdef USE_AS_STRNLEN |
136 | subq $CHAR_PER_VEC, %rdx |
137 | jbe L(ret_max) |
138 | # endif |
139 | |
140 | VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 |
141 | KMOV %k0, %VRCX |
142 | test %VRCX, %VRCX |
143 | jnz L(ret_vec_x4) |
144 | |
145 | # ifdef USE_AS_STRNLEN |
146 | subq $CHAR_PER_VEC, %rdx |
147 | jbe L(ret_max) |
148 | /* Save pointer before 4 x VEC_SIZE alignment. */ |
149 | movq %rax, %rcx |
150 | # endif |
151 | |
152 | /* Align address to VEC_SIZE * 4 for loop. */ |
153 | andq $-(VEC_SIZE * 4), %rax |
154 | |
155 | # ifdef USE_AS_STRNLEN |
156 | subq %rax, %rcx |
157 | # ifdef USE_AS_WCSLEN |
158 | shr $2, %VRCX |
159 | # endif |
160 | /* rcx contains number of [w]char will be recompared due to |
161 | alignment fixes. rdx must be incremented by rcx to offset |
162 | alignment adjustment. */ |
163 | addq %rcx, %rdx |
164 | /* Need jump as we don't want to add/subtract rdx for first |
165 | iteration of 4 x VEC_SIZE aligned loop. */ |
166 | # endif |
167 | |
168 | .p2align 4,,11 |
169 | L(loop): |
170 | /* VPMINU and VPCMP combination provide better performance as |
171 | compared to alternative combinations. */ |
172 | VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) |
173 | VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) |
174 | VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) |
175 | VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) |
176 | |
177 | VPTESTN %VMM(2), %VMM(2), %k0 |
178 | VPTESTN %VMM(4), %VMM(4), %k1 |
179 | |
180 | subq $-(VEC_SIZE * 4), %rax |
181 | KORTEST %k0, %k1 |
182 | |
183 | # ifndef USE_AS_STRNLEN |
184 | jz L(loop) |
185 | # else |
186 | jnz L(loopend) |
187 | subq $(CHAR_PER_VEC * 4), %rdx |
188 | ja L(loop) |
189 | mov %rsi, %rax |
190 | ret |
191 | # endif |
192 | |
193 | L(loopend): |
194 | |
195 | VPTESTN %VMM(1), %VMM(1), %k2 |
196 | KMOV %k2, %VRCX |
197 | test %VRCX, %VRCX |
198 | jnz L(ret_vec_x1) |
199 | |
200 | KMOV %k0, %VRCX |
201 | /* At this point, if k0 is non zero, null char must be in the |
202 | second vector. */ |
203 | test %VRCX, %VRCX |
204 | jnz L(ret_vec_x2) |
205 | |
206 | VPTESTN %VMM(3), %VMM(3), %k3 |
207 | KMOV %k3, %VRCX |
208 | test %VRCX, %VRCX |
209 | jnz L(ret_vec_x3) |
210 | /* At this point null [w]char must be in the fourth vector so no |
211 | need to check. */ |
212 | KMOV %k1, %VRCX |
213 | |
214 | /* Fourth, third, second vector terminating are pretty much |
215 | same, implemented this way to avoid branching and reuse code |
216 | from pre loop exit condition. */ |
217 | L(ret_vec_x4): |
218 | bsf %VRCX, %VRCX |
219 | subq %rdi, %rax |
220 | # ifdef USE_AS_WCSLEN |
221 | subq $-(VEC_SIZE * 3), %rax |
222 | shrq $2, %rax |
223 | addq %rcx, %rax |
224 | # else |
225 | leaq (VEC_SIZE * 3)(%rcx, %rax), %rax |
226 | # endif |
227 | # ifdef USE_AS_STRNLEN |
228 | cmpq %rsi, %rax |
229 | cmovnb %rsi, %rax |
230 | # endif |
231 | ret |
232 | |
233 | L(ret_vec_x3): |
234 | bsf %VRCX, %VRCX |
235 | subq %rdi, %rax |
236 | # ifdef USE_AS_WCSLEN |
237 | subq $-(VEC_SIZE * 2), %rax |
238 | shrq $2, %rax |
239 | addq %rcx, %rax |
240 | # else |
241 | leaq (VEC_SIZE * 2)(%rcx, %rax), %rax |
242 | # endif |
243 | # ifdef USE_AS_STRNLEN |
244 | cmpq %rsi, %rax |
245 | cmovnb %rsi, %rax |
246 | # endif |
247 | ret |
248 | |
249 | L(ret_vec_x2): |
250 | subq $-VEC_SIZE, %rax |
251 | L(ret_vec_x1): |
252 | bsf %VRCX, %VRCX |
253 | subq %rdi, %rax |
254 | # ifdef USE_AS_WCSLEN |
255 | shrq $2, %rax |
256 | # endif |
257 | addq %rcx, %rax |
258 | # ifdef USE_AS_STRNLEN |
259 | cmpq %rsi, %rax |
260 | cmovnb %rsi, %rax |
261 | # endif |
262 | ret |
263 | |
264 | L(page_cross): |
265 | mov %rdi, %rax |
266 | movl %edi, %ecx |
267 | andl $(VEC_SIZE - 1), %ecx |
268 | # ifdef USE_AS_WCSLEN |
269 | sarl $2, %ecx |
270 | # endif |
271 | /* ecx contains number of w[char] to be skipped as a result |
272 | of address alignment. */ |
273 | andq $-VEC_SIZE, %rax |
274 | VPCMPEQ (%rax), %VMM(0), %k0 |
275 | KMOV %k0, %VRDX |
276 | /* Ignore number of character for alignment adjustment. */ |
277 | shr %cl, %VRDX |
278 | # ifdef USE_AS_STRNLEN |
279 | jnz L(page_cross_end) |
280 | movl $CHAR_PER_VEC, %eax |
281 | sub %ecx, %eax |
282 | cmp %rax, %rsi |
283 | ja L(align_more) |
284 | # else |
285 | jz L(align_more) |
286 | # endif |
287 | |
288 | L(page_cross_end): |
289 | bsf %VRDX, %VRAX |
290 | # ifdef USE_AS_STRNLEN |
291 | cmpq %rsi, %rax |
292 | cmovnb %esi, %eax |
293 | # endif |
294 | ret |
295 | |
296 | END (STRLEN) |
297 | #endif |
298 | |