1 | /* strnlen/wcsnlen optimized with 256-bit EVEX instructions. |
2 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | #include <sysdep.h> |
21 | |
22 | #if ISA_SHOULD_BUILD (4) |
23 | |
24 | # ifndef VEC_SIZE |
25 | # include "x86-evex256-vecs.h" |
26 | # endif |
27 | |
28 | |
29 | # ifndef STRNLEN |
30 | # define STRNLEN __strnlen_evex |
31 | # endif |
32 | |
33 | # ifdef USE_AS_WCSLEN |
34 | # define VPCMPEQ vpcmpeqd |
35 | # define VPCMPNEQ vpcmpneqd |
36 | # define VPTESTN vptestnmd |
37 | # define VPTEST vptestmd |
38 | # define VPMINU vpminud |
39 | # define CHAR_SIZE 4 |
40 | |
41 | # else |
42 | # define VPCMPEQ vpcmpeqb |
43 | # define VPCMPNEQ vpcmpneqb |
44 | # define VPTESTN vptestnmb |
45 | # define VPTEST vptestmb |
46 | # define VPMINU vpminub |
47 | # define CHAR_SIZE 1 |
48 | |
49 | # define REG_WIDTH VEC_SIZE |
50 | # endif |
51 | |
52 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
53 | |
54 | # include "reg-macros.h" |
55 | |
56 | # if CHAR_PER_VEC == 32 |
57 | # define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) |
58 | # else |
59 | # define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) |
60 | # endif |
61 | |
62 | |
63 | |
64 | # if CHAR_PER_VEC == 64 |
65 | # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) |
66 | # else |
67 | # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) |
68 | # endif |
69 | |
70 | |
71 | # define XZERO VMM_128(0) |
72 | # define VZERO VMM(0) |
73 | # define PAGE_SIZE 4096 |
74 | |
75 | .section SECTION(.text), "ax" , @progbits |
76 | ENTRY_P2ALIGN (STRNLEN, 6) |
77 | /* Check zero length. */ |
78 | test %RSI_LP, %RSI_LP |
79 | jz L(zero) |
80 | # ifdef __ILP32__ |
81 | /* Clear the upper 32 bits. */ |
82 | movl %esi, %esi |
83 | # endif |
84 | |
85 | movl %edi, %eax |
86 | vpxorq %XZERO, %XZERO, %XZERO |
87 | andl $(PAGE_SIZE - 1), %eax |
88 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
89 | ja L(cross_page_boundary) |
90 | |
91 | /* Check the first VEC_SIZE bytes. Each bit in K0 represents a |
92 | null byte. */ |
93 | VPCMPEQ (%rdi), %VZERO, %k0 |
94 | |
95 | KMOV %k0, %VRCX |
96 | movq %rsi, %rax |
97 | |
98 | /* If src (rcx) is zero, bsf does not change the result. NB: |
99 | Must use 64-bit bsf here so that upper bits of len are not |
100 | cleared. */ |
101 | bsfq %rcx, %rax |
102 | /* If rax > CHAR_PER_VEC then rcx must have been zero (no null |
103 | CHAR) and rsi must be > CHAR_PER_VEC. */ |
104 | cmpq $CHAR_PER_VEC, %rax |
105 | ja L(more_1x_vec) |
106 | /* Check if first match in bounds. */ |
107 | cmpq %rax, %rsi |
108 | cmovb %esi, %eax |
109 | ret |
110 | |
111 | |
112 | # if CHAR_PER_VEC != 32 |
113 | .p2align 4,, 2 |
114 | L(zero): |
115 | L(max_0): |
116 | movl %esi, %eax |
117 | ret |
118 | # endif |
119 | |
120 | /* Aligned more for strnlen compares remaining length vs 2 * |
121 | CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before |
122 | going to the loop. */ |
123 | .p2align 4,, 10 |
124 | L(more_1x_vec): |
125 | L(cross_page_continue): |
126 | /* Compute number of words checked after aligning. */ |
127 | # ifdef USE_AS_WCSLEN |
128 | /* Need to compute directly for wcslen as CHAR_SIZE * rsi can |
129 | overflow. */ |
130 | movq %rdi, %rax |
131 | andq $(VEC_SIZE * -1), %rdi |
132 | subq %rdi, %rax |
133 | sarq $2, %rax |
134 | leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax |
135 | # else |
136 | leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax |
137 | andq $(VEC_SIZE * -1), %rdi |
138 | subq %rdi, %rax |
139 | # endif |
140 | |
141 | |
142 | VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 |
143 | |
144 | cmpq $(CHAR_PER_VEC * 2), %rax |
145 | ja L(more_2x_vec) |
146 | |
147 | L(last_2x_vec_or_less): |
148 | KMOV %k0, %VRDX |
149 | test %VRDX, %VRDX |
150 | jnz L(last_vec_check) |
151 | |
152 | /* Check the end of data. */ |
153 | SUB_SHORT (CHAR_PER_VEC, rax) |
154 | jbe L(max_0) |
155 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 |
156 | KMOV %k0, %VRDX |
157 | test %VRDX, %VRDX |
158 | jz L(max_0) |
159 | /* Best place for LAST_VEC_CHECK if ZMM. */ |
160 | .p2align 4,, 8 |
161 | L(last_vec_check): |
162 | bsf %VRDX, %VRDX |
163 | sub %eax, %edx |
164 | lea (%rsi, %rdx), %eax |
165 | cmovae %esi, %eax |
166 | ret |
167 | |
168 | # if CHAR_PER_VEC == 32 |
169 | .p2align 4,, 2 |
170 | L(zero): |
171 | L(max_0): |
172 | movl %esi, %eax |
173 | ret |
174 | # endif |
175 | |
176 | .p2align 4,, 8 |
177 | L(last_4x_vec_or_less): |
178 | addl $(CHAR_PER_VEC * -4), %eax |
179 | VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 |
180 | subq $(VEC_SIZE * -4), %rdi |
181 | cmpl $(CHAR_PER_VEC * 2), %eax |
182 | jbe L(last_2x_vec_or_less) |
183 | |
184 | .p2align 4,, 6 |
185 | L(more_2x_vec): |
186 | /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without |
187 | rechecking bounds. */ |
188 | |
189 | KMOV %k0, %VRDX |
190 | |
191 | test %VRDX, %VRDX |
192 | jnz L(first_vec_x1) |
193 | |
194 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 |
195 | KMOV %k0, %VRDX |
196 | test %VRDX, %VRDX |
197 | jnz L(first_vec_x2) |
198 | |
199 | cmpq $(CHAR_PER_VEC * 4), %rax |
200 | ja L(more_4x_vec) |
201 | |
202 | |
203 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 |
204 | KMOV %k0, %VRDX |
205 | addl $(CHAR_PER_VEC * -2), %eax |
206 | test %VRDX, %VRDX |
207 | jnz L(last_vec_check) |
208 | |
209 | subl $(CHAR_PER_VEC), %eax |
210 | jbe L(max_1) |
211 | |
212 | VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 |
213 | KMOV %k0, %VRDX |
214 | |
215 | test %VRDX, %VRDX |
216 | jnz L(last_vec_check) |
217 | L(max_1): |
218 | movl %esi, %eax |
219 | ret |
220 | |
221 | .p2align 4,, 3 |
222 | L(first_vec_x2): |
223 | # if VEC_SIZE == 64 |
224 | /* If VEC_SIZE == 64 we can fit logic for full return label in |
225 | spare bytes before next cache line. */ |
226 | bsf %VRDX, %VRDX |
227 | sub %eax, %esi |
228 | leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax |
229 | ret |
230 | .p2align 4,, 6 |
231 | # else |
232 | addl $CHAR_PER_VEC, %esi |
233 | # endif |
234 | L(first_vec_x1): |
235 | bsf %VRDX, %VRDX |
236 | sub %eax, %esi |
237 | leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax |
238 | ret |
239 | |
240 | |
241 | .p2align 4,, 6 |
242 | L(first_vec_x4): |
243 | # if VEC_SIZE == 64 |
244 | /* If VEC_SIZE == 64 we can fit logic for full return label in |
245 | spare bytes before next cache line. */ |
246 | bsf %VRDX, %VRDX |
247 | sub %eax, %esi |
248 | leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax |
249 | ret |
250 | .p2align 4,, 6 |
251 | # else |
252 | addl $CHAR_PER_VEC, %esi |
253 | # endif |
254 | L(first_vec_x3): |
255 | bsf %VRDX, %VRDX |
256 | sub %eax, %esi |
257 | leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax |
258 | ret |
259 | |
260 | .p2align 4,, 5 |
261 | L(more_4x_vec): |
262 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 |
263 | KMOV %k0, %VRDX |
264 | test %VRDX, %VRDX |
265 | jnz L(first_vec_x3) |
266 | |
267 | VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 |
268 | KMOV %k0, %VRDX |
269 | test %VRDX, %VRDX |
270 | jnz L(first_vec_x4) |
271 | |
272 | /* Check if at last VEC_SIZE * 4 length before aligning for the |
273 | loop. */ |
274 | cmpq $(CHAR_PER_VEC * 8), %rax |
275 | jbe L(last_4x_vec_or_less) |
276 | |
277 | |
278 | /* Compute number of words checked after aligning. */ |
279 | # ifdef USE_AS_WCSLEN |
280 | /* Need to compute directly for wcslen as CHAR_SIZE * rsi can |
281 | overflow. */ |
282 | leaq (VEC_SIZE * -3)(%rdi), %rdx |
283 | # else |
284 | leaq (VEC_SIZE * -3)(%rdi, %rax), %rax |
285 | # endif |
286 | |
287 | subq $(VEC_SIZE * -1), %rdi |
288 | |
289 | /* Align data to VEC_SIZE * 4. */ |
290 | # if VEC_SIZE == 64 |
291 | /* Saves code size. No evex512 processor has partial register |
292 | stalls. If that change this can be replaced with `andq |
293 | $-(VEC_SIZE * 4), %rdi`. */ |
294 | xorb %dil, %dil |
295 | # else |
296 | andq $-(VEC_SIZE * 4), %rdi |
297 | # endif |
298 | |
299 | # ifdef USE_AS_WCSLEN |
300 | subq %rdi, %rdx |
301 | sarq $2, %rdx |
302 | addq %rdx, %rax |
303 | # else |
304 | subq %rdi, %rax |
305 | # endif |
306 | /* Compare 4 * VEC at a time forward. */ |
307 | .p2align 4,, 11 |
308 | L(loop_4x_vec): |
309 | VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) |
310 | VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) |
311 | VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) |
312 | VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) |
313 | VPTESTN %VMM(2), %VMM(2), %k0 |
314 | VPTESTN %VMM(4), %VMM(4), %k2 |
315 | subq $-(VEC_SIZE * 4), %rdi |
316 | /* Break if at end of length. */ |
317 | subq $(CHAR_PER_VEC * 4), %rax |
318 | jbe L(loop_len_end) |
319 | |
320 | |
321 | KORTEST %k0, %k2 |
322 | jz L(loop_4x_vec) |
323 | |
324 | |
325 | L(loop_last_4x_vec): |
326 | movq %rsi, %rcx |
327 | subq %rax, %rsi |
328 | VPTESTN %VMM(1), %VMM(1), %k1 |
329 | KMOV %k1, %VRDX |
330 | test %VRDX, %VRDX |
331 | jnz L(last_vec_x0) |
332 | |
333 | KMOV %k0, %VRDX |
334 | test %VRDX, %VRDX |
335 | jnz L(last_vec_x1) |
336 | |
337 | VPTESTN %VMM(3), %VMM(3), %k0 |
338 | |
339 | /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for |
340 | returning last 2x VEC. For VEC_SIZE == 64 we test each VEC |
341 | individually, for VEC_SIZE == 32 we combine them in a single |
342 | 64-bit GPR. */ |
343 | # if CHAR_PER_VEC == 64 |
344 | KMOV %k0, %VRDX |
345 | test %VRDX, %VRDX |
346 | jnz L(last_vec_x2) |
347 | KMOV %k2, %VRDX |
348 | # else |
349 | /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. |
350 | */ |
351 | kmovd %k2, %edx |
352 | kmovd %k0, %eax |
353 | salq $CHAR_PER_VEC, %rdx |
354 | orq %rax, %rdx |
355 | # endif |
356 | |
357 | /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. |
358 | */ |
359 | bsfq %rdx, %rdx |
360 | leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax |
361 | cmpq %rax, %rcx |
362 | cmovb %rcx, %rax |
363 | ret |
364 | |
365 | /* Handle last 4x VEC after loop. All VECs have been loaded. */ |
366 | .p2align 4,, 4 |
367 | L(loop_len_end): |
368 | KORTEST %k0, %k2 |
369 | jnz L(loop_last_4x_vec) |
370 | movq %rsi, %rax |
371 | ret |
372 | |
373 | |
374 | # if CHAR_PER_VEC == 64 |
375 | /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 |
376 | need return label for it. */ |
377 | .p2align 4,, 8 |
378 | L(last_vec_x2): |
379 | bsf %VRDX, %VRDX |
380 | leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax |
381 | cmpq %rax, %rcx |
382 | cmovb %rcx, %rax |
383 | ret |
384 | # endif |
385 | |
386 | |
387 | .p2align 4,, 10 |
388 | L(last_vec_x1): |
389 | addq $CHAR_PER_VEC, %rsi |
390 | L(last_vec_x0): |
391 | bsf %VRDX, %VRDX |
392 | leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax |
393 | cmpq %rax, %rcx |
394 | cmovb %rcx, %rax |
395 | ret |
396 | |
397 | |
398 | .p2align 4,, 8 |
399 | L(cross_page_boundary): |
400 | /* Align data to VEC_SIZE. */ |
401 | movq %rdi, %rcx |
402 | andq $-VEC_SIZE, %rcx |
403 | VPCMPEQ (%rcx), %VZERO, %k0 |
404 | |
405 | KMOV %k0, %VRCX |
406 | # ifdef USE_AS_WCSLEN |
407 | shrl $2, %eax |
408 | andl $(CHAR_PER_VEC - 1), %eax |
409 | # endif |
410 | shrx %VRAX, %VRCX, %VRCX |
411 | |
412 | negl %eax |
413 | andl $(CHAR_PER_VEC - 1), %eax |
414 | movq %rsi, %rdx |
415 | bsf %VRCX, %VRDX |
416 | cmpq %rax, %rdx |
417 | ja L(cross_page_continue) |
418 | movl %edx, %eax |
419 | cmpq %rdx, %rsi |
420 | cmovb %esi, %eax |
421 | ret |
422 | END (STRNLEN) |
423 | #endif |
424 | |