1 | /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. |
2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRLEN |
24 | # define STRLEN __strlen_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WCSLEN |
28 | # define VPCMPEQ vpcmpeqd |
29 | # define VPMINU vpminud |
30 | # else |
31 | # define VPCMPEQ vpcmpeqb |
32 | # define VPMINU vpminub |
33 | # endif |
34 | |
35 | # ifndef VZEROUPPER |
36 | # define VZEROUPPER vzeroupper |
37 | # endif |
38 | |
39 | # define VEC_SIZE 32 |
40 | |
41 | .section .text.avx,"ax" ,@progbits |
42 | ENTRY (STRLEN) |
43 | # ifdef USE_AS_STRNLEN |
44 | /* Check for zero length. */ |
45 | test %RSI_LP, %RSI_LP |
46 | jz L(zero) |
47 | # ifdef USE_AS_WCSLEN |
48 | shl $2, %RSI_LP |
49 | # elif defined __ILP32__ |
50 | /* Clear the upper 32 bits. */ |
51 | movl %esi, %esi |
52 | # endif |
53 | mov %RSI_LP, %R8_LP |
54 | # endif |
55 | movl %edi, %ecx |
56 | movq %rdi, %rdx |
57 | vpxor %xmm0, %xmm0, %xmm0 |
58 | |
59 | /* Check if we may cross page boundary with one vector load. */ |
60 | andl $(2 * VEC_SIZE - 1), %ecx |
61 | cmpl $VEC_SIZE, %ecx |
62 | ja L(cros_page_boundary) |
63 | |
64 | /* Check the first VEC_SIZE bytes. */ |
65 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
66 | vpmovmskb %ymm1, %eax |
67 | testl %eax, %eax |
68 | |
69 | # ifdef USE_AS_STRNLEN |
70 | jnz L(first_vec_x0_check) |
71 | /* Adjust length and check the end of data. */ |
72 | subq $VEC_SIZE, %rsi |
73 | jbe L(max) |
74 | # else |
75 | jnz L(first_vec_x0) |
76 | # endif |
77 | |
78 | /* Align data for aligned loads in the loop. */ |
79 | addq $VEC_SIZE, %rdi |
80 | andl $(VEC_SIZE - 1), %ecx |
81 | andq $-VEC_SIZE, %rdi |
82 | |
83 | # ifdef USE_AS_STRNLEN |
84 | /* Adjust length. */ |
85 | addq %rcx, %rsi |
86 | |
87 | subq $(VEC_SIZE * 4), %rsi |
88 | jbe L(last_4x_vec_or_less) |
89 | # endif |
90 | jmp L(more_4x_vec) |
91 | |
92 | .p2align 4 |
93 | L(cros_page_boundary): |
94 | andl $(VEC_SIZE - 1), %ecx |
95 | andq $-VEC_SIZE, %rdi |
96 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
97 | vpmovmskb %ymm1, %eax |
98 | /* Remove the leading bytes. */ |
99 | sarl %cl, %eax |
100 | testl %eax, %eax |
101 | jz L(aligned_more) |
102 | tzcntl %eax, %eax |
103 | # ifdef USE_AS_STRNLEN |
104 | /* Check the end of data. */ |
105 | cmpq %rax, %rsi |
106 | jbe L(max) |
107 | # endif |
108 | addq %rdi, %rax |
109 | addq %rcx, %rax |
110 | subq %rdx, %rax |
111 | # ifdef USE_AS_WCSLEN |
112 | shrq $2, %rax |
113 | # endif |
114 | VZEROUPPER |
115 | ret |
116 | |
117 | .p2align 4 |
118 | L(aligned_more): |
119 | # ifdef USE_AS_STRNLEN |
120 | /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" |
121 | with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" |
122 | to void possible addition overflow. */ |
123 | negq %rcx |
124 | addq $VEC_SIZE, %rcx |
125 | |
126 | /* Check the end of data. */ |
127 | subq %rcx, %rsi |
128 | jbe L(max) |
129 | # endif |
130 | |
131 | addq $VEC_SIZE, %rdi |
132 | |
133 | # ifdef USE_AS_STRNLEN |
134 | subq $(VEC_SIZE * 4), %rsi |
135 | jbe L(last_4x_vec_or_less) |
136 | # endif |
137 | |
138 | L(more_4x_vec): |
139 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
140 | since data is only aligned to VEC_SIZE. */ |
141 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
142 | vpmovmskb %ymm1, %eax |
143 | testl %eax, %eax |
144 | jnz L(first_vec_x0) |
145 | |
146 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
147 | vpmovmskb %ymm1, %eax |
148 | testl %eax, %eax |
149 | jnz L(first_vec_x1) |
150 | |
151 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
152 | vpmovmskb %ymm1, %eax |
153 | testl %eax, %eax |
154 | jnz L(first_vec_x2) |
155 | |
156 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
157 | vpmovmskb %ymm1, %eax |
158 | testl %eax, %eax |
159 | jnz L(first_vec_x3) |
160 | |
161 | addq $(VEC_SIZE * 4), %rdi |
162 | |
163 | # ifdef USE_AS_STRNLEN |
164 | subq $(VEC_SIZE * 4), %rsi |
165 | jbe L(last_4x_vec_or_less) |
166 | # endif |
167 | |
168 | /* Align data to 4 * VEC_SIZE. */ |
169 | movq %rdi, %rcx |
170 | andl $(4 * VEC_SIZE - 1), %ecx |
171 | andq $-(4 * VEC_SIZE), %rdi |
172 | |
173 | # ifdef USE_AS_STRNLEN |
174 | /* Adjust length. */ |
175 | addq %rcx, %rsi |
176 | # endif |
177 | |
178 | .p2align 4 |
179 | L(loop_4x_vec): |
180 | /* Compare 4 * VEC at a time forward. */ |
181 | vmovdqa (%rdi), %ymm1 |
182 | vmovdqa VEC_SIZE(%rdi), %ymm2 |
183 | vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 |
184 | vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 |
185 | VPMINU %ymm1, %ymm2, %ymm5 |
186 | VPMINU %ymm3, %ymm4, %ymm6 |
187 | VPMINU %ymm5, %ymm6, %ymm5 |
188 | |
189 | VPCMPEQ %ymm5, %ymm0, %ymm5 |
190 | vpmovmskb %ymm5, %eax |
191 | testl %eax, %eax |
192 | jnz L(4x_vec_end) |
193 | |
194 | addq $(VEC_SIZE * 4), %rdi |
195 | |
196 | # ifndef USE_AS_STRNLEN |
197 | jmp L(loop_4x_vec) |
198 | # else |
199 | subq $(VEC_SIZE * 4), %rsi |
200 | ja L(loop_4x_vec) |
201 | |
202 | L(last_4x_vec_or_less): |
203 | /* Less than 4 * VEC and aligned to VEC_SIZE. */ |
204 | addl $(VEC_SIZE * 2), %esi |
205 | jle L(last_2x_vec) |
206 | |
207 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
208 | vpmovmskb %ymm1, %eax |
209 | testl %eax, %eax |
210 | jnz L(first_vec_x0) |
211 | |
212 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
213 | vpmovmskb %ymm1, %eax |
214 | testl %eax, %eax |
215 | jnz L(first_vec_x1) |
216 | |
217 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
218 | vpmovmskb %ymm1, %eax |
219 | testl %eax, %eax |
220 | |
221 | jnz L(first_vec_x2_check) |
222 | subl $VEC_SIZE, %esi |
223 | jle L(max) |
224 | |
225 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
226 | vpmovmskb %ymm1, %eax |
227 | testl %eax, %eax |
228 | |
229 | jnz L(first_vec_x3_check) |
230 | movq %r8, %rax |
231 | # ifdef USE_AS_WCSLEN |
232 | shrq $2, %rax |
233 | # endif |
234 | VZEROUPPER |
235 | ret |
236 | |
237 | .p2align 4 |
238 | L(last_2x_vec): |
239 | addl $(VEC_SIZE * 2), %esi |
240 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
241 | vpmovmskb %ymm1, %eax |
242 | testl %eax, %eax |
243 | |
244 | jnz L(first_vec_x0_check) |
245 | subl $VEC_SIZE, %esi |
246 | jle L(max) |
247 | |
248 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
249 | vpmovmskb %ymm1, %eax |
250 | testl %eax, %eax |
251 | jnz L(first_vec_x1_check) |
252 | movq %r8, %rax |
253 | # ifdef USE_AS_WCSLEN |
254 | shrq $2, %rax |
255 | # endif |
256 | VZEROUPPER |
257 | ret |
258 | |
259 | .p2align 4 |
260 | L(first_vec_x0_check): |
261 | tzcntl %eax, %eax |
262 | /* Check the end of data. */ |
263 | cmpq %rax, %rsi |
264 | jbe L(max) |
265 | addq %rdi, %rax |
266 | subq %rdx, %rax |
267 | # ifdef USE_AS_WCSLEN |
268 | shrq $2, %rax |
269 | # endif |
270 | VZEROUPPER |
271 | ret |
272 | |
273 | .p2align 4 |
274 | L(first_vec_x1_check): |
275 | tzcntl %eax, %eax |
276 | /* Check the end of data. */ |
277 | cmpq %rax, %rsi |
278 | jbe L(max) |
279 | addq $VEC_SIZE, %rax |
280 | addq %rdi, %rax |
281 | subq %rdx, %rax |
282 | # ifdef USE_AS_WCSLEN |
283 | shrq $2, %rax |
284 | # endif |
285 | VZEROUPPER |
286 | ret |
287 | |
288 | .p2align 4 |
289 | L(first_vec_x2_check): |
290 | tzcntl %eax, %eax |
291 | /* Check the end of data. */ |
292 | cmpq %rax, %rsi |
293 | jbe L(max) |
294 | addq $(VEC_SIZE * 2), %rax |
295 | addq %rdi, %rax |
296 | subq %rdx, %rax |
297 | # ifdef USE_AS_WCSLEN |
298 | shrq $2, %rax |
299 | # endif |
300 | VZEROUPPER |
301 | ret |
302 | |
303 | .p2align 4 |
304 | L(first_vec_x3_check): |
305 | tzcntl %eax, %eax |
306 | /* Check the end of data. */ |
307 | cmpq %rax, %rsi |
308 | jbe L(max) |
309 | addq $(VEC_SIZE * 3), %rax |
310 | addq %rdi, %rax |
311 | subq %rdx, %rax |
312 | # ifdef USE_AS_WCSLEN |
313 | shrq $2, %rax |
314 | # endif |
315 | VZEROUPPER |
316 | ret |
317 | |
318 | .p2align 4 |
319 | L(max): |
320 | movq %r8, %rax |
321 | # ifdef USE_AS_WCSLEN |
322 | shrq $2, %rax |
323 | # endif |
324 | VZEROUPPER |
325 | ret |
326 | |
327 | .p2align 4 |
328 | L(zero): |
329 | xorl %eax, %eax |
330 | ret |
331 | # endif |
332 | |
333 | .p2align 4 |
334 | L(first_vec_x0): |
335 | tzcntl %eax, %eax |
336 | addq %rdi, %rax |
337 | subq %rdx, %rax |
338 | # ifdef USE_AS_WCSLEN |
339 | shrq $2, %rax |
340 | # endif |
341 | VZEROUPPER |
342 | ret |
343 | |
344 | .p2align 4 |
345 | L(first_vec_x1): |
346 | tzcntl %eax, %eax |
347 | addq $VEC_SIZE, %rax |
348 | addq %rdi, %rax |
349 | subq %rdx, %rax |
350 | # ifdef USE_AS_WCSLEN |
351 | shrq $2, %rax |
352 | # endif |
353 | VZEROUPPER |
354 | ret |
355 | |
356 | .p2align 4 |
357 | L(first_vec_x2): |
358 | tzcntl %eax, %eax |
359 | addq $(VEC_SIZE * 2), %rax |
360 | addq %rdi, %rax |
361 | subq %rdx, %rax |
362 | # ifdef USE_AS_WCSLEN |
363 | shrq $2, %rax |
364 | # endif |
365 | VZEROUPPER |
366 | ret |
367 | |
368 | .p2align 4 |
369 | L(4x_vec_end): |
370 | VPCMPEQ %ymm1, %ymm0, %ymm1 |
371 | vpmovmskb %ymm1, %eax |
372 | testl %eax, %eax |
373 | jnz L(first_vec_x0) |
374 | VPCMPEQ %ymm2, %ymm0, %ymm2 |
375 | vpmovmskb %ymm2, %eax |
376 | testl %eax, %eax |
377 | jnz L(first_vec_x1) |
378 | VPCMPEQ %ymm3, %ymm0, %ymm3 |
379 | vpmovmskb %ymm3, %eax |
380 | testl %eax, %eax |
381 | jnz L(first_vec_x2) |
382 | VPCMPEQ %ymm4, %ymm0, %ymm4 |
383 | vpmovmskb %ymm4, %eax |
384 | L(first_vec_x3): |
385 | tzcntl %eax, %eax |
386 | addq $(VEC_SIZE * 3), %rax |
387 | addq %rdi, %rax |
388 | subq %rdx, %rax |
389 | # ifdef USE_AS_WCSLEN |
390 | shrq $2, %rax |
391 | # endif |
392 | VZEROUPPER |
393 | ret |
394 | |
395 | END (STRLEN) |
396 | #endif |
397 | |