1 | /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. |
2 | Copyright (C) 2017 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # include <sysdep.h> |
22 | |
23 | # ifndef STRLEN |
24 | # define STRLEN __strlen_avx2 |
25 | # endif |
26 | |
27 | # ifdef USE_AS_WCSLEN |
28 | # define VPCMPEQ vpcmpeqd |
29 | # define VPMINU vpminud |
30 | # else |
31 | # define VPCMPEQ vpcmpeqb |
32 | # define VPMINU vpminub |
33 | # endif |
34 | |
35 | # ifndef VZEROUPPER |
36 | # define VZEROUPPER vzeroupper |
37 | # endif |
38 | |
39 | # define VEC_SIZE 32 |
40 | |
41 | .section .text.avx,"ax" ,@progbits |
42 | ENTRY (STRLEN) |
43 | # ifdef USE_AS_STRNLEN |
44 | /* Check for zero length. */ |
45 | testq %rsi, %rsi |
46 | jz L(zero) |
47 | # ifdef USE_AS_WCSLEN |
48 | shl $2, %rsi |
49 | # endif |
50 | movq %rsi, %r8 |
51 | # endif |
52 | movl %edi, %ecx |
53 | movq %rdi, %rdx |
54 | vpxor %xmm0, %xmm0, %xmm0 |
55 | |
56 | /* Check if we may cross page boundary with one vector load. */ |
57 | andl $(2 * VEC_SIZE - 1), %ecx |
58 | cmpl $VEC_SIZE, %ecx |
59 | ja L(cros_page_boundary) |
60 | |
61 | /* Check the first VEC_SIZE bytes. */ |
62 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
63 | vpmovmskb %ymm1, %eax |
64 | testl %eax, %eax |
65 | |
66 | # ifdef USE_AS_STRNLEN |
67 | jnz L(first_vec_x0_check) |
68 | /* Adjust length and check the end of data. */ |
69 | subq $VEC_SIZE, %rsi |
70 | jbe L(max) |
71 | # else |
72 | jnz L(first_vec_x0) |
73 | # endif |
74 | |
75 | /* Align data for aligned loads in the loop. */ |
76 | addq $VEC_SIZE, %rdi |
77 | andl $(VEC_SIZE - 1), %ecx |
78 | andq $-VEC_SIZE, %rdi |
79 | |
80 | # ifdef USE_AS_STRNLEN |
81 | /* Adjust length. */ |
82 | addq %rcx, %rsi |
83 | |
84 | subq $(VEC_SIZE * 4), %rsi |
85 | jbe L(last_4x_vec_or_less) |
86 | # endif |
87 | jmp L(more_4x_vec) |
88 | |
89 | .p2align 4 |
90 | L(cros_page_boundary): |
91 | andl $(VEC_SIZE - 1), %ecx |
92 | andq $-VEC_SIZE, %rdi |
93 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
94 | vpmovmskb %ymm1, %eax |
95 | /* Remove the leading bytes. */ |
96 | sarl %cl, %eax |
97 | testl %eax, %eax |
98 | jz L(aligned_more) |
99 | tzcntl %eax, %eax |
100 | # ifdef USE_AS_STRNLEN |
101 | /* Check the end of data. */ |
102 | cmpq %rax, %rsi |
103 | jbe L(max) |
104 | # endif |
105 | addq %rdi, %rax |
106 | addq %rcx, %rax |
107 | subq %rdx, %rax |
108 | # ifdef USE_AS_WCSLEN |
109 | shrq $2, %rax |
110 | # endif |
111 | VZEROUPPER |
112 | ret |
113 | |
114 | .p2align 4 |
115 | L(aligned_more): |
116 | # ifdef USE_AS_STRNLEN |
117 | /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" |
118 | with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" |
119 | to void possible addition overflow. */ |
120 | negq %rcx |
121 | addq $VEC_SIZE, %rcx |
122 | |
123 | /* Check the end of data. */ |
124 | subq %rcx, %rsi |
125 | jbe L(max) |
126 | # endif |
127 | |
128 | addq $VEC_SIZE, %rdi |
129 | |
130 | # ifdef USE_AS_STRNLEN |
131 | subq $(VEC_SIZE * 4), %rsi |
132 | jbe L(last_4x_vec_or_less) |
133 | # endif |
134 | |
135 | L(more_4x_vec): |
136 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
137 | since data is only aligned to VEC_SIZE. */ |
138 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
139 | vpmovmskb %ymm1, %eax |
140 | testl %eax, %eax |
141 | jnz L(first_vec_x0) |
142 | |
143 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
144 | vpmovmskb %ymm1, %eax |
145 | testl %eax, %eax |
146 | jnz L(first_vec_x1) |
147 | |
148 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
149 | vpmovmskb %ymm1, %eax |
150 | testl %eax, %eax |
151 | jnz L(first_vec_x2) |
152 | |
153 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
154 | vpmovmskb %ymm1, %eax |
155 | testl %eax, %eax |
156 | jnz L(first_vec_x3) |
157 | |
158 | addq $(VEC_SIZE * 4), %rdi |
159 | |
160 | # ifdef USE_AS_STRNLEN |
161 | subq $(VEC_SIZE * 4), %rsi |
162 | jbe L(last_4x_vec_or_less) |
163 | # endif |
164 | |
165 | /* Align data to 4 * VEC_SIZE. */ |
166 | movq %rdi, %rcx |
167 | andl $(4 * VEC_SIZE - 1), %ecx |
168 | andq $-(4 * VEC_SIZE), %rdi |
169 | |
170 | # ifdef USE_AS_STRNLEN |
171 | /* Adjust length. */ |
172 | addq %rcx, %rsi |
173 | # endif |
174 | |
175 | .p2align 4 |
176 | L(loop_4x_vec): |
177 | /* Compare 4 * VEC at a time forward. */ |
178 | vmovdqa (%rdi), %ymm1 |
179 | vmovdqa VEC_SIZE(%rdi), %ymm2 |
180 | vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 |
181 | vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 |
182 | VPMINU %ymm1, %ymm2, %ymm5 |
183 | VPMINU %ymm3, %ymm4, %ymm6 |
184 | VPMINU %ymm5, %ymm6, %ymm5 |
185 | |
186 | VPCMPEQ %ymm5, %ymm0, %ymm5 |
187 | vpmovmskb %ymm5, %eax |
188 | testl %eax, %eax |
189 | jnz L(4x_vec_end) |
190 | |
191 | addq $(VEC_SIZE * 4), %rdi |
192 | |
193 | # ifndef USE_AS_STRNLEN |
194 | jmp L(loop_4x_vec) |
195 | # else |
196 | subq $(VEC_SIZE * 4), %rsi |
197 | ja L(loop_4x_vec) |
198 | |
199 | L(last_4x_vec_or_less): |
200 | /* Less than 4 * VEC and aligned to VEC_SIZE. */ |
201 | addl $(VEC_SIZE * 2), %esi |
202 | jle L(last_2x_vec) |
203 | |
204 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
205 | vpmovmskb %ymm1, %eax |
206 | testl %eax, %eax |
207 | jnz L(first_vec_x0) |
208 | |
209 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
210 | vpmovmskb %ymm1, %eax |
211 | testl %eax, %eax |
212 | jnz L(first_vec_x1) |
213 | |
214 | VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 |
215 | vpmovmskb %ymm1, %eax |
216 | testl %eax, %eax |
217 | |
218 | jnz L(first_vec_x2_check) |
219 | subl $VEC_SIZE, %esi |
220 | jle L(max) |
221 | |
222 | VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 |
223 | vpmovmskb %ymm1, %eax |
224 | testl %eax, %eax |
225 | |
226 | jnz L(first_vec_x3_check) |
227 | movq %r8, %rax |
228 | # ifdef USE_AS_WCSLEN |
229 | shrq $2, %rax |
230 | # endif |
231 | VZEROUPPER |
232 | ret |
233 | |
234 | .p2align 4 |
235 | L(last_2x_vec): |
236 | addl $(VEC_SIZE * 2), %esi |
237 | VPCMPEQ (%rdi), %ymm0, %ymm1 |
238 | vpmovmskb %ymm1, %eax |
239 | testl %eax, %eax |
240 | |
241 | jnz L(first_vec_x0_check) |
242 | subl $VEC_SIZE, %esi |
243 | jle L(max) |
244 | |
245 | VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 |
246 | vpmovmskb %ymm1, %eax |
247 | testl %eax, %eax |
248 | jnz L(first_vec_x1_check) |
249 | movq %r8, %rax |
250 | # ifdef USE_AS_WCSLEN |
251 | shrq $2, %rax |
252 | # endif |
253 | VZEROUPPER |
254 | ret |
255 | |
256 | .p2align 4 |
257 | L(first_vec_x0_check): |
258 | tzcntl %eax, %eax |
259 | /* Check the end of data. */ |
260 | cmpq %rax, %rsi |
261 | jbe L(max) |
262 | addq %rdi, %rax |
263 | subq %rdx, %rax |
264 | # ifdef USE_AS_WCSLEN |
265 | shrq $2, %rax |
266 | # endif |
267 | VZEROUPPER |
268 | ret |
269 | |
270 | .p2align 4 |
271 | L(first_vec_x1_check): |
272 | tzcntl %eax, %eax |
273 | /* Check the end of data. */ |
274 | cmpq %rax, %rsi |
275 | jbe L(max) |
276 | addq $VEC_SIZE, %rax |
277 | addq %rdi, %rax |
278 | subq %rdx, %rax |
279 | # ifdef USE_AS_WCSLEN |
280 | shrq $2, %rax |
281 | # endif |
282 | VZEROUPPER |
283 | ret |
284 | |
285 | .p2align 4 |
286 | L(first_vec_x2_check): |
287 | tzcntl %eax, %eax |
288 | /* Check the end of data. */ |
289 | cmpq %rax, %rsi |
290 | jbe L(max) |
291 | addq $(VEC_SIZE * 2), %rax |
292 | addq %rdi, %rax |
293 | subq %rdx, %rax |
294 | # ifdef USE_AS_WCSLEN |
295 | shrq $2, %rax |
296 | # endif |
297 | VZEROUPPER |
298 | ret |
299 | |
300 | .p2align 4 |
301 | L(first_vec_x3_check): |
302 | tzcntl %eax, %eax |
303 | /* Check the end of data. */ |
304 | cmpq %rax, %rsi |
305 | jbe L(max) |
306 | addq $(VEC_SIZE * 3), %rax |
307 | addq %rdi, %rax |
308 | subq %rdx, %rax |
309 | # ifdef USE_AS_WCSLEN |
310 | shrq $2, %rax |
311 | # endif |
312 | VZEROUPPER |
313 | ret |
314 | |
315 | .p2align 4 |
316 | L(max): |
317 | movq %r8, %rax |
318 | # ifdef USE_AS_WCSLEN |
319 | shrq $2, %rax |
320 | # endif |
321 | VZEROUPPER |
322 | ret |
323 | |
324 | .p2align 4 |
325 | L(zero): |
326 | xorl %eax, %eax |
327 | ret |
328 | # endif |
329 | |
330 | .p2align 4 |
331 | L(first_vec_x0): |
332 | tzcntl %eax, %eax |
333 | addq %rdi, %rax |
334 | subq %rdx, %rax |
335 | # ifdef USE_AS_WCSLEN |
336 | shrq $2, %rax |
337 | # endif |
338 | VZEROUPPER |
339 | ret |
340 | |
341 | .p2align 4 |
342 | L(first_vec_x1): |
343 | tzcntl %eax, %eax |
344 | addq $VEC_SIZE, %rax |
345 | addq %rdi, %rax |
346 | subq %rdx, %rax |
347 | # ifdef USE_AS_WCSLEN |
348 | shrq $2, %rax |
349 | # endif |
350 | VZEROUPPER |
351 | ret |
352 | |
353 | .p2align 4 |
354 | L(first_vec_x2): |
355 | tzcntl %eax, %eax |
356 | addq $(VEC_SIZE * 2), %rax |
357 | addq %rdi, %rax |
358 | subq %rdx, %rax |
359 | # ifdef USE_AS_WCSLEN |
360 | shrq $2, %rax |
361 | # endif |
362 | VZEROUPPER |
363 | ret |
364 | |
365 | .p2align 4 |
366 | L(4x_vec_end): |
367 | VPCMPEQ %ymm1, %ymm0, %ymm1 |
368 | vpmovmskb %ymm1, %eax |
369 | testl %eax, %eax |
370 | jnz L(first_vec_x0) |
371 | VPCMPEQ %ymm2, %ymm0, %ymm2 |
372 | vpmovmskb %ymm2, %eax |
373 | testl %eax, %eax |
374 | jnz L(first_vec_x1) |
375 | VPCMPEQ %ymm3, %ymm0, %ymm3 |
376 | vpmovmskb %ymm3, %eax |
377 | testl %eax, %eax |
378 | jnz L(first_vec_x2) |
379 | VPCMPEQ %ymm4, %ymm0, %ymm4 |
380 | vpmovmskb %ymm4, %eax |
381 | testl %eax, %eax |
382 | L(first_vec_x3): |
383 | tzcntl %eax, %eax |
384 | addq $(VEC_SIZE * 3), %rax |
385 | addq %rdi, %rax |
386 | subq %rdx, %rax |
387 | # ifdef USE_AS_WCSLEN |
388 | shrq $2, %rax |
389 | # endif |
390 | VZEROUPPER |
391 | ret |
392 | |
393 | END (STRLEN) |
394 | #endif |
395 | |