1 | /* strncpy with AVX2 |
2 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | |
26 | # ifndef VEC_SIZE |
27 | # include "x86-avx-vecs.h" |
28 | # endif |
29 | |
30 | # ifndef STRNCPY |
31 | # define STRNCPY __strncpy_avx2 |
32 | # endif |
33 | |
34 | |
35 | # ifdef USE_AS_WCSCPY |
36 | # define VPCMPEQ vpcmpeqd |
37 | # define VPMIN vpminud |
38 | # define CHAR_SIZE 4 |
39 | # else |
40 | # define VPCMPEQ vpcmpeqb |
41 | # define VPMIN vpminub |
42 | # define CHAR_SIZE 1 |
43 | # endif |
44 | |
45 | # include "strncpy-or-cat-overflow-def.h" |
46 | |
47 | # define PAGE_SIZE 4096 |
48 | |
49 | # define VZERO VMM(7) |
50 | # define VZERO_128 VMM_128(7) |
51 | |
52 | |
53 | .section SECTION(.text), "ax" , @progbits |
54 | ENTRY(STRNCPY) |
55 | # ifdef __ILP32__ |
56 | /* Clear the upper 32 bits. */ |
57 | movl %edx, %edx |
58 | # endif |
59 | /* Filter zero length strings and very long strings. Zero |
60 | length strings just return, very long strings are handled by |
61 | just running rep stos{b|l} to zero set (which will almost |
62 | certainly segfault), if that succeeds then just calling |
63 | OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ |
64 | # ifdef USE_AS_WCSCPY |
65 | decq %rdx |
66 | movq %rdx, %rax |
67 | /* 56 is end of max supported address space. */ |
68 | shr $56, %rax |
69 | jnz L(zero_len) |
70 | salq $2, %rdx |
71 | # else |
72 | decq %rdx |
73 | /* `dec` can macrofuse with `jl`. If the flag needs to become |
74 | `jb` replace `dec` with `sub`. */ |
75 | jl L(zero_len) |
76 | # endif |
77 | |
78 | vpxor %VZERO_128, %VZERO_128, %VZERO_128 |
79 | movl %esi, %eax |
80 | andl $(PAGE_SIZE - 1), %eax |
81 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
82 | ja L(page_cross) |
83 | |
84 | L(page_cross_continue): |
85 | VMOVU (%rsi), %VMM(0) |
86 | VPCMPEQ %VMM(0), %VZERO, %VMM(6) |
87 | vpmovmskb %VMM(6), %ecx |
88 | |
89 | /* If no STPCPY just save end ahead of time. */ |
90 | # ifndef USE_AS_STPCPY |
91 | movq %rdi, %rax |
92 | # elif defined USE_AS_WCSCPY |
93 | /* Clear dependency as nearly all return code for wcpncpy uses |
94 | `setc %al`. */ |
95 | xorl %eax, %eax |
96 | # endif |
97 | |
98 | cmpq $(VEC_SIZE - CHAR_SIZE), %rdx |
99 | /* `jb` because length rdx is now length - CHAR_SIZE. */ |
100 | jbe L(less_1x_vec) |
101 | |
102 | /* This may overset but thats fine because we still need to zero |
103 | fill. */ |
104 | VMOVU %VMM(0), (%rdi) |
105 | |
106 | testl %ecx, %ecx |
107 | jnz L(zfill) |
108 | |
109 | /* Align. */ |
110 | addq %rsi, %rdx |
111 | subq %rsi, %rdi |
112 | orq $(VEC_SIZE - 1), %rsi |
113 | incq %rsi |
114 | L(last_4x_vec): |
115 | addq %rsi, %rdi |
116 | L(loop_last_4x_vec): |
117 | subq %rsi, %rdx |
118 | |
119 | |
120 | VMOVA 0(%rsi), %VMM(1) |
121 | VPCMPEQ %VMM(1), %VZERO, %VMM(6) |
122 | vpmovmskb %VMM(6), %ecx |
123 | |
124 | cmpq $(VEC_SIZE * 2), %rdx |
125 | jae L(more_2x_vec) |
126 | |
127 | cmpl $(VEC_SIZE), %edx |
128 | jb L(ret_vec_x1_len) |
129 | |
130 | testl %ecx, %ecx |
131 | jnz L(ret_vec_x1) |
132 | |
133 | VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6) |
134 | VMOVU %VMM(1), (%rdi) |
135 | vpmovmskb %VMM(6), %ecx |
136 | shlq $VEC_SIZE, %rcx |
137 | L(ret_vec_x1_len): |
138 | tzcntq %rcx, %rcx |
139 | cmpl %ecx, %edx |
140 | jbe L(ret_vec_x1_len_no_zfill) |
141 | /* Fall through (expectation) is copy len < buffer len. */ |
142 | VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
143 | L(ret_vec_x1_len_no_zfill_mov): |
144 | movl %ecx, %edx |
145 | # ifdef USE_AS_STPCPY |
146 | /* clear flags. */ |
147 | xorl %ecx, %ecx |
148 | # endif |
149 | L(ret_vec_x1_len_no_zfill): |
150 | VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) |
151 | VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
152 | # ifdef USE_AS_STPCPY |
153 | # ifdef USE_AS_WCSCPY |
154 | setc %al |
155 | addq %rdx, %rdi |
156 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
157 | # else |
158 | movl %edx, %eax |
159 | adcq %rdi, %rax |
160 | # endif |
161 | # endif |
162 | L(return_vzeroupper): |
163 | ZERO_UPPER_VEC_REGISTERS_RETURN |
164 | |
165 | .p2align 4,, 6 |
166 | L(ret_vec_x1): |
167 | bsfl %ecx, %ecx |
168 | VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
169 | subl %ecx, %edx |
170 | /* Check if we need to reload/store. */ |
171 | cmpl $VEC_SIZE, %edx |
172 | jb L(ret_vec_x1_len_no_zfill_mov) |
173 | /* Otherwise safe to just store directly. */ |
174 | VMOVU %VMM(1), (%rdi) |
175 | VMOVU %VZERO, (%rdi, %rcx) |
176 | # ifdef USE_AS_STPCPY |
177 | leaq (%rdi, %rcx), %rax |
178 | # endif |
179 | VZEROUPPER_RETURN |
180 | |
181 | .p2align 4,, 12 |
182 | L(more_2x_vec): |
183 | VMOVU %VMM(1), (%rdi) |
184 | testl %ecx, %ecx |
185 | /* Must fill at least 2x VEC. */ |
186 | jnz L(zfill_vec1) |
187 | |
188 | VMOVA VEC_SIZE(%rsi), %VMM(2) |
189 | VMOVU %VMM(2), VEC_SIZE(%rdi) |
190 | VPCMPEQ %VMM(2), %VZERO, %VMM(6) |
191 | vpmovmskb %VMM(6), %ecx |
192 | testl %ecx, %ecx |
193 | /* Must fill at least 1x VEC. */ |
194 | jnz L(zfill_vec2) |
195 | |
196 | VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3) |
197 | VPCMPEQ %VMM(3), %VZERO, %VMM(6) |
198 | vpmovmskb %VMM(6), %ecx |
199 | |
200 | /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len - |
201 | CHAR_SIZE. */ |
202 | cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx |
203 | ja L(more_4x_vec) |
204 | |
205 | subl $(VEC_SIZE * 3), %edx |
206 | jb L(ret_vec_x3_len) |
207 | |
208 | testl %ecx, %ecx |
209 | jnz L(ret_vec_x3) |
210 | |
211 | VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6) |
212 | VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) |
213 | vpmovmskb %VMM(6), %ecx |
214 | tzcntl %ecx, %ecx |
215 | cmpl %ecx, %edx |
216 | jbe L(ret_vec_x4_len_no_zfill) |
217 | /* Fall through (expectation) is copy len < buffer len. */ |
218 | VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
219 | movl %ecx, %edx |
220 | L(ret_vec_x4_len_no_zfill): |
221 | VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) |
222 | VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
223 | # ifdef USE_AS_STPCPY |
224 | # ifdef USE_AS_WCSCPY |
225 | setc %al |
226 | addq %rdx, %rdi |
227 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
228 | # else |
229 | leal (VEC_SIZE * 3 + 0)(%edx), %eax |
230 | adcq %rdi, %rax |
231 | # endif |
232 | # endif |
233 | VZEROUPPER_RETURN |
234 | |
235 | |
236 | L(ret_vec_x3_len): |
237 | addl $(VEC_SIZE * 1), %edx |
238 | tzcntl %ecx, %ecx |
239 | cmpl %ecx, %edx |
240 | jbe L(ret_vec_x3_len_no_zfill) |
241 | /* Fall through (expectation) is copy len < buffer len. */ |
242 | VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
243 | L(ret_vec_x3_len_no_zfill_mov): |
244 | movl %ecx, %edx |
245 | # ifdef USE_AS_STPCPY |
246 | /* clear flags. */ |
247 | xorl %ecx, %ecx |
248 | # endif |
249 | .p2align 4,, 4 |
250 | L(ret_vec_x3_len_no_zfill): |
251 | VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) |
252 | VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) |
253 | # ifdef USE_AS_STPCPY |
254 | # ifdef USE_AS_WCSCPY |
255 | setc %al |
256 | addq %rdx, %rdi |
257 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
258 | # else |
259 | leal (VEC_SIZE * 2 + 0)(%rdx), %eax |
260 | adcq %rdi, %rax |
261 | # endif |
262 | # endif |
263 | VZEROUPPER_RETURN |
264 | |
265 | |
266 | .p2align 4,, 8 |
267 | L(ret_vec_x3): |
268 | bsfl %ecx, %ecx |
269 | VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx) |
270 | subl %ecx, %edx |
271 | jl L(ret_vec_x3_len_no_zfill_mov) |
272 | VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) |
273 | VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx) |
274 | # ifdef USE_AS_STPCPY |
275 | leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax |
276 | # endif |
277 | VZEROUPPER_RETURN |
278 | |
279 | .p2align 4,, 8 |
280 | L(more_4x_vec): |
281 | |
282 | VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) |
283 | testl %ecx, %ecx |
284 | jnz L(zfill_vec3) |
285 | |
286 | VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4) |
287 | VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi) |
288 | VPCMPEQ %VMM(4), %VZERO, %VMM(6) |
289 | vpmovmskb %VMM(6), %ecx |
290 | testl %ecx, %ecx |
291 | jnz L(zfill_vec4) |
292 | |
293 | movq %rdx, %rcx |
294 | addq %rsi, %rdx |
295 | subq %rsi, %rdi |
296 | subq $-(VEC_SIZE * 4), %rsi |
297 | /* Recheck length before aligning. */ |
298 | cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx |
299 | jbe L(last_4x_vec) |
300 | |
301 | andq $(VEC_SIZE * -4), %rsi |
302 | |
303 | /* Do first half of loop ahead of time so loop can just start by |
304 | storing. */ |
305 | VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) |
306 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) |
307 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) |
308 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) |
309 | |
310 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
311 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
312 | VPMIN %VMM(4), %VMM(6), %VMM(6) |
313 | VPCMPEQ %VMM(6), %VZERO, %VMM(6) |
314 | vpmovmskb %VMM(6), %r8d |
315 | addq %rsi, %rdi |
316 | testl %r8d, %r8d |
317 | jnz L(loop_4x_done) |
318 | |
319 | /* Use r9 as end register. */ |
320 | leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9 |
321 | |
322 | .p2align 4,, 11 |
323 | L(loop_4x_vec): |
324 | |
325 | VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) |
326 | VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) |
327 | subq $(VEC_SIZE * -4), %rsi |
328 | VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) |
329 | VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) |
330 | |
331 | subq $(VEC_SIZE * -4), %rdi |
332 | cmpq %rsi, %r9 |
333 | jbe L(loop_last_4x_vec) |
334 | |
335 | VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) |
336 | VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) |
337 | VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) |
338 | VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) |
339 | |
340 | VPMIN %VMM(0), %VMM(1), %VMM(4) |
341 | VPMIN %VMM(2), %VMM(3), %VMM(6) |
342 | VPMIN %VMM(4), %VMM(6), %VMM(6) |
343 | VPCMPEQ %VMM(6), %VZERO, %VMM(6) |
344 | |
345 | vpmovmskb %VMM(6), %r8d |
346 | |
347 | testl %r8d, %r8d |
348 | jz L(loop_4x_vec) |
349 | |
350 | L(loop_4x_done): |
351 | subq %rsi, %rdx |
352 | VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) |
353 | VPCMPEQ %VMM(0), %VZERO, %VMM(6) |
354 | vpmovmskb %VMM(6), %ecx |
355 | testl %ecx, %ecx |
356 | jnz L(zfill_vec1) |
357 | |
358 | VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) |
359 | VPCMPEQ %VMM(1), %VZERO, %VMM(6) |
360 | vpmovmskb %VMM(6), %ecx |
361 | testl %ecx, %ecx |
362 | jnz L(zfill_vec2) |
363 | |
364 | VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) |
365 | VPCMPEQ %VMM(2), %VZERO, %VMM(6) |
366 | vpmovmskb %VMM(6), %ecx |
367 | testl %ecx, %ecx |
368 | jnz L(zfill_vec3) |
369 | |
370 | VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) |
371 | movl %r8d, %ecx |
372 | |
373 | // Zfill more.... |
374 | |
375 | .p2align 4,, 4 |
376 | L(zfill_vec4): |
377 | addq $(VEC_SIZE * 2), %rdi |
378 | subq $(VEC_SIZE * 2), %rdx |
379 | L(zfill_vec2): |
380 | shlq $VEC_SIZE, %rcx |
381 | L(zfill): |
382 | bsfq %rcx, %rcx |
383 | subq %rcx, %rdx |
384 | addq %rcx, %rdi |
385 | # ifdef USE_AS_STPCPY |
386 | movq %rdi, %rax |
387 | # endif |
388 | L(zfill_from_page_cross): |
389 | cmpq $VEC_SIZE, %rdx |
390 | jb L(zfill_less_vec_vzeroupper) |
391 | |
392 | L(zfill_more_1x_vec): |
393 | VMOVU %VZERO, CHAR_SIZE(%rdi) |
394 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx) |
395 | cmpq $(VEC_SIZE * 2), %rdx |
396 | jae L(zfill_more_2x_vec) |
397 | L(zfill_done0): |
398 | VZEROUPPER_RETURN |
399 | |
400 | .p2align 4,, 8 |
401 | L(zfill_vec3): |
402 | addq $(VEC_SIZE * 2), %rdi |
403 | subq $(VEC_SIZE * 2), %rdx |
404 | .p2align 4,, 2 |
405 | L(zfill_vec1): |
406 | bsfl %ecx, %ecx |
407 | addq %rcx, %rdi |
408 | subq %rcx, %rdx |
409 | # ifdef USE_AS_STPCPY |
410 | movq %rdi, %rax |
411 | # endif |
412 | /* zfill from vec1/vec3 must have to set at least 2x VECS. */ |
413 | |
414 | VMOVU %VZERO, CHAR_SIZE(%rdi) |
415 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx) |
416 | cmpq $(VEC_SIZE * 2), %rdx |
417 | jb L(zfill_done0) |
418 | L(zfill_more_2x_vec): |
419 | VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx) |
420 | VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi) |
421 | subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx |
422 | jbe L(zfill_done) |
423 | |
424 | addq %rdi, %rdx |
425 | VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi) |
426 | VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi) |
427 | |
428 | |
429 | VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) |
430 | VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) |
431 | |
432 | subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi |
433 | cmpq %rdi, %rdx |
434 | jbe L(zfill_done) |
435 | |
436 | andq $-(VEC_SIZE), %rdi |
437 | .p2align 4,, 12 |
438 | L(zfill_loop_4x_vec): |
439 | VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) |
440 | VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) |
441 | VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) |
442 | VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) |
443 | subq $-(VEC_SIZE * 4), %rdi |
444 | cmpq %rdi, %rdx |
445 | ja L(zfill_loop_4x_vec) |
446 | L(zfill_done): |
447 | VZEROUPPER_RETURN |
448 | |
449 | |
450 | .p2align 4,, 8 |
451 | L(copy_1x): |
452 | VMOVU %VMM(0), (%rdi) |
453 | testl %ecx, %ecx |
454 | jz L(ret_32_32) |
455 | L(zfill_less_vec): |
456 | bsfl %ecx, %ecx |
457 | L(zfill_less_vec_no_bsf): |
458 | subq %rcx, %rdx |
459 | addq %rcx, %rdi |
460 | # ifdef USE_AS_STPCPY |
461 | movq %rdi, %rax |
462 | # endif |
463 | L(zfill_less_vec_vzeroupper): |
464 | COND_VZEROUPPER |
465 | /* We are taking advantage of the fact that to be here we must |
466 | be writing null-term as (%rdi, %rcx) we have a byte of lee- |
467 | way for overwriting. */ |
468 | cmpl $16, %edx |
469 | jb L(zfill_less_16) |
470 | VMOVU %VZERO_128, (%rdi) |
471 | VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx) |
472 | ret |
473 | # ifdef USE_AS_STPCPY |
474 | L(ret_32_32): |
475 | leaq CHAR_SIZE(%rdi, %rdx), %rax |
476 | VZEROUPPER_RETURN |
477 | # endif |
478 | |
479 | .p2align 4,, 4 |
480 | L(copy_16_31): |
481 | /* Overfill to avoid branches. */ |
482 | vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1 |
483 | vmovdqu %xmm0, (%rdi) |
484 | vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx) |
485 | cmpl %ecx, %edx |
486 | ja L(zfill_less_vec_no_bsf) |
487 | # ifndef USE_AS_STPCPY |
488 | L(ret_32_32): |
489 | # else |
490 | # ifdef USE_AS_WCSCPY |
491 | setc %al |
492 | addq %rdx, %rdi |
493 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
494 | # else |
495 | movl %edx, %eax |
496 | adcq %rdi, %rax |
497 | # endif |
498 | # endif |
499 | VZEROUPPER_RETURN |
500 | |
501 | .p2align 4,, 4 |
502 | L(copy_8_15): |
503 | /* Overfill to avoid branches. */ |
504 | movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi |
505 | vmovq %xmm0, (%rdi) |
506 | movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx) |
507 | cmpl %ecx, %edx |
508 | jbe L(ret_8_15) |
509 | subq %rcx, %rdx |
510 | addq %rcx, %rdi |
511 | # ifdef USE_AS_STPCPY |
512 | movq %rdi, %rax |
513 | # endif |
514 | .p2align 4,, 8 |
515 | L(zfill_less_16): |
516 | xorl %ecx, %ecx |
517 | cmpl $8, %edx |
518 | jb L(zfill_less_8) |
519 | movq %rcx, (%rdi) |
520 | movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx) |
521 | # ifndef USE_AS_STPCPY |
522 | L(ret_8_15): |
523 | # endif |
524 | ret |
525 | |
526 | |
527 | .p2align 4,, 8 |
528 | L(less_1x_vec): |
529 | /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many |
530 | buffer sizes are aligned conventially. */ |
531 | je L(copy_1x) |
532 | |
533 | tzcntl %ecx, %ecx |
534 | cmpl $16, %edx |
535 | jae L(copy_16_31) |
536 | |
537 | COND_VZEROUPPER |
538 | cmpl $8, %edx |
539 | jae L(copy_8_15) |
540 | # ifdef USE_AS_WCSCPY |
541 | testl %ecx, %ecx |
542 | jz L(zfill_less_8_set_ret) |
543 | |
544 | movl (%rsi, %rdx), %esi |
545 | vmovd %xmm0, (%rdi) |
546 | movl %esi, (%rdi, %rdx) |
547 | |
548 | # ifdef USE_AS_STPCPY |
549 | cmpl %ecx, %edx |
550 | L(ret_8_15): |
551 | setc %al |
552 | addq %rdx, %rdi |
553 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
554 | # endif |
555 | ret |
556 | L(zfill_less_8_set_ret): |
557 | xorl %ecx, %ecx |
558 | # ifdef USE_AS_STPCPY |
559 | movq %rdi, %rax |
560 | # endif |
561 | L(zfill_less_8): |
562 | movl %ecx, (%rdi) |
563 | movl %ecx, (%rdi, %rdx) |
564 | ret |
565 | |
566 | # else |
567 | cmpl $3, %edx |
568 | jb L(copy_0_3) |
569 | /* Overfill to avoid branches. */ |
570 | movl -3(%rsi, %rdx), %esi |
571 | vmovd %xmm0, (%rdi) |
572 | movl %esi, -3(%rdi, %rdx) |
573 | cmpl %ecx, %edx |
574 | jbe L(ret_4_7) |
575 | subq %rcx, %rdx |
576 | addq %rcx, %rdi |
577 | # ifdef USE_AS_STPCPY |
578 | movq %rdi, %rax |
579 | # endif |
580 | xorl %ecx, %ecx |
581 | .p2align 4,, 8 |
582 | L(zfill_less_8): |
583 | cmpl $3, %edx |
584 | jb L(zfill_less_3) |
585 | movl %ecx, (%rdi) |
586 | movl %ecx, -3(%rdi, %rdx) |
587 | # ifdef USE_AS_STPCPY |
588 | ret |
589 | # endif |
590 | |
591 | L(ret_4_7): |
592 | # ifdef USE_AS_STPCPY |
593 | L(ret_8_15): |
594 | movl %edx, %eax |
595 | adcq %rdi, %rax |
596 | # endif |
597 | ret |
598 | |
599 | .p2align 4,, 4 |
600 | L(zfill_less_3): |
601 | testl %edx, %edx |
602 | jz L(zfill_1) |
603 | movw %cx, (%rdi) |
604 | L(zfill_1): |
605 | movb %cl, (%rdi, %rdx) |
606 | ret |
607 | |
608 | .p2align 4,, 8 |
609 | L(copy_0_3): |
610 | vmovd %xmm0, %r8d |
611 | testl %edx, %edx |
612 | jz L(copy_1) |
613 | movw %r8w, (%rdi) |
614 | cmpl %ecx, %edx |
615 | ja L(zfill_from_1) |
616 | movzbl (%rsi, %rdx), %r8d |
617 | # ifdef USE_AS_STPCPY |
618 | movl %edx, %eax |
619 | adcq %rdi, %rax |
620 | movb %r8b, (%rdi, %rdx) |
621 | ret |
622 | # endif |
623 | |
624 | L(copy_1): |
625 | # ifdef USE_AS_STPCPY |
626 | movl %edx, %eax |
627 | cmpl %ecx, %edx |
628 | adcq %rdi, %rax |
629 | # endif |
630 | # ifdef USE_AS_WCSCPY |
631 | vmovd %xmm0, (%rdi) |
632 | # else |
633 | movb %r8b, (%rdi, %rdx) |
634 | # endif |
635 | ret |
636 | # endif |
637 | |
638 | .p2align 4,, 2 |
639 | L(zero_len): |
640 | movq %rdi, %rax |
641 | ret |
642 | # ifndef USE_AS_WCSCPY |
643 | .p2align 4,, 8 |
644 | L(zfill_from_1): |
645 | # ifdef USE_AS_STPCPY |
646 | leaq (%rdi, %rcx), %rax |
647 | # endif |
648 | movw $0, -1(%rdi, %rdx) |
649 | ret |
650 | # endif |
651 | |
652 | .p2align 4,, 4 |
653 | .p2align 6,, 8 |
654 | L(page_cross): |
655 | movq %rsi, %rax |
656 | andq $(VEC_SIZE * -1), %rax |
657 | |
658 | VPCMPEQ (%rax), %VZERO, %VMM(6) |
659 | |
660 | vpmovmskb %VMM(6), %ecx |
661 | shrxl %esi, %ecx, %ecx |
662 | |
663 | subl %esi, %eax |
664 | andl $(VEC_SIZE - 1), %eax |
665 | cmpq %rax, %rdx |
666 | jb L(page_cross_small) |
667 | /* Optimizing more aggressively for space as this is very cold |
668 | code. This saves 2x cache lines. */ |
669 | |
670 | /* If rcx is non-zero then continue. */ |
671 | shl $CHAR_SIZE, %ecx |
672 | jz L(page_cross_continue) |
673 | bsf %ecx, %ecx |
674 | |
675 | subq %rcx, %rdx |
676 | # ifdef USE_AS_STPCPY |
677 | leaq -CHAR_SIZE(%rdi, %rcx), %rax |
678 | # else |
679 | movq %rdi, %rax |
680 | # endif |
681 | |
682 | rep movsb |
683 | # ifdef USE_AS_WCSCPY |
684 | movl $0, (%rdi) |
685 | # else |
686 | movb $0, (%rdi) |
687 | # endif |
688 | jmp L(zfill_from_page_cross) |
689 | |
690 | L(page_cross_small): |
691 | tzcntl %ecx, %ecx |
692 | xorl %eax, %eax |
693 | cmpl %ecx, %edx |
694 | jbe L(page_cross_copy_only) |
695 | |
696 | /* Do a zfill of the tail before copying. */ |
697 | movq %rdi, %r9 |
698 | movl %ecx, %r8d |
699 | |
700 | subl %ecx, %edx |
701 | leaq CHAR_SIZE(%rdi, %rcx), %rdi |
702 | movl %edx, %ecx |
703 | rep stosb |
704 | movq %r9, %rdi |
705 | movl %r8d, %edx |
706 | L(page_cross_copy_only): |
707 | leal CHAR_SIZE(%rdx), %ecx |
708 | # ifdef USE_AS_STPCPY |
709 | # ifdef USE_AS_WCSCPY |
710 | setc %al |
711 | addq %rdi, %rdx |
712 | leaq (%rdx, %rax, CHAR_SIZE), %rax |
713 | # else |
714 | movl %edx, %eax |
715 | adcq %rdi, %rax |
716 | # endif |
717 | # else |
718 | movq %rdi, %rax |
719 | # endif |
720 | rep movsb |
721 | ret |
722 | |
723 | |
724 | L(best_effort_strncpy): |
725 | movq %rdx, %rcx |
726 | xorl %eax, %eax |
727 | movq %rdi, %r8 |
728 | /* The length is >= 2^63. We very much so expect to segfault at |
729 | rep stos. If that doesn't happen then just strcpy to finish. |
730 | */ |
731 | # ifdef USE_AS_WCSCPY |
732 | rep stosl |
733 | # else |
734 | rep stosb |
735 | # endif |
736 | movq %r8, %rdi |
737 | jmp OVERFLOW_STRCPY |
738 | END(STRNCPY) |
739 | #endif |
740 | |