1/* strcpy with AVX2
2 Copyright (C) 2011-2021 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#if IS_IN (libc)
21
22# ifndef USE_AS_STRCAT
23# include <sysdep.h>
24
25# ifndef STRCPY
26# define STRCPY __strcpy_avx2
27# endif
28
29# endif
30
31/* Number of bytes in a vector register */
32# ifndef VEC_SIZE
33# define VEC_SIZE 32
34# endif
35
36# ifndef VZEROUPPER
37# define VZEROUPPER vzeroupper
38# endif
39
40# ifndef SECTION
41# define SECTION(p) p##.avx
42# endif
43
44/* zero register */
45#define xmmZ xmm0
46#define ymmZ ymm0
47
48/* mask register */
49#define ymmM ymm1
50
51# ifndef USE_AS_STRCAT
52
53 .section SECTION(.text),"ax",@progbits
54ENTRY (STRCPY)
55# ifdef USE_AS_STRNCPY
56 mov %RDX_LP, %R8_LP
57 test %R8_LP, %R8_LP
58 jz L(ExitZero)
59# endif
60 mov %rsi, %rcx
61# ifndef USE_AS_STPCPY
62 mov %rdi, %rax /* save result */
63# endif
64
65# endif
66
67 vpxor %xmmZ, %xmmZ, %xmmZ
68
69 and $((VEC_SIZE * 4) - 1), %ecx
70 cmp $(VEC_SIZE * 2), %ecx
71 jbe L(SourceStringAlignmentLessTwoVecSize)
72
73 and $-VEC_SIZE, %rsi
74 and $(VEC_SIZE - 1), %ecx
75
76 vpcmpeqb (%rsi), %ymmZ, %ymmM
77 vpmovmskb %ymmM, %edx
78 shr %cl, %rdx
79
80# ifdef USE_AS_STRNCPY
81# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
82 mov $VEC_SIZE, %r10
83 sub %rcx, %r10
84 cmp %r10, %r8
85# else
86 mov $(VEC_SIZE + 1), %r10
87 sub %rcx, %r10
88 cmp %r10, %r8
89# endif
90 jbe L(CopyVecSizeTailCase2OrCase3)
91# endif
92 test %edx, %edx
93 jnz L(CopyVecSizeTail)
94
95 vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
96 vpmovmskb %ymm2, %edx
97
98# ifdef USE_AS_STRNCPY
99 add $VEC_SIZE, %r10
100 cmp %r10, %r8
101 jbe L(CopyTwoVecSizeCase2OrCase3)
102# endif
103 test %edx, %edx
104 jnz L(CopyTwoVecSize)
105
106 vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
107 vmovdqu %ymm2, (%rdi)
108
109/* If source address alignment != destination address alignment */
110 .p2align 4
111L(UnalignVecSizeBoth):
112 sub %rcx, %rdi
113# ifdef USE_AS_STRNCPY
114 add %rcx, %r8
115 sbb %rcx, %rcx
116 or %rcx, %r8
117# endif
118 mov $VEC_SIZE, %rcx
119 vmovdqa (%rsi, %rcx), %ymm2
120 vmovdqu %ymm2, (%rdi, %rcx)
121 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
122 vpcmpeqb %ymm2, %ymmZ, %ymmM
123 vpmovmskb %ymmM, %edx
124 add $VEC_SIZE, %rcx
125# ifdef USE_AS_STRNCPY
126 sub $(VEC_SIZE * 3), %r8
127 jbe L(CopyVecSizeCase2OrCase3)
128# endif
129 test %edx, %edx
130# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
131 jnz L(CopyVecSizeUnalignedVec2)
132# else
133 jnz L(CopyVecSize)
134# endif
135
136 vmovdqu %ymm2, (%rdi, %rcx)
137 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
138 vpcmpeqb %ymm3, %ymmZ, %ymmM
139 vpmovmskb %ymmM, %edx
140 add $VEC_SIZE, %rcx
141# ifdef USE_AS_STRNCPY
142 sub $VEC_SIZE, %r8
143 jbe L(CopyVecSizeCase2OrCase3)
144# endif
145 test %edx, %edx
146# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
147 jnz L(CopyVecSizeUnalignedVec3)
148# else
149 jnz L(CopyVecSize)
150# endif
151
152 vmovdqu %ymm3, (%rdi, %rcx)
153 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
154 vpcmpeqb %ymm4, %ymmZ, %ymmM
155 vpmovmskb %ymmM, %edx
156 add $VEC_SIZE, %rcx
157# ifdef USE_AS_STRNCPY
158 sub $VEC_SIZE, %r8
159 jbe L(CopyVecSizeCase2OrCase3)
160# endif
161 test %edx, %edx
162# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
163 jnz L(CopyVecSizeUnalignedVec4)
164# else
165 jnz L(CopyVecSize)
166# endif
167
168 vmovdqu %ymm4, (%rdi, %rcx)
169 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
170 vpcmpeqb %ymm2, %ymmZ, %ymmM
171 vpmovmskb %ymmM, %edx
172 add $VEC_SIZE, %rcx
173# ifdef USE_AS_STRNCPY
174 sub $VEC_SIZE, %r8
175 jbe L(CopyVecSizeCase2OrCase3)
176# endif
177 test %edx, %edx
178# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
179 jnz L(CopyVecSizeUnalignedVec2)
180# else
181 jnz L(CopyVecSize)
182# endif
183
184 vmovdqu %ymm2, (%rdi, %rcx)
185 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
186 vpcmpeqb %ymm2, %ymmZ, %ymmM
187 vpmovmskb %ymmM, %edx
188 add $VEC_SIZE, %rcx
189# ifdef USE_AS_STRNCPY
190 sub $VEC_SIZE, %r8
191 jbe L(CopyVecSizeCase2OrCase3)
192# endif
193 test %edx, %edx
194# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
195 jnz L(CopyVecSizeUnalignedVec2)
196# else
197 jnz L(CopyVecSize)
198# endif
199
200 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
201 vmovdqu %ymm2, (%rdi, %rcx)
202 vpcmpeqb %ymm3, %ymmZ, %ymmM
203 vpmovmskb %ymmM, %edx
204 add $VEC_SIZE, %rcx
205# ifdef USE_AS_STRNCPY
206 sub $VEC_SIZE, %r8
207 jbe L(CopyVecSizeCase2OrCase3)
208# endif
209 test %edx, %edx
210# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
211 jnz L(CopyVecSizeUnalignedVec3)
212# else
213 jnz L(CopyVecSize)
214# endif
215
216 vmovdqu %ymm3, (%rdi, %rcx)
217 mov %rsi, %rdx
218 lea VEC_SIZE(%rsi, %rcx), %rsi
219 and $-(VEC_SIZE * 4), %rsi
220 sub %rsi, %rdx
221 sub %rdx, %rdi
222# ifdef USE_AS_STRNCPY
223 lea (VEC_SIZE * 8)(%r8, %rdx), %r8
224# endif
225L(UnalignedFourVecSizeLoop):
226 vmovdqa (%rsi), %ymm4
227 vmovdqa VEC_SIZE(%rsi), %ymm5
228 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
229 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
230 vpminub %ymm5, %ymm4, %ymm2
231 vpminub %ymm7, %ymm6, %ymm3
232 vpminub %ymm2, %ymm3, %ymm3
233 vpcmpeqb %ymmM, %ymm3, %ymm3
234 vpmovmskb %ymm3, %edx
235# ifdef USE_AS_STRNCPY
236 sub $(VEC_SIZE * 4), %r8
237 jbe L(UnalignedLeaveCase2OrCase3)
238# endif
239 test %edx, %edx
240 jnz L(UnalignedFourVecSizeLeave)
241
242L(UnalignedFourVecSizeLoop_start):
243 add $(VEC_SIZE * 4), %rdi
244 add $(VEC_SIZE * 4), %rsi
245 vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
246 vmovdqa (%rsi), %ymm4
247 vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
248 vmovdqa VEC_SIZE(%rsi), %ymm5
249 vpminub %ymm5, %ymm4, %ymm2
250 vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
251 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
252 vmovdqu %ymm7, -VEC_SIZE(%rdi)
253 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
254 vpminub %ymm7, %ymm6, %ymm3
255 vpminub %ymm2, %ymm3, %ymm3
256 vpcmpeqb %ymmM, %ymm3, %ymm3
257 vpmovmskb %ymm3, %edx
258# ifdef USE_AS_STRNCPY
259 sub $(VEC_SIZE * 4), %r8
260 jbe L(UnalignedLeaveCase2OrCase3)
261# endif
262 test %edx, %edx
263 jz L(UnalignedFourVecSizeLoop_start)
264
265L(UnalignedFourVecSizeLeave):
266 vpcmpeqb %ymm4, %ymmZ, %ymmM
267 vpmovmskb %ymmM, %edx
268 test %edx, %edx
269 jnz L(CopyVecSizeUnaligned_0)
270
271 vpcmpeqb %ymm5, %ymmZ, %ymmM
272 vpmovmskb %ymmM, %ecx
273 test %ecx, %ecx
274 jnz L(CopyVecSizeUnaligned_16)
275
276 vpcmpeqb %ymm6, %ymmZ, %ymmM
277 vpmovmskb %ymmM, %edx
278 test %edx, %edx
279 jnz L(CopyVecSizeUnaligned_32)
280
281 vpcmpeqb %ymm7, %ymmZ, %ymmM
282 vpmovmskb %ymmM, %ecx
283 bsf %ecx, %edx
284 vmovdqu %ymm4, (%rdi)
285 vmovdqu %ymm5, VEC_SIZE(%rdi)
286 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
287# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
288# ifdef USE_AS_STPCPY
289 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
290# endif
291 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
292 add $(VEC_SIZE - 1), %r8
293 sub %rdx, %r8
294 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
295 jmp L(StrncpyFillTailWithZero)
296# else
297 add $(VEC_SIZE * 3), %rsi
298 add $(VEC_SIZE * 3), %rdi
299 jmp L(CopyVecSizeExit)
300# endif
301
302/* If source address alignment == destination address alignment */
303
304L(SourceStringAlignmentLessTwoVecSize):
305 vmovdqu (%rsi), %ymm3
306 vmovdqu VEC_SIZE(%rsi), %ymm2
307 vpcmpeqb %ymm3, %ymmZ, %ymmM
308 vpmovmskb %ymmM, %edx
309
310# ifdef USE_AS_STRNCPY
311# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
312 cmp $VEC_SIZE, %r8
313# else
314 cmp $(VEC_SIZE + 1), %r8
315# endif
316 jbe L(CopyVecSizeTail1Case2OrCase3)
317# endif
318 test %edx, %edx
319 jnz L(CopyVecSizeTail1)
320
321 vmovdqu %ymm3, (%rdi)
322 vpcmpeqb %ymm2, %ymmZ, %ymmM
323 vpmovmskb %ymmM, %edx
324
325# ifdef USE_AS_STRNCPY
326# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
327 cmp $(VEC_SIZE * 2), %r8
328# else
329 cmp $((VEC_SIZE * 2) + 1), %r8
330# endif
331 jbe L(CopyTwoVecSize1Case2OrCase3)
332# endif
333 test %edx, %edx
334 jnz L(CopyTwoVecSize1)
335
336 and $-VEC_SIZE, %rsi
337 and $(VEC_SIZE - 1), %ecx
338 jmp L(UnalignVecSizeBoth)
339
340/*------End of main part with loops---------------------*/
341
342/* Case1 */
343
344# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
345 .p2align 4
346L(CopyVecSize):
347 add %rcx, %rdi
348# endif
349L(CopyVecSizeTail):
350 add %rcx, %rsi
351L(CopyVecSizeTail1):
352 bsf %edx, %edx
353L(CopyVecSizeExit):
354 cmp $32, %edx
355 jae L(Exit32_63)
356 cmp $16, %edx
357 jae L(Exit16_31)
358 cmp $8, %edx
359 jae L(Exit8_15)
360 cmp $4, %edx
361 jae L(Exit4_7)
362 cmp $3, %edx
363 je L(Exit3)
364 cmp $1, %edx
365 ja L(Exit2)
366 je L(Exit1)
367 movb $0, (%rdi)
368# ifdef USE_AS_STPCPY
369 lea (%rdi), %rax
370# endif
371# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
372 sub $1, %r8
373 lea 1(%rdi), %rdi
374 jnz L(StrncpyFillTailWithZero)
375# endif
376L(return_vzeroupper):
377 ZERO_UPPER_VEC_REGISTERS_RETURN
378
379 .p2align 4
380L(CopyTwoVecSize1):
381 add $VEC_SIZE, %rsi
382 add $VEC_SIZE, %rdi
383# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
384 sub $VEC_SIZE, %r8
385# endif
386 jmp L(CopyVecSizeTail1)
387
388 .p2align 4
389L(CopyTwoVecSize):
390 bsf %edx, %edx
391 add %rcx, %rsi
392 add $VEC_SIZE, %edx
393 sub %ecx, %edx
394 jmp L(CopyVecSizeExit)
395
396 .p2align 4
397L(CopyVecSizeUnaligned_0):
398 bsf %edx, %edx
399# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
400# ifdef USE_AS_STPCPY
401 lea (%rdi, %rdx), %rax
402# endif
403 vmovdqu %ymm4, (%rdi)
404 add $((VEC_SIZE * 4) - 1), %r8
405 sub %rdx, %r8
406 lea 1(%rdi, %rdx), %rdi
407 jmp L(StrncpyFillTailWithZero)
408# else
409 jmp L(CopyVecSizeExit)
410# endif
411
412 .p2align 4
413L(CopyVecSizeUnaligned_16):
414 bsf %ecx, %edx
415 vmovdqu %ymm4, (%rdi)
416# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
417# ifdef USE_AS_STPCPY
418 lea VEC_SIZE(%rdi, %rdx), %rax
419# endif
420 vmovdqu %ymm5, VEC_SIZE(%rdi)
421 add $((VEC_SIZE * 3) - 1), %r8
422 sub %rdx, %r8
423 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
424 jmp L(StrncpyFillTailWithZero)
425# else
426 add $VEC_SIZE, %rsi
427 add $VEC_SIZE, %rdi
428 jmp L(CopyVecSizeExit)
429# endif
430
431 .p2align 4
432L(CopyVecSizeUnaligned_32):
433 bsf %edx, %edx
434 vmovdqu %ymm4, (%rdi)
435 vmovdqu %ymm5, VEC_SIZE(%rdi)
436# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
437# ifdef USE_AS_STPCPY
438 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
439# endif
440 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
441 add $((VEC_SIZE * 2) - 1), %r8
442 sub %rdx, %r8
443 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
444 jmp L(StrncpyFillTailWithZero)
445# else
446 add $(VEC_SIZE * 2), %rsi
447 add $(VEC_SIZE * 2), %rdi
448 jmp L(CopyVecSizeExit)
449# endif
450
451# ifdef USE_AS_STRNCPY
452# ifndef USE_AS_STRCAT
453 .p2align 4
454L(CopyVecSizeUnalignedVec6):
455 vmovdqu %ymm6, (%rdi, %rcx)
456 jmp L(CopyVecSizeVecExit)
457
458 .p2align 4
459L(CopyVecSizeUnalignedVec5):
460 vmovdqu %ymm5, (%rdi, %rcx)
461 jmp L(CopyVecSizeVecExit)
462
463 .p2align 4
464L(CopyVecSizeUnalignedVec4):
465 vmovdqu %ymm4, (%rdi, %rcx)
466 jmp L(CopyVecSizeVecExit)
467
468 .p2align 4
469L(CopyVecSizeUnalignedVec3):
470 vmovdqu %ymm3, (%rdi, %rcx)
471 jmp L(CopyVecSizeVecExit)
472# endif
473
474/* Case2 */
475
476 .p2align 4
477L(CopyVecSizeCase2):
478 add $VEC_SIZE, %r8
479 add %rcx, %rdi
480 add %rcx, %rsi
481 bsf %edx, %edx
482 cmp %r8d, %edx
483 jb L(CopyVecSizeExit)
484 jmp L(StrncpyExit)
485
486 .p2align 4
487L(CopyTwoVecSizeCase2):
488 add %rcx, %rsi
489 bsf %edx, %edx
490 add $VEC_SIZE, %edx
491 sub %ecx, %edx
492 cmp %r8d, %edx
493 jb L(CopyVecSizeExit)
494 jmp L(StrncpyExit)
495
496L(CopyVecSizeTailCase2):
497 add %rcx, %rsi
498 bsf %edx, %edx
499 cmp %r8d, %edx
500 jb L(CopyVecSizeExit)
501 jmp L(StrncpyExit)
502
503L(CopyVecSizeTail1Case2):
504 bsf %edx, %edx
505 cmp %r8d, %edx
506 jb L(CopyVecSizeExit)
507 jmp L(StrncpyExit)
508
509/* Case2 or Case3, Case3 */
510
511 .p2align 4
512L(CopyVecSizeCase2OrCase3):
513 test %rdx, %rdx
514 jnz L(CopyVecSizeCase2)
515L(CopyVecSizeCase3):
516 add $VEC_SIZE, %r8
517 add %rcx, %rdi
518 add %rcx, %rsi
519 jmp L(StrncpyExit)
520
521 .p2align 4
522L(CopyTwoVecSizeCase2OrCase3):
523 test %rdx, %rdx
524 jnz L(CopyTwoVecSizeCase2)
525 add %rcx, %rsi
526 jmp L(StrncpyExit)
527
528 .p2align 4
529L(CopyVecSizeTailCase2OrCase3):
530 test %rdx, %rdx
531 jnz L(CopyVecSizeTailCase2)
532 add %rcx, %rsi
533 jmp L(StrncpyExit)
534
535 .p2align 4
536L(CopyTwoVecSize1Case2OrCase3):
537 add $VEC_SIZE, %rdi
538 add $VEC_SIZE, %rsi
539 sub $VEC_SIZE, %r8
540L(CopyVecSizeTail1Case2OrCase3):
541 test %rdx, %rdx
542 jnz L(CopyVecSizeTail1Case2)
543 jmp L(StrncpyExit)
544# endif
545
546/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
547
548 .p2align 4
549L(Exit1):
550 movzwl (%rsi), %edx
551 mov %dx, (%rdi)
552# ifdef USE_AS_STPCPY
553 lea 1(%rdi), %rax
554# endif
555# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
556 sub $2, %r8
557 lea 2(%rdi), %rdi
558 jnz L(StrncpyFillTailWithZero)
559# endif
560 VZEROUPPER_RETURN
561
562 .p2align 4
563L(Exit2):
564 movzwl (%rsi), %ecx
565 mov %cx, (%rdi)
566 movb $0, 2(%rdi)
567# ifdef USE_AS_STPCPY
568 lea 2(%rdi), %rax
569# endif
570# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
571 sub $3, %r8
572 lea 3(%rdi), %rdi
573 jnz L(StrncpyFillTailWithZero)
574# endif
575 VZEROUPPER_RETURN
576
577 .p2align 4
578L(Exit3):
579 mov (%rsi), %edx
580 mov %edx, (%rdi)
581# ifdef USE_AS_STPCPY
582 lea 3(%rdi), %rax
583# endif
584# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
585 sub $4, %r8
586 lea 4(%rdi), %rdi
587 jnz L(StrncpyFillTailWithZero)
588# endif
589 VZEROUPPER_RETURN
590
591 .p2align 4
592L(Exit4_7):
593 mov (%rsi), %ecx
594 mov %ecx, (%rdi)
595 mov -3(%rsi, %rdx), %ecx
596 mov %ecx, -3(%rdi, %rdx)
597# ifdef USE_AS_STPCPY
598 lea (%rdi, %rdx), %rax
599# endif
600# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
601 sub %rdx, %r8
602 sub $1, %r8
603 lea 1(%rdi, %rdx), %rdi
604 jnz L(StrncpyFillTailWithZero)
605# endif
606 VZEROUPPER_RETURN
607
608 .p2align 4
609L(Exit8_15):
610 mov (%rsi), %rcx
611 mov -7(%rsi, %rdx), %r9
612 mov %rcx, (%rdi)
613 mov %r9, -7(%rdi, %rdx)
614# ifdef USE_AS_STPCPY
615 lea (%rdi, %rdx), %rax
616# endif
617# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
618 sub %rdx, %r8
619 sub $1, %r8
620 lea 1(%rdi, %rdx), %rdi
621 jnz L(StrncpyFillTailWithZero)
622# endif
623 VZEROUPPER_RETURN
624
625 .p2align 4
626L(Exit16_31):
627 vmovdqu (%rsi), %xmm2
628 vmovdqu -15(%rsi, %rdx), %xmm3
629 vmovdqu %xmm2, (%rdi)
630 vmovdqu %xmm3, -15(%rdi, %rdx)
631# ifdef USE_AS_STPCPY
632 lea (%rdi, %rdx), %rax
633# endif
634# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
635 sub %rdx, %r8
636 sub $1, %r8
637 lea 1(%rdi, %rdx), %rdi
638 jnz L(StrncpyFillTailWithZero)
639# endif
640 VZEROUPPER_RETURN
641
642 .p2align 4
643L(Exit32_63):
644 vmovdqu (%rsi), %ymm2
645 vmovdqu -31(%rsi, %rdx), %ymm3
646 vmovdqu %ymm2, (%rdi)
647 vmovdqu %ymm3, -31(%rdi, %rdx)
648# ifdef USE_AS_STPCPY
649 lea (%rdi, %rdx), %rax
650# endif
651# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
652 sub %rdx, %r8
653 sub $1, %r8
654 lea 1(%rdi, %rdx), %rdi
655 jnz L(StrncpyFillTailWithZero)
656# endif
657 VZEROUPPER_RETURN
658
659# ifdef USE_AS_STRNCPY
660
661 .p2align 4
662L(StrncpyExit1):
663 movzbl (%rsi), %edx
664 mov %dl, (%rdi)
665# ifdef USE_AS_STPCPY
666 lea 1(%rdi), %rax
667# endif
668# ifdef USE_AS_STRCAT
669 movb $0, 1(%rdi)
670# endif
671 VZEROUPPER_RETURN
672
673 .p2align 4
674L(StrncpyExit2):
675 movzwl (%rsi), %edx
676 mov %dx, (%rdi)
677# ifdef USE_AS_STPCPY
678 lea 2(%rdi), %rax
679# endif
680# ifdef USE_AS_STRCAT
681 movb $0, 2(%rdi)
682# endif
683 VZEROUPPER_RETURN
684
685 .p2align 4
686L(StrncpyExit3_4):
687 movzwl (%rsi), %ecx
688 movzwl -2(%rsi, %r8), %edx
689 mov %cx, (%rdi)
690 mov %dx, -2(%rdi, %r8)
691# ifdef USE_AS_STPCPY
692 lea (%rdi, %r8), %rax
693# endif
694# ifdef USE_AS_STRCAT
695 movb $0, (%rdi, %r8)
696# endif
697 VZEROUPPER_RETURN
698
699 .p2align 4
700L(StrncpyExit5_8):
701 mov (%rsi), %ecx
702 mov -4(%rsi, %r8), %edx
703 mov %ecx, (%rdi)
704 mov %edx, -4(%rdi, %r8)
705# ifdef USE_AS_STPCPY
706 lea (%rdi, %r8), %rax
707# endif
708# ifdef USE_AS_STRCAT
709 movb $0, (%rdi, %r8)
710# endif
711 VZEROUPPER_RETURN
712
713 .p2align 4
714L(StrncpyExit9_16):
715 mov (%rsi), %rcx
716 mov -8(%rsi, %r8), %rdx
717 mov %rcx, (%rdi)
718 mov %rdx, -8(%rdi, %r8)
719# ifdef USE_AS_STPCPY
720 lea (%rdi, %r8), %rax
721# endif
722# ifdef USE_AS_STRCAT
723 movb $0, (%rdi, %r8)
724# endif
725 VZEROUPPER_RETURN
726
727 .p2align 4
728L(StrncpyExit17_32):
729 vmovdqu (%rsi), %xmm2
730 vmovdqu -16(%rsi, %r8), %xmm3
731 vmovdqu %xmm2, (%rdi)
732 vmovdqu %xmm3, -16(%rdi, %r8)
733# ifdef USE_AS_STPCPY
734 lea (%rdi, %r8), %rax
735# endif
736# ifdef USE_AS_STRCAT
737 movb $0, (%rdi, %r8)
738# endif
739 VZEROUPPER_RETURN
740
741 .p2align 4
742L(StrncpyExit33_64):
743 /* 0/32, 31/16 */
744 vmovdqu (%rsi), %ymm2
745 vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
746 vmovdqu %ymm2, (%rdi)
747 vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
748# ifdef USE_AS_STPCPY
749 lea (%rdi, %r8), %rax
750# endif
751# ifdef USE_AS_STRCAT
752 movb $0, (%rdi, %r8)
753# endif
754 VZEROUPPER_RETURN
755
756 .p2align 4
757L(StrncpyExit65):
758 /* 0/32, 32/32, 64/1 */
759 vmovdqu (%rsi), %ymm2
760 vmovdqu 32(%rsi), %ymm3
761 mov 64(%rsi), %cl
762 vmovdqu %ymm2, (%rdi)
763 vmovdqu %ymm3, 32(%rdi)
764 mov %cl, 64(%rdi)
765# ifdef USE_AS_STPCPY
766 lea 65(%rdi), %rax
767# endif
768# ifdef USE_AS_STRCAT
769 movb $0, 65(%rdi)
770# endif
771 VZEROUPPER_RETURN
772
773# ifndef USE_AS_STRCAT
774
775 .p2align 4
776L(Fill1):
777 mov %dl, (%rdi)
778 VZEROUPPER_RETURN
779
780 .p2align 4
781L(Fill2):
782 mov %dx, (%rdi)
783 VZEROUPPER_RETURN
784
785 .p2align 4
786L(Fill3_4):
787 mov %dx, (%rdi)
788 mov %dx, -2(%rdi, %r8)
789 VZEROUPPER_RETURN
790
791 .p2align 4
792L(Fill5_8):
793 mov %edx, (%rdi)
794 mov %edx, -4(%rdi, %r8)
795 VZEROUPPER_RETURN
796
797 .p2align 4
798L(Fill9_16):
799 mov %rdx, (%rdi)
800 mov %rdx, -8(%rdi, %r8)
801 VZEROUPPER_RETURN
802
803 .p2align 4
804L(Fill17_32):
805 vmovdqu %xmmZ, (%rdi)
806 vmovdqu %xmmZ, -16(%rdi, %r8)
807 VZEROUPPER_RETURN
808
809 .p2align 4
810L(CopyVecSizeUnalignedVec2):
811 vmovdqu %ymm2, (%rdi, %rcx)
812
813 .p2align 4
814L(CopyVecSizeVecExit):
815 bsf %edx, %edx
816 add $(VEC_SIZE - 1), %r8
817 add %rcx, %rdi
818# ifdef USE_AS_STPCPY
819 lea (%rdi, %rdx), %rax
820# endif
821 sub %rdx, %r8
822 lea 1(%rdi, %rdx), %rdi
823
824 .p2align 4
825L(StrncpyFillTailWithZero):
826 xor %edx, %edx
827 sub $VEC_SIZE, %r8
828 jbe L(StrncpyFillExit)
829
830 vmovdqu %ymmZ, (%rdi)
831 add $VEC_SIZE, %rdi
832
833 mov %rdi, %rsi
834 and $(VEC_SIZE - 1), %esi
835 sub %rsi, %rdi
836 add %rsi, %r8
837 sub $(VEC_SIZE * 4), %r8
838 jb L(StrncpyFillLessFourVecSize)
839
840L(StrncpyFillLoopVmovdqa):
841 vmovdqa %ymmZ, (%rdi)
842 vmovdqa %ymmZ, VEC_SIZE(%rdi)
843 vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
844 vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
845 add $(VEC_SIZE * 4), %rdi
846 sub $(VEC_SIZE * 4), %r8
847 jae L(StrncpyFillLoopVmovdqa)
848
849L(StrncpyFillLessFourVecSize):
850 add $(VEC_SIZE * 2), %r8
851 jl L(StrncpyFillLessTwoVecSize)
852 vmovdqa %ymmZ, (%rdi)
853 vmovdqa %ymmZ, VEC_SIZE(%rdi)
854 add $(VEC_SIZE * 2), %rdi
855 sub $VEC_SIZE, %r8
856 jl L(StrncpyFillExit)
857 vmovdqa %ymmZ, (%rdi)
858 add $VEC_SIZE, %rdi
859 jmp L(Fill)
860
861 .p2align 4
862L(StrncpyFillLessTwoVecSize):
863 add $VEC_SIZE, %r8
864 jl L(StrncpyFillExit)
865 vmovdqa %ymmZ, (%rdi)
866 add $VEC_SIZE, %rdi
867 jmp L(Fill)
868
869 .p2align 4
870L(StrncpyFillExit):
871 add $VEC_SIZE, %r8
872L(Fill):
873 cmp $17, %r8d
874 jae L(Fill17_32)
875 cmp $9, %r8d
876 jae L(Fill9_16)
877 cmp $5, %r8d
878 jae L(Fill5_8)
879 cmp $3, %r8d
880 jae L(Fill3_4)
881 cmp $1, %r8d
882 ja L(Fill2)
883 je L(Fill1)
884 VZEROUPPER_RETURN
885
886/* end of ifndef USE_AS_STRCAT */
887# endif
888
889 .p2align 4
890L(UnalignedLeaveCase2OrCase3):
891 test %rdx, %rdx
892 jnz L(UnalignedFourVecSizeLeaveCase2)
893L(UnalignedFourVecSizeLeaveCase3):
894 lea (VEC_SIZE * 4)(%r8), %rcx
895 and $-VEC_SIZE, %rcx
896 add $(VEC_SIZE * 3), %r8
897 jl L(CopyVecSizeCase3)
898 vmovdqu %ymm4, (%rdi)
899 sub $VEC_SIZE, %r8
900 jb L(CopyVecSizeCase3)
901 vmovdqu %ymm5, VEC_SIZE(%rdi)
902 sub $VEC_SIZE, %r8
903 jb L(CopyVecSizeCase3)
904 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
905 sub $VEC_SIZE, %r8
906 jb L(CopyVecSizeCase3)
907 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
908# ifdef USE_AS_STPCPY
909 lea (VEC_SIZE * 4)(%rdi), %rax
910# endif
911# ifdef USE_AS_STRCAT
912 movb $0, (VEC_SIZE * 4)(%rdi)
913# endif
914 VZEROUPPER_RETURN
915
916 .p2align 4
917L(UnalignedFourVecSizeLeaveCase2):
918 xor %ecx, %ecx
919 vpcmpeqb %ymm4, %ymmZ, %ymmM
920 vpmovmskb %ymmM, %edx
921 add $(VEC_SIZE * 3), %r8
922 jle L(CopyVecSizeCase2OrCase3)
923 test %edx, %edx
924# ifndef USE_AS_STRCAT
925 jnz L(CopyVecSizeUnalignedVec4)
926# else
927 jnz L(CopyVecSize)
928# endif
929 vpcmpeqb %ymm5, %ymmZ, %ymmM
930 vpmovmskb %ymmM, %edx
931 vmovdqu %ymm4, (%rdi)
932 add $VEC_SIZE, %rcx
933 sub $VEC_SIZE, %r8
934 jbe L(CopyVecSizeCase2OrCase3)
935 test %edx, %edx
936# ifndef USE_AS_STRCAT
937 jnz L(CopyVecSizeUnalignedVec5)
938# else
939 jnz L(CopyVecSize)
940# endif
941
942 vpcmpeqb %ymm6, %ymmZ, %ymmM
943 vpmovmskb %ymmM, %edx
944 vmovdqu %ymm5, VEC_SIZE(%rdi)
945 add $VEC_SIZE, %rcx
946 sub $VEC_SIZE, %r8
947 jbe L(CopyVecSizeCase2OrCase3)
948 test %edx, %edx
949# ifndef USE_AS_STRCAT
950 jnz L(CopyVecSizeUnalignedVec6)
951# else
952 jnz L(CopyVecSize)
953# endif
954
955 vpcmpeqb %ymm7, %ymmZ, %ymmM
956 vpmovmskb %ymmM, %edx
957 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
958 lea VEC_SIZE(%rdi, %rcx), %rdi
959 lea VEC_SIZE(%rsi, %rcx), %rsi
960 bsf %edx, %edx
961 cmp %r8d, %edx
962 jb L(CopyVecSizeExit)
963L(StrncpyExit):
964 cmp $65, %r8d
965 je L(StrncpyExit65)
966 cmp $33, %r8d
967 jae L(StrncpyExit33_64)
968 cmp $17, %r8d
969 jae L(StrncpyExit17_32)
970 cmp $9, %r8d
971 jae L(StrncpyExit9_16)
972 cmp $5, %r8d
973 jae L(StrncpyExit5_8)
974 cmp $3, %r8d
975 jae L(StrncpyExit3_4)
976 cmp $1, %r8d
977 ja L(StrncpyExit2)
978 je L(StrncpyExit1)
979# ifdef USE_AS_STPCPY
980 mov %rdi, %rax
981# endif
982# ifdef USE_AS_STRCAT
983 movb $0, (%rdi)
984# endif
985 VZEROUPPER_RETURN
986
987 .p2align 4
988L(ExitZero):
989# ifndef USE_AS_STRCAT
990 mov %rdi, %rax
991# endif
992 VZEROUPPER_RETURN
993
994# endif
995
996# ifndef USE_AS_STRCAT
997END (STRCPY)
998# else
999END (STRCAT)
1000# endif
1001#endif
1002