1/* strcpy with AVX2
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23
24# ifndef USE_AS_STRCAT
25# include <sysdep.h>
26
27# ifndef STRCPY
28# define STRCPY __strcpy_avx2
29# endif
30
31# endif
32
33/* Number of bytes in a vector register */
34# ifndef VEC_SIZE
35# define VEC_SIZE 32
36# endif
37
38# ifndef VZEROUPPER
39# define VZEROUPPER vzeroupper
40# endif
41
42# ifndef SECTION
43# define SECTION(p) p##.avx
44# endif
45
46/* zero register */
47#define xmmZ xmm0
48#define ymmZ ymm0
49
50/* mask register */
51#define ymmM ymm1
52
53# ifndef USE_AS_STRCAT
54
55 .section SECTION(.text),"ax",@progbits
56ENTRY (STRCPY)
57# ifdef USE_AS_STRNCPY
58 mov %RDX_LP, %R8_LP
59 test %R8_LP, %R8_LP
60 jz L(ExitZero)
61# endif
62 mov %rsi, %rcx
63# ifndef USE_AS_STPCPY
64 mov %rdi, %rax /* save result */
65# endif
66
67# endif
68
69 vpxor %xmmZ, %xmmZ, %xmmZ
70
71 and $((VEC_SIZE * 4) - 1), %ecx
72 cmp $(VEC_SIZE * 2), %ecx
73 jbe L(SourceStringAlignmentLessTwoVecSize)
74
75 and $-VEC_SIZE, %rsi
76 and $(VEC_SIZE - 1), %ecx
77
78 vpcmpeqb (%rsi), %ymmZ, %ymmM
79 vpmovmskb %ymmM, %edx
80 shr %cl, %rdx
81
82# ifdef USE_AS_STRNCPY
83# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
84 mov $VEC_SIZE, %r10
85 sub %rcx, %r10
86 cmp %r10, %r8
87# else
88 mov $(VEC_SIZE + 1), %r10
89 sub %rcx, %r10
90 cmp %r10, %r8
91# endif
92 jbe L(CopyVecSizeTailCase2OrCase3)
93# endif
94 test %edx, %edx
95 jnz L(CopyVecSizeTail)
96
97 vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
98 vpmovmskb %ymm2, %edx
99
100# ifdef USE_AS_STRNCPY
101 add $VEC_SIZE, %r10
102 cmp %r10, %r8
103 jbe L(CopyTwoVecSizeCase2OrCase3)
104# endif
105 test %edx, %edx
106 jnz L(CopyTwoVecSize)
107
108 vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
109 vmovdqu %ymm2, (%rdi)
110
111/* If source address alignment != destination address alignment */
112 .p2align 4
113L(UnalignVecSizeBoth):
114 sub %rcx, %rdi
115# ifdef USE_AS_STRNCPY
116 add %rcx, %r8
117 sbb %rcx, %rcx
118 or %rcx, %r8
119# endif
120 mov $VEC_SIZE, %rcx
121 vmovdqa (%rsi, %rcx), %ymm2
122 vmovdqu %ymm2, (%rdi, %rcx)
123 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
124 vpcmpeqb %ymm2, %ymmZ, %ymmM
125 vpmovmskb %ymmM, %edx
126 add $VEC_SIZE, %rcx
127# ifdef USE_AS_STRNCPY
128 sub $(VEC_SIZE * 3), %r8
129 jbe L(CopyVecSizeCase2OrCase3)
130# endif
131 test %edx, %edx
132# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
133 jnz L(CopyVecSizeUnalignedVec2)
134# else
135 jnz L(CopyVecSize)
136# endif
137
138 vmovdqu %ymm2, (%rdi, %rcx)
139 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
140 vpcmpeqb %ymm3, %ymmZ, %ymmM
141 vpmovmskb %ymmM, %edx
142 add $VEC_SIZE, %rcx
143# ifdef USE_AS_STRNCPY
144 sub $VEC_SIZE, %r8
145 jbe L(CopyVecSizeCase2OrCase3)
146# endif
147 test %edx, %edx
148# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
149 jnz L(CopyVecSizeUnalignedVec3)
150# else
151 jnz L(CopyVecSize)
152# endif
153
154 vmovdqu %ymm3, (%rdi, %rcx)
155 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
156 vpcmpeqb %ymm4, %ymmZ, %ymmM
157 vpmovmskb %ymmM, %edx
158 add $VEC_SIZE, %rcx
159# ifdef USE_AS_STRNCPY
160 sub $VEC_SIZE, %r8
161 jbe L(CopyVecSizeCase2OrCase3)
162# endif
163 test %edx, %edx
164# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
165 jnz L(CopyVecSizeUnalignedVec4)
166# else
167 jnz L(CopyVecSize)
168# endif
169
170 vmovdqu %ymm4, (%rdi, %rcx)
171 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
172 vpcmpeqb %ymm2, %ymmZ, %ymmM
173 vpmovmskb %ymmM, %edx
174 add $VEC_SIZE, %rcx
175# ifdef USE_AS_STRNCPY
176 sub $VEC_SIZE, %r8
177 jbe L(CopyVecSizeCase2OrCase3)
178# endif
179 test %edx, %edx
180# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
181 jnz L(CopyVecSizeUnalignedVec2)
182# else
183 jnz L(CopyVecSize)
184# endif
185
186 vmovdqu %ymm2, (%rdi, %rcx)
187 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
188 vpcmpeqb %ymm2, %ymmZ, %ymmM
189 vpmovmskb %ymmM, %edx
190 add $VEC_SIZE, %rcx
191# ifdef USE_AS_STRNCPY
192 sub $VEC_SIZE, %r8
193 jbe L(CopyVecSizeCase2OrCase3)
194# endif
195 test %edx, %edx
196# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
197 jnz L(CopyVecSizeUnalignedVec2)
198# else
199 jnz L(CopyVecSize)
200# endif
201
202 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
203 vmovdqu %ymm2, (%rdi, %rcx)
204 vpcmpeqb %ymm3, %ymmZ, %ymmM
205 vpmovmskb %ymmM, %edx
206 add $VEC_SIZE, %rcx
207# ifdef USE_AS_STRNCPY
208 sub $VEC_SIZE, %r8
209 jbe L(CopyVecSizeCase2OrCase3)
210# endif
211 test %edx, %edx
212# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
213 jnz L(CopyVecSizeUnalignedVec3)
214# else
215 jnz L(CopyVecSize)
216# endif
217
218 vmovdqu %ymm3, (%rdi, %rcx)
219 mov %rsi, %rdx
220 lea VEC_SIZE(%rsi, %rcx), %rsi
221 and $-(VEC_SIZE * 4), %rsi
222 sub %rsi, %rdx
223 sub %rdx, %rdi
224# ifdef USE_AS_STRNCPY
225 lea (VEC_SIZE * 8)(%r8, %rdx), %r8
226# endif
227L(UnalignedFourVecSizeLoop):
228 vmovdqa (%rsi), %ymm4
229 vmovdqa VEC_SIZE(%rsi), %ymm5
230 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
231 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
232 vpminub %ymm5, %ymm4, %ymm2
233 vpminub %ymm7, %ymm6, %ymm3
234 vpminub %ymm2, %ymm3, %ymm3
235 vpcmpeqb %ymmM, %ymm3, %ymm3
236 vpmovmskb %ymm3, %edx
237# ifdef USE_AS_STRNCPY
238 sub $(VEC_SIZE * 4), %r8
239 jbe L(UnalignedLeaveCase2OrCase3)
240# endif
241 test %edx, %edx
242 jnz L(UnalignedFourVecSizeLeave)
243
244L(UnalignedFourVecSizeLoop_start):
245 add $(VEC_SIZE * 4), %rdi
246 add $(VEC_SIZE * 4), %rsi
247 vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
248 vmovdqa (%rsi), %ymm4
249 vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
250 vmovdqa VEC_SIZE(%rsi), %ymm5
251 vpminub %ymm5, %ymm4, %ymm2
252 vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
253 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
254 vmovdqu %ymm7, -VEC_SIZE(%rdi)
255 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
256 vpminub %ymm7, %ymm6, %ymm3
257 vpminub %ymm2, %ymm3, %ymm3
258 vpcmpeqb %ymmM, %ymm3, %ymm3
259 vpmovmskb %ymm3, %edx
260# ifdef USE_AS_STRNCPY
261 sub $(VEC_SIZE * 4), %r8
262 jbe L(UnalignedLeaveCase2OrCase3)
263# endif
264 test %edx, %edx
265 jz L(UnalignedFourVecSizeLoop_start)
266
267L(UnalignedFourVecSizeLeave):
268 vpcmpeqb %ymm4, %ymmZ, %ymmM
269 vpmovmskb %ymmM, %edx
270 test %edx, %edx
271 jnz L(CopyVecSizeUnaligned_0)
272
273 vpcmpeqb %ymm5, %ymmZ, %ymmM
274 vpmovmskb %ymmM, %ecx
275 test %ecx, %ecx
276 jnz L(CopyVecSizeUnaligned_16)
277
278 vpcmpeqb %ymm6, %ymmZ, %ymmM
279 vpmovmskb %ymmM, %edx
280 test %edx, %edx
281 jnz L(CopyVecSizeUnaligned_32)
282
283 vpcmpeqb %ymm7, %ymmZ, %ymmM
284 vpmovmskb %ymmM, %ecx
285 bsf %ecx, %edx
286 vmovdqu %ymm4, (%rdi)
287 vmovdqu %ymm5, VEC_SIZE(%rdi)
288 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
289# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
290# ifdef USE_AS_STPCPY
291 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
292# endif
293 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
294 add $(VEC_SIZE - 1), %r8
295 sub %rdx, %r8
296 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
297 jmp L(StrncpyFillTailWithZero)
298# else
299 add $(VEC_SIZE * 3), %rsi
300 add $(VEC_SIZE * 3), %rdi
301 jmp L(CopyVecSizeExit)
302# endif
303
304/* If source address alignment == destination address alignment */
305
306L(SourceStringAlignmentLessTwoVecSize):
307 vmovdqu (%rsi), %ymm3
308 vmovdqu VEC_SIZE(%rsi), %ymm2
309 vpcmpeqb %ymm3, %ymmZ, %ymmM
310 vpmovmskb %ymmM, %edx
311
312# ifdef USE_AS_STRNCPY
313# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
314 cmp $VEC_SIZE, %r8
315# else
316 cmp $(VEC_SIZE + 1), %r8
317# endif
318 jbe L(CopyVecSizeTail1Case2OrCase3)
319# endif
320 test %edx, %edx
321 jnz L(CopyVecSizeTail1)
322
323 vmovdqu %ymm3, (%rdi)
324 vpcmpeqb %ymm2, %ymmZ, %ymmM
325 vpmovmskb %ymmM, %edx
326
327# ifdef USE_AS_STRNCPY
328# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
329 cmp $(VEC_SIZE * 2), %r8
330# else
331 cmp $((VEC_SIZE * 2) + 1), %r8
332# endif
333 jbe L(CopyTwoVecSize1Case2OrCase3)
334# endif
335 test %edx, %edx
336 jnz L(CopyTwoVecSize1)
337
338 and $-VEC_SIZE, %rsi
339 and $(VEC_SIZE - 1), %ecx
340 jmp L(UnalignVecSizeBoth)
341
342/*------End of main part with loops---------------------*/
343
344/* Case1 */
345
346# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
347 .p2align 4
348L(CopyVecSize):
349 add %rcx, %rdi
350# endif
351L(CopyVecSizeTail):
352 add %rcx, %rsi
353L(CopyVecSizeTail1):
354 bsf %edx, %edx
355L(CopyVecSizeExit):
356 cmp $32, %edx
357 jae L(Exit32_63)
358 cmp $16, %edx
359 jae L(Exit16_31)
360 cmp $8, %edx
361 jae L(Exit8_15)
362 cmp $4, %edx
363 jae L(Exit4_7)
364 cmp $3, %edx
365 je L(Exit3)
366 cmp $1, %edx
367 ja L(Exit2)
368 je L(Exit1)
369 movb $0, (%rdi)
370# ifdef USE_AS_STPCPY
371 lea (%rdi), %rax
372# endif
373# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
374 sub $1, %r8
375 lea 1(%rdi), %rdi
376 jnz L(StrncpyFillTailWithZero)
377# endif
378L(return_vzeroupper):
379 ZERO_UPPER_VEC_REGISTERS_RETURN
380
381 .p2align 4
382L(CopyTwoVecSize1):
383 add $VEC_SIZE, %rsi
384 add $VEC_SIZE, %rdi
385# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
386 sub $VEC_SIZE, %r8
387# endif
388 jmp L(CopyVecSizeTail1)
389
390 .p2align 4
391L(CopyTwoVecSize):
392 bsf %edx, %edx
393 add %rcx, %rsi
394 add $VEC_SIZE, %edx
395 sub %ecx, %edx
396 jmp L(CopyVecSizeExit)
397
398 .p2align 4
399L(CopyVecSizeUnaligned_0):
400 bsf %edx, %edx
401# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
402# ifdef USE_AS_STPCPY
403 lea (%rdi, %rdx), %rax
404# endif
405 vmovdqu %ymm4, (%rdi)
406 add $((VEC_SIZE * 4) - 1), %r8
407 sub %rdx, %r8
408 lea 1(%rdi, %rdx), %rdi
409 jmp L(StrncpyFillTailWithZero)
410# else
411 jmp L(CopyVecSizeExit)
412# endif
413
414 .p2align 4
415L(CopyVecSizeUnaligned_16):
416 bsf %ecx, %edx
417 vmovdqu %ymm4, (%rdi)
418# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
419# ifdef USE_AS_STPCPY
420 lea VEC_SIZE(%rdi, %rdx), %rax
421# endif
422 vmovdqu %ymm5, VEC_SIZE(%rdi)
423 add $((VEC_SIZE * 3) - 1), %r8
424 sub %rdx, %r8
425 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
426 jmp L(StrncpyFillTailWithZero)
427# else
428 add $VEC_SIZE, %rsi
429 add $VEC_SIZE, %rdi
430 jmp L(CopyVecSizeExit)
431# endif
432
433 .p2align 4
434L(CopyVecSizeUnaligned_32):
435 bsf %edx, %edx
436 vmovdqu %ymm4, (%rdi)
437 vmovdqu %ymm5, VEC_SIZE(%rdi)
438# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
439# ifdef USE_AS_STPCPY
440 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
441# endif
442 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
443 add $((VEC_SIZE * 2) - 1), %r8
444 sub %rdx, %r8
445 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
446 jmp L(StrncpyFillTailWithZero)
447# else
448 add $(VEC_SIZE * 2), %rsi
449 add $(VEC_SIZE * 2), %rdi
450 jmp L(CopyVecSizeExit)
451# endif
452
453# ifdef USE_AS_STRNCPY
454# ifndef USE_AS_STRCAT
455 .p2align 4
456L(CopyVecSizeUnalignedVec6):
457 vmovdqu %ymm6, (%rdi, %rcx)
458 jmp L(CopyVecSizeVecExit)
459
460 .p2align 4
461L(CopyVecSizeUnalignedVec5):
462 vmovdqu %ymm5, (%rdi, %rcx)
463 jmp L(CopyVecSizeVecExit)
464
465 .p2align 4
466L(CopyVecSizeUnalignedVec4):
467 vmovdqu %ymm4, (%rdi, %rcx)
468 jmp L(CopyVecSizeVecExit)
469
470 .p2align 4
471L(CopyVecSizeUnalignedVec3):
472 vmovdqu %ymm3, (%rdi, %rcx)
473 jmp L(CopyVecSizeVecExit)
474# endif
475
476/* Case2 */
477
478 .p2align 4
479L(CopyVecSizeCase2):
480 add $VEC_SIZE, %r8
481 add %rcx, %rdi
482 add %rcx, %rsi
483 bsf %edx, %edx
484 cmp %r8d, %edx
485 jb L(CopyVecSizeExit)
486 jmp L(StrncpyExit)
487
488 .p2align 4
489L(CopyTwoVecSizeCase2):
490 add %rcx, %rsi
491 bsf %edx, %edx
492 add $VEC_SIZE, %edx
493 sub %ecx, %edx
494 cmp %r8d, %edx
495 jb L(CopyVecSizeExit)
496 jmp L(StrncpyExit)
497
498L(CopyVecSizeTailCase2):
499 add %rcx, %rsi
500 bsf %edx, %edx
501 cmp %r8d, %edx
502 jb L(CopyVecSizeExit)
503 jmp L(StrncpyExit)
504
505L(CopyVecSizeTail1Case2):
506 bsf %edx, %edx
507 cmp %r8d, %edx
508 jb L(CopyVecSizeExit)
509 jmp L(StrncpyExit)
510
511/* Case2 or Case3, Case3 */
512
513 .p2align 4
514L(CopyVecSizeCase2OrCase3):
515 test %rdx, %rdx
516 jnz L(CopyVecSizeCase2)
517L(CopyVecSizeCase3):
518 add $VEC_SIZE, %r8
519 add %rcx, %rdi
520 add %rcx, %rsi
521 jmp L(StrncpyExit)
522
523 .p2align 4
524L(CopyTwoVecSizeCase2OrCase3):
525 test %rdx, %rdx
526 jnz L(CopyTwoVecSizeCase2)
527 add %rcx, %rsi
528 jmp L(StrncpyExit)
529
530 .p2align 4
531L(CopyVecSizeTailCase2OrCase3):
532 test %rdx, %rdx
533 jnz L(CopyVecSizeTailCase2)
534 add %rcx, %rsi
535 jmp L(StrncpyExit)
536
537 .p2align 4
538L(CopyTwoVecSize1Case2OrCase3):
539 add $VEC_SIZE, %rdi
540 add $VEC_SIZE, %rsi
541 sub $VEC_SIZE, %r8
542L(CopyVecSizeTail1Case2OrCase3):
543 test %rdx, %rdx
544 jnz L(CopyVecSizeTail1Case2)
545 jmp L(StrncpyExit)
546# endif
547
548/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
549
550 .p2align 4
551L(Exit1):
552 movzwl (%rsi), %edx
553 mov %dx, (%rdi)
554# ifdef USE_AS_STPCPY
555 lea 1(%rdi), %rax
556# endif
557# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
558 sub $2, %r8
559 lea 2(%rdi), %rdi
560 jnz L(StrncpyFillTailWithZero)
561# endif
562 VZEROUPPER_RETURN
563
564 .p2align 4
565L(Exit2):
566 movzwl (%rsi), %ecx
567 mov %cx, (%rdi)
568 movb $0, 2(%rdi)
569# ifdef USE_AS_STPCPY
570 lea 2(%rdi), %rax
571# endif
572# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
573 sub $3, %r8
574 lea 3(%rdi), %rdi
575 jnz L(StrncpyFillTailWithZero)
576# endif
577 VZEROUPPER_RETURN
578
579 .p2align 4
580L(Exit3):
581 mov (%rsi), %edx
582 mov %edx, (%rdi)
583# ifdef USE_AS_STPCPY
584 lea 3(%rdi), %rax
585# endif
586# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
587 sub $4, %r8
588 lea 4(%rdi), %rdi
589 jnz L(StrncpyFillTailWithZero)
590# endif
591 VZEROUPPER_RETURN
592
593 .p2align 4
594L(Exit4_7):
595 mov (%rsi), %ecx
596 mov %ecx, (%rdi)
597 mov -3(%rsi, %rdx), %ecx
598 mov %ecx, -3(%rdi, %rdx)
599# ifdef USE_AS_STPCPY
600 lea (%rdi, %rdx), %rax
601# endif
602# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
603 sub %rdx, %r8
604 sub $1, %r8
605 lea 1(%rdi, %rdx), %rdi
606 jnz L(StrncpyFillTailWithZero)
607# endif
608 VZEROUPPER_RETURN
609
610 .p2align 4
611L(Exit8_15):
612 mov (%rsi), %rcx
613 mov -7(%rsi, %rdx), %r9
614 mov %rcx, (%rdi)
615 mov %r9, -7(%rdi, %rdx)
616# ifdef USE_AS_STPCPY
617 lea (%rdi, %rdx), %rax
618# endif
619# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
620 sub %rdx, %r8
621 sub $1, %r8
622 lea 1(%rdi, %rdx), %rdi
623 jnz L(StrncpyFillTailWithZero)
624# endif
625 VZEROUPPER_RETURN
626
627 .p2align 4
628L(Exit16_31):
629 vmovdqu (%rsi), %xmm2
630 vmovdqu -15(%rsi, %rdx), %xmm3
631 vmovdqu %xmm2, (%rdi)
632 vmovdqu %xmm3, -15(%rdi, %rdx)
633# ifdef USE_AS_STPCPY
634 lea (%rdi, %rdx), %rax
635# endif
636# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
637 sub %rdx, %r8
638 sub $1, %r8
639 lea 1(%rdi, %rdx), %rdi
640 jnz L(StrncpyFillTailWithZero)
641# endif
642 VZEROUPPER_RETURN
643
644 .p2align 4
645L(Exit32_63):
646 vmovdqu (%rsi), %ymm2
647 vmovdqu -31(%rsi, %rdx), %ymm3
648 vmovdqu %ymm2, (%rdi)
649 vmovdqu %ymm3, -31(%rdi, %rdx)
650# ifdef USE_AS_STPCPY
651 lea (%rdi, %rdx), %rax
652# endif
653# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
654 sub %rdx, %r8
655 sub $1, %r8
656 lea 1(%rdi, %rdx), %rdi
657 jnz L(StrncpyFillTailWithZero)
658# endif
659 VZEROUPPER_RETURN
660
661# ifdef USE_AS_STRNCPY
662
663 .p2align 4
664L(StrncpyExit1):
665 movzbl (%rsi), %edx
666 mov %dl, (%rdi)
667# ifdef USE_AS_STPCPY
668 lea 1(%rdi), %rax
669# endif
670# ifdef USE_AS_STRCAT
671 movb $0, 1(%rdi)
672# endif
673 VZEROUPPER_RETURN
674
675 .p2align 4
676L(StrncpyExit2):
677 movzwl (%rsi), %edx
678 mov %dx, (%rdi)
679# ifdef USE_AS_STPCPY
680 lea 2(%rdi), %rax
681# endif
682# ifdef USE_AS_STRCAT
683 movb $0, 2(%rdi)
684# endif
685 VZEROUPPER_RETURN
686
687 .p2align 4
688L(StrncpyExit3_4):
689 movzwl (%rsi), %ecx
690 movzwl -2(%rsi, %r8), %edx
691 mov %cx, (%rdi)
692 mov %dx, -2(%rdi, %r8)
693# ifdef USE_AS_STPCPY
694 lea (%rdi, %r8), %rax
695# endif
696# ifdef USE_AS_STRCAT
697 movb $0, (%rdi, %r8)
698# endif
699 VZEROUPPER_RETURN
700
701 .p2align 4
702L(StrncpyExit5_8):
703 mov (%rsi), %ecx
704 mov -4(%rsi, %r8), %edx
705 mov %ecx, (%rdi)
706 mov %edx, -4(%rdi, %r8)
707# ifdef USE_AS_STPCPY
708 lea (%rdi, %r8), %rax
709# endif
710# ifdef USE_AS_STRCAT
711 movb $0, (%rdi, %r8)
712# endif
713 VZEROUPPER_RETURN
714
715 .p2align 4
716L(StrncpyExit9_16):
717 mov (%rsi), %rcx
718 mov -8(%rsi, %r8), %rdx
719 mov %rcx, (%rdi)
720 mov %rdx, -8(%rdi, %r8)
721# ifdef USE_AS_STPCPY
722 lea (%rdi, %r8), %rax
723# endif
724# ifdef USE_AS_STRCAT
725 movb $0, (%rdi, %r8)
726# endif
727 VZEROUPPER_RETURN
728
729 .p2align 4
730L(StrncpyExit17_32):
731 vmovdqu (%rsi), %xmm2
732 vmovdqu -16(%rsi, %r8), %xmm3
733 vmovdqu %xmm2, (%rdi)
734 vmovdqu %xmm3, -16(%rdi, %r8)
735# ifdef USE_AS_STPCPY
736 lea (%rdi, %r8), %rax
737# endif
738# ifdef USE_AS_STRCAT
739 movb $0, (%rdi, %r8)
740# endif
741 VZEROUPPER_RETURN
742
743 .p2align 4
744L(StrncpyExit33_64):
745 /* 0/32, 31/16 */
746 vmovdqu (%rsi), %ymm2
747 vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
748 vmovdqu %ymm2, (%rdi)
749 vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
750# ifdef USE_AS_STPCPY
751 lea (%rdi, %r8), %rax
752# endif
753# ifdef USE_AS_STRCAT
754 movb $0, (%rdi, %r8)
755# endif
756 VZEROUPPER_RETURN
757
758 .p2align 4
759L(StrncpyExit65):
760 /* 0/32, 32/32, 64/1 */
761 vmovdqu (%rsi), %ymm2
762 vmovdqu 32(%rsi), %ymm3
763 mov 64(%rsi), %cl
764 vmovdqu %ymm2, (%rdi)
765 vmovdqu %ymm3, 32(%rdi)
766 mov %cl, 64(%rdi)
767# ifdef USE_AS_STPCPY
768 lea 65(%rdi), %rax
769# endif
770# ifdef USE_AS_STRCAT
771 movb $0, 65(%rdi)
772# endif
773 VZEROUPPER_RETURN
774
775# ifndef USE_AS_STRCAT
776
777 .p2align 4
778L(Fill1):
779 mov %dl, (%rdi)
780 VZEROUPPER_RETURN
781
782 .p2align 4
783L(Fill2):
784 mov %dx, (%rdi)
785 VZEROUPPER_RETURN
786
787 .p2align 4
788L(Fill3_4):
789 mov %dx, (%rdi)
790 mov %dx, -2(%rdi, %r8)
791 VZEROUPPER_RETURN
792
793 .p2align 4
794L(Fill5_8):
795 mov %edx, (%rdi)
796 mov %edx, -4(%rdi, %r8)
797 VZEROUPPER_RETURN
798
799 .p2align 4
800L(Fill9_16):
801 mov %rdx, (%rdi)
802 mov %rdx, -8(%rdi, %r8)
803 VZEROUPPER_RETURN
804
805 .p2align 4
806L(Fill17_32):
807 vmovdqu %xmmZ, (%rdi)
808 vmovdqu %xmmZ, -16(%rdi, %r8)
809 VZEROUPPER_RETURN
810
811 .p2align 4
812L(CopyVecSizeUnalignedVec2):
813 vmovdqu %ymm2, (%rdi, %rcx)
814
815 .p2align 4
816L(CopyVecSizeVecExit):
817 bsf %edx, %edx
818 add $(VEC_SIZE - 1), %r8
819 add %rcx, %rdi
820# ifdef USE_AS_STPCPY
821 lea (%rdi, %rdx), %rax
822# endif
823 sub %rdx, %r8
824 lea 1(%rdi, %rdx), %rdi
825
826 .p2align 4
827L(StrncpyFillTailWithZero):
828 xor %edx, %edx
829 sub $VEC_SIZE, %r8
830 jbe L(StrncpyFillExit)
831
832 vmovdqu %ymmZ, (%rdi)
833 add $VEC_SIZE, %rdi
834
835 mov %rdi, %rsi
836 and $(VEC_SIZE - 1), %esi
837 sub %rsi, %rdi
838 add %rsi, %r8
839 sub $(VEC_SIZE * 4), %r8
840 jb L(StrncpyFillLessFourVecSize)
841
842L(StrncpyFillLoopVmovdqa):
843 vmovdqa %ymmZ, (%rdi)
844 vmovdqa %ymmZ, VEC_SIZE(%rdi)
845 vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
846 vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
847 add $(VEC_SIZE * 4), %rdi
848 sub $(VEC_SIZE * 4), %r8
849 jae L(StrncpyFillLoopVmovdqa)
850
851L(StrncpyFillLessFourVecSize):
852 add $(VEC_SIZE * 2), %r8
853 jl L(StrncpyFillLessTwoVecSize)
854 vmovdqa %ymmZ, (%rdi)
855 vmovdqa %ymmZ, VEC_SIZE(%rdi)
856 add $(VEC_SIZE * 2), %rdi
857 sub $VEC_SIZE, %r8
858 jl L(StrncpyFillExit)
859 vmovdqa %ymmZ, (%rdi)
860 add $VEC_SIZE, %rdi
861 jmp L(Fill)
862
863 .p2align 4
864L(StrncpyFillLessTwoVecSize):
865 add $VEC_SIZE, %r8
866 jl L(StrncpyFillExit)
867 vmovdqa %ymmZ, (%rdi)
868 add $VEC_SIZE, %rdi
869 jmp L(Fill)
870
871 .p2align 4
872L(StrncpyFillExit):
873 add $VEC_SIZE, %r8
874L(Fill):
875 cmp $17, %r8d
876 jae L(Fill17_32)
877 cmp $9, %r8d
878 jae L(Fill9_16)
879 cmp $5, %r8d
880 jae L(Fill5_8)
881 cmp $3, %r8d
882 jae L(Fill3_4)
883 cmp $1, %r8d
884 ja L(Fill2)
885 je L(Fill1)
886 VZEROUPPER_RETURN
887
888/* end of ifndef USE_AS_STRCAT */
889# endif
890
891 .p2align 4
892L(UnalignedLeaveCase2OrCase3):
893 test %rdx, %rdx
894 jnz L(UnalignedFourVecSizeLeaveCase2)
895L(UnalignedFourVecSizeLeaveCase3):
896 lea (VEC_SIZE * 4)(%r8), %rcx
897 and $-VEC_SIZE, %rcx
898 add $(VEC_SIZE * 3), %r8
899 jl L(CopyVecSizeCase3)
900 vmovdqu %ymm4, (%rdi)
901 sub $VEC_SIZE, %r8
902 jb L(CopyVecSizeCase3)
903 vmovdqu %ymm5, VEC_SIZE(%rdi)
904 sub $VEC_SIZE, %r8
905 jb L(CopyVecSizeCase3)
906 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
907 sub $VEC_SIZE, %r8
908 jb L(CopyVecSizeCase3)
909 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
910# ifdef USE_AS_STPCPY
911 lea (VEC_SIZE * 4)(%rdi), %rax
912# endif
913# ifdef USE_AS_STRCAT
914 movb $0, (VEC_SIZE * 4)(%rdi)
915# endif
916 VZEROUPPER_RETURN
917
918 .p2align 4
919L(UnalignedFourVecSizeLeaveCase2):
920 xor %ecx, %ecx
921 vpcmpeqb %ymm4, %ymmZ, %ymmM
922 vpmovmskb %ymmM, %edx
923 add $(VEC_SIZE * 3), %r8
924 jle L(CopyVecSizeCase2OrCase3)
925 test %edx, %edx
926# ifndef USE_AS_STRCAT
927 jnz L(CopyVecSizeUnalignedVec4)
928# else
929 jnz L(CopyVecSize)
930# endif
931 vpcmpeqb %ymm5, %ymmZ, %ymmM
932 vpmovmskb %ymmM, %edx
933 vmovdqu %ymm4, (%rdi)
934 add $VEC_SIZE, %rcx
935 sub $VEC_SIZE, %r8
936 jbe L(CopyVecSizeCase2OrCase3)
937 test %edx, %edx
938# ifndef USE_AS_STRCAT
939 jnz L(CopyVecSizeUnalignedVec5)
940# else
941 jnz L(CopyVecSize)
942# endif
943
944 vpcmpeqb %ymm6, %ymmZ, %ymmM
945 vpmovmskb %ymmM, %edx
946 vmovdqu %ymm5, VEC_SIZE(%rdi)
947 add $VEC_SIZE, %rcx
948 sub $VEC_SIZE, %r8
949 jbe L(CopyVecSizeCase2OrCase3)
950 test %edx, %edx
951# ifndef USE_AS_STRCAT
952 jnz L(CopyVecSizeUnalignedVec6)
953# else
954 jnz L(CopyVecSize)
955# endif
956
957 vpcmpeqb %ymm7, %ymmZ, %ymmM
958 vpmovmskb %ymmM, %edx
959 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
960 lea VEC_SIZE(%rdi, %rcx), %rdi
961 lea VEC_SIZE(%rsi, %rcx), %rsi
962 bsf %edx, %edx
963 cmp %r8d, %edx
964 jb L(CopyVecSizeExit)
965L(StrncpyExit):
966 cmp $65, %r8d
967 je L(StrncpyExit65)
968 cmp $33, %r8d
969 jae L(StrncpyExit33_64)
970 cmp $17, %r8d
971 jae L(StrncpyExit17_32)
972 cmp $9, %r8d
973 jae L(StrncpyExit9_16)
974 cmp $5, %r8d
975 jae L(StrncpyExit5_8)
976 cmp $3, %r8d
977 jae L(StrncpyExit3_4)
978 cmp $1, %r8d
979 ja L(StrncpyExit2)
980 je L(StrncpyExit1)
981# ifdef USE_AS_STPCPY
982 mov %rdi, %rax
983# endif
984# ifdef USE_AS_STRCAT
985 movb $0, (%rdi)
986# endif
987 VZEROUPPER_RETURN
988
989 .p2align 4
990L(ExitZero):
991# ifndef USE_AS_STRCAT
992 mov %rdi, %rax
993# endif
994 VZEROUPPER_RETURN
995
996# endif
997
998# ifndef USE_AS_STRCAT
999END (STRCPY)
1000# else
1001END (STRCAT)
1002# endif
1003#endif
1004