1/* strcpy with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# ifndef USE_AS_STRCAT
22# include <sysdep.h>
23
24# ifndef STRCPY
25# define STRCPY __strcpy_evex
26# endif
27
28# endif
29
30# define VMOVU vmovdqu64
31# define VMOVA vmovdqa64
32
33/* Number of bytes in a vector register */
34# ifndef VEC_SIZE
35# define VEC_SIZE 32
36# endif
37
38# define XMM2 xmm18
39# define XMM3 xmm19
40
41# define YMM2 ymm18
42# define YMM3 ymm19
43# define YMM4 ymm20
44# define YMM5 ymm21
45# define YMM6 ymm22
46# define YMM7 ymm23
47
48# ifndef USE_AS_STRCAT
49
50/* zero register */
51# define XMMZERO xmm16
52# define YMMZERO ymm16
53# define YMM1 ymm17
54
55 .section .text.evex,"ax",@progbits
56ENTRY (STRCPY)
57# ifdef USE_AS_STRNCPY
58 mov %RDX_LP, %R8_LP
59 test %R8_LP, %R8_LP
60 jz L(ExitZero)
61# endif
62 mov %rsi, %rcx
63# ifndef USE_AS_STPCPY
64 mov %rdi, %rax /* save result */
65# endif
66
67 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
68# endif
69
70 and $((VEC_SIZE * 4) - 1), %ecx
71 cmp $(VEC_SIZE * 2), %ecx
72 jbe L(SourceStringAlignmentLessTwoVecSize)
73
74 and $-VEC_SIZE, %rsi
75 and $(VEC_SIZE - 1), %ecx
76
77 vpcmpb $0, (%rsi), %YMMZERO, %k0
78 kmovd %k0, %edx
79 shr %cl, %rdx
80
81# ifdef USE_AS_STRNCPY
82# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
83 mov $VEC_SIZE, %r10
84 sub %rcx, %r10
85 cmp %r10, %r8
86# else
87 mov $(VEC_SIZE + 1), %r10
88 sub %rcx, %r10
89 cmp %r10, %r8
90# endif
91 jbe L(CopyVecSizeTailCase2OrCase3)
92# endif
93 test %edx, %edx
94 jnz L(CopyVecSizeTail)
95
96 vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1
97 kmovd %k1, %edx
98
99# ifdef USE_AS_STRNCPY
100 add $VEC_SIZE, %r10
101 cmp %r10, %r8
102 jbe L(CopyTwoVecSizeCase2OrCase3)
103# endif
104 test %edx, %edx
105 jnz L(CopyTwoVecSize)
106
107 VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */
108 VMOVU %YMM2, (%rdi)
109
110/* If source address alignment != destination address alignment */
111 .p2align 4
112L(UnalignVecSizeBoth):
113 sub %rcx, %rdi
114# ifdef USE_AS_STRNCPY
115 add %rcx, %r8
116 sbb %rcx, %rcx
117 or %rcx, %r8
118# endif
119 mov $VEC_SIZE, %rcx
120 VMOVA (%rsi, %rcx), %YMM2
121 VMOVU %YMM2, (%rdi, %rcx)
122 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
123 vpcmpb $0, %YMM2, %YMMZERO, %k0
124 kmovd %k0, %edx
125 add $VEC_SIZE, %rcx
126# ifdef USE_AS_STRNCPY
127 sub $(VEC_SIZE * 3), %r8
128 jbe L(CopyVecSizeCase2OrCase3)
129# endif
130 test %edx, %edx
131# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
132 jnz L(CopyVecSizeUnalignedVec2)
133# else
134 jnz L(CopyVecSize)
135# endif
136
137 VMOVU %YMM2, (%rdi, %rcx)
138 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
139 vpcmpb $0, %YMM3, %YMMZERO, %k0
140 kmovd %k0, %edx
141 add $VEC_SIZE, %rcx
142# ifdef USE_AS_STRNCPY
143 sub $VEC_SIZE, %r8
144 jbe L(CopyVecSizeCase2OrCase3)
145# endif
146 test %edx, %edx
147# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
148 jnz L(CopyVecSizeUnalignedVec3)
149# else
150 jnz L(CopyVecSize)
151# endif
152
153 VMOVU %YMM3, (%rdi, %rcx)
154 VMOVA VEC_SIZE(%rsi, %rcx), %YMM4
155 vpcmpb $0, %YMM4, %YMMZERO, %k0
156 kmovd %k0, %edx
157 add $VEC_SIZE, %rcx
158# ifdef USE_AS_STRNCPY
159 sub $VEC_SIZE, %r8
160 jbe L(CopyVecSizeCase2OrCase3)
161# endif
162 test %edx, %edx
163# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
164 jnz L(CopyVecSizeUnalignedVec4)
165# else
166 jnz L(CopyVecSize)
167# endif
168
169 VMOVU %YMM4, (%rdi, %rcx)
170 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
171 vpcmpb $0, %YMM2, %YMMZERO, %k0
172 kmovd %k0, %edx
173 add $VEC_SIZE, %rcx
174# ifdef USE_AS_STRNCPY
175 sub $VEC_SIZE, %r8
176 jbe L(CopyVecSizeCase2OrCase3)
177# endif
178 test %edx, %edx
179# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
180 jnz L(CopyVecSizeUnalignedVec2)
181# else
182 jnz L(CopyVecSize)
183# endif
184
185 VMOVU %YMM2, (%rdi, %rcx)
186 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
187 vpcmpb $0, %YMM2, %YMMZERO, %k0
188 kmovd %k0, %edx
189 add $VEC_SIZE, %rcx
190# ifdef USE_AS_STRNCPY
191 sub $VEC_SIZE, %r8
192 jbe L(CopyVecSizeCase2OrCase3)
193# endif
194 test %edx, %edx
195# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
196 jnz L(CopyVecSizeUnalignedVec2)
197# else
198 jnz L(CopyVecSize)
199# endif
200
201 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
202 VMOVU %YMM2, (%rdi, %rcx)
203 vpcmpb $0, %YMM3, %YMMZERO, %k0
204 kmovd %k0, %edx
205 add $VEC_SIZE, %rcx
206# ifdef USE_AS_STRNCPY
207 sub $VEC_SIZE, %r8
208 jbe L(CopyVecSizeCase2OrCase3)
209# endif
210 test %edx, %edx
211# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
212 jnz L(CopyVecSizeUnalignedVec3)
213# else
214 jnz L(CopyVecSize)
215# endif
216
217 VMOVU %YMM3, (%rdi, %rcx)
218 mov %rsi, %rdx
219 lea VEC_SIZE(%rsi, %rcx), %rsi
220 and $-(VEC_SIZE * 4), %rsi
221 sub %rsi, %rdx
222 sub %rdx, %rdi
223# ifdef USE_AS_STRNCPY
224 lea (VEC_SIZE * 8)(%r8, %rdx), %r8
225# endif
226L(UnalignedFourVecSizeLoop):
227 VMOVA (%rsi), %YMM4
228 VMOVA VEC_SIZE(%rsi), %YMM5
229 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
230 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
231 vpminub %YMM5, %YMM4, %YMM2
232 vpminub %YMM7, %YMM6, %YMM3
233 vpminub %YMM2, %YMM3, %YMM2
234 /* If K7 != 0, there is a null byte. */
235 vpcmpb $0, %YMM2, %YMMZERO, %k7
236 kmovd %k7, %edx
237# ifdef USE_AS_STRNCPY
238 sub $(VEC_SIZE * 4), %r8
239 jbe L(UnalignedLeaveCase2OrCase3)
240# endif
241 test %edx, %edx
242 jnz L(UnalignedFourVecSizeLeave)
243
244L(UnalignedFourVecSizeLoop_start):
245 add $(VEC_SIZE * 4), %rdi
246 add $(VEC_SIZE * 4), %rsi
247 VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi)
248 VMOVA (%rsi), %YMM4
249 VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi)
250 VMOVA VEC_SIZE(%rsi), %YMM5
251 vpminub %YMM5, %YMM4, %YMM2
252 VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi)
253 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
254 VMOVU %YMM7, -VEC_SIZE(%rdi)
255 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
256 vpminub %YMM7, %YMM6, %YMM3
257 vpminub %YMM2, %YMM3, %YMM2
258 /* If K7 != 0, there is a null byte. */
259 vpcmpb $0, %YMM2, %YMMZERO, %k7
260 kmovd %k7, %edx
261# ifdef USE_AS_STRNCPY
262 sub $(VEC_SIZE * 4), %r8
263 jbe L(UnalignedLeaveCase2OrCase3)
264# endif
265 test %edx, %edx
266 jz L(UnalignedFourVecSizeLoop_start)
267
268L(UnalignedFourVecSizeLeave):
269 vpcmpb $0, %YMM4, %YMMZERO, %k1
270 kmovd %k1, %edx
271 test %edx, %edx
272 jnz L(CopyVecSizeUnaligned_0)
273
274 vpcmpb $0, %YMM5, %YMMZERO, %k2
275 kmovd %k2, %ecx
276 test %ecx, %ecx
277 jnz L(CopyVecSizeUnaligned_16)
278
279 vpcmpb $0, %YMM6, %YMMZERO, %k3
280 kmovd %k3, %edx
281 test %edx, %edx
282 jnz L(CopyVecSizeUnaligned_32)
283
284 vpcmpb $0, %YMM7, %YMMZERO, %k4
285 kmovd %k4, %ecx
286 bsf %ecx, %edx
287 VMOVU %YMM4, (%rdi)
288 VMOVU %YMM5, VEC_SIZE(%rdi)
289 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
290# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
291# ifdef USE_AS_STPCPY
292 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
293# endif
294 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
295 add $(VEC_SIZE - 1), %r8
296 sub %rdx, %r8
297 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
298 jmp L(StrncpyFillTailWithZero)
299# else
300 add $(VEC_SIZE * 3), %rsi
301 add $(VEC_SIZE * 3), %rdi
302 jmp L(CopyVecSizeExit)
303# endif
304
305/* If source address alignment == destination address alignment */
306
307L(SourceStringAlignmentLessTwoVecSize):
308 VMOVU (%rsi), %YMM3
309 VMOVU VEC_SIZE(%rsi), %YMM2
310 vpcmpb $0, %YMM3, %YMMZERO, %k0
311 kmovd %k0, %edx
312
313# ifdef USE_AS_STRNCPY
314# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
315 cmp $VEC_SIZE, %r8
316# else
317 cmp $(VEC_SIZE + 1), %r8
318# endif
319 jbe L(CopyVecSizeTail1Case2OrCase3)
320# endif
321 test %edx, %edx
322 jnz L(CopyVecSizeTail1)
323
324 VMOVU %YMM3, (%rdi)
325 vpcmpb $0, %YMM2, %YMMZERO, %k0
326 kmovd %k0, %edx
327
328# ifdef USE_AS_STRNCPY
329# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
330 cmp $(VEC_SIZE * 2), %r8
331# else
332 cmp $((VEC_SIZE * 2) + 1), %r8
333# endif
334 jbe L(CopyTwoVecSize1Case2OrCase3)
335# endif
336 test %edx, %edx
337 jnz L(CopyTwoVecSize1)
338
339 and $-VEC_SIZE, %rsi
340 and $(VEC_SIZE - 1), %ecx
341 jmp L(UnalignVecSizeBoth)
342
343/*------End of main part with loops---------------------*/
344
345/* Case1 */
346
347# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
348 .p2align 4
349L(CopyVecSize):
350 add %rcx, %rdi
351# endif
352L(CopyVecSizeTail):
353 add %rcx, %rsi
354L(CopyVecSizeTail1):
355 bsf %edx, %edx
356L(CopyVecSizeExit):
357 cmp $32, %edx
358 jae L(Exit32_63)
359 cmp $16, %edx
360 jae L(Exit16_31)
361 cmp $8, %edx
362 jae L(Exit8_15)
363 cmp $4, %edx
364 jae L(Exit4_7)
365 cmp $3, %edx
366 je L(Exit3)
367 cmp $1, %edx
368 ja L(Exit2)
369 je L(Exit1)
370 movb $0, (%rdi)
371# ifdef USE_AS_STPCPY
372 lea (%rdi), %rax
373# endif
374# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
375 sub $1, %r8
376 lea 1(%rdi), %rdi
377 jnz L(StrncpyFillTailWithZero)
378# endif
379 ret
380
381 .p2align 4
382L(CopyTwoVecSize1):
383 add $VEC_SIZE, %rsi
384 add $VEC_SIZE, %rdi
385# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
386 sub $VEC_SIZE, %r8
387# endif
388 jmp L(CopyVecSizeTail1)
389
390 .p2align 4
391L(CopyTwoVecSize):
392 bsf %edx, %edx
393 add %rcx, %rsi
394 add $VEC_SIZE, %edx
395 sub %ecx, %edx
396 jmp L(CopyVecSizeExit)
397
398 .p2align 4
399L(CopyVecSizeUnaligned_0):
400 bsf %edx, %edx
401# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
402# ifdef USE_AS_STPCPY
403 lea (%rdi, %rdx), %rax
404# endif
405 VMOVU %YMM4, (%rdi)
406 add $((VEC_SIZE * 4) - 1), %r8
407 sub %rdx, %r8
408 lea 1(%rdi, %rdx), %rdi
409 jmp L(StrncpyFillTailWithZero)
410# else
411 jmp L(CopyVecSizeExit)
412# endif
413
414 .p2align 4
415L(CopyVecSizeUnaligned_16):
416 bsf %ecx, %edx
417 VMOVU %YMM4, (%rdi)
418# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
419# ifdef USE_AS_STPCPY
420 lea VEC_SIZE(%rdi, %rdx), %rax
421# endif
422 VMOVU %YMM5, VEC_SIZE(%rdi)
423 add $((VEC_SIZE * 3) - 1), %r8
424 sub %rdx, %r8
425 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
426 jmp L(StrncpyFillTailWithZero)
427# else
428 add $VEC_SIZE, %rsi
429 add $VEC_SIZE, %rdi
430 jmp L(CopyVecSizeExit)
431# endif
432
433 .p2align 4
434L(CopyVecSizeUnaligned_32):
435 bsf %edx, %edx
436 VMOVU %YMM4, (%rdi)
437 VMOVU %YMM5, VEC_SIZE(%rdi)
438# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
439# ifdef USE_AS_STPCPY
440 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
441# endif
442 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
443 add $((VEC_SIZE * 2) - 1), %r8
444 sub %rdx, %r8
445 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
446 jmp L(StrncpyFillTailWithZero)
447# else
448 add $(VEC_SIZE * 2), %rsi
449 add $(VEC_SIZE * 2), %rdi
450 jmp L(CopyVecSizeExit)
451# endif
452
453# ifdef USE_AS_STRNCPY
454# ifndef USE_AS_STRCAT
455 .p2align 4
456L(CopyVecSizeUnalignedVec6):
457 VMOVU %YMM6, (%rdi, %rcx)
458 jmp L(CopyVecSizeVecExit)
459
460 .p2align 4
461L(CopyVecSizeUnalignedVec5):
462 VMOVU %YMM5, (%rdi, %rcx)
463 jmp L(CopyVecSizeVecExit)
464
465 .p2align 4
466L(CopyVecSizeUnalignedVec4):
467 VMOVU %YMM4, (%rdi, %rcx)
468 jmp L(CopyVecSizeVecExit)
469
470 .p2align 4
471L(CopyVecSizeUnalignedVec3):
472 VMOVU %YMM3, (%rdi, %rcx)
473 jmp L(CopyVecSizeVecExit)
474# endif
475
476/* Case2 */
477
478 .p2align 4
479L(CopyVecSizeCase2):
480 add $VEC_SIZE, %r8
481 add %rcx, %rdi
482 add %rcx, %rsi
483 bsf %edx, %edx
484 cmp %r8d, %edx
485 jb L(CopyVecSizeExit)
486 jmp L(StrncpyExit)
487
488 .p2align 4
489L(CopyTwoVecSizeCase2):
490 add %rcx, %rsi
491 bsf %edx, %edx
492 add $VEC_SIZE, %edx
493 sub %ecx, %edx
494 cmp %r8d, %edx
495 jb L(CopyVecSizeExit)
496 jmp L(StrncpyExit)
497
498L(CopyVecSizeTailCase2):
499 add %rcx, %rsi
500 bsf %edx, %edx
501 cmp %r8d, %edx
502 jb L(CopyVecSizeExit)
503 jmp L(StrncpyExit)
504
505L(CopyVecSizeTail1Case2):
506 bsf %edx, %edx
507 cmp %r8d, %edx
508 jb L(CopyVecSizeExit)
509 jmp L(StrncpyExit)
510
511/* Case2 or Case3, Case3 */
512
513 .p2align 4
514L(CopyVecSizeCase2OrCase3):
515 test %rdx, %rdx
516 jnz L(CopyVecSizeCase2)
517L(CopyVecSizeCase3):
518 add $VEC_SIZE, %r8
519 add %rcx, %rdi
520 add %rcx, %rsi
521 jmp L(StrncpyExit)
522
523 .p2align 4
524L(CopyTwoVecSizeCase2OrCase3):
525 test %rdx, %rdx
526 jnz L(CopyTwoVecSizeCase2)
527 add %rcx, %rsi
528 jmp L(StrncpyExit)
529
530 .p2align 4
531L(CopyVecSizeTailCase2OrCase3):
532 test %rdx, %rdx
533 jnz L(CopyVecSizeTailCase2)
534 add %rcx, %rsi
535 jmp L(StrncpyExit)
536
537 .p2align 4
538L(CopyTwoVecSize1Case2OrCase3):
539 add $VEC_SIZE, %rdi
540 add $VEC_SIZE, %rsi
541 sub $VEC_SIZE, %r8
542L(CopyVecSizeTail1Case2OrCase3):
543 test %rdx, %rdx
544 jnz L(CopyVecSizeTail1Case2)
545 jmp L(StrncpyExit)
546# endif
547
548/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
549
550 .p2align 4
551L(Exit1):
552 movzwl (%rsi), %edx
553 mov %dx, (%rdi)
554# ifdef USE_AS_STPCPY
555 lea 1(%rdi), %rax
556# endif
557# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
558 sub $2, %r8
559 lea 2(%rdi), %rdi
560 jnz L(StrncpyFillTailWithZero)
561# endif
562 ret
563
564 .p2align 4
565L(Exit2):
566 movzwl (%rsi), %ecx
567 mov %cx, (%rdi)
568 movb $0, 2(%rdi)
569# ifdef USE_AS_STPCPY
570 lea 2(%rdi), %rax
571# endif
572# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
573 sub $3, %r8
574 lea 3(%rdi), %rdi
575 jnz L(StrncpyFillTailWithZero)
576# endif
577 ret
578
579 .p2align 4
580L(Exit3):
581 mov (%rsi), %edx
582 mov %edx, (%rdi)
583# ifdef USE_AS_STPCPY
584 lea 3(%rdi), %rax
585# endif
586# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
587 sub $4, %r8
588 lea 4(%rdi), %rdi
589 jnz L(StrncpyFillTailWithZero)
590# endif
591 ret
592
593 .p2align 4
594L(Exit4_7):
595 mov (%rsi), %ecx
596 mov %ecx, (%rdi)
597 mov -3(%rsi, %rdx), %ecx
598 mov %ecx, -3(%rdi, %rdx)
599# ifdef USE_AS_STPCPY
600 lea (%rdi, %rdx), %rax
601# endif
602# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
603 sub %rdx, %r8
604 sub $1, %r8
605 lea 1(%rdi, %rdx), %rdi
606 jnz L(StrncpyFillTailWithZero)
607# endif
608 ret
609
610 .p2align 4
611L(Exit8_15):
612 mov (%rsi), %rcx
613 mov -7(%rsi, %rdx), %r9
614 mov %rcx, (%rdi)
615 mov %r9, -7(%rdi, %rdx)
616# ifdef USE_AS_STPCPY
617 lea (%rdi, %rdx), %rax
618# endif
619# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
620 sub %rdx, %r8
621 sub $1, %r8
622 lea 1(%rdi, %rdx), %rdi
623 jnz L(StrncpyFillTailWithZero)
624# endif
625 ret
626
627 .p2align 4
628L(Exit16_31):
629 VMOVU (%rsi), %XMM2
630 VMOVU -15(%rsi, %rdx), %XMM3
631 VMOVU %XMM2, (%rdi)
632 VMOVU %XMM3, -15(%rdi, %rdx)
633# ifdef USE_AS_STPCPY
634 lea (%rdi, %rdx), %rax
635# endif
636# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
637 sub %rdx, %r8
638 sub $1, %r8
639 lea 1(%rdi, %rdx), %rdi
640 jnz L(StrncpyFillTailWithZero)
641# endif
642 ret
643
644 .p2align 4
645L(Exit32_63):
646 VMOVU (%rsi), %YMM2
647 VMOVU -31(%rsi, %rdx), %YMM3
648 VMOVU %YMM2, (%rdi)
649 VMOVU %YMM3, -31(%rdi, %rdx)
650# ifdef USE_AS_STPCPY
651 lea (%rdi, %rdx), %rax
652# endif
653# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
654 sub %rdx, %r8
655 sub $1, %r8
656 lea 1(%rdi, %rdx), %rdi
657 jnz L(StrncpyFillTailWithZero)
658# endif
659 ret
660
661# ifdef USE_AS_STRNCPY
662
663 .p2align 4
664L(StrncpyExit1):
665 movzbl (%rsi), %edx
666 mov %dl, (%rdi)
667# ifdef USE_AS_STPCPY
668 lea 1(%rdi), %rax
669# endif
670# ifdef USE_AS_STRCAT
671 movb $0, 1(%rdi)
672# endif
673 ret
674
675 .p2align 4
676L(StrncpyExit2):
677 movzwl (%rsi), %edx
678 mov %dx, (%rdi)
679# ifdef USE_AS_STPCPY
680 lea 2(%rdi), %rax
681# endif
682# ifdef USE_AS_STRCAT
683 movb $0, 2(%rdi)
684# endif
685 ret
686
687 .p2align 4
688L(StrncpyExit3_4):
689 movzwl (%rsi), %ecx
690 movzwl -2(%rsi, %r8), %edx
691 mov %cx, (%rdi)
692 mov %dx, -2(%rdi, %r8)
693# ifdef USE_AS_STPCPY
694 lea (%rdi, %r8), %rax
695# endif
696# ifdef USE_AS_STRCAT
697 movb $0, (%rdi, %r8)
698# endif
699 ret
700
701 .p2align 4
702L(StrncpyExit5_8):
703 mov (%rsi), %ecx
704 mov -4(%rsi, %r8), %edx
705 mov %ecx, (%rdi)
706 mov %edx, -4(%rdi, %r8)
707# ifdef USE_AS_STPCPY
708 lea (%rdi, %r8), %rax
709# endif
710# ifdef USE_AS_STRCAT
711 movb $0, (%rdi, %r8)
712# endif
713 ret
714
715 .p2align 4
716L(StrncpyExit9_16):
717 mov (%rsi), %rcx
718 mov -8(%rsi, %r8), %rdx
719 mov %rcx, (%rdi)
720 mov %rdx, -8(%rdi, %r8)
721# ifdef USE_AS_STPCPY
722 lea (%rdi, %r8), %rax
723# endif
724# ifdef USE_AS_STRCAT
725 movb $0, (%rdi, %r8)
726# endif
727 ret
728
729 .p2align 4
730L(StrncpyExit17_32):
731 VMOVU (%rsi), %XMM2
732 VMOVU -16(%rsi, %r8), %XMM3
733 VMOVU %XMM2, (%rdi)
734 VMOVU %XMM3, -16(%rdi, %r8)
735# ifdef USE_AS_STPCPY
736 lea (%rdi, %r8), %rax
737# endif
738# ifdef USE_AS_STRCAT
739 movb $0, (%rdi, %r8)
740# endif
741 ret
742
743 .p2align 4
744L(StrncpyExit33_64):
745 /* 0/32, 31/16 */
746 VMOVU (%rsi), %YMM2
747 VMOVU -VEC_SIZE(%rsi, %r8), %YMM3
748 VMOVU %YMM2, (%rdi)
749 VMOVU %YMM3, -VEC_SIZE(%rdi, %r8)
750# ifdef USE_AS_STPCPY
751 lea (%rdi, %r8), %rax
752# endif
753# ifdef USE_AS_STRCAT
754 movb $0, (%rdi, %r8)
755# endif
756 ret
757
758 .p2align 4
759L(StrncpyExit65):
760 /* 0/32, 32/32, 64/1 */
761 VMOVU (%rsi), %YMM2
762 VMOVU 32(%rsi), %YMM3
763 mov 64(%rsi), %cl
764 VMOVU %YMM2, (%rdi)
765 VMOVU %YMM3, 32(%rdi)
766 mov %cl, 64(%rdi)
767# ifdef USE_AS_STPCPY
768 lea 65(%rdi), %rax
769# endif
770# ifdef USE_AS_STRCAT
771 movb $0, 65(%rdi)
772# endif
773 ret
774
775# ifndef USE_AS_STRCAT
776
777 .p2align 4
778L(Fill1):
779 mov %dl, (%rdi)
780 ret
781
782 .p2align 4
783L(Fill2):
784 mov %dx, (%rdi)
785 ret
786
787 .p2align 4
788L(Fill3_4):
789 mov %dx, (%rdi)
790 mov %dx, -2(%rdi, %r8)
791 ret
792
793 .p2align 4
794L(Fill5_8):
795 mov %edx, (%rdi)
796 mov %edx, -4(%rdi, %r8)
797 ret
798
799 .p2align 4
800L(Fill9_16):
801 mov %rdx, (%rdi)
802 mov %rdx, -8(%rdi, %r8)
803 ret
804
805 .p2align 4
806L(Fill17_32):
807 VMOVU %XMMZERO, (%rdi)
808 VMOVU %XMMZERO, -16(%rdi, %r8)
809 ret
810
811 .p2align 4
812L(CopyVecSizeUnalignedVec2):
813 VMOVU %YMM2, (%rdi, %rcx)
814
815 .p2align 4
816L(CopyVecSizeVecExit):
817 bsf %edx, %edx
818 add $(VEC_SIZE - 1), %r8
819 add %rcx, %rdi
820# ifdef USE_AS_STPCPY
821 lea (%rdi, %rdx), %rax
822# endif
823 sub %rdx, %r8
824 lea 1(%rdi, %rdx), %rdi
825
826 .p2align 4
827L(StrncpyFillTailWithZero):
828 xor %edx, %edx
829 sub $VEC_SIZE, %r8
830 jbe L(StrncpyFillExit)
831
832 VMOVU %YMMZERO, (%rdi)
833 add $VEC_SIZE, %rdi
834
835 mov %rdi, %rsi
836 and $(VEC_SIZE - 1), %esi
837 sub %rsi, %rdi
838 add %rsi, %r8
839 sub $(VEC_SIZE * 4), %r8
840 jb L(StrncpyFillLessFourVecSize)
841
842L(StrncpyFillLoopVmovdqa):
843 VMOVA %YMMZERO, (%rdi)
844 VMOVA %YMMZERO, VEC_SIZE(%rdi)
845 VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi)
846 VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi)
847 add $(VEC_SIZE * 4), %rdi
848 sub $(VEC_SIZE * 4), %r8
849 jae L(StrncpyFillLoopVmovdqa)
850
851L(StrncpyFillLessFourVecSize):
852 add $(VEC_SIZE * 2), %r8
853 jl L(StrncpyFillLessTwoVecSize)
854 VMOVA %YMMZERO, (%rdi)
855 VMOVA %YMMZERO, VEC_SIZE(%rdi)
856 add $(VEC_SIZE * 2), %rdi
857 sub $VEC_SIZE, %r8
858 jl L(StrncpyFillExit)
859 VMOVA %YMMZERO, (%rdi)
860 add $VEC_SIZE, %rdi
861 jmp L(Fill)
862
863 .p2align 4
864L(StrncpyFillLessTwoVecSize):
865 add $VEC_SIZE, %r8
866 jl L(StrncpyFillExit)
867 VMOVA %YMMZERO, (%rdi)
868 add $VEC_SIZE, %rdi
869 jmp L(Fill)
870
871 .p2align 4
872L(StrncpyFillExit):
873 add $VEC_SIZE, %r8
874L(Fill):
875 cmp $17, %r8d
876 jae L(Fill17_32)
877 cmp $9, %r8d
878 jae L(Fill9_16)
879 cmp $5, %r8d
880 jae L(Fill5_8)
881 cmp $3, %r8d
882 jae L(Fill3_4)
883 cmp $1, %r8d
884 ja L(Fill2)
885 je L(Fill1)
886 ret
887
888/* end of ifndef USE_AS_STRCAT */
889# endif
890
891 .p2align 4
892L(UnalignedLeaveCase2OrCase3):
893 test %rdx, %rdx
894 jnz L(UnalignedFourVecSizeLeaveCase2)
895L(UnalignedFourVecSizeLeaveCase3):
896 lea (VEC_SIZE * 4)(%r8), %rcx
897 and $-VEC_SIZE, %rcx
898 add $(VEC_SIZE * 3), %r8
899 jl L(CopyVecSizeCase3)
900 VMOVU %YMM4, (%rdi)
901 sub $VEC_SIZE, %r8
902 jb L(CopyVecSizeCase3)
903 VMOVU %YMM5, VEC_SIZE(%rdi)
904 sub $VEC_SIZE, %r8
905 jb L(CopyVecSizeCase3)
906 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
907 sub $VEC_SIZE, %r8
908 jb L(CopyVecSizeCase3)
909 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
910# ifdef USE_AS_STPCPY
911 lea (VEC_SIZE * 4)(%rdi), %rax
912# endif
913# ifdef USE_AS_STRCAT
914 movb $0, (VEC_SIZE * 4)(%rdi)
915# endif
916 ret
917
918 .p2align 4
919L(UnalignedFourVecSizeLeaveCase2):
920 xor %ecx, %ecx
921 vpcmpb $0, %YMM4, %YMMZERO, %k1
922 kmovd %k1, %edx
923 add $(VEC_SIZE * 3), %r8
924 jle L(CopyVecSizeCase2OrCase3)
925 test %edx, %edx
926# ifndef USE_AS_STRCAT
927 jnz L(CopyVecSizeUnalignedVec4)
928# else
929 jnz L(CopyVecSize)
930# endif
931 vpcmpb $0, %YMM5, %YMMZERO, %k2
932 kmovd %k2, %edx
933 VMOVU %YMM4, (%rdi)
934 add $VEC_SIZE, %rcx
935 sub $VEC_SIZE, %r8
936 jbe L(CopyVecSizeCase2OrCase3)
937 test %edx, %edx
938# ifndef USE_AS_STRCAT
939 jnz L(CopyVecSizeUnalignedVec5)
940# else
941 jnz L(CopyVecSize)
942# endif
943
944 vpcmpb $0, %YMM6, %YMMZERO, %k3
945 kmovd %k3, %edx
946 VMOVU %YMM5, VEC_SIZE(%rdi)
947 add $VEC_SIZE, %rcx
948 sub $VEC_SIZE, %r8
949 jbe L(CopyVecSizeCase2OrCase3)
950 test %edx, %edx
951# ifndef USE_AS_STRCAT
952 jnz L(CopyVecSizeUnalignedVec6)
953# else
954 jnz L(CopyVecSize)
955# endif
956
957 vpcmpb $0, %YMM7, %YMMZERO, %k4
958 kmovd %k4, %edx
959 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
960 lea VEC_SIZE(%rdi, %rcx), %rdi
961 lea VEC_SIZE(%rsi, %rcx), %rsi
962 bsf %edx, %edx
963 cmp %r8d, %edx
964 jb L(CopyVecSizeExit)
965L(StrncpyExit):
966 cmp $65, %r8d
967 je L(StrncpyExit65)
968 cmp $33, %r8d
969 jae L(StrncpyExit33_64)
970 cmp $17, %r8d
971 jae L(StrncpyExit17_32)
972 cmp $9, %r8d
973 jae L(StrncpyExit9_16)
974 cmp $5, %r8d
975 jae L(StrncpyExit5_8)
976 cmp $3, %r8d
977 jae L(StrncpyExit3_4)
978 cmp $1, %r8d
979 ja L(StrncpyExit2)
980 je L(StrncpyExit1)
981# ifdef USE_AS_STPCPY
982 mov %rdi, %rax
983# endif
984# ifdef USE_AS_STRCAT
985 movb $0, (%rdi)
986# endif
987 ret
988
989 .p2align 4
990L(ExitZero):
991# ifndef USE_AS_STRCAT
992 mov %rdi, %rax
993# endif
994 ret
995
996# endif
997
998# ifndef USE_AS_STRCAT
999END (STRCPY)
1000# else
1001END (STRCAT)
1002# endif
1003#endif
1004