1/* strcpy with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23
24# ifndef USE_AS_STRCAT
25# include <sysdep.h>
26
27# ifndef STRCPY
28# define STRCPY __strcpy_evex
29# endif
30
31# endif
32
33# define VMOVU vmovdqu64
34# define VMOVA vmovdqa64
35
36/* Number of bytes in a vector register */
37# ifndef VEC_SIZE
38# define VEC_SIZE 32
39# endif
40
41# define XMM2 xmm18
42# define XMM3 xmm19
43
44# define YMM2 ymm18
45# define YMM3 ymm19
46# define YMM4 ymm20
47# define YMM5 ymm21
48# define YMM6 ymm22
49# define YMM7 ymm23
50
51# ifndef USE_AS_STRCAT
52
53/* zero register */
54# define XMMZERO xmm16
55# define YMMZERO ymm16
56# define YMM1 ymm17
57
58 .section .text.evex,"ax",@progbits
59ENTRY (STRCPY)
60# ifdef USE_AS_STRNCPY
61 mov %RDX_LP, %R8_LP
62 test %R8_LP, %R8_LP
63 jz L(ExitZero)
64# endif
65 mov %rsi, %rcx
66# ifndef USE_AS_STPCPY
67 mov %rdi, %rax /* save result */
68# endif
69
70 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
71# endif
72
73 and $((VEC_SIZE * 4) - 1), %ecx
74 cmp $(VEC_SIZE * 2), %ecx
75 jbe L(SourceStringAlignmentLessTwoVecSize)
76
77 and $-VEC_SIZE, %rsi
78 and $(VEC_SIZE - 1), %ecx
79
80 vpcmpb $0, (%rsi), %YMMZERO, %k0
81 kmovd %k0, %edx
82 shr %cl, %rdx
83
84# ifdef USE_AS_STRNCPY
85# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
86 mov $VEC_SIZE, %r10
87 sub %rcx, %r10
88 cmp %r10, %r8
89# else
90 mov $(VEC_SIZE + 1), %r10
91 sub %rcx, %r10
92 cmp %r10, %r8
93# endif
94 jbe L(CopyVecSizeTailCase2OrCase3)
95# endif
96 test %edx, %edx
97 jnz L(CopyVecSizeTail)
98
99 vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1
100 kmovd %k1, %edx
101
102# ifdef USE_AS_STRNCPY
103 add $VEC_SIZE, %r10
104 cmp %r10, %r8
105 jbe L(CopyTwoVecSizeCase2OrCase3)
106# endif
107 test %edx, %edx
108 jnz L(CopyTwoVecSize)
109
110 VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */
111 VMOVU %YMM2, (%rdi)
112
113/* If source address alignment != destination address alignment */
114 .p2align 4
115L(UnalignVecSizeBoth):
116 sub %rcx, %rdi
117# ifdef USE_AS_STRNCPY
118 add %rcx, %r8
119 sbb %rcx, %rcx
120 or %rcx, %r8
121# endif
122 mov $VEC_SIZE, %rcx
123 VMOVA (%rsi, %rcx), %YMM2
124 VMOVU %YMM2, (%rdi, %rcx)
125 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
126 vpcmpb $0, %YMM2, %YMMZERO, %k0
127 kmovd %k0, %edx
128 add $VEC_SIZE, %rcx
129# ifdef USE_AS_STRNCPY
130 sub $(VEC_SIZE * 3), %r8
131 jbe L(CopyVecSizeCase2OrCase3)
132# endif
133 test %edx, %edx
134# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
135 jnz L(CopyVecSizeUnalignedVec2)
136# else
137 jnz L(CopyVecSize)
138# endif
139
140 VMOVU %YMM2, (%rdi, %rcx)
141 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
142 vpcmpb $0, %YMM3, %YMMZERO, %k0
143 kmovd %k0, %edx
144 add $VEC_SIZE, %rcx
145# ifdef USE_AS_STRNCPY
146 sub $VEC_SIZE, %r8
147 jbe L(CopyVecSizeCase2OrCase3)
148# endif
149 test %edx, %edx
150# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
151 jnz L(CopyVecSizeUnalignedVec3)
152# else
153 jnz L(CopyVecSize)
154# endif
155
156 VMOVU %YMM3, (%rdi, %rcx)
157 VMOVA VEC_SIZE(%rsi, %rcx), %YMM4
158 vpcmpb $0, %YMM4, %YMMZERO, %k0
159 kmovd %k0, %edx
160 add $VEC_SIZE, %rcx
161# ifdef USE_AS_STRNCPY
162 sub $VEC_SIZE, %r8
163 jbe L(CopyVecSizeCase2OrCase3)
164# endif
165 test %edx, %edx
166# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
167 jnz L(CopyVecSizeUnalignedVec4)
168# else
169 jnz L(CopyVecSize)
170# endif
171
172 VMOVU %YMM4, (%rdi, %rcx)
173 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
174 vpcmpb $0, %YMM2, %YMMZERO, %k0
175 kmovd %k0, %edx
176 add $VEC_SIZE, %rcx
177# ifdef USE_AS_STRNCPY
178 sub $VEC_SIZE, %r8
179 jbe L(CopyVecSizeCase2OrCase3)
180# endif
181 test %edx, %edx
182# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
183 jnz L(CopyVecSizeUnalignedVec2)
184# else
185 jnz L(CopyVecSize)
186# endif
187
188 VMOVU %YMM2, (%rdi, %rcx)
189 VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
190 vpcmpb $0, %YMM2, %YMMZERO, %k0
191 kmovd %k0, %edx
192 add $VEC_SIZE, %rcx
193# ifdef USE_AS_STRNCPY
194 sub $VEC_SIZE, %r8
195 jbe L(CopyVecSizeCase2OrCase3)
196# endif
197 test %edx, %edx
198# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
199 jnz L(CopyVecSizeUnalignedVec2)
200# else
201 jnz L(CopyVecSize)
202# endif
203
204 VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
205 VMOVU %YMM2, (%rdi, %rcx)
206 vpcmpb $0, %YMM3, %YMMZERO, %k0
207 kmovd %k0, %edx
208 add $VEC_SIZE, %rcx
209# ifdef USE_AS_STRNCPY
210 sub $VEC_SIZE, %r8
211 jbe L(CopyVecSizeCase2OrCase3)
212# endif
213 test %edx, %edx
214# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
215 jnz L(CopyVecSizeUnalignedVec3)
216# else
217 jnz L(CopyVecSize)
218# endif
219
220 VMOVU %YMM3, (%rdi, %rcx)
221 mov %rsi, %rdx
222 lea VEC_SIZE(%rsi, %rcx), %rsi
223 and $-(VEC_SIZE * 4), %rsi
224 sub %rsi, %rdx
225 sub %rdx, %rdi
226# ifdef USE_AS_STRNCPY
227 lea (VEC_SIZE * 8)(%r8, %rdx), %r8
228# endif
229L(UnalignedFourVecSizeLoop):
230 VMOVA (%rsi), %YMM4
231 VMOVA VEC_SIZE(%rsi), %YMM5
232 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
233 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
234 vpminub %YMM5, %YMM4, %YMM2
235 vpminub %YMM7, %YMM6, %YMM3
236 vpminub %YMM2, %YMM3, %YMM2
237 /* If K7 != 0, there is a null byte. */
238 vpcmpb $0, %YMM2, %YMMZERO, %k7
239 kmovd %k7, %edx
240# ifdef USE_AS_STRNCPY
241 sub $(VEC_SIZE * 4), %r8
242 jbe L(UnalignedLeaveCase2OrCase3)
243# endif
244 test %edx, %edx
245 jnz L(UnalignedFourVecSizeLeave)
246
247L(UnalignedFourVecSizeLoop_start):
248 add $(VEC_SIZE * 4), %rdi
249 add $(VEC_SIZE * 4), %rsi
250 VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi)
251 VMOVA (%rsi), %YMM4
252 VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi)
253 VMOVA VEC_SIZE(%rsi), %YMM5
254 vpminub %YMM5, %YMM4, %YMM2
255 VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi)
256 VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
257 VMOVU %YMM7, -VEC_SIZE(%rdi)
258 VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
259 vpminub %YMM7, %YMM6, %YMM3
260 vpminub %YMM2, %YMM3, %YMM2
261 /* If K7 != 0, there is a null byte. */
262 vpcmpb $0, %YMM2, %YMMZERO, %k7
263 kmovd %k7, %edx
264# ifdef USE_AS_STRNCPY
265 sub $(VEC_SIZE * 4), %r8
266 jbe L(UnalignedLeaveCase2OrCase3)
267# endif
268 test %edx, %edx
269 jz L(UnalignedFourVecSizeLoop_start)
270
271L(UnalignedFourVecSizeLeave):
272 vpcmpb $0, %YMM4, %YMMZERO, %k1
273 kmovd %k1, %edx
274 test %edx, %edx
275 jnz L(CopyVecSizeUnaligned_0)
276
277 vpcmpb $0, %YMM5, %YMMZERO, %k2
278 kmovd %k2, %ecx
279 test %ecx, %ecx
280 jnz L(CopyVecSizeUnaligned_16)
281
282 vpcmpb $0, %YMM6, %YMMZERO, %k3
283 kmovd %k3, %edx
284 test %edx, %edx
285 jnz L(CopyVecSizeUnaligned_32)
286
287 vpcmpb $0, %YMM7, %YMMZERO, %k4
288 kmovd %k4, %ecx
289 bsf %ecx, %edx
290 VMOVU %YMM4, (%rdi)
291 VMOVU %YMM5, VEC_SIZE(%rdi)
292 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
293# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
294# ifdef USE_AS_STPCPY
295 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
296# endif
297 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
298 add $(VEC_SIZE - 1), %r8
299 sub %rdx, %r8
300 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
301 jmp L(StrncpyFillTailWithZero)
302# else
303 add $(VEC_SIZE * 3), %rsi
304 add $(VEC_SIZE * 3), %rdi
305 jmp L(CopyVecSizeExit)
306# endif
307
308/* If source address alignment == destination address alignment */
309
310L(SourceStringAlignmentLessTwoVecSize):
311 VMOVU (%rsi), %YMM3
312 VMOVU VEC_SIZE(%rsi), %YMM2
313 vpcmpb $0, %YMM3, %YMMZERO, %k0
314 kmovd %k0, %edx
315
316# ifdef USE_AS_STRNCPY
317# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
318 cmp $VEC_SIZE, %r8
319# else
320 cmp $(VEC_SIZE + 1), %r8
321# endif
322 jbe L(CopyVecSizeTail1Case2OrCase3)
323# endif
324 test %edx, %edx
325 jnz L(CopyVecSizeTail1)
326
327 VMOVU %YMM3, (%rdi)
328 vpcmpb $0, %YMM2, %YMMZERO, %k0
329 kmovd %k0, %edx
330
331# ifdef USE_AS_STRNCPY
332# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
333 cmp $(VEC_SIZE * 2), %r8
334# else
335 cmp $((VEC_SIZE * 2) + 1), %r8
336# endif
337 jbe L(CopyTwoVecSize1Case2OrCase3)
338# endif
339 test %edx, %edx
340 jnz L(CopyTwoVecSize1)
341
342 and $-VEC_SIZE, %rsi
343 and $(VEC_SIZE - 1), %ecx
344 jmp L(UnalignVecSizeBoth)
345
346/*------End of main part with loops---------------------*/
347
348/* Case1 */
349
350# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
351 .p2align 4
352L(CopyVecSize):
353 add %rcx, %rdi
354# endif
355L(CopyVecSizeTail):
356 add %rcx, %rsi
357L(CopyVecSizeTail1):
358 bsf %edx, %edx
359L(CopyVecSizeExit):
360 cmp $32, %edx
361 jae L(Exit32_63)
362 cmp $16, %edx
363 jae L(Exit16_31)
364 cmp $8, %edx
365 jae L(Exit8_15)
366 cmp $4, %edx
367 jae L(Exit4_7)
368 cmp $3, %edx
369 je L(Exit3)
370 cmp $1, %edx
371 ja L(Exit2)
372 je L(Exit1)
373 movb $0, (%rdi)
374# ifdef USE_AS_STPCPY
375 lea (%rdi), %rax
376# endif
377# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
378 sub $1, %r8
379 lea 1(%rdi), %rdi
380 jnz L(StrncpyFillTailWithZero)
381# endif
382 ret
383
384 .p2align 4
385L(CopyTwoVecSize1):
386 add $VEC_SIZE, %rsi
387 add $VEC_SIZE, %rdi
388# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
389 sub $VEC_SIZE, %r8
390# endif
391 jmp L(CopyVecSizeTail1)
392
393 .p2align 4
394L(CopyTwoVecSize):
395 bsf %edx, %edx
396 add %rcx, %rsi
397 add $VEC_SIZE, %edx
398 sub %ecx, %edx
399 jmp L(CopyVecSizeExit)
400
401 .p2align 4
402L(CopyVecSizeUnaligned_0):
403 bsf %edx, %edx
404# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
405# ifdef USE_AS_STPCPY
406 lea (%rdi, %rdx), %rax
407# endif
408 VMOVU %YMM4, (%rdi)
409 add $((VEC_SIZE * 4) - 1), %r8
410 sub %rdx, %r8
411 lea 1(%rdi, %rdx), %rdi
412 jmp L(StrncpyFillTailWithZero)
413# else
414 jmp L(CopyVecSizeExit)
415# endif
416
417 .p2align 4
418L(CopyVecSizeUnaligned_16):
419 bsf %ecx, %edx
420 VMOVU %YMM4, (%rdi)
421# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
422# ifdef USE_AS_STPCPY
423 lea VEC_SIZE(%rdi, %rdx), %rax
424# endif
425 VMOVU %YMM5, VEC_SIZE(%rdi)
426 add $((VEC_SIZE * 3) - 1), %r8
427 sub %rdx, %r8
428 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
429 jmp L(StrncpyFillTailWithZero)
430# else
431 add $VEC_SIZE, %rsi
432 add $VEC_SIZE, %rdi
433 jmp L(CopyVecSizeExit)
434# endif
435
436 .p2align 4
437L(CopyVecSizeUnaligned_32):
438 bsf %edx, %edx
439 VMOVU %YMM4, (%rdi)
440 VMOVU %YMM5, VEC_SIZE(%rdi)
441# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
442# ifdef USE_AS_STPCPY
443 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
444# endif
445 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
446 add $((VEC_SIZE * 2) - 1), %r8
447 sub %rdx, %r8
448 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
449 jmp L(StrncpyFillTailWithZero)
450# else
451 add $(VEC_SIZE * 2), %rsi
452 add $(VEC_SIZE * 2), %rdi
453 jmp L(CopyVecSizeExit)
454# endif
455
456# ifdef USE_AS_STRNCPY
457# ifndef USE_AS_STRCAT
458 .p2align 4
459L(CopyVecSizeUnalignedVec6):
460 VMOVU %YMM6, (%rdi, %rcx)
461 jmp L(CopyVecSizeVecExit)
462
463 .p2align 4
464L(CopyVecSizeUnalignedVec5):
465 VMOVU %YMM5, (%rdi, %rcx)
466 jmp L(CopyVecSizeVecExit)
467
468 .p2align 4
469L(CopyVecSizeUnalignedVec4):
470 VMOVU %YMM4, (%rdi, %rcx)
471 jmp L(CopyVecSizeVecExit)
472
473 .p2align 4
474L(CopyVecSizeUnalignedVec3):
475 VMOVU %YMM3, (%rdi, %rcx)
476 jmp L(CopyVecSizeVecExit)
477# endif
478
479/* Case2 */
480
481 .p2align 4
482L(CopyVecSizeCase2):
483 add $VEC_SIZE, %r8
484 add %rcx, %rdi
485 add %rcx, %rsi
486 bsf %edx, %edx
487 cmp %r8d, %edx
488 jb L(CopyVecSizeExit)
489 jmp L(StrncpyExit)
490
491 .p2align 4
492L(CopyTwoVecSizeCase2):
493 add %rcx, %rsi
494 bsf %edx, %edx
495 add $VEC_SIZE, %edx
496 sub %ecx, %edx
497 cmp %r8d, %edx
498 jb L(CopyVecSizeExit)
499 jmp L(StrncpyExit)
500
501L(CopyVecSizeTailCase2):
502 add %rcx, %rsi
503 bsf %edx, %edx
504 cmp %r8d, %edx
505 jb L(CopyVecSizeExit)
506 jmp L(StrncpyExit)
507
508L(CopyVecSizeTail1Case2):
509 bsf %edx, %edx
510 cmp %r8d, %edx
511 jb L(CopyVecSizeExit)
512 jmp L(StrncpyExit)
513
514/* Case2 or Case3, Case3 */
515
516 .p2align 4
517L(CopyVecSizeCase2OrCase3):
518 test %rdx, %rdx
519 jnz L(CopyVecSizeCase2)
520L(CopyVecSizeCase3):
521 add $VEC_SIZE, %r8
522 add %rcx, %rdi
523 add %rcx, %rsi
524 jmp L(StrncpyExit)
525
526 .p2align 4
527L(CopyTwoVecSizeCase2OrCase3):
528 test %rdx, %rdx
529 jnz L(CopyTwoVecSizeCase2)
530 add %rcx, %rsi
531 jmp L(StrncpyExit)
532
533 .p2align 4
534L(CopyVecSizeTailCase2OrCase3):
535 test %rdx, %rdx
536 jnz L(CopyVecSizeTailCase2)
537 add %rcx, %rsi
538 jmp L(StrncpyExit)
539
540 .p2align 4
541L(CopyTwoVecSize1Case2OrCase3):
542 add $VEC_SIZE, %rdi
543 add $VEC_SIZE, %rsi
544 sub $VEC_SIZE, %r8
545L(CopyVecSizeTail1Case2OrCase3):
546 test %rdx, %rdx
547 jnz L(CopyVecSizeTail1Case2)
548 jmp L(StrncpyExit)
549# endif
550
551/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
552
553 .p2align 4
554L(Exit1):
555 movzwl (%rsi), %edx
556 mov %dx, (%rdi)
557# ifdef USE_AS_STPCPY
558 lea 1(%rdi), %rax
559# endif
560# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
561 sub $2, %r8
562 lea 2(%rdi), %rdi
563 jnz L(StrncpyFillTailWithZero)
564# endif
565 ret
566
567 .p2align 4
568L(Exit2):
569 movzwl (%rsi), %ecx
570 mov %cx, (%rdi)
571 movb $0, 2(%rdi)
572# ifdef USE_AS_STPCPY
573 lea 2(%rdi), %rax
574# endif
575# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
576 sub $3, %r8
577 lea 3(%rdi), %rdi
578 jnz L(StrncpyFillTailWithZero)
579# endif
580 ret
581
582 .p2align 4
583L(Exit3):
584 mov (%rsi), %edx
585 mov %edx, (%rdi)
586# ifdef USE_AS_STPCPY
587 lea 3(%rdi), %rax
588# endif
589# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
590 sub $4, %r8
591 lea 4(%rdi), %rdi
592 jnz L(StrncpyFillTailWithZero)
593# endif
594 ret
595
596 .p2align 4
597L(Exit4_7):
598 mov (%rsi), %ecx
599 mov %ecx, (%rdi)
600 mov -3(%rsi, %rdx), %ecx
601 mov %ecx, -3(%rdi, %rdx)
602# ifdef USE_AS_STPCPY
603 lea (%rdi, %rdx), %rax
604# endif
605# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
606 sub %rdx, %r8
607 sub $1, %r8
608 lea 1(%rdi, %rdx), %rdi
609 jnz L(StrncpyFillTailWithZero)
610# endif
611 ret
612
613 .p2align 4
614L(Exit8_15):
615 mov (%rsi), %rcx
616 mov -7(%rsi, %rdx), %r9
617 mov %rcx, (%rdi)
618 mov %r9, -7(%rdi, %rdx)
619# ifdef USE_AS_STPCPY
620 lea (%rdi, %rdx), %rax
621# endif
622# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
623 sub %rdx, %r8
624 sub $1, %r8
625 lea 1(%rdi, %rdx), %rdi
626 jnz L(StrncpyFillTailWithZero)
627# endif
628 ret
629
630 .p2align 4
631L(Exit16_31):
632 VMOVU (%rsi), %XMM2
633 VMOVU -15(%rsi, %rdx), %XMM3
634 VMOVU %XMM2, (%rdi)
635 VMOVU %XMM3, -15(%rdi, %rdx)
636# ifdef USE_AS_STPCPY
637 lea (%rdi, %rdx), %rax
638# endif
639# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
640 sub %rdx, %r8
641 sub $1, %r8
642 lea 1(%rdi, %rdx), %rdi
643 jnz L(StrncpyFillTailWithZero)
644# endif
645 ret
646
647 .p2align 4
648L(Exit32_63):
649 VMOVU (%rsi), %YMM2
650 VMOVU -31(%rsi, %rdx), %YMM3
651 VMOVU %YMM2, (%rdi)
652 VMOVU %YMM3, -31(%rdi, %rdx)
653# ifdef USE_AS_STPCPY
654 lea (%rdi, %rdx), %rax
655# endif
656# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
657 sub %rdx, %r8
658 sub $1, %r8
659 lea 1(%rdi, %rdx), %rdi
660 jnz L(StrncpyFillTailWithZero)
661# endif
662 ret
663
664# ifdef USE_AS_STRNCPY
665
666 .p2align 4
667L(StrncpyExit1):
668 movzbl (%rsi), %edx
669 mov %dl, (%rdi)
670# ifdef USE_AS_STPCPY
671 lea 1(%rdi), %rax
672# endif
673# ifdef USE_AS_STRCAT
674 movb $0, 1(%rdi)
675# endif
676 ret
677
678 .p2align 4
679L(StrncpyExit2):
680 movzwl (%rsi), %edx
681 mov %dx, (%rdi)
682# ifdef USE_AS_STPCPY
683 lea 2(%rdi), %rax
684# endif
685# ifdef USE_AS_STRCAT
686 movb $0, 2(%rdi)
687# endif
688 ret
689
690 .p2align 4
691L(StrncpyExit3_4):
692 movzwl (%rsi), %ecx
693 movzwl -2(%rsi, %r8), %edx
694 mov %cx, (%rdi)
695 mov %dx, -2(%rdi, %r8)
696# ifdef USE_AS_STPCPY
697 lea (%rdi, %r8), %rax
698# endif
699# ifdef USE_AS_STRCAT
700 movb $0, (%rdi, %r8)
701# endif
702 ret
703
704 .p2align 4
705L(StrncpyExit5_8):
706 mov (%rsi), %ecx
707 mov -4(%rsi, %r8), %edx
708 mov %ecx, (%rdi)
709 mov %edx, -4(%rdi, %r8)
710# ifdef USE_AS_STPCPY
711 lea (%rdi, %r8), %rax
712# endif
713# ifdef USE_AS_STRCAT
714 movb $0, (%rdi, %r8)
715# endif
716 ret
717
718 .p2align 4
719L(StrncpyExit9_16):
720 mov (%rsi), %rcx
721 mov -8(%rsi, %r8), %rdx
722 mov %rcx, (%rdi)
723 mov %rdx, -8(%rdi, %r8)
724# ifdef USE_AS_STPCPY
725 lea (%rdi, %r8), %rax
726# endif
727# ifdef USE_AS_STRCAT
728 movb $0, (%rdi, %r8)
729# endif
730 ret
731
732 .p2align 4
733L(StrncpyExit17_32):
734 VMOVU (%rsi), %XMM2
735 VMOVU -16(%rsi, %r8), %XMM3
736 VMOVU %XMM2, (%rdi)
737 VMOVU %XMM3, -16(%rdi, %r8)
738# ifdef USE_AS_STPCPY
739 lea (%rdi, %r8), %rax
740# endif
741# ifdef USE_AS_STRCAT
742 movb $0, (%rdi, %r8)
743# endif
744 ret
745
746 .p2align 4
747L(StrncpyExit33_64):
748 /* 0/32, 31/16 */
749 VMOVU (%rsi), %YMM2
750 VMOVU -VEC_SIZE(%rsi, %r8), %YMM3
751 VMOVU %YMM2, (%rdi)
752 VMOVU %YMM3, -VEC_SIZE(%rdi, %r8)
753# ifdef USE_AS_STPCPY
754 lea (%rdi, %r8), %rax
755# endif
756# ifdef USE_AS_STRCAT
757 movb $0, (%rdi, %r8)
758# endif
759 ret
760
761 .p2align 4
762L(StrncpyExit65):
763 /* 0/32, 32/32, 64/1 */
764 VMOVU (%rsi), %YMM2
765 VMOVU 32(%rsi), %YMM3
766 mov 64(%rsi), %cl
767 VMOVU %YMM2, (%rdi)
768 VMOVU %YMM3, 32(%rdi)
769 mov %cl, 64(%rdi)
770# ifdef USE_AS_STPCPY
771 lea 65(%rdi), %rax
772# endif
773# ifdef USE_AS_STRCAT
774 movb $0, 65(%rdi)
775# endif
776 ret
777
778# ifndef USE_AS_STRCAT
779
780 .p2align 4
781L(Fill1):
782 mov %dl, (%rdi)
783 ret
784
785 .p2align 4
786L(Fill2):
787 mov %dx, (%rdi)
788 ret
789
790 .p2align 4
791L(Fill3_4):
792 mov %dx, (%rdi)
793 mov %dx, -2(%rdi, %r8)
794 ret
795
796 .p2align 4
797L(Fill5_8):
798 mov %edx, (%rdi)
799 mov %edx, -4(%rdi, %r8)
800 ret
801
802 .p2align 4
803L(Fill9_16):
804 mov %rdx, (%rdi)
805 mov %rdx, -8(%rdi, %r8)
806 ret
807
808 .p2align 4
809L(Fill17_32):
810 VMOVU %XMMZERO, (%rdi)
811 VMOVU %XMMZERO, -16(%rdi, %r8)
812 ret
813
814 .p2align 4
815L(CopyVecSizeUnalignedVec2):
816 VMOVU %YMM2, (%rdi, %rcx)
817
818 .p2align 4
819L(CopyVecSizeVecExit):
820 bsf %edx, %edx
821 add $(VEC_SIZE - 1), %r8
822 add %rcx, %rdi
823# ifdef USE_AS_STPCPY
824 lea (%rdi, %rdx), %rax
825# endif
826 sub %rdx, %r8
827 lea 1(%rdi, %rdx), %rdi
828
829 .p2align 4
830L(StrncpyFillTailWithZero):
831 xor %edx, %edx
832 sub $VEC_SIZE, %r8
833 jbe L(StrncpyFillExit)
834
835 VMOVU %YMMZERO, (%rdi)
836 add $VEC_SIZE, %rdi
837
838 mov %rdi, %rsi
839 and $(VEC_SIZE - 1), %esi
840 sub %rsi, %rdi
841 add %rsi, %r8
842 sub $(VEC_SIZE * 4), %r8
843 jb L(StrncpyFillLessFourVecSize)
844
845L(StrncpyFillLoopVmovdqa):
846 VMOVA %YMMZERO, (%rdi)
847 VMOVA %YMMZERO, VEC_SIZE(%rdi)
848 VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi)
849 VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi)
850 add $(VEC_SIZE * 4), %rdi
851 sub $(VEC_SIZE * 4), %r8
852 jae L(StrncpyFillLoopVmovdqa)
853
854L(StrncpyFillLessFourVecSize):
855 add $(VEC_SIZE * 2), %r8
856 jl L(StrncpyFillLessTwoVecSize)
857 VMOVA %YMMZERO, (%rdi)
858 VMOVA %YMMZERO, VEC_SIZE(%rdi)
859 add $(VEC_SIZE * 2), %rdi
860 sub $VEC_SIZE, %r8
861 jl L(StrncpyFillExit)
862 VMOVA %YMMZERO, (%rdi)
863 add $VEC_SIZE, %rdi
864 jmp L(Fill)
865
866 .p2align 4
867L(StrncpyFillLessTwoVecSize):
868 add $VEC_SIZE, %r8
869 jl L(StrncpyFillExit)
870 VMOVA %YMMZERO, (%rdi)
871 add $VEC_SIZE, %rdi
872 jmp L(Fill)
873
874 .p2align 4
875L(StrncpyFillExit):
876 add $VEC_SIZE, %r8
877L(Fill):
878 cmp $17, %r8d
879 jae L(Fill17_32)
880 cmp $9, %r8d
881 jae L(Fill9_16)
882 cmp $5, %r8d
883 jae L(Fill5_8)
884 cmp $3, %r8d
885 jae L(Fill3_4)
886 cmp $1, %r8d
887 ja L(Fill2)
888 je L(Fill1)
889 ret
890
891/* end of ifndef USE_AS_STRCAT */
892# endif
893
894 .p2align 4
895L(UnalignedLeaveCase2OrCase3):
896 test %rdx, %rdx
897 jnz L(UnalignedFourVecSizeLeaveCase2)
898L(UnalignedFourVecSizeLeaveCase3):
899 lea (VEC_SIZE * 4)(%r8), %rcx
900 and $-VEC_SIZE, %rcx
901 add $(VEC_SIZE * 3), %r8
902 jl L(CopyVecSizeCase3)
903 VMOVU %YMM4, (%rdi)
904 sub $VEC_SIZE, %r8
905 jb L(CopyVecSizeCase3)
906 VMOVU %YMM5, VEC_SIZE(%rdi)
907 sub $VEC_SIZE, %r8
908 jb L(CopyVecSizeCase3)
909 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
910 sub $VEC_SIZE, %r8
911 jb L(CopyVecSizeCase3)
912 VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
913# ifdef USE_AS_STPCPY
914 lea (VEC_SIZE * 4)(%rdi), %rax
915# endif
916# ifdef USE_AS_STRCAT
917 movb $0, (VEC_SIZE * 4)(%rdi)
918# endif
919 ret
920
921 .p2align 4
922L(UnalignedFourVecSizeLeaveCase2):
923 xor %ecx, %ecx
924 vpcmpb $0, %YMM4, %YMMZERO, %k1
925 kmovd %k1, %edx
926 add $(VEC_SIZE * 3), %r8
927 jle L(CopyVecSizeCase2OrCase3)
928 test %edx, %edx
929# ifndef USE_AS_STRCAT
930 jnz L(CopyVecSizeUnalignedVec4)
931# else
932 jnz L(CopyVecSize)
933# endif
934 vpcmpb $0, %YMM5, %YMMZERO, %k2
935 kmovd %k2, %edx
936 VMOVU %YMM4, (%rdi)
937 add $VEC_SIZE, %rcx
938 sub $VEC_SIZE, %r8
939 jbe L(CopyVecSizeCase2OrCase3)
940 test %edx, %edx
941# ifndef USE_AS_STRCAT
942 jnz L(CopyVecSizeUnalignedVec5)
943# else
944 jnz L(CopyVecSize)
945# endif
946
947 vpcmpb $0, %YMM6, %YMMZERO, %k3
948 kmovd %k3, %edx
949 VMOVU %YMM5, VEC_SIZE(%rdi)
950 add $VEC_SIZE, %rcx
951 sub $VEC_SIZE, %r8
952 jbe L(CopyVecSizeCase2OrCase3)
953 test %edx, %edx
954# ifndef USE_AS_STRCAT
955 jnz L(CopyVecSizeUnalignedVec6)
956# else
957 jnz L(CopyVecSize)
958# endif
959
960 vpcmpb $0, %YMM7, %YMMZERO, %k4
961 kmovd %k4, %edx
962 VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
963 lea VEC_SIZE(%rdi, %rcx), %rdi
964 lea VEC_SIZE(%rsi, %rcx), %rsi
965 bsf %edx, %edx
966 cmp %r8d, %edx
967 jb L(CopyVecSizeExit)
968L(StrncpyExit):
969 cmp $65, %r8d
970 je L(StrncpyExit65)
971 cmp $33, %r8d
972 jae L(StrncpyExit33_64)
973 cmp $17, %r8d
974 jae L(StrncpyExit17_32)
975 cmp $9, %r8d
976 jae L(StrncpyExit9_16)
977 cmp $5, %r8d
978 jae L(StrncpyExit5_8)
979 cmp $3, %r8d
980 jae L(StrncpyExit3_4)
981 cmp $1, %r8d
982 ja L(StrncpyExit2)
983 je L(StrncpyExit1)
984# ifdef USE_AS_STPCPY
985 mov %rdi, %rax
986# endif
987# ifdef USE_AS_STRCAT
988 movb $0, (%rdi)
989# endif
990 ret
991
992 .p2align 4
993L(ExitZero):
994# ifndef USE_AS_STRCAT
995 mov %rdi, %rax
996# endif
997 ret
998
999# endif
1000
1001# ifndef USE_AS_STRCAT
1002END (STRCPY)
1003# else
1004END (STRCAT)
1005# endif
1006#endif
1007