1 | /* strcpy with AVX2 |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (3) |
22 | |
23 | |
24 | # ifndef USE_AS_STRCAT |
25 | # include <sysdep.h> |
26 | |
27 | # ifndef STRCPY |
28 | # define STRCPY __strcpy_avx2 |
29 | # endif |
30 | |
31 | # endif |
32 | |
33 | /* Number of bytes in a vector register */ |
34 | # ifndef VEC_SIZE |
35 | # define VEC_SIZE 32 |
36 | # endif |
37 | |
38 | # ifndef VZEROUPPER |
39 | # define VZEROUPPER vzeroupper |
40 | # endif |
41 | |
42 | # ifndef SECTION |
43 | # define SECTION(p) p##.avx |
44 | # endif |
45 | |
46 | /* zero register */ |
47 | #define xmmZ xmm0 |
48 | #define ymmZ ymm0 |
49 | |
50 | /* mask register */ |
51 | #define ymmM ymm1 |
52 | |
53 | # ifndef USE_AS_STRCAT |
54 | |
55 | .section SECTION(.text),"ax" ,@progbits |
56 | ENTRY (STRCPY) |
57 | # ifdef USE_AS_STRNCPY |
58 | mov %RDX_LP, %R8_LP |
59 | test %R8_LP, %R8_LP |
60 | jz L(ExitZero) |
61 | # endif |
62 | mov %rsi, %rcx |
63 | # ifndef USE_AS_STPCPY |
64 | mov %rdi, %rax /* save result */ |
65 | # endif |
66 | |
67 | # endif |
68 | |
69 | vpxor %xmmZ, %xmmZ, %xmmZ |
70 | |
71 | and $((VEC_SIZE * 4) - 1), %ecx |
72 | cmp $(VEC_SIZE * 2), %ecx |
73 | jbe L(SourceStringAlignmentLessTwoVecSize) |
74 | |
75 | and $-VEC_SIZE, %rsi |
76 | and $(VEC_SIZE - 1), %ecx |
77 | |
78 | vpcmpeqb (%rsi), %ymmZ, %ymmM |
79 | vpmovmskb %ymmM, %edx |
80 | shr %cl, %rdx |
81 | |
82 | # ifdef USE_AS_STRNCPY |
83 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
84 | mov $VEC_SIZE, %r10 |
85 | sub %rcx, %r10 |
86 | cmp %r10, %r8 |
87 | # else |
88 | mov $(VEC_SIZE + 1), %r10 |
89 | sub %rcx, %r10 |
90 | cmp %r10, %r8 |
91 | # endif |
92 | jbe L(CopyVecSizeTailCase2OrCase3) |
93 | # endif |
94 | test %edx, %edx |
95 | jnz L(CopyVecSizeTail) |
96 | |
97 | vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 |
98 | vpmovmskb %ymm2, %edx |
99 | |
100 | # ifdef USE_AS_STRNCPY |
101 | add $VEC_SIZE, %r10 |
102 | cmp %r10, %r8 |
103 | jbe L(CopyTwoVecSizeCase2OrCase3) |
104 | # endif |
105 | test %edx, %edx |
106 | jnz L(CopyTwoVecSize) |
107 | |
108 | vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ |
109 | vmovdqu %ymm2, (%rdi) |
110 | |
111 | /* If source address alignment != destination address alignment */ |
112 | .p2align 4 |
113 | L(UnalignVecSizeBoth): |
114 | sub %rcx, %rdi |
115 | # ifdef USE_AS_STRNCPY |
116 | add %rcx, %r8 |
117 | sbb %rcx, %rcx |
118 | or %rcx, %r8 |
119 | # endif |
120 | mov $VEC_SIZE, %rcx |
121 | vmovdqa (%rsi, %rcx), %ymm2 |
122 | vmovdqu %ymm2, (%rdi, %rcx) |
123 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
124 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
125 | vpmovmskb %ymmM, %edx |
126 | add $VEC_SIZE, %rcx |
127 | # ifdef USE_AS_STRNCPY |
128 | sub $(VEC_SIZE * 3), %r8 |
129 | jbe L(CopyVecSizeCase2OrCase3) |
130 | # endif |
131 | test %edx, %edx |
132 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
133 | jnz L(CopyVecSizeUnalignedVec2) |
134 | # else |
135 | jnz L(CopyVecSize) |
136 | # endif |
137 | |
138 | vmovdqu %ymm2, (%rdi, %rcx) |
139 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 |
140 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
141 | vpmovmskb %ymmM, %edx |
142 | add $VEC_SIZE, %rcx |
143 | # ifdef USE_AS_STRNCPY |
144 | sub $VEC_SIZE, %r8 |
145 | jbe L(CopyVecSizeCase2OrCase3) |
146 | # endif |
147 | test %edx, %edx |
148 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
149 | jnz L(CopyVecSizeUnalignedVec3) |
150 | # else |
151 | jnz L(CopyVecSize) |
152 | # endif |
153 | |
154 | vmovdqu %ymm3, (%rdi, %rcx) |
155 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 |
156 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
157 | vpmovmskb %ymmM, %edx |
158 | add $VEC_SIZE, %rcx |
159 | # ifdef USE_AS_STRNCPY |
160 | sub $VEC_SIZE, %r8 |
161 | jbe L(CopyVecSizeCase2OrCase3) |
162 | # endif |
163 | test %edx, %edx |
164 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
165 | jnz L(CopyVecSizeUnalignedVec4) |
166 | # else |
167 | jnz L(CopyVecSize) |
168 | # endif |
169 | |
170 | vmovdqu %ymm4, (%rdi, %rcx) |
171 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
172 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
173 | vpmovmskb %ymmM, %edx |
174 | add $VEC_SIZE, %rcx |
175 | # ifdef USE_AS_STRNCPY |
176 | sub $VEC_SIZE, %r8 |
177 | jbe L(CopyVecSizeCase2OrCase3) |
178 | # endif |
179 | test %edx, %edx |
180 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
181 | jnz L(CopyVecSizeUnalignedVec2) |
182 | # else |
183 | jnz L(CopyVecSize) |
184 | # endif |
185 | |
186 | vmovdqu %ymm2, (%rdi, %rcx) |
187 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
188 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
189 | vpmovmskb %ymmM, %edx |
190 | add $VEC_SIZE, %rcx |
191 | # ifdef USE_AS_STRNCPY |
192 | sub $VEC_SIZE, %r8 |
193 | jbe L(CopyVecSizeCase2OrCase3) |
194 | # endif |
195 | test %edx, %edx |
196 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
197 | jnz L(CopyVecSizeUnalignedVec2) |
198 | # else |
199 | jnz L(CopyVecSize) |
200 | # endif |
201 | |
202 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 |
203 | vmovdqu %ymm2, (%rdi, %rcx) |
204 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
205 | vpmovmskb %ymmM, %edx |
206 | add $VEC_SIZE, %rcx |
207 | # ifdef USE_AS_STRNCPY |
208 | sub $VEC_SIZE, %r8 |
209 | jbe L(CopyVecSizeCase2OrCase3) |
210 | # endif |
211 | test %edx, %edx |
212 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
213 | jnz L(CopyVecSizeUnalignedVec3) |
214 | # else |
215 | jnz L(CopyVecSize) |
216 | # endif |
217 | |
218 | vmovdqu %ymm3, (%rdi, %rcx) |
219 | mov %rsi, %rdx |
220 | lea VEC_SIZE(%rsi, %rcx), %rsi |
221 | and $-(VEC_SIZE * 4), %rsi |
222 | sub %rsi, %rdx |
223 | sub %rdx, %rdi |
224 | # ifdef USE_AS_STRNCPY |
225 | lea (VEC_SIZE * 8)(%r8, %rdx), %r8 |
226 | # endif |
227 | L(UnalignedFourVecSizeLoop): |
228 | vmovdqa (%rsi), %ymm4 |
229 | vmovdqa VEC_SIZE(%rsi), %ymm5 |
230 | vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 |
231 | vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 |
232 | vpminub %ymm5, %ymm4, %ymm2 |
233 | vpminub %ymm7, %ymm6, %ymm3 |
234 | vpminub %ymm2, %ymm3, %ymm3 |
235 | vpcmpeqb %ymmM, %ymm3, %ymm3 |
236 | vpmovmskb %ymm3, %edx |
237 | # ifdef USE_AS_STRNCPY |
238 | sub $(VEC_SIZE * 4), %r8 |
239 | jbe L(UnalignedLeaveCase2OrCase3) |
240 | # endif |
241 | test %edx, %edx |
242 | jnz L(UnalignedFourVecSizeLeave) |
243 | |
244 | L(UnalignedFourVecSizeLoop_start): |
245 | add $(VEC_SIZE * 4), %rdi |
246 | add $(VEC_SIZE * 4), %rsi |
247 | vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) |
248 | vmovdqa (%rsi), %ymm4 |
249 | vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) |
250 | vmovdqa VEC_SIZE(%rsi), %ymm5 |
251 | vpminub %ymm5, %ymm4, %ymm2 |
252 | vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) |
253 | vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 |
254 | vmovdqu %ymm7, -VEC_SIZE(%rdi) |
255 | vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 |
256 | vpminub %ymm7, %ymm6, %ymm3 |
257 | vpminub %ymm2, %ymm3, %ymm3 |
258 | vpcmpeqb %ymmM, %ymm3, %ymm3 |
259 | vpmovmskb %ymm3, %edx |
260 | # ifdef USE_AS_STRNCPY |
261 | sub $(VEC_SIZE * 4), %r8 |
262 | jbe L(UnalignedLeaveCase2OrCase3) |
263 | # endif |
264 | test %edx, %edx |
265 | jz L(UnalignedFourVecSizeLoop_start) |
266 | |
267 | L(UnalignedFourVecSizeLeave): |
268 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
269 | vpmovmskb %ymmM, %edx |
270 | test %edx, %edx |
271 | jnz L(CopyVecSizeUnaligned_0) |
272 | |
273 | vpcmpeqb %ymm5, %ymmZ, %ymmM |
274 | vpmovmskb %ymmM, %ecx |
275 | test %ecx, %ecx |
276 | jnz L(CopyVecSizeUnaligned_16) |
277 | |
278 | vpcmpeqb %ymm6, %ymmZ, %ymmM |
279 | vpmovmskb %ymmM, %edx |
280 | test %edx, %edx |
281 | jnz L(CopyVecSizeUnaligned_32) |
282 | |
283 | vpcmpeqb %ymm7, %ymmZ, %ymmM |
284 | vpmovmskb %ymmM, %ecx |
285 | bsf %ecx, %edx |
286 | vmovdqu %ymm4, (%rdi) |
287 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
288 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
289 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
290 | # ifdef USE_AS_STPCPY |
291 | lea (VEC_SIZE * 3)(%rdi, %rdx), %rax |
292 | # endif |
293 | vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) |
294 | add $(VEC_SIZE - 1), %r8 |
295 | sub %rdx, %r8 |
296 | lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi |
297 | jmp L(StrncpyFillTailWithZero) |
298 | # else |
299 | add $(VEC_SIZE * 3), %rsi |
300 | add $(VEC_SIZE * 3), %rdi |
301 | jmp L(CopyVecSizeExit) |
302 | # endif |
303 | |
304 | /* If source address alignment == destination address alignment */ |
305 | |
306 | L(SourceStringAlignmentLessTwoVecSize): |
307 | vmovdqu (%rsi), %ymm3 |
308 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
309 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
310 | vpmovmskb %ymmM, %edx |
311 | |
312 | # ifdef USE_AS_STRNCPY |
313 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
314 | cmp $VEC_SIZE, %r8 |
315 | # else |
316 | cmp $(VEC_SIZE + 1), %r8 |
317 | # endif |
318 | jbe L(CopyVecSizeTail1Case2OrCase3) |
319 | # endif |
320 | test %edx, %edx |
321 | jnz L(CopyVecSizeTail1) |
322 | |
323 | vmovdqu %ymm3, (%rdi) |
324 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
325 | vpmovmskb %ymmM, %edx |
326 | |
327 | # ifdef USE_AS_STRNCPY |
328 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
329 | cmp $(VEC_SIZE * 2), %r8 |
330 | # else |
331 | cmp $((VEC_SIZE * 2) + 1), %r8 |
332 | # endif |
333 | jbe L(CopyTwoVecSize1Case2OrCase3) |
334 | # endif |
335 | test %edx, %edx |
336 | jnz L(CopyTwoVecSize1) |
337 | |
338 | and $-VEC_SIZE, %rsi |
339 | and $(VEC_SIZE - 1), %ecx |
340 | jmp L(UnalignVecSizeBoth) |
341 | |
342 | /*------End of main part with loops---------------------*/ |
343 | |
344 | /* Case1 */ |
345 | |
346 | # if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) |
347 | .p2align 4 |
348 | L(CopyVecSize): |
349 | add %rcx, %rdi |
350 | # endif |
351 | L(CopyVecSizeTail): |
352 | add %rcx, %rsi |
353 | L(CopyVecSizeTail1): |
354 | bsf %edx, %edx |
355 | L(CopyVecSizeExit): |
356 | cmp $32, %edx |
357 | jae L(Exit32_63) |
358 | cmp $16, %edx |
359 | jae L(Exit16_31) |
360 | cmp $8, %edx |
361 | jae L(Exit8_15) |
362 | cmp $4, %edx |
363 | jae L(Exit4_7) |
364 | cmp $3, %edx |
365 | je L(Exit3) |
366 | cmp $1, %edx |
367 | ja L(Exit2) |
368 | je L(Exit1) |
369 | movb $0, (%rdi) |
370 | # ifdef USE_AS_STPCPY |
371 | lea (%rdi), %rax |
372 | # endif |
373 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
374 | sub $1, %r8 |
375 | lea 1(%rdi), %rdi |
376 | jnz L(StrncpyFillTailWithZero) |
377 | # endif |
378 | L(return_vzeroupper): |
379 | ZERO_UPPER_VEC_REGISTERS_RETURN |
380 | |
381 | .p2align 4 |
382 | L(CopyTwoVecSize1): |
383 | add $VEC_SIZE, %rsi |
384 | add $VEC_SIZE, %rdi |
385 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
386 | sub $VEC_SIZE, %r8 |
387 | # endif |
388 | jmp L(CopyVecSizeTail1) |
389 | |
390 | .p2align 4 |
391 | L(CopyTwoVecSize): |
392 | bsf %edx, %edx |
393 | add %rcx, %rsi |
394 | add $VEC_SIZE, %edx |
395 | sub %ecx, %edx |
396 | jmp L(CopyVecSizeExit) |
397 | |
398 | .p2align 4 |
399 | L(CopyVecSizeUnaligned_0): |
400 | bsf %edx, %edx |
401 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
402 | # ifdef USE_AS_STPCPY |
403 | lea (%rdi, %rdx), %rax |
404 | # endif |
405 | vmovdqu %ymm4, (%rdi) |
406 | add $((VEC_SIZE * 4) - 1), %r8 |
407 | sub %rdx, %r8 |
408 | lea 1(%rdi, %rdx), %rdi |
409 | jmp L(StrncpyFillTailWithZero) |
410 | # else |
411 | jmp L(CopyVecSizeExit) |
412 | # endif |
413 | |
414 | .p2align 4 |
415 | L(CopyVecSizeUnaligned_16): |
416 | bsf %ecx, %edx |
417 | vmovdqu %ymm4, (%rdi) |
418 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
419 | # ifdef USE_AS_STPCPY |
420 | lea VEC_SIZE(%rdi, %rdx), %rax |
421 | # endif |
422 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
423 | add $((VEC_SIZE * 3) - 1), %r8 |
424 | sub %rdx, %r8 |
425 | lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi |
426 | jmp L(StrncpyFillTailWithZero) |
427 | # else |
428 | add $VEC_SIZE, %rsi |
429 | add $VEC_SIZE, %rdi |
430 | jmp L(CopyVecSizeExit) |
431 | # endif |
432 | |
433 | .p2align 4 |
434 | L(CopyVecSizeUnaligned_32): |
435 | bsf %edx, %edx |
436 | vmovdqu %ymm4, (%rdi) |
437 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
438 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
439 | # ifdef USE_AS_STPCPY |
440 | lea (VEC_SIZE * 2)(%rdi, %rdx), %rax |
441 | # endif |
442 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
443 | add $((VEC_SIZE * 2) - 1), %r8 |
444 | sub %rdx, %r8 |
445 | lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi |
446 | jmp L(StrncpyFillTailWithZero) |
447 | # else |
448 | add $(VEC_SIZE * 2), %rsi |
449 | add $(VEC_SIZE * 2), %rdi |
450 | jmp L(CopyVecSizeExit) |
451 | # endif |
452 | |
453 | # ifdef USE_AS_STRNCPY |
454 | # ifndef USE_AS_STRCAT |
455 | .p2align 4 |
456 | L(CopyVecSizeUnalignedVec6): |
457 | vmovdqu %ymm6, (%rdi, %rcx) |
458 | jmp L(CopyVecSizeVecExit) |
459 | |
460 | .p2align 4 |
461 | L(CopyVecSizeUnalignedVec5): |
462 | vmovdqu %ymm5, (%rdi, %rcx) |
463 | jmp L(CopyVecSizeVecExit) |
464 | |
465 | .p2align 4 |
466 | L(CopyVecSizeUnalignedVec4): |
467 | vmovdqu %ymm4, (%rdi, %rcx) |
468 | jmp L(CopyVecSizeVecExit) |
469 | |
470 | .p2align 4 |
471 | L(CopyVecSizeUnalignedVec3): |
472 | vmovdqu %ymm3, (%rdi, %rcx) |
473 | jmp L(CopyVecSizeVecExit) |
474 | # endif |
475 | |
476 | /* Case2 */ |
477 | |
478 | .p2align 4 |
479 | L(CopyVecSizeCase2): |
480 | add $VEC_SIZE, %r8 |
481 | add %rcx, %rdi |
482 | add %rcx, %rsi |
483 | bsf %edx, %edx |
484 | cmp %r8d, %edx |
485 | jb L(CopyVecSizeExit) |
486 | jmp L(StrncpyExit) |
487 | |
488 | .p2align 4 |
489 | L(CopyTwoVecSizeCase2): |
490 | add %rcx, %rsi |
491 | bsf %edx, %edx |
492 | add $VEC_SIZE, %edx |
493 | sub %ecx, %edx |
494 | cmp %r8d, %edx |
495 | jb L(CopyVecSizeExit) |
496 | jmp L(StrncpyExit) |
497 | |
498 | L(CopyVecSizeTailCase2): |
499 | add %rcx, %rsi |
500 | bsf %edx, %edx |
501 | cmp %r8d, %edx |
502 | jb L(CopyVecSizeExit) |
503 | jmp L(StrncpyExit) |
504 | |
505 | L(CopyVecSizeTail1Case2): |
506 | bsf %edx, %edx |
507 | cmp %r8d, %edx |
508 | jb L(CopyVecSizeExit) |
509 | jmp L(StrncpyExit) |
510 | |
511 | /* Case2 or Case3, Case3 */ |
512 | |
513 | .p2align 4 |
514 | L(CopyVecSizeCase2OrCase3): |
515 | test %rdx, %rdx |
516 | jnz L(CopyVecSizeCase2) |
517 | L(CopyVecSizeCase3): |
518 | add $VEC_SIZE, %r8 |
519 | add %rcx, %rdi |
520 | add %rcx, %rsi |
521 | jmp L(StrncpyExit) |
522 | |
523 | .p2align 4 |
524 | L(CopyTwoVecSizeCase2OrCase3): |
525 | test %rdx, %rdx |
526 | jnz L(CopyTwoVecSizeCase2) |
527 | add %rcx, %rsi |
528 | jmp L(StrncpyExit) |
529 | |
530 | .p2align 4 |
531 | L(CopyVecSizeTailCase2OrCase3): |
532 | test %rdx, %rdx |
533 | jnz L(CopyVecSizeTailCase2) |
534 | add %rcx, %rsi |
535 | jmp L(StrncpyExit) |
536 | |
537 | .p2align 4 |
538 | L(CopyTwoVecSize1Case2OrCase3): |
539 | add $VEC_SIZE, %rdi |
540 | add $VEC_SIZE, %rsi |
541 | sub $VEC_SIZE, %r8 |
542 | L(CopyVecSizeTail1Case2OrCase3): |
543 | test %rdx, %rdx |
544 | jnz L(CopyVecSizeTail1Case2) |
545 | jmp L(StrncpyExit) |
546 | # endif |
547 | |
548 | /*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ |
549 | |
550 | .p2align 4 |
551 | L(Exit1): |
552 | movzwl (%rsi), %edx |
553 | mov %dx, (%rdi) |
554 | # ifdef USE_AS_STPCPY |
555 | lea 1(%rdi), %rax |
556 | # endif |
557 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
558 | sub $2, %r8 |
559 | lea 2(%rdi), %rdi |
560 | jnz L(StrncpyFillTailWithZero) |
561 | # endif |
562 | VZEROUPPER_RETURN |
563 | |
564 | .p2align 4 |
565 | L(Exit2): |
566 | movzwl (%rsi), %ecx |
567 | mov %cx, (%rdi) |
568 | movb $0, 2(%rdi) |
569 | # ifdef USE_AS_STPCPY |
570 | lea 2(%rdi), %rax |
571 | # endif |
572 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
573 | sub $3, %r8 |
574 | lea 3(%rdi), %rdi |
575 | jnz L(StrncpyFillTailWithZero) |
576 | # endif |
577 | VZEROUPPER_RETURN |
578 | |
579 | .p2align 4 |
580 | L(Exit3): |
581 | mov (%rsi), %edx |
582 | mov %edx, (%rdi) |
583 | # ifdef USE_AS_STPCPY |
584 | lea 3(%rdi), %rax |
585 | # endif |
586 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
587 | sub $4, %r8 |
588 | lea 4(%rdi), %rdi |
589 | jnz L(StrncpyFillTailWithZero) |
590 | # endif |
591 | VZEROUPPER_RETURN |
592 | |
593 | .p2align 4 |
594 | L(Exit4_7): |
595 | mov (%rsi), %ecx |
596 | mov %ecx, (%rdi) |
597 | mov -3(%rsi, %rdx), %ecx |
598 | mov %ecx, -3(%rdi, %rdx) |
599 | # ifdef USE_AS_STPCPY |
600 | lea (%rdi, %rdx), %rax |
601 | # endif |
602 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
603 | sub %rdx, %r8 |
604 | sub $1, %r8 |
605 | lea 1(%rdi, %rdx), %rdi |
606 | jnz L(StrncpyFillTailWithZero) |
607 | # endif |
608 | VZEROUPPER_RETURN |
609 | |
610 | .p2align 4 |
611 | L(Exit8_15): |
612 | mov (%rsi), %rcx |
613 | mov -7(%rsi, %rdx), %r9 |
614 | mov %rcx, (%rdi) |
615 | mov %r9, -7(%rdi, %rdx) |
616 | # ifdef USE_AS_STPCPY |
617 | lea (%rdi, %rdx), %rax |
618 | # endif |
619 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
620 | sub %rdx, %r8 |
621 | sub $1, %r8 |
622 | lea 1(%rdi, %rdx), %rdi |
623 | jnz L(StrncpyFillTailWithZero) |
624 | # endif |
625 | VZEROUPPER_RETURN |
626 | |
627 | .p2align 4 |
628 | L(Exit16_31): |
629 | vmovdqu (%rsi), %xmm2 |
630 | vmovdqu -15(%rsi, %rdx), %xmm3 |
631 | vmovdqu %xmm2, (%rdi) |
632 | vmovdqu %xmm3, -15(%rdi, %rdx) |
633 | # ifdef USE_AS_STPCPY |
634 | lea (%rdi, %rdx), %rax |
635 | # endif |
636 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
637 | sub %rdx, %r8 |
638 | sub $1, %r8 |
639 | lea 1(%rdi, %rdx), %rdi |
640 | jnz L(StrncpyFillTailWithZero) |
641 | # endif |
642 | VZEROUPPER_RETURN |
643 | |
644 | .p2align 4 |
645 | L(Exit32_63): |
646 | vmovdqu (%rsi), %ymm2 |
647 | vmovdqu -31(%rsi, %rdx), %ymm3 |
648 | vmovdqu %ymm2, (%rdi) |
649 | vmovdqu %ymm3, -31(%rdi, %rdx) |
650 | # ifdef USE_AS_STPCPY |
651 | lea (%rdi, %rdx), %rax |
652 | # endif |
653 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
654 | sub %rdx, %r8 |
655 | sub $1, %r8 |
656 | lea 1(%rdi, %rdx), %rdi |
657 | jnz L(StrncpyFillTailWithZero) |
658 | # endif |
659 | VZEROUPPER_RETURN |
660 | |
661 | # ifdef USE_AS_STRNCPY |
662 | |
663 | .p2align 4 |
664 | L(StrncpyExit1): |
665 | movzbl (%rsi), %edx |
666 | mov %dl, (%rdi) |
667 | # ifdef USE_AS_STPCPY |
668 | lea 1(%rdi), %rax |
669 | # endif |
670 | # ifdef USE_AS_STRCAT |
671 | movb $0, 1(%rdi) |
672 | # endif |
673 | VZEROUPPER_RETURN |
674 | |
675 | .p2align 4 |
676 | L(StrncpyExit2): |
677 | movzwl (%rsi), %edx |
678 | mov %dx, (%rdi) |
679 | # ifdef USE_AS_STPCPY |
680 | lea 2(%rdi), %rax |
681 | # endif |
682 | # ifdef USE_AS_STRCAT |
683 | movb $0, 2(%rdi) |
684 | # endif |
685 | VZEROUPPER_RETURN |
686 | |
687 | .p2align 4 |
688 | L(StrncpyExit3_4): |
689 | movzwl (%rsi), %ecx |
690 | movzwl -2(%rsi, %r8), %edx |
691 | mov %cx, (%rdi) |
692 | mov %dx, -2(%rdi, %r8) |
693 | # ifdef USE_AS_STPCPY |
694 | lea (%rdi, %r8), %rax |
695 | # endif |
696 | # ifdef USE_AS_STRCAT |
697 | movb $0, (%rdi, %r8) |
698 | # endif |
699 | VZEROUPPER_RETURN |
700 | |
701 | .p2align 4 |
702 | L(StrncpyExit5_8): |
703 | mov (%rsi), %ecx |
704 | mov -4(%rsi, %r8), %edx |
705 | mov %ecx, (%rdi) |
706 | mov %edx, -4(%rdi, %r8) |
707 | # ifdef USE_AS_STPCPY |
708 | lea (%rdi, %r8), %rax |
709 | # endif |
710 | # ifdef USE_AS_STRCAT |
711 | movb $0, (%rdi, %r8) |
712 | # endif |
713 | VZEROUPPER_RETURN |
714 | |
715 | .p2align 4 |
716 | L(StrncpyExit9_16): |
717 | mov (%rsi), %rcx |
718 | mov -8(%rsi, %r8), %rdx |
719 | mov %rcx, (%rdi) |
720 | mov %rdx, -8(%rdi, %r8) |
721 | # ifdef USE_AS_STPCPY |
722 | lea (%rdi, %r8), %rax |
723 | # endif |
724 | # ifdef USE_AS_STRCAT |
725 | movb $0, (%rdi, %r8) |
726 | # endif |
727 | VZEROUPPER_RETURN |
728 | |
729 | .p2align 4 |
730 | L(StrncpyExit17_32): |
731 | vmovdqu (%rsi), %xmm2 |
732 | vmovdqu -16(%rsi, %r8), %xmm3 |
733 | vmovdqu %xmm2, (%rdi) |
734 | vmovdqu %xmm3, -16(%rdi, %r8) |
735 | # ifdef USE_AS_STPCPY |
736 | lea (%rdi, %r8), %rax |
737 | # endif |
738 | # ifdef USE_AS_STRCAT |
739 | movb $0, (%rdi, %r8) |
740 | # endif |
741 | VZEROUPPER_RETURN |
742 | |
743 | .p2align 4 |
744 | L(StrncpyExit33_64): |
745 | /* 0/32, 31/16 */ |
746 | vmovdqu (%rsi), %ymm2 |
747 | vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 |
748 | vmovdqu %ymm2, (%rdi) |
749 | vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) |
750 | # ifdef USE_AS_STPCPY |
751 | lea (%rdi, %r8), %rax |
752 | # endif |
753 | # ifdef USE_AS_STRCAT |
754 | movb $0, (%rdi, %r8) |
755 | # endif |
756 | VZEROUPPER_RETURN |
757 | |
758 | .p2align 4 |
759 | L(StrncpyExit65): |
760 | /* 0/32, 32/32, 64/1 */ |
761 | vmovdqu (%rsi), %ymm2 |
762 | vmovdqu 32(%rsi), %ymm3 |
763 | mov 64(%rsi), %cl |
764 | vmovdqu %ymm2, (%rdi) |
765 | vmovdqu %ymm3, 32(%rdi) |
766 | mov %cl, 64(%rdi) |
767 | # ifdef USE_AS_STPCPY |
768 | lea 65(%rdi), %rax |
769 | # endif |
770 | # ifdef USE_AS_STRCAT |
771 | movb $0, 65(%rdi) |
772 | # endif |
773 | VZEROUPPER_RETURN |
774 | |
775 | # ifndef USE_AS_STRCAT |
776 | |
777 | .p2align 4 |
778 | L(Fill1): |
779 | mov %dl, (%rdi) |
780 | VZEROUPPER_RETURN |
781 | |
782 | .p2align 4 |
783 | L(Fill2): |
784 | mov %dx, (%rdi) |
785 | VZEROUPPER_RETURN |
786 | |
787 | .p2align 4 |
788 | L(Fill3_4): |
789 | mov %dx, (%rdi) |
790 | mov %dx, -2(%rdi, %r8) |
791 | VZEROUPPER_RETURN |
792 | |
793 | .p2align 4 |
794 | L(Fill5_8): |
795 | mov %edx, (%rdi) |
796 | mov %edx, -4(%rdi, %r8) |
797 | VZEROUPPER_RETURN |
798 | |
799 | .p2align 4 |
800 | L(Fill9_16): |
801 | mov %rdx, (%rdi) |
802 | mov %rdx, -8(%rdi, %r8) |
803 | VZEROUPPER_RETURN |
804 | |
805 | .p2align 4 |
806 | L(Fill17_32): |
807 | vmovdqu %xmmZ, (%rdi) |
808 | vmovdqu %xmmZ, -16(%rdi, %r8) |
809 | VZEROUPPER_RETURN |
810 | |
811 | .p2align 4 |
812 | L(CopyVecSizeUnalignedVec2): |
813 | vmovdqu %ymm2, (%rdi, %rcx) |
814 | |
815 | .p2align 4 |
816 | L(CopyVecSizeVecExit): |
817 | bsf %edx, %edx |
818 | add $(VEC_SIZE - 1), %r8 |
819 | add %rcx, %rdi |
820 | # ifdef USE_AS_STPCPY |
821 | lea (%rdi, %rdx), %rax |
822 | # endif |
823 | sub %rdx, %r8 |
824 | lea 1(%rdi, %rdx), %rdi |
825 | |
826 | .p2align 4 |
827 | L(StrncpyFillTailWithZero): |
828 | xor %edx, %edx |
829 | sub $VEC_SIZE, %r8 |
830 | jbe L(StrncpyFillExit) |
831 | |
832 | vmovdqu %ymmZ, (%rdi) |
833 | add $VEC_SIZE, %rdi |
834 | |
835 | mov %rdi, %rsi |
836 | and $(VEC_SIZE - 1), %esi |
837 | sub %rsi, %rdi |
838 | add %rsi, %r8 |
839 | sub $(VEC_SIZE * 4), %r8 |
840 | jb L(StrncpyFillLessFourVecSize) |
841 | |
842 | L(StrncpyFillLoopVmovdqa): |
843 | vmovdqa %ymmZ, (%rdi) |
844 | vmovdqa %ymmZ, VEC_SIZE(%rdi) |
845 | vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) |
846 | vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) |
847 | add $(VEC_SIZE * 4), %rdi |
848 | sub $(VEC_SIZE * 4), %r8 |
849 | jae L(StrncpyFillLoopVmovdqa) |
850 | |
851 | L(StrncpyFillLessFourVecSize): |
852 | add $(VEC_SIZE * 2), %r8 |
853 | jl L(StrncpyFillLessTwoVecSize) |
854 | vmovdqa %ymmZ, (%rdi) |
855 | vmovdqa %ymmZ, VEC_SIZE(%rdi) |
856 | add $(VEC_SIZE * 2), %rdi |
857 | sub $VEC_SIZE, %r8 |
858 | jl L(StrncpyFillExit) |
859 | vmovdqa %ymmZ, (%rdi) |
860 | add $VEC_SIZE, %rdi |
861 | jmp L(Fill) |
862 | |
863 | .p2align 4 |
864 | L(StrncpyFillLessTwoVecSize): |
865 | add $VEC_SIZE, %r8 |
866 | jl L(StrncpyFillExit) |
867 | vmovdqa %ymmZ, (%rdi) |
868 | add $VEC_SIZE, %rdi |
869 | jmp L(Fill) |
870 | |
871 | .p2align 4 |
872 | L(StrncpyFillExit): |
873 | add $VEC_SIZE, %r8 |
874 | L(Fill): |
875 | cmp $17, %r8d |
876 | jae L(Fill17_32) |
877 | cmp $9, %r8d |
878 | jae L(Fill9_16) |
879 | cmp $5, %r8d |
880 | jae L(Fill5_8) |
881 | cmp $3, %r8d |
882 | jae L(Fill3_4) |
883 | cmp $1, %r8d |
884 | ja L(Fill2) |
885 | je L(Fill1) |
886 | VZEROUPPER_RETURN |
887 | |
888 | /* end of ifndef USE_AS_STRCAT */ |
889 | # endif |
890 | |
891 | .p2align 4 |
892 | L(UnalignedLeaveCase2OrCase3): |
893 | test %rdx, %rdx |
894 | jnz L(UnalignedFourVecSizeLeaveCase2) |
895 | L(UnalignedFourVecSizeLeaveCase3): |
896 | lea (VEC_SIZE * 4)(%r8), %rcx |
897 | and $-VEC_SIZE, %rcx |
898 | add $(VEC_SIZE * 3), %r8 |
899 | jl L(CopyVecSizeCase3) |
900 | vmovdqu %ymm4, (%rdi) |
901 | sub $VEC_SIZE, %r8 |
902 | jb L(CopyVecSizeCase3) |
903 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
904 | sub $VEC_SIZE, %r8 |
905 | jb L(CopyVecSizeCase3) |
906 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
907 | sub $VEC_SIZE, %r8 |
908 | jb L(CopyVecSizeCase3) |
909 | vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) |
910 | # ifdef USE_AS_STPCPY |
911 | lea (VEC_SIZE * 4)(%rdi), %rax |
912 | # endif |
913 | # ifdef USE_AS_STRCAT |
914 | movb $0, (VEC_SIZE * 4)(%rdi) |
915 | # endif |
916 | VZEROUPPER_RETURN |
917 | |
918 | .p2align 4 |
919 | L(UnalignedFourVecSizeLeaveCase2): |
920 | xor %ecx, %ecx |
921 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
922 | vpmovmskb %ymmM, %edx |
923 | add $(VEC_SIZE * 3), %r8 |
924 | jle L(CopyVecSizeCase2OrCase3) |
925 | test %edx, %edx |
926 | # ifndef USE_AS_STRCAT |
927 | jnz L(CopyVecSizeUnalignedVec4) |
928 | # else |
929 | jnz L(CopyVecSize) |
930 | # endif |
931 | vpcmpeqb %ymm5, %ymmZ, %ymmM |
932 | vpmovmskb %ymmM, %edx |
933 | vmovdqu %ymm4, (%rdi) |
934 | add $VEC_SIZE, %rcx |
935 | sub $VEC_SIZE, %r8 |
936 | jbe L(CopyVecSizeCase2OrCase3) |
937 | test %edx, %edx |
938 | # ifndef USE_AS_STRCAT |
939 | jnz L(CopyVecSizeUnalignedVec5) |
940 | # else |
941 | jnz L(CopyVecSize) |
942 | # endif |
943 | |
944 | vpcmpeqb %ymm6, %ymmZ, %ymmM |
945 | vpmovmskb %ymmM, %edx |
946 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
947 | add $VEC_SIZE, %rcx |
948 | sub $VEC_SIZE, %r8 |
949 | jbe L(CopyVecSizeCase2OrCase3) |
950 | test %edx, %edx |
951 | # ifndef USE_AS_STRCAT |
952 | jnz L(CopyVecSizeUnalignedVec6) |
953 | # else |
954 | jnz L(CopyVecSize) |
955 | # endif |
956 | |
957 | vpcmpeqb %ymm7, %ymmZ, %ymmM |
958 | vpmovmskb %ymmM, %edx |
959 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
960 | lea VEC_SIZE(%rdi, %rcx), %rdi |
961 | lea VEC_SIZE(%rsi, %rcx), %rsi |
962 | bsf %edx, %edx |
963 | cmp %r8d, %edx |
964 | jb L(CopyVecSizeExit) |
965 | L(StrncpyExit): |
966 | cmp $65, %r8d |
967 | je L(StrncpyExit65) |
968 | cmp $33, %r8d |
969 | jae L(StrncpyExit33_64) |
970 | cmp $17, %r8d |
971 | jae L(StrncpyExit17_32) |
972 | cmp $9, %r8d |
973 | jae L(StrncpyExit9_16) |
974 | cmp $5, %r8d |
975 | jae L(StrncpyExit5_8) |
976 | cmp $3, %r8d |
977 | jae L(StrncpyExit3_4) |
978 | cmp $1, %r8d |
979 | ja L(StrncpyExit2) |
980 | je L(StrncpyExit1) |
981 | # ifdef USE_AS_STPCPY |
982 | mov %rdi, %rax |
983 | # endif |
984 | # ifdef USE_AS_STRCAT |
985 | movb $0, (%rdi) |
986 | # endif |
987 | VZEROUPPER_RETURN |
988 | |
989 | .p2align 4 |
990 | L(ExitZero): |
991 | # ifndef USE_AS_STRCAT |
992 | mov %rdi, %rax |
993 | # endif |
994 | VZEROUPPER_RETURN |
995 | |
996 | # endif |
997 | |
998 | # ifndef USE_AS_STRCAT |
999 | END (STRCPY) |
1000 | # else |
1001 | END (STRCAT) |
1002 | # endif |
1003 | #endif |
1004 | |