1 | /* strcpy with AVX2 |
2 | Copyright (C) 2011-2021 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #if IS_IN (libc) |
21 | |
22 | # ifndef USE_AS_STRCAT |
23 | # include <sysdep.h> |
24 | |
25 | # ifndef STRCPY |
26 | # define STRCPY __strcpy_avx2 |
27 | # endif |
28 | |
29 | # endif |
30 | |
31 | /* Number of bytes in a vector register */ |
32 | # ifndef VEC_SIZE |
33 | # define VEC_SIZE 32 |
34 | # endif |
35 | |
36 | # ifndef VZEROUPPER |
37 | # define VZEROUPPER vzeroupper |
38 | # endif |
39 | |
40 | /* zero register */ |
41 | #define xmmZ xmm0 |
42 | #define ymmZ ymm0 |
43 | |
44 | /* mask register */ |
45 | #define ymmM ymm1 |
46 | |
47 | # ifndef USE_AS_STRCAT |
48 | |
49 | .section .text.avx,"ax" ,@progbits |
50 | ENTRY (STRCPY) |
51 | # ifdef USE_AS_STRNCPY |
52 | mov %RDX_LP, %R8_LP |
53 | test %R8_LP, %R8_LP |
54 | jz L(ExitZero) |
55 | # endif |
56 | mov %rsi, %rcx |
57 | # ifndef USE_AS_STPCPY |
58 | mov %rdi, %rax /* save result */ |
59 | # endif |
60 | |
61 | # endif |
62 | |
63 | vpxor %xmmZ, %xmmZ, %xmmZ |
64 | |
65 | and $((VEC_SIZE * 4) - 1), %ecx |
66 | cmp $(VEC_SIZE * 2), %ecx |
67 | jbe L(SourceStringAlignmentLessTwoVecSize) |
68 | |
69 | and $-VEC_SIZE, %rsi |
70 | and $(VEC_SIZE - 1), %ecx |
71 | |
72 | vpcmpeqb (%rsi), %ymmZ, %ymmM |
73 | vpmovmskb %ymmM, %edx |
74 | shr %cl, %rdx |
75 | |
76 | # ifdef USE_AS_STRNCPY |
77 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
78 | mov $VEC_SIZE, %r10 |
79 | sub %rcx, %r10 |
80 | cmp %r10, %r8 |
81 | # else |
82 | mov $(VEC_SIZE + 1), %r10 |
83 | sub %rcx, %r10 |
84 | cmp %r10, %r8 |
85 | # endif |
86 | jbe L(CopyVecSizeTailCase2OrCase3) |
87 | # endif |
88 | test %edx, %edx |
89 | jnz L(CopyVecSizeTail) |
90 | |
91 | vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 |
92 | vpmovmskb %ymm2, %edx |
93 | |
94 | # ifdef USE_AS_STRNCPY |
95 | add $VEC_SIZE, %r10 |
96 | cmp %r10, %r8 |
97 | jbe L(CopyTwoVecSizeCase2OrCase3) |
98 | # endif |
99 | test %edx, %edx |
100 | jnz L(CopyTwoVecSize) |
101 | |
102 | vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ |
103 | vmovdqu %ymm2, (%rdi) |
104 | |
105 | /* If source address alignment != destination address alignment */ |
106 | .p2align 4 |
107 | L(UnalignVecSizeBoth): |
108 | sub %rcx, %rdi |
109 | # ifdef USE_AS_STRNCPY |
110 | add %rcx, %r8 |
111 | sbb %rcx, %rcx |
112 | or %rcx, %r8 |
113 | # endif |
114 | mov $VEC_SIZE, %rcx |
115 | vmovdqa (%rsi, %rcx), %ymm2 |
116 | vmovdqu %ymm2, (%rdi, %rcx) |
117 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
118 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
119 | vpmovmskb %ymmM, %edx |
120 | add $VEC_SIZE, %rcx |
121 | # ifdef USE_AS_STRNCPY |
122 | sub $(VEC_SIZE * 3), %r8 |
123 | jbe L(CopyVecSizeCase2OrCase3) |
124 | # endif |
125 | test %edx, %edx |
126 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
127 | jnz L(CopyVecSizeUnalignedVec2) |
128 | # else |
129 | jnz L(CopyVecSize) |
130 | # endif |
131 | |
132 | vmovdqu %ymm2, (%rdi, %rcx) |
133 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 |
134 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
135 | vpmovmskb %ymmM, %edx |
136 | add $VEC_SIZE, %rcx |
137 | # ifdef USE_AS_STRNCPY |
138 | sub $VEC_SIZE, %r8 |
139 | jbe L(CopyVecSizeCase2OrCase3) |
140 | # endif |
141 | test %edx, %edx |
142 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
143 | jnz L(CopyVecSizeUnalignedVec3) |
144 | # else |
145 | jnz L(CopyVecSize) |
146 | # endif |
147 | |
148 | vmovdqu %ymm3, (%rdi, %rcx) |
149 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 |
150 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
151 | vpmovmskb %ymmM, %edx |
152 | add $VEC_SIZE, %rcx |
153 | # ifdef USE_AS_STRNCPY |
154 | sub $VEC_SIZE, %r8 |
155 | jbe L(CopyVecSizeCase2OrCase3) |
156 | # endif |
157 | test %edx, %edx |
158 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
159 | jnz L(CopyVecSizeUnalignedVec4) |
160 | # else |
161 | jnz L(CopyVecSize) |
162 | # endif |
163 | |
164 | vmovdqu %ymm4, (%rdi, %rcx) |
165 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
166 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
167 | vpmovmskb %ymmM, %edx |
168 | add $VEC_SIZE, %rcx |
169 | # ifdef USE_AS_STRNCPY |
170 | sub $VEC_SIZE, %r8 |
171 | jbe L(CopyVecSizeCase2OrCase3) |
172 | # endif |
173 | test %edx, %edx |
174 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
175 | jnz L(CopyVecSizeUnalignedVec2) |
176 | # else |
177 | jnz L(CopyVecSize) |
178 | # endif |
179 | |
180 | vmovdqu %ymm2, (%rdi, %rcx) |
181 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 |
182 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
183 | vpmovmskb %ymmM, %edx |
184 | add $VEC_SIZE, %rcx |
185 | # ifdef USE_AS_STRNCPY |
186 | sub $VEC_SIZE, %r8 |
187 | jbe L(CopyVecSizeCase2OrCase3) |
188 | # endif |
189 | test %edx, %edx |
190 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
191 | jnz L(CopyVecSizeUnalignedVec2) |
192 | # else |
193 | jnz L(CopyVecSize) |
194 | # endif |
195 | |
196 | vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 |
197 | vmovdqu %ymm2, (%rdi, %rcx) |
198 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
199 | vpmovmskb %ymmM, %edx |
200 | add $VEC_SIZE, %rcx |
201 | # ifdef USE_AS_STRNCPY |
202 | sub $VEC_SIZE, %r8 |
203 | jbe L(CopyVecSizeCase2OrCase3) |
204 | # endif |
205 | test %edx, %edx |
206 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
207 | jnz L(CopyVecSizeUnalignedVec3) |
208 | # else |
209 | jnz L(CopyVecSize) |
210 | # endif |
211 | |
212 | vmovdqu %ymm3, (%rdi, %rcx) |
213 | mov %rsi, %rdx |
214 | lea VEC_SIZE(%rsi, %rcx), %rsi |
215 | and $-(VEC_SIZE * 4), %rsi |
216 | sub %rsi, %rdx |
217 | sub %rdx, %rdi |
218 | # ifdef USE_AS_STRNCPY |
219 | lea (VEC_SIZE * 8)(%r8, %rdx), %r8 |
220 | # endif |
221 | L(UnalignedFourVecSizeLoop): |
222 | vmovdqa (%rsi), %ymm4 |
223 | vmovdqa VEC_SIZE(%rsi), %ymm5 |
224 | vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 |
225 | vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 |
226 | vpminub %ymm5, %ymm4, %ymm2 |
227 | vpminub %ymm7, %ymm6, %ymm3 |
228 | vpminub %ymm2, %ymm3, %ymm3 |
229 | vpcmpeqb %ymmM, %ymm3, %ymm3 |
230 | vpmovmskb %ymm3, %edx |
231 | # ifdef USE_AS_STRNCPY |
232 | sub $(VEC_SIZE * 4), %r8 |
233 | jbe L(UnalignedLeaveCase2OrCase3) |
234 | # endif |
235 | test %edx, %edx |
236 | jnz L(UnalignedFourVecSizeLeave) |
237 | |
238 | L(UnalignedFourVecSizeLoop_start): |
239 | add $(VEC_SIZE * 4), %rdi |
240 | add $(VEC_SIZE * 4), %rsi |
241 | vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) |
242 | vmovdqa (%rsi), %ymm4 |
243 | vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) |
244 | vmovdqa VEC_SIZE(%rsi), %ymm5 |
245 | vpminub %ymm5, %ymm4, %ymm2 |
246 | vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) |
247 | vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 |
248 | vmovdqu %ymm7, -VEC_SIZE(%rdi) |
249 | vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 |
250 | vpminub %ymm7, %ymm6, %ymm3 |
251 | vpminub %ymm2, %ymm3, %ymm3 |
252 | vpcmpeqb %ymmM, %ymm3, %ymm3 |
253 | vpmovmskb %ymm3, %edx |
254 | # ifdef USE_AS_STRNCPY |
255 | sub $(VEC_SIZE * 4), %r8 |
256 | jbe L(UnalignedLeaveCase2OrCase3) |
257 | # endif |
258 | test %edx, %edx |
259 | jz L(UnalignedFourVecSizeLoop_start) |
260 | |
261 | L(UnalignedFourVecSizeLeave): |
262 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
263 | vpmovmskb %ymmM, %edx |
264 | test %edx, %edx |
265 | jnz L(CopyVecSizeUnaligned_0) |
266 | |
267 | vpcmpeqb %ymm5, %ymmZ, %ymmM |
268 | vpmovmskb %ymmM, %ecx |
269 | test %ecx, %ecx |
270 | jnz L(CopyVecSizeUnaligned_16) |
271 | |
272 | vpcmpeqb %ymm6, %ymmZ, %ymmM |
273 | vpmovmskb %ymmM, %edx |
274 | test %edx, %edx |
275 | jnz L(CopyVecSizeUnaligned_32) |
276 | |
277 | vpcmpeqb %ymm7, %ymmZ, %ymmM |
278 | vpmovmskb %ymmM, %ecx |
279 | bsf %ecx, %edx |
280 | vmovdqu %ymm4, (%rdi) |
281 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
282 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
283 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
284 | # ifdef USE_AS_STPCPY |
285 | lea (VEC_SIZE * 3)(%rdi, %rdx), %rax |
286 | # endif |
287 | vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) |
288 | add $(VEC_SIZE - 1), %r8 |
289 | sub %rdx, %r8 |
290 | lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi |
291 | jmp L(StrncpyFillTailWithZero) |
292 | # else |
293 | add $(VEC_SIZE * 3), %rsi |
294 | add $(VEC_SIZE * 3), %rdi |
295 | jmp L(CopyVecSizeExit) |
296 | # endif |
297 | |
298 | /* If source address alignment == destination address alignment */ |
299 | |
300 | L(SourceStringAlignmentLessTwoVecSize): |
301 | vmovdqu (%rsi), %ymm3 |
302 | vmovdqu VEC_SIZE(%rsi), %ymm2 |
303 | vpcmpeqb %ymm3, %ymmZ, %ymmM |
304 | vpmovmskb %ymmM, %edx |
305 | |
306 | # ifdef USE_AS_STRNCPY |
307 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
308 | cmp $VEC_SIZE, %r8 |
309 | # else |
310 | cmp $(VEC_SIZE + 1), %r8 |
311 | # endif |
312 | jbe L(CopyVecSizeTail1Case2OrCase3) |
313 | # endif |
314 | test %edx, %edx |
315 | jnz L(CopyVecSizeTail1) |
316 | |
317 | vmovdqu %ymm3, (%rdi) |
318 | vpcmpeqb %ymm2, %ymmZ, %ymmM |
319 | vpmovmskb %ymmM, %edx |
320 | |
321 | # ifdef USE_AS_STRNCPY |
322 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
323 | cmp $(VEC_SIZE * 2), %r8 |
324 | # else |
325 | cmp $((VEC_SIZE * 2) + 1), %r8 |
326 | # endif |
327 | jbe L(CopyTwoVecSize1Case2OrCase3) |
328 | # endif |
329 | test %edx, %edx |
330 | jnz L(CopyTwoVecSize1) |
331 | |
332 | and $-VEC_SIZE, %rsi |
333 | and $(VEC_SIZE - 1), %ecx |
334 | jmp L(UnalignVecSizeBoth) |
335 | |
336 | /*------End of main part with loops---------------------*/ |
337 | |
338 | /* Case1 */ |
339 | |
340 | # if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) |
341 | .p2align 4 |
342 | L(CopyVecSize): |
343 | add %rcx, %rdi |
344 | # endif |
345 | L(CopyVecSizeTail): |
346 | add %rcx, %rsi |
347 | L(CopyVecSizeTail1): |
348 | bsf %edx, %edx |
349 | L(CopyVecSizeExit): |
350 | cmp $32, %edx |
351 | jae L(Exit32_63) |
352 | cmp $16, %edx |
353 | jae L(Exit16_31) |
354 | cmp $8, %edx |
355 | jae L(Exit8_15) |
356 | cmp $4, %edx |
357 | jae L(Exit4_7) |
358 | cmp $3, %edx |
359 | je L(Exit3) |
360 | cmp $1, %edx |
361 | ja L(Exit2) |
362 | je L(Exit1) |
363 | movb $0, (%rdi) |
364 | # ifdef USE_AS_STPCPY |
365 | lea (%rdi), %rax |
366 | # endif |
367 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
368 | sub $1, %r8 |
369 | lea 1(%rdi), %rdi |
370 | jnz L(StrncpyFillTailWithZero) |
371 | # endif |
372 | VZEROUPPER |
373 | ret |
374 | |
375 | .p2align 4 |
376 | L(CopyTwoVecSize1): |
377 | add $VEC_SIZE, %rsi |
378 | add $VEC_SIZE, %rdi |
379 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
380 | sub $VEC_SIZE, %r8 |
381 | # endif |
382 | jmp L(CopyVecSizeTail1) |
383 | |
384 | .p2align 4 |
385 | L(CopyTwoVecSize): |
386 | bsf %edx, %edx |
387 | add %rcx, %rsi |
388 | add $VEC_SIZE, %edx |
389 | sub %ecx, %edx |
390 | jmp L(CopyVecSizeExit) |
391 | |
392 | .p2align 4 |
393 | L(CopyVecSizeUnaligned_0): |
394 | bsf %edx, %edx |
395 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
396 | # ifdef USE_AS_STPCPY |
397 | lea (%rdi, %rdx), %rax |
398 | # endif |
399 | vmovdqu %ymm4, (%rdi) |
400 | add $((VEC_SIZE * 4) - 1), %r8 |
401 | sub %rdx, %r8 |
402 | lea 1(%rdi, %rdx), %rdi |
403 | jmp L(StrncpyFillTailWithZero) |
404 | # else |
405 | jmp L(CopyVecSizeExit) |
406 | # endif |
407 | |
408 | .p2align 4 |
409 | L(CopyVecSizeUnaligned_16): |
410 | bsf %ecx, %edx |
411 | vmovdqu %ymm4, (%rdi) |
412 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
413 | # ifdef USE_AS_STPCPY |
414 | lea VEC_SIZE(%rdi, %rdx), %rax |
415 | # endif |
416 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
417 | add $((VEC_SIZE * 3) - 1), %r8 |
418 | sub %rdx, %r8 |
419 | lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi |
420 | jmp L(StrncpyFillTailWithZero) |
421 | # else |
422 | add $VEC_SIZE, %rsi |
423 | add $VEC_SIZE, %rdi |
424 | jmp L(CopyVecSizeExit) |
425 | # endif |
426 | |
427 | .p2align 4 |
428 | L(CopyVecSizeUnaligned_32): |
429 | bsf %edx, %edx |
430 | vmovdqu %ymm4, (%rdi) |
431 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
432 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
433 | # ifdef USE_AS_STPCPY |
434 | lea (VEC_SIZE * 2)(%rdi, %rdx), %rax |
435 | # endif |
436 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
437 | add $((VEC_SIZE * 2) - 1), %r8 |
438 | sub %rdx, %r8 |
439 | lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi |
440 | jmp L(StrncpyFillTailWithZero) |
441 | # else |
442 | add $(VEC_SIZE * 2), %rsi |
443 | add $(VEC_SIZE * 2), %rdi |
444 | jmp L(CopyVecSizeExit) |
445 | # endif |
446 | |
447 | # ifdef USE_AS_STRNCPY |
448 | # ifndef USE_AS_STRCAT |
449 | .p2align 4 |
450 | L(CopyVecSizeUnalignedVec6): |
451 | vmovdqu %ymm6, (%rdi, %rcx) |
452 | jmp L(CopyVecSizeVecExit) |
453 | |
454 | .p2align 4 |
455 | L(CopyVecSizeUnalignedVec5): |
456 | vmovdqu %ymm5, (%rdi, %rcx) |
457 | jmp L(CopyVecSizeVecExit) |
458 | |
459 | .p2align 4 |
460 | L(CopyVecSizeUnalignedVec4): |
461 | vmovdqu %ymm4, (%rdi, %rcx) |
462 | jmp L(CopyVecSizeVecExit) |
463 | |
464 | .p2align 4 |
465 | L(CopyVecSizeUnalignedVec3): |
466 | vmovdqu %ymm3, (%rdi, %rcx) |
467 | jmp L(CopyVecSizeVecExit) |
468 | # endif |
469 | |
470 | /* Case2 */ |
471 | |
472 | .p2align 4 |
473 | L(CopyVecSizeCase2): |
474 | add $VEC_SIZE, %r8 |
475 | add %rcx, %rdi |
476 | add %rcx, %rsi |
477 | bsf %edx, %edx |
478 | cmp %r8d, %edx |
479 | jb L(CopyVecSizeExit) |
480 | jmp L(StrncpyExit) |
481 | |
482 | .p2align 4 |
483 | L(CopyTwoVecSizeCase2): |
484 | add %rcx, %rsi |
485 | bsf %edx, %edx |
486 | add $VEC_SIZE, %edx |
487 | sub %ecx, %edx |
488 | cmp %r8d, %edx |
489 | jb L(CopyVecSizeExit) |
490 | jmp L(StrncpyExit) |
491 | |
492 | L(CopyVecSizeTailCase2): |
493 | add %rcx, %rsi |
494 | bsf %edx, %edx |
495 | cmp %r8d, %edx |
496 | jb L(CopyVecSizeExit) |
497 | jmp L(StrncpyExit) |
498 | |
499 | L(CopyVecSizeTail1Case2): |
500 | bsf %edx, %edx |
501 | cmp %r8d, %edx |
502 | jb L(CopyVecSizeExit) |
503 | jmp L(StrncpyExit) |
504 | |
505 | /* Case2 or Case3, Case3 */ |
506 | |
507 | .p2align 4 |
508 | L(CopyVecSizeCase2OrCase3): |
509 | test %rdx, %rdx |
510 | jnz L(CopyVecSizeCase2) |
511 | L(CopyVecSizeCase3): |
512 | add $VEC_SIZE, %r8 |
513 | add %rcx, %rdi |
514 | add %rcx, %rsi |
515 | jmp L(StrncpyExit) |
516 | |
517 | .p2align 4 |
518 | L(CopyTwoVecSizeCase2OrCase3): |
519 | test %rdx, %rdx |
520 | jnz L(CopyTwoVecSizeCase2) |
521 | add %rcx, %rsi |
522 | jmp L(StrncpyExit) |
523 | |
524 | .p2align 4 |
525 | L(CopyVecSizeTailCase2OrCase3): |
526 | test %rdx, %rdx |
527 | jnz L(CopyVecSizeTailCase2) |
528 | add %rcx, %rsi |
529 | jmp L(StrncpyExit) |
530 | |
531 | .p2align 4 |
532 | L(CopyTwoVecSize1Case2OrCase3): |
533 | add $VEC_SIZE, %rdi |
534 | add $VEC_SIZE, %rsi |
535 | sub $VEC_SIZE, %r8 |
536 | L(CopyVecSizeTail1Case2OrCase3): |
537 | test %rdx, %rdx |
538 | jnz L(CopyVecSizeTail1Case2) |
539 | jmp L(StrncpyExit) |
540 | # endif |
541 | |
542 | /*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ |
543 | |
544 | .p2align 4 |
545 | L(Exit1): |
546 | movzwl (%rsi), %edx |
547 | mov %dx, (%rdi) |
548 | # ifdef USE_AS_STPCPY |
549 | lea 1(%rdi), %rax |
550 | # endif |
551 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
552 | sub $2, %r8 |
553 | lea 2(%rdi), %rdi |
554 | jnz L(StrncpyFillTailWithZero) |
555 | # endif |
556 | VZEROUPPER |
557 | ret |
558 | |
559 | .p2align 4 |
560 | L(Exit2): |
561 | movzwl (%rsi), %ecx |
562 | mov %cx, (%rdi) |
563 | movb $0, 2(%rdi) |
564 | # ifdef USE_AS_STPCPY |
565 | lea 2(%rdi), %rax |
566 | # endif |
567 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
568 | sub $3, %r8 |
569 | lea 3(%rdi), %rdi |
570 | jnz L(StrncpyFillTailWithZero) |
571 | # endif |
572 | VZEROUPPER |
573 | ret |
574 | |
575 | .p2align 4 |
576 | L(Exit3): |
577 | mov (%rsi), %edx |
578 | mov %edx, (%rdi) |
579 | # ifdef USE_AS_STPCPY |
580 | lea 3(%rdi), %rax |
581 | # endif |
582 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
583 | sub $4, %r8 |
584 | lea 4(%rdi), %rdi |
585 | jnz L(StrncpyFillTailWithZero) |
586 | # endif |
587 | VZEROUPPER |
588 | ret |
589 | |
590 | .p2align 4 |
591 | L(Exit4_7): |
592 | mov (%rsi), %ecx |
593 | mov %ecx, (%rdi) |
594 | mov -3(%rsi, %rdx), %ecx |
595 | mov %ecx, -3(%rdi, %rdx) |
596 | # ifdef USE_AS_STPCPY |
597 | lea (%rdi, %rdx), %rax |
598 | # endif |
599 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
600 | sub %rdx, %r8 |
601 | sub $1, %r8 |
602 | lea 1(%rdi, %rdx), %rdi |
603 | jnz L(StrncpyFillTailWithZero) |
604 | # endif |
605 | VZEROUPPER |
606 | ret |
607 | |
608 | .p2align 4 |
609 | L(Exit8_15): |
610 | mov (%rsi), %rcx |
611 | mov -7(%rsi, %rdx), %r9 |
612 | mov %rcx, (%rdi) |
613 | mov %r9, -7(%rdi, %rdx) |
614 | # ifdef USE_AS_STPCPY |
615 | lea (%rdi, %rdx), %rax |
616 | # endif |
617 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
618 | sub %rdx, %r8 |
619 | sub $1, %r8 |
620 | lea 1(%rdi, %rdx), %rdi |
621 | jnz L(StrncpyFillTailWithZero) |
622 | # endif |
623 | VZEROUPPER |
624 | ret |
625 | |
626 | .p2align 4 |
627 | L(Exit16_31): |
628 | vmovdqu (%rsi), %xmm2 |
629 | vmovdqu -15(%rsi, %rdx), %xmm3 |
630 | vmovdqu %xmm2, (%rdi) |
631 | vmovdqu %xmm3, -15(%rdi, %rdx) |
632 | # ifdef USE_AS_STPCPY |
633 | lea (%rdi, %rdx), %rax |
634 | # endif |
635 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
636 | sub %rdx, %r8 |
637 | sub $1, %r8 |
638 | lea 1(%rdi, %rdx), %rdi |
639 | jnz L(StrncpyFillTailWithZero) |
640 | # endif |
641 | VZEROUPPER |
642 | ret |
643 | |
644 | .p2align 4 |
645 | L(Exit32_63): |
646 | vmovdqu (%rsi), %ymm2 |
647 | vmovdqu -31(%rsi, %rdx), %ymm3 |
648 | vmovdqu %ymm2, (%rdi) |
649 | vmovdqu %ymm3, -31(%rdi, %rdx) |
650 | # ifdef USE_AS_STPCPY |
651 | lea (%rdi, %rdx), %rax |
652 | # endif |
653 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
654 | sub %rdx, %r8 |
655 | sub $1, %r8 |
656 | lea 1(%rdi, %rdx), %rdi |
657 | jnz L(StrncpyFillTailWithZero) |
658 | # endif |
659 | VZEROUPPER |
660 | ret |
661 | |
662 | # ifdef USE_AS_STRNCPY |
663 | |
664 | .p2align 4 |
665 | L(StrncpyExit1): |
666 | movzbl (%rsi), %edx |
667 | mov %dl, (%rdi) |
668 | # ifdef USE_AS_STPCPY |
669 | lea 1(%rdi), %rax |
670 | # endif |
671 | # ifdef USE_AS_STRCAT |
672 | movb $0, 1(%rdi) |
673 | # endif |
674 | VZEROUPPER |
675 | ret |
676 | |
677 | .p2align 4 |
678 | L(StrncpyExit2): |
679 | movzwl (%rsi), %edx |
680 | mov %dx, (%rdi) |
681 | # ifdef USE_AS_STPCPY |
682 | lea 2(%rdi), %rax |
683 | # endif |
684 | # ifdef USE_AS_STRCAT |
685 | movb $0, 2(%rdi) |
686 | # endif |
687 | VZEROUPPER |
688 | ret |
689 | |
690 | .p2align 4 |
691 | L(StrncpyExit3_4): |
692 | movzwl (%rsi), %ecx |
693 | movzwl -2(%rsi, %r8), %edx |
694 | mov %cx, (%rdi) |
695 | mov %dx, -2(%rdi, %r8) |
696 | # ifdef USE_AS_STPCPY |
697 | lea (%rdi, %r8), %rax |
698 | # endif |
699 | # ifdef USE_AS_STRCAT |
700 | movb $0, (%rdi, %r8) |
701 | # endif |
702 | VZEROUPPER |
703 | ret |
704 | |
705 | .p2align 4 |
706 | L(StrncpyExit5_8): |
707 | mov (%rsi), %ecx |
708 | mov -4(%rsi, %r8), %edx |
709 | mov %ecx, (%rdi) |
710 | mov %edx, -4(%rdi, %r8) |
711 | # ifdef USE_AS_STPCPY |
712 | lea (%rdi, %r8), %rax |
713 | # endif |
714 | # ifdef USE_AS_STRCAT |
715 | movb $0, (%rdi, %r8) |
716 | # endif |
717 | VZEROUPPER |
718 | ret |
719 | |
720 | .p2align 4 |
721 | L(StrncpyExit9_16): |
722 | mov (%rsi), %rcx |
723 | mov -8(%rsi, %r8), %rdx |
724 | mov %rcx, (%rdi) |
725 | mov %rdx, -8(%rdi, %r8) |
726 | # ifdef USE_AS_STPCPY |
727 | lea (%rdi, %r8), %rax |
728 | # endif |
729 | # ifdef USE_AS_STRCAT |
730 | movb $0, (%rdi, %r8) |
731 | # endif |
732 | VZEROUPPER |
733 | ret |
734 | |
735 | .p2align 4 |
736 | L(StrncpyExit17_32): |
737 | vmovdqu (%rsi), %xmm2 |
738 | vmovdqu -16(%rsi, %r8), %xmm3 |
739 | vmovdqu %xmm2, (%rdi) |
740 | vmovdqu %xmm3, -16(%rdi, %r8) |
741 | # ifdef USE_AS_STPCPY |
742 | lea (%rdi, %r8), %rax |
743 | # endif |
744 | # ifdef USE_AS_STRCAT |
745 | movb $0, (%rdi, %r8) |
746 | # endif |
747 | VZEROUPPER |
748 | ret |
749 | |
750 | .p2align 4 |
751 | L(StrncpyExit33_64): |
752 | /* 0/32, 31/16 */ |
753 | vmovdqu (%rsi), %ymm2 |
754 | vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 |
755 | vmovdqu %ymm2, (%rdi) |
756 | vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) |
757 | # ifdef USE_AS_STPCPY |
758 | lea (%rdi, %r8), %rax |
759 | # endif |
760 | # ifdef USE_AS_STRCAT |
761 | movb $0, (%rdi, %r8) |
762 | # endif |
763 | VZEROUPPER |
764 | ret |
765 | |
766 | .p2align 4 |
767 | L(StrncpyExit65): |
768 | /* 0/32, 32/32, 64/1 */ |
769 | vmovdqu (%rsi), %ymm2 |
770 | vmovdqu 32(%rsi), %ymm3 |
771 | mov 64(%rsi), %cl |
772 | vmovdqu %ymm2, (%rdi) |
773 | vmovdqu %ymm3, 32(%rdi) |
774 | mov %cl, 64(%rdi) |
775 | # ifdef USE_AS_STPCPY |
776 | lea 65(%rdi), %rax |
777 | # endif |
778 | # ifdef USE_AS_STRCAT |
779 | movb $0, 65(%rdi) |
780 | # endif |
781 | VZEROUPPER |
782 | ret |
783 | |
784 | # ifndef USE_AS_STRCAT |
785 | |
786 | .p2align 4 |
787 | L(Fill1): |
788 | mov %dl, (%rdi) |
789 | VZEROUPPER |
790 | ret |
791 | |
792 | .p2align 4 |
793 | L(Fill2): |
794 | mov %dx, (%rdi) |
795 | VZEROUPPER |
796 | ret |
797 | |
798 | .p2align 4 |
799 | L(Fill3_4): |
800 | mov %dx, (%rdi) |
801 | mov %dx, -2(%rdi, %r8) |
802 | VZEROUPPER |
803 | ret |
804 | |
805 | .p2align 4 |
806 | L(Fill5_8): |
807 | mov %edx, (%rdi) |
808 | mov %edx, -4(%rdi, %r8) |
809 | VZEROUPPER |
810 | ret |
811 | |
812 | .p2align 4 |
813 | L(Fill9_16): |
814 | mov %rdx, (%rdi) |
815 | mov %rdx, -8(%rdi, %r8) |
816 | VZEROUPPER |
817 | ret |
818 | |
819 | .p2align 4 |
820 | L(Fill17_32): |
821 | vmovdqu %xmmZ, (%rdi) |
822 | vmovdqu %xmmZ, -16(%rdi, %r8) |
823 | VZEROUPPER |
824 | ret |
825 | |
826 | .p2align 4 |
827 | L(CopyVecSizeUnalignedVec2): |
828 | vmovdqu %ymm2, (%rdi, %rcx) |
829 | |
830 | .p2align 4 |
831 | L(CopyVecSizeVecExit): |
832 | bsf %edx, %edx |
833 | add $(VEC_SIZE - 1), %r8 |
834 | add %rcx, %rdi |
835 | # ifdef USE_AS_STPCPY |
836 | lea (%rdi, %rdx), %rax |
837 | # endif |
838 | sub %rdx, %r8 |
839 | lea 1(%rdi, %rdx), %rdi |
840 | |
841 | .p2align 4 |
842 | L(StrncpyFillTailWithZero): |
843 | xor %edx, %edx |
844 | sub $VEC_SIZE, %r8 |
845 | jbe L(StrncpyFillExit) |
846 | |
847 | vmovdqu %ymmZ, (%rdi) |
848 | add $VEC_SIZE, %rdi |
849 | |
850 | mov %rdi, %rsi |
851 | and $(VEC_SIZE - 1), %esi |
852 | sub %rsi, %rdi |
853 | add %rsi, %r8 |
854 | sub $(VEC_SIZE * 4), %r8 |
855 | jb L(StrncpyFillLessFourVecSize) |
856 | |
857 | L(StrncpyFillLoopVmovdqa): |
858 | vmovdqa %ymmZ, (%rdi) |
859 | vmovdqa %ymmZ, VEC_SIZE(%rdi) |
860 | vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) |
861 | vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) |
862 | add $(VEC_SIZE * 4), %rdi |
863 | sub $(VEC_SIZE * 4), %r8 |
864 | jae L(StrncpyFillLoopVmovdqa) |
865 | |
866 | L(StrncpyFillLessFourVecSize): |
867 | add $(VEC_SIZE * 2), %r8 |
868 | jl L(StrncpyFillLessTwoVecSize) |
869 | vmovdqa %ymmZ, (%rdi) |
870 | vmovdqa %ymmZ, VEC_SIZE(%rdi) |
871 | add $(VEC_SIZE * 2), %rdi |
872 | sub $VEC_SIZE, %r8 |
873 | jl L(StrncpyFillExit) |
874 | vmovdqa %ymmZ, (%rdi) |
875 | add $VEC_SIZE, %rdi |
876 | jmp L(Fill) |
877 | |
878 | .p2align 4 |
879 | L(StrncpyFillLessTwoVecSize): |
880 | add $VEC_SIZE, %r8 |
881 | jl L(StrncpyFillExit) |
882 | vmovdqa %ymmZ, (%rdi) |
883 | add $VEC_SIZE, %rdi |
884 | jmp L(Fill) |
885 | |
886 | .p2align 4 |
887 | L(StrncpyFillExit): |
888 | add $VEC_SIZE, %r8 |
889 | L(Fill): |
890 | cmp $17, %r8d |
891 | jae L(Fill17_32) |
892 | cmp $9, %r8d |
893 | jae L(Fill9_16) |
894 | cmp $5, %r8d |
895 | jae L(Fill5_8) |
896 | cmp $3, %r8d |
897 | jae L(Fill3_4) |
898 | cmp $1, %r8d |
899 | ja L(Fill2) |
900 | je L(Fill1) |
901 | VZEROUPPER |
902 | ret |
903 | |
904 | /* end of ifndef USE_AS_STRCAT */ |
905 | # endif |
906 | |
907 | .p2align 4 |
908 | L(UnalignedLeaveCase2OrCase3): |
909 | test %rdx, %rdx |
910 | jnz L(UnalignedFourVecSizeLeaveCase2) |
911 | L(UnalignedFourVecSizeLeaveCase3): |
912 | lea (VEC_SIZE * 4)(%r8), %rcx |
913 | and $-VEC_SIZE, %rcx |
914 | add $(VEC_SIZE * 3), %r8 |
915 | jl L(CopyVecSizeCase3) |
916 | vmovdqu %ymm4, (%rdi) |
917 | sub $VEC_SIZE, %r8 |
918 | jb L(CopyVecSizeCase3) |
919 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
920 | sub $VEC_SIZE, %r8 |
921 | jb L(CopyVecSizeCase3) |
922 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
923 | sub $VEC_SIZE, %r8 |
924 | jb L(CopyVecSizeCase3) |
925 | vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) |
926 | # ifdef USE_AS_STPCPY |
927 | lea (VEC_SIZE * 4)(%rdi), %rax |
928 | # endif |
929 | # ifdef USE_AS_STRCAT |
930 | movb $0, (VEC_SIZE * 4)(%rdi) |
931 | # endif |
932 | VZEROUPPER |
933 | ret |
934 | |
935 | .p2align 4 |
936 | L(UnalignedFourVecSizeLeaveCase2): |
937 | xor %ecx, %ecx |
938 | vpcmpeqb %ymm4, %ymmZ, %ymmM |
939 | vpmovmskb %ymmM, %edx |
940 | add $(VEC_SIZE * 3), %r8 |
941 | jle L(CopyVecSizeCase2OrCase3) |
942 | test %edx, %edx |
943 | # ifndef USE_AS_STRCAT |
944 | jnz L(CopyVecSizeUnalignedVec4) |
945 | # else |
946 | jnz L(CopyVecSize) |
947 | # endif |
948 | vpcmpeqb %ymm5, %ymmZ, %ymmM |
949 | vpmovmskb %ymmM, %edx |
950 | vmovdqu %ymm4, (%rdi) |
951 | add $VEC_SIZE, %rcx |
952 | sub $VEC_SIZE, %r8 |
953 | jbe L(CopyVecSizeCase2OrCase3) |
954 | test %edx, %edx |
955 | # ifndef USE_AS_STRCAT |
956 | jnz L(CopyVecSizeUnalignedVec5) |
957 | # else |
958 | jnz L(CopyVecSize) |
959 | # endif |
960 | |
961 | vpcmpeqb %ymm6, %ymmZ, %ymmM |
962 | vpmovmskb %ymmM, %edx |
963 | vmovdqu %ymm5, VEC_SIZE(%rdi) |
964 | add $VEC_SIZE, %rcx |
965 | sub $VEC_SIZE, %r8 |
966 | jbe L(CopyVecSizeCase2OrCase3) |
967 | test %edx, %edx |
968 | # ifndef USE_AS_STRCAT |
969 | jnz L(CopyVecSizeUnalignedVec6) |
970 | # else |
971 | jnz L(CopyVecSize) |
972 | # endif |
973 | |
974 | vpcmpeqb %ymm7, %ymmZ, %ymmM |
975 | vpmovmskb %ymmM, %edx |
976 | vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) |
977 | lea VEC_SIZE(%rdi, %rcx), %rdi |
978 | lea VEC_SIZE(%rsi, %rcx), %rsi |
979 | bsf %edx, %edx |
980 | cmp %r8d, %edx |
981 | jb L(CopyVecSizeExit) |
982 | L(StrncpyExit): |
983 | cmp $65, %r8d |
984 | je L(StrncpyExit65) |
985 | cmp $33, %r8d |
986 | jae L(StrncpyExit33_64) |
987 | cmp $17, %r8d |
988 | jae L(StrncpyExit17_32) |
989 | cmp $9, %r8d |
990 | jae L(StrncpyExit9_16) |
991 | cmp $5, %r8d |
992 | jae L(StrncpyExit5_8) |
993 | cmp $3, %r8d |
994 | jae L(StrncpyExit3_4) |
995 | cmp $1, %r8d |
996 | ja L(StrncpyExit2) |
997 | je L(StrncpyExit1) |
998 | # ifdef USE_AS_STPCPY |
999 | mov %rdi, %rax |
1000 | # endif |
1001 | # ifdef USE_AS_STRCAT |
1002 | movb $0, (%rdi) |
1003 | # endif |
1004 | VZEROUPPER |
1005 | ret |
1006 | |
1007 | .p2align 4 |
1008 | L(ExitZero): |
1009 | # ifndef USE_AS_STRCAT |
1010 | mov %rdi, %rax |
1011 | # endif |
1012 | VZEROUPPER |
1013 | ret |
1014 | |
1015 | # endif |
1016 | |
1017 | # ifndef USE_AS_STRCAT |
1018 | END (STRCPY) |
1019 | # else |
1020 | END (STRCAT) |
1021 | # endif |
1022 | #endif |
1023 | |