1 | /* strcpy with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #if IS_IN (libc) |
20 | |
21 | # ifndef USE_AS_STRCAT |
22 | # include <sysdep.h> |
23 | |
24 | # ifndef STRCPY |
25 | # define STRCPY __strcpy_evex |
26 | # endif |
27 | |
28 | # endif |
29 | |
30 | # define VMOVU vmovdqu64 |
31 | # define VMOVA vmovdqa64 |
32 | |
33 | /* Number of bytes in a vector register */ |
34 | # ifndef VEC_SIZE |
35 | # define VEC_SIZE 32 |
36 | # endif |
37 | |
38 | # define XMM2 xmm18 |
39 | # define XMM3 xmm19 |
40 | |
41 | # define YMM2 ymm18 |
42 | # define YMM3 ymm19 |
43 | # define YMM4 ymm20 |
44 | # define YMM5 ymm21 |
45 | # define YMM6 ymm22 |
46 | # define YMM7 ymm23 |
47 | |
48 | # ifndef USE_AS_STRCAT |
49 | |
50 | /* zero register */ |
51 | # define XMMZERO xmm16 |
52 | # define YMMZERO ymm16 |
53 | # define YMM1 ymm17 |
54 | |
55 | .section .text.evex,"ax" ,@progbits |
56 | ENTRY (STRCPY) |
57 | # ifdef USE_AS_STRNCPY |
58 | mov %RDX_LP, %R8_LP |
59 | test %R8_LP, %R8_LP |
60 | jz L(ExitZero) |
61 | # endif |
62 | mov %rsi, %rcx |
63 | # ifndef USE_AS_STPCPY |
64 | mov %rdi, %rax /* save result */ |
65 | # endif |
66 | |
67 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
68 | # endif |
69 | |
70 | and $((VEC_SIZE * 4) - 1), %ecx |
71 | cmp $(VEC_SIZE * 2), %ecx |
72 | jbe L(SourceStringAlignmentLessTwoVecSize) |
73 | |
74 | and $-VEC_SIZE, %rsi |
75 | and $(VEC_SIZE - 1), %ecx |
76 | |
77 | vpcmpb $0, (%rsi), %YMMZERO, %k0 |
78 | kmovd %k0, %edx |
79 | shr %cl, %rdx |
80 | |
81 | # ifdef USE_AS_STRNCPY |
82 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
83 | mov $VEC_SIZE, %r10 |
84 | sub %rcx, %r10 |
85 | cmp %r10, %r8 |
86 | # else |
87 | mov $(VEC_SIZE + 1), %r10 |
88 | sub %rcx, %r10 |
89 | cmp %r10, %r8 |
90 | # endif |
91 | jbe L(CopyVecSizeTailCase2OrCase3) |
92 | # endif |
93 | test %edx, %edx |
94 | jnz L(CopyVecSizeTail) |
95 | |
96 | vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 |
97 | kmovd %k1, %edx |
98 | |
99 | # ifdef USE_AS_STRNCPY |
100 | add $VEC_SIZE, %r10 |
101 | cmp %r10, %r8 |
102 | jbe L(CopyTwoVecSizeCase2OrCase3) |
103 | # endif |
104 | test %edx, %edx |
105 | jnz L(CopyTwoVecSize) |
106 | |
107 | VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ |
108 | VMOVU %YMM2, (%rdi) |
109 | |
110 | /* If source address alignment != destination address alignment */ |
111 | .p2align 4 |
112 | L(UnalignVecSizeBoth): |
113 | sub %rcx, %rdi |
114 | # ifdef USE_AS_STRNCPY |
115 | add %rcx, %r8 |
116 | sbb %rcx, %rcx |
117 | or %rcx, %r8 |
118 | # endif |
119 | mov $VEC_SIZE, %rcx |
120 | VMOVA (%rsi, %rcx), %YMM2 |
121 | VMOVU %YMM2, (%rdi, %rcx) |
122 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 |
123 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
124 | kmovd %k0, %edx |
125 | add $VEC_SIZE, %rcx |
126 | # ifdef USE_AS_STRNCPY |
127 | sub $(VEC_SIZE * 3), %r8 |
128 | jbe L(CopyVecSizeCase2OrCase3) |
129 | # endif |
130 | test %edx, %edx |
131 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
132 | jnz L(CopyVecSizeUnalignedVec2) |
133 | # else |
134 | jnz L(CopyVecSize) |
135 | # endif |
136 | |
137 | VMOVU %YMM2, (%rdi, %rcx) |
138 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 |
139 | vpcmpb $0, %YMM3, %YMMZERO, %k0 |
140 | kmovd %k0, %edx |
141 | add $VEC_SIZE, %rcx |
142 | # ifdef USE_AS_STRNCPY |
143 | sub $VEC_SIZE, %r8 |
144 | jbe L(CopyVecSizeCase2OrCase3) |
145 | # endif |
146 | test %edx, %edx |
147 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
148 | jnz L(CopyVecSizeUnalignedVec3) |
149 | # else |
150 | jnz L(CopyVecSize) |
151 | # endif |
152 | |
153 | VMOVU %YMM3, (%rdi, %rcx) |
154 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 |
155 | vpcmpb $0, %YMM4, %YMMZERO, %k0 |
156 | kmovd %k0, %edx |
157 | add $VEC_SIZE, %rcx |
158 | # ifdef USE_AS_STRNCPY |
159 | sub $VEC_SIZE, %r8 |
160 | jbe L(CopyVecSizeCase2OrCase3) |
161 | # endif |
162 | test %edx, %edx |
163 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
164 | jnz L(CopyVecSizeUnalignedVec4) |
165 | # else |
166 | jnz L(CopyVecSize) |
167 | # endif |
168 | |
169 | VMOVU %YMM4, (%rdi, %rcx) |
170 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 |
171 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
172 | kmovd %k0, %edx |
173 | add $VEC_SIZE, %rcx |
174 | # ifdef USE_AS_STRNCPY |
175 | sub $VEC_SIZE, %r8 |
176 | jbe L(CopyVecSizeCase2OrCase3) |
177 | # endif |
178 | test %edx, %edx |
179 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
180 | jnz L(CopyVecSizeUnalignedVec2) |
181 | # else |
182 | jnz L(CopyVecSize) |
183 | # endif |
184 | |
185 | VMOVU %YMM2, (%rdi, %rcx) |
186 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 |
187 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
188 | kmovd %k0, %edx |
189 | add $VEC_SIZE, %rcx |
190 | # ifdef USE_AS_STRNCPY |
191 | sub $VEC_SIZE, %r8 |
192 | jbe L(CopyVecSizeCase2OrCase3) |
193 | # endif |
194 | test %edx, %edx |
195 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
196 | jnz L(CopyVecSizeUnalignedVec2) |
197 | # else |
198 | jnz L(CopyVecSize) |
199 | # endif |
200 | |
201 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 |
202 | VMOVU %YMM2, (%rdi, %rcx) |
203 | vpcmpb $0, %YMM3, %YMMZERO, %k0 |
204 | kmovd %k0, %edx |
205 | add $VEC_SIZE, %rcx |
206 | # ifdef USE_AS_STRNCPY |
207 | sub $VEC_SIZE, %r8 |
208 | jbe L(CopyVecSizeCase2OrCase3) |
209 | # endif |
210 | test %edx, %edx |
211 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
212 | jnz L(CopyVecSizeUnalignedVec3) |
213 | # else |
214 | jnz L(CopyVecSize) |
215 | # endif |
216 | |
217 | VMOVU %YMM3, (%rdi, %rcx) |
218 | mov %rsi, %rdx |
219 | lea VEC_SIZE(%rsi, %rcx), %rsi |
220 | and $-(VEC_SIZE * 4), %rsi |
221 | sub %rsi, %rdx |
222 | sub %rdx, %rdi |
223 | # ifdef USE_AS_STRNCPY |
224 | lea (VEC_SIZE * 8)(%r8, %rdx), %r8 |
225 | # endif |
226 | L(UnalignedFourVecSizeLoop): |
227 | VMOVA (%rsi), %YMM4 |
228 | VMOVA VEC_SIZE(%rsi), %YMM5 |
229 | VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 |
230 | VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 |
231 | vpminub %YMM5, %YMM4, %YMM2 |
232 | vpminub %YMM7, %YMM6, %YMM3 |
233 | vpminub %YMM2, %YMM3, %YMM2 |
234 | /* If K7 != 0, there is a null byte. */ |
235 | vpcmpb $0, %YMM2, %YMMZERO, %k7 |
236 | kmovd %k7, %edx |
237 | # ifdef USE_AS_STRNCPY |
238 | sub $(VEC_SIZE * 4), %r8 |
239 | jbe L(UnalignedLeaveCase2OrCase3) |
240 | # endif |
241 | test %edx, %edx |
242 | jnz L(UnalignedFourVecSizeLeave) |
243 | |
244 | L(UnalignedFourVecSizeLoop_start): |
245 | add $(VEC_SIZE * 4), %rdi |
246 | add $(VEC_SIZE * 4), %rsi |
247 | VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) |
248 | VMOVA (%rsi), %YMM4 |
249 | VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) |
250 | VMOVA VEC_SIZE(%rsi), %YMM5 |
251 | vpminub %YMM5, %YMM4, %YMM2 |
252 | VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) |
253 | VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 |
254 | VMOVU %YMM7, -VEC_SIZE(%rdi) |
255 | VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 |
256 | vpminub %YMM7, %YMM6, %YMM3 |
257 | vpminub %YMM2, %YMM3, %YMM2 |
258 | /* If K7 != 0, there is a null byte. */ |
259 | vpcmpb $0, %YMM2, %YMMZERO, %k7 |
260 | kmovd %k7, %edx |
261 | # ifdef USE_AS_STRNCPY |
262 | sub $(VEC_SIZE * 4), %r8 |
263 | jbe L(UnalignedLeaveCase2OrCase3) |
264 | # endif |
265 | test %edx, %edx |
266 | jz L(UnalignedFourVecSizeLoop_start) |
267 | |
268 | L(UnalignedFourVecSizeLeave): |
269 | vpcmpb $0, %YMM4, %YMMZERO, %k1 |
270 | kmovd %k1, %edx |
271 | test %edx, %edx |
272 | jnz L(CopyVecSizeUnaligned_0) |
273 | |
274 | vpcmpb $0, %YMM5, %YMMZERO, %k2 |
275 | kmovd %k2, %ecx |
276 | test %ecx, %ecx |
277 | jnz L(CopyVecSizeUnaligned_16) |
278 | |
279 | vpcmpb $0, %YMM6, %YMMZERO, %k3 |
280 | kmovd %k3, %edx |
281 | test %edx, %edx |
282 | jnz L(CopyVecSizeUnaligned_32) |
283 | |
284 | vpcmpb $0, %YMM7, %YMMZERO, %k4 |
285 | kmovd %k4, %ecx |
286 | bsf %ecx, %edx |
287 | VMOVU %YMM4, (%rdi) |
288 | VMOVU %YMM5, VEC_SIZE(%rdi) |
289 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
290 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
291 | # ifdef USE_AS_STPCPY |
292 | lea (VEC_SIZE * 3)(%rdi, %rdx), %rax |
293 | # endif |
294 | VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) |
295 | add $(VEC_SIZE - 1), %r8 |
296 | sub %rdx, %r8 |
297 | lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi |
298 | jmp L(StrncpyFillTailWithZero) |
299 | # else |
300 | add $(VEC_SIZE * 3), %rsi |
301 | add $(VEC_SIZE * 3), %rdi |
302 | jmp L(CopyVecSizeExit) |
303 | # endif |
304 | |
305 | /* If source address alignment == destination address alignment */ |
306 | |
307 | L(SourceStringAlignmentLessTwoVecSize): |
308 | VMOVU (%rsi), %YMM3 |
309 | VMOVU VEC_SIZE(%rsi), %YMM2 |
310 | vpcmpb $0, %YMM3, %YMMZERO, %k0 |
311 | kmovd %k0, %edx |
312 | |
313 | # ifdef USE_AS_STRNCPY |
314 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
315 | cmp $VEC_SIZE, %r8 |
316 | # else |
317 | cmp $(VEC_SIZE + 1), %r8 |
318 | # endif |
319 | jbe L(CopyVecSizeTail1Case2OrCase3) |
320 | # endif |
321 | test %edx, %edx |
322 | jnz L(CopyVecSizeTail1) |
323 | |
324 | VMOVU %YMM3, (%rdi) |
325 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
326 | kmovd %k0, %edx |
327 | |
328 | # ifdef USE_AS_STRNCPY |
329 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
330 | cmp $(VEC_SIZE * 2), %r8 |
331 | # else |
332 | cmp $((VEC_SIZE * 2) + 1), %r8 |
333 | # endif |
334 | jbe L(CopyTwoVecSize1Case2OrCase3) |
335 | # endif |
336 | test %edx, %edx |
337 | jnz L(CopyTwoVecSize1) |
338 | |
339 | and $-VEC_SIZE, %rsi |
340 | and $(VEC_SIZE - 1), %ecx |
341 | jmp L(UnalignVecSizeBoth) |
342 | |
343 | /*------End of main part with loops---------------------*/ |
344 | |
345 | /* Case1 */ |
346 | |
347 | # if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) |
348 | .p2align 4 |
349 | L(CopyVecSize): |
350 | add %rcx, %rdi |
351 | # endif |
352 | L(CopyVecSizeTail): |
353 | add %rcx, %rsi |
354 | L(CopyVecSizeTail1): |
355 | bsf %edx, %edx |
356 | L(CopyVecSizeExit): |
357 | cmp $32, %edx |
358 | jae L(Exit32_63) |
359 | cmp $16, %edx |
360 | jae L(Exit16_31) |
361 | cmp $8, %edx |
362 | jae L(Exit8_15) |
363 | cmp $4, %edx |
364 | jae L(Exit4_7) |
365 | cmp $3, %edx |
366 | je L(Exit3) |
367 | cmp $1, %edx |
368 | ja L(Exit2) |
369 | je L(Exit1) |
370 | movb $0, (%rdi) |
371 | # ifdef USE_AS_STPCPY |
372 | lea (%rdi), %rax |
373 | # endif |
374 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
375 | sub $1, %r8 |
376 | lea 1(%rdi), %rdi |
377 | jnz L(StrncpyFillTailWithZero) |
378 | # endif |
379 | ret |
380 | |
381 | .p2align 4 |
382 | L(CopyTwoVecSize1): |
383 | add $VEC_SIZE, %rsi |
384 | add $VEC_SIZE, %rdi |
385 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
386 | sub $VEC_SIZE, %r8 |
387 | # endif |
388 | jmp L(CopyVecSizeTail1) |
389 | |
390 | .p2align 4 |
391 | L(CopyTwoVecSize): |
392 | bsf %edx, %edx |
393 | add %rcx, %rsi |
394 | add $VEC_SIZE, %edx |
395 | sub %ecx, %edx |
396 | jmp L(CopyVecSizeExit) |
397 | |
398 | .p2align 4 |
399 | L(CopyVecSizeUnaligned_0): |
400 | bsf %edx, %edx |
401 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
402 | # ifdef USE_AS_STPCPY |
403 | lea (%rdi, %rdx), %rax |
404 | # endif |
405 | VMOVU %YMM4, (%rdi) |
406 | add $((VEC_SIZE * 4) - 1), %r8 |
407 | sub %rdx, %r8 |
408 | lea 1(%rdi, %rdx), %rdi |
409 | jmp L(StrncpyFillTailWithZero) |
410 | # else |
411 | jmp L(CopyVecSizeExit) |
412 | # endif |
413 | |
414 | .p2align 4 |
415 | L(CopyVecSizeUnaligned_16): |
416 | bsf %ecx, %edx |
417 | VMOVU %YMM4, (%rdi) |
418 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
419 | # ifdef USE_AS_STPCPY |
420 | lea VEC_SIZE(%rdi, %rdx), %rax |
421 | # endif |
422 | VMOVU %YMM5, VEC_SIZE(%rdi) |
423 | add $((VEC_SIZE * 3) - 1), %r8 |
424 | sub %rdx, %r8 |
425 | lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi |
426 | jmp L(StrncpyFillTailWithZero) |
427 | # else |
428 | add $VEC_SIZE, %rsi |
429 | add $VEC_SIZE, %rdi |
430 | jmp L(CopyVecSizeExit) |
431 | # endif |
432 | |
433 | .p2align 4 |
434 | L(CopyVecSizeUnaligned_32): |
435 | bsf %edx, %edx |
436 | VMOVU %YMM4, (%rdi) |
437 | VMOVU %YMM5, VEC_SIZE(%rdi) |
438 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
439 | # ifdef USE_AS_STPCPY |
440 | lea (VEC_SIZE * 2)(%rdi, %rdx), %rax |
441 | # endif |
442 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
443 | add $((VEC_SIZE * 2) - 1), %r8 |
444 | sub %rdx, %r8 |
445 | lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi |
446 | jmp L(StrncpyFillTailWithZero) |
447 | # else |
448 | add $(VEC_SIZE * 2), %rsi |
449 | add $(VEC_SIZE * 2), %rdi |
450 | jmp L(CopyVecSizeExit) |
451 | # endif |
452 | |
453 | # ifdef USE_AS_STRNCPY |
454 | # ifndef USE_AS_STRCAT |
455 | .p2align 4 |
456 | L(CopyVecSizeUnalignedVec6): |
457 | VMOVU %YMM6, (%rdi, %rcx) |
458 | jmp L(CopyVecSizeVecExit) |
459 | |
460 | .p2align 4 |
461 | L(CopyVecSizeUnalignedVec5): |
462 | VMOVU %YMM5, (%rdi, %rcx) |
463 | jmp L(CopyVecSizeVecExit) |
464 | |
465 | .p2align 4 |
466 | L(CopyVecSizeUnalignedVec4): |
467 | VMOVU %YMM4, (%rdi, %rcx) |
468 | jmp L(CopyVecSizeVecExit) |
469 | |
470 | .p2align 4 |
471 | L(CopyVecSizeUnalignedVec3): |
472 | VMOVU %YMM3, (%rdi, %rcx) |
473 | jmp L(CopyVecSizeVecExit) |
474 | # endif |
475 | |
476 | /* Case2 */ |
477 | |
478 | .p2align 4 |
479 | L(CopyVecSizeCase2): |
480 | add $VEC_SIZE, %r8 |
481 | add %rcx, %rdi |
482 | add %rcx, %rsi |
483 | bsf %edx, %edx |
484 | cmp %r8d, %edx |
485 | jb L(CopyVecSizeExit) |
486 | jmp L(StrncpyExit) |
487 | |
488 | .p2align 4 |
489 | L(CopyTwoVecSizeCase2): |
490 | add %rcx, %rsi |
491 | bsf %edx, %edx |
492 | add $VEC_SIZE, %edx |
493 | sub %ecx, %edx |
494 | cmp %r8d, %edx |
495 | jb L(CopyVecSizeExit) |
496 | jmp L(StrncpyExit) |
497 | |
498 | L(CopyVecSizeTailCase2): |
499 | add %rcx, %rsi |
500 | bsf %edx, %edx |
501 | cmp %r8d, %edx |
502 | jb L(CopyVecSizeExit) |
503 | jmp L(StrncpyExit) |
504 | |
505 | L(CopyVecSizeTail1Case2): |
506 | bsf %edx, %edx |
507 | cmp %r8d, %edx |
508 | jb L(CopyVecSizeExit) |
509 | jmp L(StrncpyExit) |
510 | |
511 | /* Case2 or Case3, Case3 */ |
512 | |
513 | .p2align 4 |
514 | L(CopyVecSizeCase2OrCase3): |
515 | test %rdx, %rdx |
516 | jnz L(CopyVecSizeCase2) |
517 | L(CopyVecSizeCase3): |
518 | add $VEC_SIZE, %r8 |
519 | add %rcx, %rdi |
520 | add %rcx, %rsi |
521 | jmp L(StrncpyExit) |
522 | |
523 | .p2align 4 |
524 | L(CopyTwoVecSizeCase2OrCase3): |
525 | test %rdx, %rdx |
526 | jnz L(CopyTwoVecSizeCase2) |
527 | add %rcx, %rsi |
528 | jmp L(StrncpyExit) |
529 | |
530 | .p2align 4 |
531 | L(CopyVecSizeTailCase2OrCase3): |
532 | test %rdx, %rdx |
533 | jnz L(CopyVecSizeTailCase2) |
534 | add %rcx, %rsi |
535 | jmp L(StrncpyExit) |
536 | |
537 | .p2align 4 |
538 | L(CopyTwoVecSize1Case2OrCase3): |
539 | add $VEC_SIZE, %rdi |
540 | add $VEC_SIZE, %rsi |
541 | sub $VEC_SIZE, %r8 |
542 | L(CopyVecSizeTail1Case2OrCase3): |
543 | test %rdx, %rdx |
544 | jnz L(CopyVecSizeTail1Case2) |
545 | jmp L(StrncpyExit) |
546 | # endif |
547 | |
548 | /*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ |
549 | |
550 | .p2align 4 |
551 | L(Exit1): |
552 | movzwl (%rsi), %edx |
553 | mov %dx, (%rdi) |
554 | # ifdef USE_AS_STPCPY |
555 | lea 1(%rdi), %rax |
556 | # endif |
557 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
558 | sub $2, %r8 |
559 | lea 2(%rdi), %rdi |
560 | jnz L(StrncpyFillTailWithZero) |
561 | # endif |
562 | ret |
563 | |
564 | .p2align 4 |
565 | L(Exit2): |
566 | movzwl (%rsi), %ecx |
567 | mov %cx, (%rdi) |
568 | movb $0, 2(%rdi) |
569 | # ifdef USE_AS_STPCPY |
570 | lea 2(%rdi), %rax |
571 | # endif |
572 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
573 | sub $3, %r8 |
574 | lea 3(%rdi), %rdi |
575 | jnz L(StrncpyFillTailWithZero) |
576 | # endif |
577 | ret |
578 | |
579 | .p2align 4 |
580 | L(Exit3): |
581 | mov (%rsi), %edx |
582 | mov %edx, (%rdi) |
583 | # ifdef USE_AS_STPCPY |
584 | lea 3(%rdi), %rax |
585 | # endif |
586 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
587 | sub $4, %r8 |
588 | lea 4(%rdi), %rdi |
589 | jnz L(StrncpyFillTailWithZero) |
590 | # endif |
591 | ret |
592 | |
593 | .p2align 4 |
594 | L(Exit4_7): |
595 | mov (%rsi), %ecx |
596 | mov %ecx, (%rdi) |
597 | mov -3(%rsi, %rdx), %ecx |
598 | mov %ecx, -3(%rdi, %rdx) |
599 | # ifdef USE_AS_STPCPY |
600 | lea (%rdi, %rdx), %rax |
601 | # endif |
602 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
603 | sub %rdx, %r8 |
604 | sub $1, %r8 |
605 | lea 1(%rdi, %rdx), %rdi |
606 | jnz L(StrncpyFillTailWithZero) |
607 | # endif |
608 | ret |
609 | |
610 | .p2align 4 |
611 | L(Exit8_15): |
612 | mov (%rsi), %rcx |
613 | mov -7(%rsi, %rdx), %r9 |
614 | mov %rcx, (%rdi) |
615 | mov %r9, -7(%rdi, %rdx) |
616 | # ifdef USE_AS_STPCPY |
617 | lea (%rdi, %rdx), %rax |
618 | # endif |
619 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
620 | sub %rdx, %r8 |
621 | sub $1, %r8 |
622 | lea 1(%rdi, %rdx), %rdi |
623 | jnz L(StrncpyFillTailWithZero) |
624 | # endif |
625 | ret |
626 | |
627 | .p2align 4 |
628 | L(Exit16_31): |
629 | VMOVU (%rsi), %XMM2 |
630 | VMOVU -15(%rsi, %rdx), %XMM3 |
631 | VMOVU %XMM2, (%rdi) |
632 | VMOVU %XMM3, -15(%rdi, %rdx) |
633 | # ifdef USE_AS_STPCPY |
634 | lea (%rdi, %rdx), %rax |
635 | # endif |
636 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
637 | sub %rdx, %r8 |
638 | sub $1, %r8 |
639 | lea 1(%rdi, %rdx), %rdi |
640 | jnz L(StrncpyFillTailWithZero) |
641 | # endif |
642 | ret |
643 | |
644 | .p2align 4 |
645 | L(Exit32_63): |
646 | VMOVU (%rsi), %YMM2 |
647 | VMOVU -31(%rsi, %rdx), %YMM3 |
648 | VMOVU %YMM2, (%rdi) |
649 | VMOVU %YMM3, -31(%rdi, %rdx) |
650 | # ifdef USE_AS_STPCPY |
651 | lea (%rdi, %rdx), %rax |
652 | # endif |
653 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
654 | sub %rdx, %r8 |
655 | sub $1, %r8 |
656 | lea 1(%rdi, %rdx), %rdi |
657 | jnz L(StrncpyFillTailWithZero) |
658 | # endif |
659 | ret |
660 | |
661 | # ifdef USE_AS_STRNCPY |
662 | |
663 | .p2align 4 |
664 | L(StrncpyExit1): |
665 | movzbl (%rsi), %edx |
666 | mov %dl, (%rdi) |
667 | # ifdef USE_AS_STPCPY |
668 | lea 1(%rdi), %rax |
669 | # endif |
670 | # ifdef USE_AS_STRCAT |
671 | movb $0, 1(%rdi) |
672 | # endif |
673 | ret |
674 | |
675 | .p2align 4 |
676 | L(StrncpyExit2): |
677 | movzwl (%rsi), %edx |
678 | mov %dx, (%rdi) |
679 | # ifdef USE_AS_STPCPY |
680 | lea 2(%rdi), %rax |
681 | # endif |
682 | # ifdef USE_AS_STRCAT |
683 | movb $0, 2(%rdi) |
684 | # endif |
685 | ret |
686 | |
687 | .p2align 4 |
688 | L(StrncpyExit3_4): |
689 | movzwl (%rsi), %ecx |
690 | movzwl -2(%rsi, %r8), %edx |
691 | mov %cx, (%rdi) |
692 | mov %dx, -2(%rdi, %r8) |
693 | # ifdef USE_AS_STPCPY |
694 | lea (%rdi, %r8), %rax |
695 | # endif |
696 | # ifdef USE_AS_STRCAT |
697 | movb $0, (%rdi, %r8) |
698 | # endif |
699 | ret |
700 | |
701 | .p2align 4 |
702 | L(StrncpyExit5_8): |
703 | mov (%rsi), %ecx |
704 | mov -4(%rsi, %r8), %edx |
705 | mov %ecx, (%rdi) |
706 | mov %edx, -4(%rdi, %r8) |
707 | # ifdef USE_AS_STPCPY |
708 | lea (%rdi, %r8), %rax |
709 | # endif |
710 | # ifdef USE_AS_STRCAT |
711 | movb $0, (%rdi, %r8) |
712 | # endif |
713 | ret |
714 | |
715 | .p2align 4 |
716 | L(StrncpyExit9_16): |
717 | mov (%rsi), %rcx |
718 | mov -8(%rsi, %r8), %rdx |
719 | mov %rcx, (%rdi) |
720 | mov %rdx, -8(%rdi, %r8) |
721 | # ifdef USE_AS_STPCPY |
722 | lea (%rdi, %r8), %rax |
723 | # endif |
724 | # ifdef USE_AS_STRCAT |
725 | movb $0, (%rdi, %r8) |
726 | # endif |
727 | ret |
728 | |
729 | .p2align 4 |
730 | L(StrncpyExit17_32): |
731 | VMOVU (%rsi), %XMM2 |
732 | VMOVU -16(%rsi, %r8), %XMM3 |
733 | VMOVU %XMM2, (%rdi) |
734 | VMOVU %XMM3, -16(%rdi, %r8) |
735 | # ifdef USE_AS_STPCPY |
736 | lea (%rdi, %r8), %rax |
737 | # endif |
738 | # ifdef USE_AS_STRCAT |
739 | movb $0, (%rdi, %r8) |
740 | # endif |
741 | ret |
742 | |
743 | .p2align 4 |
744 | L(StrncpyExit33_64): |
745 | /* 0/32, 31/16 */ |
746 | VMOVU (%rsi), %YMM2 |
747 | VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 |
748 | VMOVU %YMM2, (%rdi) |
749 | VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) |
750 | # ifdef USE_AS_STPCPY |
751 | lea (%rdi, %r8), %rax |
752 | # endif |
753 | # ifdef USE_AS_STRCAT |
754 | movb $0, (%rdi, %r8) |
755 | # endif |
756 | ret |
757 | |
758 | .p2align 4 |
759 | L(StrncpyExit65): |
760 | /* 0/32, 32/32, 64/1 */ |
761 | VMOVU (%rsi), %YMM2 |
762 | VMOVU 32(%rsi), %YMM3 |
763 | mov 64(%rsi), %cl |
764 | VMOVU %YMM2, (%rdi) |
765 | VMOVU %YMM3, 32(%rdi) |
766 | mov %cl, 64(%rdi) |
767 | # ifdef USE_AS_STPCPY |
768 | lea 65(%rdi), %rax |
769 | # endif |
770 | # ifdef USE_AS_STRCAT |
771 | movb $0, 65(%rdi) |
772 | # endif |
773 | ret |
774 | |
775 | # ifndef USE_AS_STRCAT |
776 | |
777 | .p2align 4 |
778 | L(Fill1): |
779 | mov %dl, (%rdi) |
780 | ret |
781 | |
782 | .p2align 4 |
783 | L(Fill2): |
784 | mov %dx, (%rdi) |
785 | ret |
786 | |
787 | .p2align 4 |
788 | L(Fill3_4): |
789 | mov %dx, (%rdi) |
790 | mov %dx, -2(%rdi, %r8) |
791 | ret |
792 | |
793 | .p2align 4 |
794 | L(Fill5_8): |
795 | mov %edx, (%rdi) |
796 | mov %edx, -4(%rdi, %r8) |
797 | ret |
798 | |
799 | .p2align 4 |
800 | L(Fill9_16): |
801 | mov %rdx, (%rdi) |
802 | mov %rdx, -8(%rdi, %r8) |
803 | ret |
804 | |
805 | .p2align 4 |
806 | L(Fill17_32): |
807 | VMOVU %XMMZERO, (%rdi) |
808 | VMOVU %XMMZERO, -16(%rdi, %r8) |
809 | ret |
810 | |
811 | .p2align 4 |
812 | L(CopyVecSizeUnalignedVec2): |
813 | VMOVU %YMM2, (%rdi, %rcx) |
814 | |
815 | .p2align 4 |
816 | L(CopyVecSizeVecExit): |
817 | bsf %edx, %edx |
818 | add $(VEC_SIZE - 1), %r8 |
819 | add %rcx, %rdi |
820 | # ifdef USE_AS_STPCPY |
821 | lea (%rdi, %rdx), %rax |
822 | # endif |
823 | sub %rdx, %r8 |
824 | lea 1(%rdi, %rdx), %rdi |
825 | |
826 | .p2align 4 |
827 | L(StrncpyFillTailWithZero): |
828 | xor %edx, %edx |
829 | sub $VEC_SIZE, %r8 |
830 | jbe L(StrncpyFillExit) |
831 | |
832 | VMOVU %YMMZERO, (%rdi) |
833 | add $VEC_SIZE, %rdi |
834 | |
835 | mov %rdi, %rsi |
836 | and $(VEC_SIZE - 1), %esi |
837 | sub %rsi, %rdi |
838 | add %rsi, %r8 |
839 | sub $(VEC_SIZE * 4), %r8 |
840 | jb L(StrncpyFillLessFourVecSize) |
841 | |
842 | L(StrncpyFillLoopVmovdqa): |
843 | VMOVA %YMMZERO, (%rdi) |
844 | VMOVA %YMMZERO, VEC_SIZE(%rdi) |
845 | VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) |
846 | VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) |
847 | add $(VEC_SIZE * 4), %rdi |
848 | sub $(VEC_SIZE * 4), %r8 |
849 | jae L(StrncpyFillLoopVmovdqa) |
850 | |
851 | L(StrncpyFillLessFourVecSize): |
852 | add $(VEC_SIZE * 2), %r8 |
853 | jl L(StrncpyFillLessTwoVecSize) |
854 | VMOVA %YMMZERO, (%rdi) |
855 | VMOVA %YMMZERO, VEC_SIZE(%rdi) |
856 | add $(VEC_SIZE * 2), %rdi |
857 | sub $VEC_SIZE, %r8 |
858 | jl L(StrncpyFillExit) |
859 | VMOVA %YMMZERO, (%rdi) |
860 | add $VEC_SIZE, %rdi |
861 | jmp L(Fill) |
862 | |
863 | .p2align 4 |
864 | L(StrncpyFillLessTwoVecSize): |
865 | add $VEC_SIZE, %r8 |
866 | jl L(StrncpyFillExit) |
867 | VMOVA %YMMZERO, (%rdi) |
868 | add $VEC_SIZE, %rdi |
869 | jmp L(Fill) |
870 | |
871 | .p2align 4 |
872 | L(StrncpyFillExit): |
873 | add $VEC_SIZE, %r8 |
874 | L(Fill): |
875 | cmp $17, %r8d |
876 | jae L(Fill17_32) |
877 | cmp $9, %r8d |
878 | jae L(Fill9_16) |
879 | cmp $5, %r8d |
880 | jae L(Fill5_8) |
881 | cmp $3, %r8d |
882 | jae L(Fill3_4) |
883 | cmp $1, %r8d |
884 | ja L(Fill2) |
885 | je L(Fill1) |
886 | ret |
887 | |
888 | /* end of ifndef USE_AS_STRCAT */ |
889 | # endif |
890 | |
891 | .p2align 4 |
892 | L(UnalignedLeaveCase2OrCase3): |
893 | test %rdx, %rdx |
894 | jnz L(UnalignedFourVecSizeLeaveCase2) |
895 | L(UnalignedFourVecSizeLeaveCase3): |
896 | lea (VEC_SIZE * 4)(%r8), %rcx |
897 | and $-VEC_SIZE, %rcx |
898 | add $(VEC_SIZE * 3), %r8 |
899 | jl L(CopyVecSizeCase3) |
900 | VMOVU %YMM4, (%rdi) |
901 | sub $VEC_SIZE, %r8 |
902 | jb L(CopyVecSizeCase3) |
903 | VMOVU %YMM5, VEC_SIZE(%rdi) |
904 | sub $VEC_SIZE, %r8 |
905 | jb L(CopyVecSizeCase3) |
906 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
907 | sub $VEC_SIZE, %r8 |
908 | jb L(CopyVecSizeCase3) |
909 | VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) |
910 | # ifdef USE_AS_STPCPY |
911 | lea (VEC_SIZE * 4)(%rdi), %rax |
912 | # endif |
913 | # ifdef USE_AS_STRCAT |
914 | movb $0, (VEC_SIZE * 4)(%rdi) |
915 | # endif |
916 | ret |
917 | |
918 | .p2align 4 |
919 | L(UnalignedFourVecSizeLeaveCase2): |
920 | xor %ecx, %ecx |
921 | vpcmpb $0, %YMM4, %YMMZERO, %k1 |
922 | kmovd %k1, %edx |
923 | add $(VEC_SIZE * 3), %r8 |
924 | jle L(CopyVecSizeCase2OrCase3) |
925 | test %edx, %edx |
926 | # ifndef USE_AS_STRCAT |
927 | jnz L(CopyVecSizeUnalignedVec4) |
928 | # else |
929 | jnz L(CopyVecSize) |
930 | # endif |
931 | vpcmpb $0, %YMM5, %YMMZERO, %k2 |
932 | kmovd %k2, %edx |
933 | VMOVU %YMM4, (%rdi) |
934 | add $VEC_SIZE, %rcx |
935 | sub $VEC_SIZE, %r8 |
936 | jbe L(CopyVecSizeCase2OrCase3) |
937 | test %edx, %edx |
938 | # ifndef USE_AS_STRCAT |
939 | jnz L(CopyVecSizeUnalignedVec5) |
940 | # else |
941 | jnz L(CopyVecSize) |
942 | # endif |
943 | |
944 | vpcmpb $0, %YMM6, %YMMZERO, %k3 |
945 | kmovd %k3, %edx |
946 | VMOVU %YMM5, VEC_SIZE(%rdi) |
947 | add $VEC_SIZE, %rcx |
948 | sub $VEC_SIZE, %r8 |
949 | jbe L(CopyVecSizeCase2OrCase3) |
950 | test %edx, %edx |
951 | # ifndef USE_AS_STRCAT |
952 | jnz L(CopyVecSizeUnalignedVec6) |
953 | # else |
954 | jnz L(CopyVecSize) |
955 | # endif |
956 | |
957 | vpcmpb $0, %YMM7, %YMMZERO, %k4 |
958 | kmovd %k4, %edx |
959 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
960 | lea VEC_SIZE(%rdi, %rcx), %rdi |
961 | lea VEC_SIZE(%rsi, %rcx), %rsi |
962 | bsf %edx, %edx |
963 | cmp %r8d, %edx |
964 | jb L(CopyVecSizeExit) |
965 | L(StrncpyExit): |
966 | cmp $65, %r8d |
967 | je L(StrncpyExit65) |
968 | cmp $33, %r8d |
969 | jae L(StrncpyExit33_64) |
970 | cmp $17, %r8d |
971 | jae L(StrncpyExit17_32) |
972 | cmp $9, %r8d |
973 | jae L(StrncpyExit9_16) |
974 | cmp $5, %r8d |
975 | jae L(StrncpyExit5_8) |
976 | cmp $3, %r8d |
977 | jae L(StrncpyExit3_4) |
978 | cmp $1, %r8d |
979 | ja L(StrncpyExit2) |
980 | je L(StrncpyExit1) |
981 | # ifdef USE_AS_STPCPY |
982 | mov %rdi, %rax |
983 | # endif |
984 | # ifdef USE_AS_STRCAT |
985 | movb $0, (%rdi) |
986 | # endif |
987 | ret |
988 | |
989 | .p2align 4 |
990 | L(ExitZero): |
991 | # ifndef USE_AS_STRCAT |
992 | mov %rdi, %rax |
993 | # endif |
994 | ret |
995 | |
996 | # endif |
997 | |
998 | # ifndef USE_AS_STRCAT |
999 | END (STRCPY) |
1000 | # else |
1001 | END (STRCAT) |
1002 | # endif |
1003 | #endif |
1004 | |