1 | /* strcpy with 256-bit EVEX instructions. |
2 | Copyright (C) 2021-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | |
24 | # ifndef USE_AS_STRCAT |
25 | # include <sysdep.h> |
26 | |
27 | # ifndef STRCPY |
28 | # define STRCPY __strcpy_evex |
29 | # endif |
30 | |
31 | # endif |
32 | |
33 | # define VMOVU vmovdqu64 |
34 | # define VMOVA vmovdqa64 |
35 | |
36 | /* Number of bytes in a vector register */ |
37 | # ifndef VEC_SIZE |
38 | # define VEC_SIZE 32 |
39 | # endif |
40 | |
41 | # define XMM2 xmm18 |
42 | # define XMM3 xmm19 |
43 | |
44 | # define YMM2 ymm18 |
45 | # define YMM3 ymm19 |
46 | # define YMM4 ymm20 |
47 | # define YMM5 ymm21 |
48 | # define YMM6 ymm22 |
49 | # define YMM7 ymm23 |
50 | |
51 | # ifndef USE_AS_STRCAT |
52 | |
53 | /* zero register */ |
54 | # define XMMZERO xmm16 |
55 | # define YMMZERO ymm16 |
56 | # define YMM1 ymm17 |
57 | |
58 | .section .text.evex,"ax" ,@progbits |
59 | ENTRY (STRCPY) |
60 | # ifdef USE_AS_STRNCPY |
61 | mov %RDX_LP, %R8_LP |
62 | test %R8_LP, %R8_LP |
63 | jz L(ExitZero) |
64 | # endif |
65 | mov %rsi, %rcx |
66 | # ifndef USE_AS_STPCPY |
67 | mov %rdi, %rax /* save result */ |
68 | # endif |
69 | |
70 | vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
71 | # endif |
72 | |
73 | and $((VEC_SIZE * 4) - 1), %ecx |
74 | cmp $(VEC_SIZE * 2), %ecx |
75 | jbe L(SourceStringAlignmentLessTwoVecSize) |
76 | |
77 | and $-VEC_SIZE, %rsi |
78 | and $(VEC_SIZE - 1), %ecx |
79 | |
80 | vpcmpb $0, (%rsi), %YMMZERO, %k0 |
81 | kmovd %k0, %edx |
82 | shr %cl, %rdx |
83 | |
84 | # ifdef USE_AS_STRNCPY |
85 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
86 | mov $VEC_SIZE, %r10 |
87 | sub %rcx, %r10 |
88 | cmp %r10, %r8 |
89 | # else |
90 | mov $(VEC_SIZE + 1), %r10 |
91 | sub %rcx, %r10 |
92 | cmp %r10, %r8 |
93 | # endif |
94 | jbe L(CopyVecSizeTailCase2OrCase3) |
95 | # endif |
96 | test %edx, %edx |
97 | jnz L(CopyVecSizeTail) |
98 | |
99 | vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 |
100 | kmovd %k1, %edx |
101 | |
102 | # ifdef USE_AS_STRNCPY |
103 | add $VEC_SIZE, %r10 |
104 | cmp %r10, %r8 |
105 | jbe L(CopyTwoVecSizeCase2OrCase3) |
106 | # endif |
107 | test %edx, %edx |
108 | jnz L(CopyTwoVecSize) |
109 | |
110 | VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ |
111 | VMOVU %YMM2, (%rdi) |
112 | |
113 | /* If source address alignment != destination address alignment */ |
114 | .p2align 4 |
115 | L(UnalignVecSizeBoth): |
116 | sub %rcx, %rdi |
117 | # ifdef USE_AS_STRNCPY |
118 | add %rcx, %r8 |
119 | sbb %rcx, %rcx |
120 | or %rcx, %r8 |
121 | # endif |
122 | mov $VEC_SIZE, %rcx |
123 | VMOVA (%rsi, %rcx), %YMM2 |
124 | VMOVU %YMM2, (%rdi, %rcx) |
125 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 |
126 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
127 | kmovd %k0, %edx |
128 | add $VEC_SIZE, %rcx |
129 | # ifdef USE_AS_STRNCPY |
130 | sub $(VEC_SIZE * 3), %r8 |
131 | jbe L(CopyVecSizeCase2OrCase3) |
132 | # endif |
133 | test %edx, %edx |
134 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
135 | jnz L(CopyVecSizeUnalignedVec2) |
136 | # else |
137 | jnz L(CopyVecSize) |
138 | # endif |
139 | |
140 | VMOVU %YMM2, (%rdi, %rcx) |
141 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 |
142 | vpcmpb $0, %YMM3, %YMMZERO, %k0 |
143 | kmovd %k0, %edx |
144 | add $VEC_SIZE, %rcx |
145 | # ifdef USE_AS_STRNCPY |
146 | sub $VEC_SIZE, %r8 |
147 | jbe L(CopyVecSizeCase2OrCase3) |
148 | # endif |
149 | test %edx, %edx |
150 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
151 | jnz L(CopyVecSizeUnalignedVec3) |
152 | # else |
153 | jnz L(CopyVecSize) |
154 | # endif |
155 | |
156 | VMOVU %YMM3, (%rdi, %rcx) |
157 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 |
158 | vpcmpb $0, %YMM4, %YMMZERO, %k0 |
159 | kmovd %k0, %edx |
160 | add $VEC_SIZE, %rcx |
161 | # ifdef USE_AS_STRNCPY |
162 | sub $VEC_SIZE, %r8 |
163 | jbe L(CopyVecSizeCase2OrCase3) |
164 | # endif |
165 | test %edx, %edx |
166 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
167 | jnz L(CopyVecSizeUnalignedVec4) |
168 | # else |
169 | jnz L(CopyVecSize) |
170 | # endif |
171 | |
172 | VMOVU %YMM4, (%rdi, %rcx) |
173 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 |
174 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
175 | kmovd %k0, %edx |
176 | add $VEC_SIZE, %rcx |
177 | # ifdef USE_AS_STRNCPY |
178 | sub $VEC_SIZE, %r8 |
179 | jbe L(CopyVecSizeCase2OrCase3) |
180 | # endif |
181 | test %edx, %edx |
182 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
183 | jnz L(CopyVecSizeUnalignedVec2) |
184 | # else |
185 | jnz L(CopyVecSize) |
186 | # endif |
187 | |
188 | VMOVU %YMM2, (%rdi, %rcx) |
189 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 |
190 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
191 | kmovd %k0, %edx |
192 | add $VEC_SIZE, %rcx |
193 | # ifdef USE_AS_STRNCPY |
194 | sub $VEC_SIZE, %r8 |
195 | jbe L(CopyVecSizeCase2OrCase3) |
196 | # endif |
197 | test %edx, %edx |
198 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
199 | jnz L(CopyVecSizeUnalignedVec2) |
200 | # else |
201 | jnz L(CopyVecSize) |
202 | # endif |
203 | |
204 | VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 |
205 | VMOVU %YMM2, (%rdi, %rcx) |
206 | vpcmpb $0, %YMM3, %YMMZERO, %k0 |
207 | kmovd %k0, %edx |
208 | add $VEC_SIZE, %rcx |
209 | # ifdef USE_AS_STRNCPY |
210 | sub $VEC_SIZE, %r8 |
211 | jbe L(CopyVecSizeCase2OrCase3) |
212 | # endif |
213 | test %edx, %edx |
214 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
215 | jnz L(CopyVecSizeUnalignedVec3) |
216 | # else |
217 | jnz L(CopyVecSize) |
218 | # endif |
219 | |
220 | VMOVU %YMM3, (%rdi, %rcx) |
221 | mov %rsi, %rdx |
222 | lea VEC_SIZE(%rsi, %rcx), %rsi |
223 | and $-(VEC_SIZE * 4), %rsi |
224 | sub %rsi, %rdx |
225 | sub %rdx, %rdi |
226 | # ifdef USE_AS_STRNCPY |
227 | lea (VEC_SIZE * 8)(%r8, %rdx), %r8 |
228 | # endif |
229 | L(UnalignedFourVecSizeLoop): |
230 | VMOVA (%rsi), %YMM4 |
231 | VMOVA VEC_SIZE(%rsi), %YMM5 |
232 | VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 |
233 | VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 |
234 | vpminub %YMM5, %YMM4, %YMM2 |
235 | vpminub %YMM7, %YMM6, %YMM3 |
236 | vpminub %YMM2, %YMM3, %YMM2 |
237 | /* If K7 != 0, there is a null byte. */ |
238 | vpcmpb $0, %YMM2, %YMMZERO, %k7 |
239 | kmovd %k7, %edx |
240 | # ifdef USE_AS_STRNCPY |
241 | sub $(VEC_SIZE * 4), %r8 |
242 | jbe L(UnalignedLeaveCase2OrCase3) |
243 | # endif |
244 | test %edx, %edx |
245 | jnz L(UnalignedFourVecSizeLeave) |
246 | |
247 | L(UnalignedFourVecSizeLoop_start): |
248 | add $(VEC_SIZE * 4), %rdi |
249 | add $(VEC_SIZE * 4), %rsi |
250 | VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) |
251 | VMOVA (%rsi), %YMM4 |
252 | VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) |
253 | VMOVA VEC_SIZE(%rsi), %YMM5 |
254 | vpminub %YMM5, %YMM4, %YMM2 |
255 | VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) |
256 | VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 |
257 | VMOVU %YMM7, -VEC_SIZE(%rdi) |
258 | VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 |
259 | vpminub %YMM7, %YMM6, %YMM3 |
260 | vpminub %YMM2, %YMM3, %YMM2 |
261 | /* If K7 != 0, there is a null byte. */ |
262 | vpcmpb $0, %YMM2, %YMMZERO, %k7 |
263 | kmovd %k7, %edx |
264 | # ifdef USE_AS_STRNCPY |
265 | sub $(VEC_SIZE * 4), %r8 |
266 | jbe L(UnalignedLeaveCase2OrCase3) |
267 | # endif |
268 | test %edx, %edx |
269 | jz L(UnalignedFourVecSizeLoop_start) |
270 | |
271 | L(UnalignedFourVecSizeLeave): |
272 | vpcmpb $0, %YMM4, %YMMZERO, %k1 |
273 | kmovd %k1, %edx |
274 | test %edx, %edx |
275 | jnz L(CopyVecSizeUnaligned_0) |
276 | |
277 | vpcmpb $0, %YMM5, %YMMZERO, %k2 |
278 | kmovd %k2, %ecx |
279 | test %ecx, %ecx |
280 | jnz L(CopyVecSizeUnaligned_16) |
281 | |
282 | vpcmpb $0, %YMM6, %YMMZERO, %k3 |
283 | kmovd %k3, %edx |
284 | test %edx, %edx |
285 | jnz L(CopyVecSizeUnaligned_32) |
286 | |
287 | vpcmpb $0, %YMM7, %YMMZERO, %k4 |
288 | kmovd %k4, %ecx |
289 | bsf %ecx, %edx |
290 | VMOVU %YMM4, (%rdi) |
291 | VMOVU %YMM5, VEC_SIZE(%rdi) |
292 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
293 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
294 | # ifdef USE_AS_STPCPY |
295 | lea (VEC_SIZE * 3)(%rdi, %rdx), %rax |
296 | # endif |
297 | VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) |
298 | add $(VEC_SIZE - 1), %r8 |
299 | sub %rdx, %r8 |
300 | lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi |
301 | jmp L(StrncpyFillTailWithZero) |
302 | # else |
303 | add $(VEC_SIZE * 3), %rsi |
304 | add $(VEC_SIZE * 3), %rdi |
305 | jmp L(CopyVecSizeExit) |
306 | # endif |
307 | |
308 | /* If source address alignment == destination address alignment */ |
309 | |
310 | L(SourceStringAlignmentLessTwoVecSize): |
311 | VMOVU (%rsi), %YMM3 |
312 | VMOVU VEC_SIZE(%rsi), %YMM2 |
313 | vpcmpb $0, %YMM3, %YMMZERO, %k0 |
314 | kmovd %k0, %edx |
315 | |
316 | # ifdef USE_AS_STRNCPY |
317 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
318 | cmp $VEC_SIZE, %r8 |
319 | # else |
320 | cmp $(VEC_SIZE + 1), %r8 |
321 | # endif |
322 | jbe L(CopyVecSizeTail1Case2OrCase3) |
323 | # endif |
324 | test %edx, %edx |
325 | jnz L(CopyVecSizeTail1) |
326 | |
327 | VMOVU %YMM3, (%rdi) |
328 | vpcmpb $0, %YMM2, %YMMZERO, %k0 |
329 | kmovd %k0, %edx |
330 | |
331 | # ifdef USE_AS_STRNCPY |
332 | # if defined USE_AS_STPCPY || defined USE_AS_STRCAT |
333 | cmp $(VEC_SIZE * 2), %r8 |
334 | # else |
335 | cmp $((VEC_SIZE * 2) + 1), %r8 |
336 | # endif |
337 | jbe L(CopyTwoVecSize1Case2OrCase3) |
338 | # endif |
339 | test %edx, %edx |
340 | jnz L(CopyTwoVecSize1) |
341 | |
342 | and $-VEC_SIZE, %rsi |
343 | and $(VEC_SIZE - 1), %ecx |
344 | jmp L(UnalignVecSizeBoth) |
345 | |
346 | /*------End of main part with loops---------------------*/ |
347 | |
348 | /* Case1 */ |
349 | |
350 | # if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) |
351 | .p2align 4 |
352 | L(CopyVecSize): |
353 | add %rcx, %rdi |
354 | # endif |
355 | L(CopyVecSizeTail): |
356 | add %rcx, %rsi |
357 | L(CopyVecSizeTail1): |
358 | bsf %edx, %edx |
359 | L(CopyVecSizeExit): |
360 | cmp $32, %edx |
361 | jae L(Exit32_63) |
362 | cmp $16, %edx |
363 | jae L(Exit16_31) |
364 | cmp $8, %edx |
365 | jae L(Exit8_15) |
366 | cmp $4, %edx |
367 | jae L(Exit4_7) |
368 | cmp $3, %edx |
369 | je L(Exit3) |
370 | cmp $1, %edx |
371 | ja L(Exit2) |
372 | je L(Exit1) |
373 | movb $0, (%rdi) |
374 | # ifdef USE_AS_STPCPY |
375 | lea (%rdi), %rax |
376 | # endif |
377 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
378 | sub $1, %r8 |
379 | lea 1(%rdi), %rdi |
380 | jnz L(StrncpyFillTailWithZero) |
381 | # endif |
382 | ret |
383 | |
384 | .p2align 4 |
385 | L(CopyTwoVecSize1): |
386 | add $VEC_SIZE, %rsi |
387 | add $VEC_SIZE, %rdi |
388 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
389 | sub $VEC_SIZE, %r8 |
390 | # endif |
391 | jmp L(CopyVecSizeTail1) |
392 | |
393 | .p2align 4 |
394 | L(CopyTwoVecSize): |
395 | bsf %edx, %edx |
396 | add %rcx, %rsi |
397 | add $VEC_SIZE, %edx |
398 | sub %ecx, %edx |
399 | jmp L(CopyVecSizeExit) |
400 | |
401 | .p2align 4 |
402 | L(CopyVecSizeUnaligned_0): |
403 | bsf %edx, %edx |
404 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
405 | # ifdef USE_AS_STPCPY |
406 | lea (%rdi, %rdx), %rax |
407 | # endif |
408 | VMOVU %YMM4, (%rdi) |
409 | add $((VEC_SIZE * 4) - 1), %r8 |
410 | sub %rdx, %r8 |
411 | lea 1(%rdi, %rdx), %rdi |
412 | jmp L(StrncpyFillTailWithZero) |
413 | # else |
414 | jmp L(CopyVecSizeExit) |
415 | # endif |
416 | |
417 | .p2align 4 |
418 | L(CopyVecSizeUnaligned_16): |
419 | bsf %ecx, %edx |
420 | VMOVU %YMM4, (%rdi) |
421 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
422 | # ifdef USE_AS_STPCPY |
423 | lea VEC_SIZE(%rdi, %rdx), %rax |
424 | # endif |
425 | VMOVU %YMM5, VEC_SIZE(%rdi) |
426 | add $((VEC_SIZE * 3) - 1), %r8 |
427 | sub %rdx, %r8 |
428 | lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi |
429 | jmp L(StrncpyFillTailWithZero) |
430 | # else |
431 | add $VEC_SIZE, %rsi |
432 | add $VEC_SIZE, %rdi |
433 | jmp L(CopyVecSizeExit) |
434 | # endif |
435 | |
436 | .p2align 4 |
437 | L(CopyVecSizeUnaligned_32): |
438 | bsf %edx, %edx |
439 | VMOVU %YMM4, (%rdi) |
440 | VMOVU %YMM5, VEC_SIZE(%rdi) |
441 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
442 | # ifdef USE_AS_STPCPY |
443 | lea (VEC_SIZE * 2)(%rdi, %rdx), %rax |
444 | # endif |
445 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
446 | add $((VEC_SIZE * 2) - 1), %r8 |
447 | sub %rdx, %r8 |
448 | lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi |
449 | jmp L(StrncpyFillTailWithZero) |
450 | # else |
451 | add $(VEC_SIZE * 2), %rsi |
452 | add $(VEC_SIZE * 2), %rdi |
453 | jmp L(CopyVecSizeExit) |
454 | # endif |
455 | |
456 | # ifdef USE_AS_STRNCPY |
457 | # ifndef USE_AS_STRCAT |
458 | .p2align 4 |
459 | L(CopyVecSizeUnalignedVec6): |
460 | VMOVU %YMM6, (%rdi, %rcx) |
461 | jmp L(CopyVecSizeVecExit) |
462 | |
463 | .p2align 4 |
464 | L(CopyVecSizeUnalignedVec5): |
465 | VMOVU %YMM5, (%rdi, %rcx) |
466 | jmp L(CopyVecSizeVecExit) |
467 | |
468 | .p2align 4 |
469 | L(CopyVecSizeUnalignedVec4): |
470 | VMOVU %YMM4, (%rdi, %rcx) |
471 | jmp L(CopyVecSizeVecExit) |
472 | |
473 | .p2align 4 |
474 | L(CopyVecSizeUnalignedVec3): |
475 | VMOVU %YMM3, (%rdi, %rcx) |
476 | jmp L(CopyVecSizeVecExit) |
477 | # endif |
478 | |
479 | /* Case2 */ |
480 | |
481 | .p2align 4 |
482 | L(CopyVecSizeCase2): |
483 | add $VEC_SIZE, %r8 |
484 | add %rcx, %rdi |
485 | add %rcx, %rsi |
486 | bsf %edx, %edx |
487 | cmp %r8d, %edx |
488 | jb L(CopyVecSizeExit) |
489 | jmp L(StrncpyExit) |
490 | |
491 | .p2align 4 |
492 | L(CopyTwoVecSizeCase2): |
493 | add %rcx, %rsi |
494 | bsf %edx, %edx |
495 | add $VEC_SIZE, %edx |
496 | sub %ecx, %edx |
497 | cmp %r8d, %edx |
498 | jb L(CopyVecSizeExit) |
499 | jmp L(StrncpyExit) |
500 | |
501 | L(CopyVecSizeTailCase2): |
502 | add %rcx, %rsi |
503 | bsf %edx, %edx |
504 | cmp %r8d, %edx |
505 | jb L(CopyVecSizeExit) |
506 | jmp L(StrncpyExit) |
507 | |
508 | L(CopyVecSizeTail1Case2): |
509 | bsf %edx, %edx |
510 | cmp %r8d, %edx |
511 | jb L(CopyVecSizeExit) |
512 | jmp L(StrncpyExit) |
513 | |
514 | /* Case2 or Case3, Case3 */ |
515 | |
516 | .p2align 4 |
517 | L(CopyVecSizeCase2OrCase3): |
518 | test %rdx, %rdx |
519 | jnz L(CopyVecSizeCase2) |
520 | L(CopyVecSizeCase3): |
521 | add $VEC_SIZE, %r8 |
522 | add %rcx, %rdi |
523 | add %rcx, %rsi |
524 | jmp L(StrncpyExit) |
525 | |
526 | .p2align 4 |
527 | L(CopyTwoVecSizeCase2OrCase3): |
528 | test %rdx, %rdx |
529 | jnz L(CopyTwoVecSizeCase2) |
530 | add %rcx, %rsi |
531 | jmp L(StrncpyExit) |
532 | |
533 | .p2align 4 |
534 | L(CopyVecSizeTailCase2OrCase3): |
535 | test %rdx, %rdx |
536 | jnz L(CopyVecSizeTailCase2) |
537 | add %rcx, %rsi |
538 | jmp L(StrncpyExit) |
539 | |
540 | .p2align 4 |
541 | L(CopyTwoVecSize1Case2OrCase3): |
542 | add $VEC_SIZE, %rdi |
543 | add $VEC_SIZE, %rsi |
544 | sub $VEC_SIZE, %r8 |
545 | L(CopyVecSizeTail1Case2OrCase3): |
546 | test %rdx, %rdx |
547 | jnz L(CopyVecSizeTail1Case2) |
548 | jmp L(StrncpyExit) |
549 | # endif |
550 | |
551 | /*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ |
552 | |
553 | .p2align 4 |
554 | L(Exit1): |
555 | movzwl (%rsi), %edx |
556 | mov %dx, (%rdi) |
557 | # ifdef USE_AS_STPCPY |
558 | lea 1(%rdi), %rax |
559 | # endif |
560 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
561 | sub $2, %r8 |
562 | lea 2(%rdi), %rdi |
563 | jnz L(StrncpyFillTailWithZero) |
564 | # endif |
565 | ret |
566 | |
567 | .p2align 4 |
568 | L(Exit2): |
569 | movzwl (%rsi), %ecx |
570 | mov %cx, (%rdi) |
571 | movb $0, 2(%rdi) |
572 | # ifdef USE_AS_STPCPY |
573 | lea 2(%rdi), %rax |
574 | # endif |
575 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
576 | sub $3, %r8 |
577 | lea 3(%rdi), %rdi |
578 | jnz L(StrncpyFillTailWithZero) |
579 | # endif |
580 | ret |
581 | |
582 | .p2align 4 |
583 | L(Exit3): |
584 | mov (%rsi), %edx |
585 | mov %edx, (%rdi) |
586 | # ifdef USE_AS_STPCPY |
587 | lea 3(%rdi), %rax |
588 | # endif |
589 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
590 | sub $4, %r8 |
591 | lea 4(%rdi), %rdi |
592 | jnz L(StrncpyFillTailWithZero) |
593 | # endif |
594 | ret |
595 | |
596 | .p2align 4 |
597 | L(Exit4_7): |
598 | mov (%rsi), %ecx |
599 | mov %ecx, (%rdi) |
600 | mov -3(%rsi, %rdx), %ecx |
601 | mov %ecx, -3(%rdi, %rdx) |
602 | # ifdef USE_AS_STPCPY |
603 | lea (%rdi, %rdx), %rax |
604 | # endif |
605 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
606 | sub %rdx, %r8 |
607 | sub $1, %r8 |
608 | lea 1(%rdi, %rdx), %rdi |
609 | jnz L(StrncpyFillTailWithZero) |
610 | # endif |
611 | ret |
612 | |
613 | .p2align 4 |
614 | L(Exit8_15): |
615 | mov (%rsi), %rcx |
616 | mov -7(%rsi, %rdx), %r9 |
617 | mov %rcx, (%rdi) |
618 | mov %r9, -7(%rdi, %rdx) |
619 | # ifdef USE_AS_STPCPY |
620 | lea (%rdi, %rdx), %rax |
621 | # endif |
622 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
623 | sub %rdx, %r8 |
624 | sub $1, %r8 |
625 | lea 1(%rdi, %rdx), %rdi |
626 | jnz L(StrncpyFillTailWithZero) |
627 | # endif |
628 | ret |
629 | |
630 | .p2align 4 |
631 | L(Exit16_31): |
632 | VMOVU (%rsi), %XMM2 |
633 | VMOVU -15(%rsi, %rdx), %XMM3 |
634 | VMOVU %XMM2, (%rdi) |
635 | VMOVU %XMM3, -15(%rdi, %rdx) |
636 | # ifdef USE_AS_STPCPY |
637 | lea (%rdi, %rdx), %rax |
638 | # endif |
639 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
640 | sub %rdx, %r8 |
641 | sub $1, %r8 |
642 | lea 1(%rdi, %rdx), %rdi |
643 | jnz L(StrncpyFillTailWithZero) |
644 | # endif |
645 | ret |
646 | |
647 | .p2align 4 |
648 | L(Exit32_63): |
649 | VMOVU (%rsi), %YMM2 |
650 | VMOVU -31(%rsi, %rdx), %YMM3 |
651 | VMOVU %YMM2, (%rdi) |
652 | VMOVU %YMM3, -31(%rdi, %rdx) |
653 | # ifdef USE_AS_STPCPY |
654 | lea (%rdi, %rdx), %rax |
655 | # endif |
656 | # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT |
657 | sub %rdx, %r8 |
658 | sub $1, %r8 |
659 | lea 1(%rdi, %rdx), %rdi |
660 | jnz L(StrncpyFillTailWithZero) |
661 | # endif |
662 | ret |
663 | |
664 | # ifdef USE_AS_STRNCPY |
665 | |
666 | .p2align 4 |
667 | L(StrncpyExit1): |
668 | movzbl (%rsi), %edx |
669 | mov %dl, (%rdi) |
670 | # ifdef USE_AS_STPCPY |
671 | lea 1(%rdi), %rax |
672 | # endif |
673 | # ifdef USE_AS_STRCAT |
674 | movb $0, 1(%rdi) |
675 | # endif |
676 | ret |
677 | |
678 | .p2align 4 |
679 | L(StrncpyExit2): |
680 | movzwl (%rsi), %edx |
681 | mov %dx, (%rdi) |
682 | # ifdef USE_AS_STPCPY |
683 | lea 2(%rdi), %rax |
684 | # endif |
685 | # ifdef USE_AS_STRCAT |
686 | movb $0, 2(%rdi) |
687 | # endif |
688 | ret |
689 | |
690 | .p2align 4 |
691 | L(StrncpyExit3_4): |
692 | movzwl (%rsi), %ecx |
693 | movzwl -2(%rsi, %r8), %edx |
694 | mov %cx, (%rdi) |
695 | mov %dx, -2(%rdi, %r8) |
696 | # ifdef USE_AS_STPCPY |
697 | lea (%rdi, %r8), %rax |
698 | # endif |
699 | # ifdef USE_AS_STRCAT |
700 | movb $0, (%rdi, %r8) |
701 | # endif |
702 | ret |
703 | |
704 | .p2align 4 |
705 | L(StrncpyExit5_8): |
706 | mov (%rsi), %ecx |
707 | mov -4(%rsi, %r8), %edx |
708 | mov %ecx, (%rdi) |
709 | mov %edx, -4(%rdi, %r8) |
710 | # ifdef USE_AS_STPCPY |
711 | lea (%rdi, %r8), %rax |
712 | # endif |
713 | # ifdef USE_AS_STRCAT |
714 | movb $0, (%rdi, %r8) |
715 | # endif |
716 | ret |
717 | |
718 | .p2align 4 |
719 | L(StrncpyExit9_16): |
720 | mov (%rsi), %rcx |
721 | mov -8(%rsi, %r8), %rdx |
722 | mov %rcx, (%rdi) |
723 | mov %rdx, -8(%rdi, %r8) |
724 | # ifdef USE_AS_STPCPY |
725 | lea (%rdi, %r8), %rax |
726 | # endif |
727 | # ifdef USE_AS_STRCAT |
728 | movb $0, (%rdi, %r8) |
729 | # endif |
730 | ret |
731 | |
732 | .p2align 4 |
733 | L(StrncpyExit17_32): |
734 | VMOVU (%rsi), %XMM2 |
735 | VMOVU -16(%rsi, %r8), %XMM3 |
736 | VMOVU %XMM2, (%rdi) |
737 | VMOVU %XMM3, -16(%rdi, %r8) |
738 | # ifdef USE_AS_STPCPY |
739 | lea (%rdi, %r8), %rax |
740 | # endif |
741 | # ifdef USE_AS_STRCAT |
742 | movb $0, (%rdi, %r8) |
743 | # endif |
744 | ret |
745 | |
746 | .p2align 4 |
747 | L(StrncpyExit33_64): |
748 | /* 0/32, 31/16 */ |
749 | VMOVU (%rsi), %YMM2 |
750 | VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 |
751 | VMOVU %YMM2, (%rdi) |
752 | VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) |
753 | # ifdef USE_AS_STPCPY |
754 | lea (%rdi, %r8), %rax |
755 | # endif |
756 | # ifdef USE_AS_STRCAT |
757 | movb $0, (%rdi, %r8) |
758 | # endif |
759 | ret |
760 | |
761 | .p2align 4 |
762 | L(StrncpyExit65): |
763 | /* 0/32, 32/32, 64/1 */ |
764 | VMOVU (%rsi), %YMM2 |
765 | VMOVU 32(%rsi), %YMM3 |
766 | mov 64(%rsi), %cl |
767 | VMOVU %YMM2, (%rdi) |
768 | VMOVU %YMM3, 32(%rdi) |
769 | mov %cl, 64(%rdi) |
770 | # ifdef USE_AS_STPCPY |
771 | lea 65(%rdi), %rax |
772 | # endif |
773 | # ifdef USE_AS_STRCAT |
774 | movb $0, 65(%rdi) |
775 | # endif |
776 | ret |
777 | |
778 | # ifndef USE_AS_STRCAT |
779 | |
780 | .p2align 4 |
781 | L(Fill1): |
782 | mov %dl, (%rdi) |
783 | ret |
784 | |
785 | .p2align 4 |
786 | L(Fill2): |
787 | mov %dx, (%rdi) |
788 | ret |
789 | |
790 | .p2align 4 |
791 | L(Fill3_4): |
792 | mov %dx, (%rdi) |
793 | mov %dx, -2(%rdi, %r8) |
794 | ret |
795 | |
796 | .p2align 4 |
797 | L(Fill5_8): |
798 | mov %edx, (%rdi) |
799 | mov %edx, -4(%rdi, %r8) |
800 | ret |
801 | |
802 | .p2align 4 |
803 | L(Fill9_16): |
804 | mov %rdx, (%rdi) |
805 | mov %rdx, -8(%rdi, %r8) |
806 | ret |
807 | |
808 | .p2align 4 |
809 | L(Fill17_32): |
810 | VMOVU %XMMZERO, (%rdi) |
811 | VMOVU %XMMZERO, -16(%rdi, %r8) |
812 | ret |
813 | |
814 | .p2align 4 |
815 | L(CopyVecSizeUnalignedVec2): |
816 | VMOVU %YMM2, (%rdi, %rcx) |
817 | |
818 | .p2align 4 |
819 | L(CopyVecSizeVecExit): |
820 | bsf %edx, %edx |
821 | add $(VEC_SIZE - 1), %r8 |
822 | add %rcx, %rdi |
823 | # ifdef USE_AS_STPCPY |
824 | lea (%rdi, %rdx), %rax |
825 | # endif |
826 | sub %rdx, %r8 |
827 | lea 1(%rdi, %rdx), %rdi |
828 | |
829 | .p2align 4 |
830 | L(StrncpyFillTailWithZero): |
831 | xor %edx, %edx |
832 | sub $VEC_SIZE, %r8 |
833 | jbe L(StrncpyFillExit) |
834 | |
835 | VMOVU %YMMZERO, (%rdi) |
836 | add $VEC_SIZE, %rdi |
837 | |
838 | mov %rdi, %rsi |
839 | and $(VEC_SIZE - 1), %esi |
840 | sub %rsi, %rdi |
841 | add %rsi, %r8 |
842 | sub $(VEC_SIZE * 4), %r8 |
843 | jb L(StrncpyFillLessFourVecSize) |
844 | |
845 | L(StrncpyFillLoopVmovdqa): |
846 | VMOVA %YMMZERO, (%rdi) |
847 | VMOVA %YMMZERO, VEC_SIZE(%rdi) |
848 | VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) |
849 | VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) |
850 | add $(VEC_SIZE * 4), %rdi |
851 | sub $(VEC_SIZE * 4), %r8 |
852 | jae L(StrncpyFillLoopVmovdqa) |
853 | |
854 | L(StrncpyFillLessFourVecSize): |
855 | add $(VEC_SIZE * 2), %r8 |
856 | jl L(StrncpyFillLessTwoVecSize) |
857 | VMOVA %YMMZERO, (%rdi) |
858 | VMOVA %YMMZERO, VEC_SIZE(%rdi) |
859 | add $(VEC_SIZE * 2), %rdi |
860 | sub $VEC_SIZE, %r8 |
861 | jl L(StrncpyFillExit) |
862 | VMOVA %YMMZERO, (%rdi) |
863 | add $VEC_SIZE, %rdi |
864 | jmp L(Fill) |
865 | |
866 | .p2align 4 |
867 | L(StrncpyFillLessTwoVecSize): |
868 | add $VEC_SIZE, %r8 |
869 | jl L(StrncpyFillExit) |
870 | VMOVA %YMMZERO, (%rdi) |
871 | add $VEC_SIZE, %rdi |
872 | jmp L(Fill) |
873 | |
874 | .p2align 4 |
875 | L(StrncpyFillExit): |
876 | add $VEC_SIZE, %r8 |
877 | L(Fill): |
878 | cmp $17, %r8d |
879 | jae L(Fill17_32) |
880 | cmp $9, %r8d |
881 | jae L(Fill9_16) |
882 | cmp $5, %r8d |
883 | jae L(Fill5_8) |
884 | cmp $3, %r8d |
885 | jae L(Fill3_4) |
886 | cmp $1, %r8d |
887 | ja L(Fill2) |
888 | je L(Fill1) |
889 | ret |
890 | |
891 | /* end of ifndef USE_AS_STRCAT */ |
892 | # endif |
893 | |
894 | .p2align 4 |
895 | L(UnalignedLeaveCase2OrCase3): |
896 | test %rdx, %rdx |
897 | jnz L(UnalignedFourVecSizeLeaveCase2) |
898 | L(UnalignedFourVecSizeLeaveCase3): |
899 | lea (VEC_SIZE * 4)(%r8), %rcx |
900 | and $-VEC_SIZE, %rcx |
901 | add $(VEC_SIZE * 3), %r8 |
902 | jl L(CopyVecSizeCase3) |
903 | VMOVU %YMM4, (%rdi) |
904 | sub $VEC_SIZE, %r8 |
905 | jb L(CopyVecSizeCase3) |
906 | VMOVU %YMM5, VEC_SIZE(%rdi) |
907 | sub $VEC_SIZE, %r8 |
908 | jb L(CopyVecSizeCase3) |
909 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
910 | sub $VEC_SIZE, %r8 |
911 | jb L(CopyVecSizeCase3) |
912 | VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) |
913 | # ifdef USE_AS_STPCPY |
914 | lea (VEC_SIZE * 4)(%rdi), %rax |
915 | # endif |
916 | # ifdef USE_AS_STRCAT |
917 | movb $0, (VEC_SIZE * 4)(%rdi) |
918 | # endif |
919 | ret |
920 | |
921 | .p2align 4 |
922 | L(UnalignedFourVecSizeLeaveCase2): |
923 | xor %ecx, %ecx |
924 | vpcmpb $0, %YMM4, %YMMZERO, %k1 |
925 | kmovd %k1, %edx |
926 | add $(VEC_SIZE * 3), %r8 |
927 | jle L(CopyVecSizeCase2OrCase3) |
928 | test %edx, %edx |
929 | # ifndef USE_AS_STRCAT |
930 | jnz L(CopyVecSizeUnalignedVec4) |
931 | # else |
932 | jnz L(CopyVecSize) |
933 | # endif |
934 | vpcmpb $0, %YMM5, %YMMZERO, %k2 |
935 | kmovd %k2, %edx |
936 | VMOVU %YMM4, (%rdi) |
937 | add $VEC_SIZE, %rcx |
938 | sub $VEC_SIZE, %r8 |
939 | jbe L(CopyVecSizeCase2OrCase3) |
940 | test %edx, %edx |
941 | # ifndef USE_AS_STRCAT |
942 | jnz L(CopyVecSizeUnalignedVec5) |
943 | # else |
944 | jnz L(CopyVecSize) |
945 | # endif |
946 | |
947 | vpcmpb $0, %YMM6, %YMMZERO, %k3 |
948 | kmovd %k3, %edx |
949 | VMOVU %YMM5, VEC_SIZE(%rdi) |
950 | add $VEC_SIZE, %rcx |
951 | sub $VEC_SIZE, %r8 |
952 | jbe L(CopyVecSizeCase2OrCase3) |
953 | test %edx, %edx |
954 | # ifndef USE_AS_STRCAT |
955 | jnz L(CopyVecSizeUnalignedVec6) |
956 | # else |
957 | jnz L(CopyVecSize) |
958 | # endif |
959 | |
960 | vpcmpb $0, %YMM7, %YMMZERO, %k4 |
961 | kmovd %k4, %edx |
962 | VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) |
963 | lea VEC_SIZE(%rdi, %rcx), %rdi |
964 | lea VEC_SIZE(%rsi, %rcx), %rsi |
965 | bsf %edx, %edx |
966 | cmp %r8d, %edx |
967 | jb L(CopyVecSizeExit) |
968 | L(StrncpyExit): |
969 | cmp $65, %r8d |
970 | je L(StrncpyExit65) |
971 | cmp $33, %r8d |
972 | jae L(StrncpyExit33_64) |
973 | cmp $17, %r8d |
974 | jae L(StrncpyExit17_32) |
975 | cmp $9, %r8d |
976 | jae L(StrncpyExit9_16) |
977 | cmp $5, %r8d |
978 | jae L(StrncpyExit5_8) |
979 | cmp $3, %r8d |
980 | jae L(StrncpyExit3_4) |
981 | cmp $1, %r8d |
982 | ja L(StrncpyExit2) |
983 | je L(StrncpyExit1) |
984 | # ifdef USE_AS_STPCPY |
985 | mov %rdi, %rax |
986 | # endif |
987 | # ifdef USE_AS_STRCAT |
988 | movb $0, (%rdi) |
989 | # endif |
990 | ret |
991 | |
992 | .p2align 4 |
993 | L(ExitZero): |
994 | # ifndef USE_AS_STRCAT |
995 | mov %rdi, %rax |
996 | # endif |
997 | ret |
998 | |
999 | # endif |
1000 | |
1001 | # ifndef USE_AS_STRCAT |
1002 | END (STRCPY) |
1003 | # else |
1004 | END (STRCAT) |
1005 | # endif |
1006 | #endif |
1007 | |