1 | /* wcscpy with SSSE3 |
2 | Copyright (C) 2011-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | /* MINIMUM_X86_ISA_LEVEL <= 4 because there are not V3/V4 |
22 | implementations so we need this to build for ISA V3/V4 |
23 | builds. */ |
24 | #if ISA_SHOULD_BUILD (4) |
25 | |
26 | # ifndef WCSCPY |
27 | # define WCSCPY __wcscpy_ssse3 |
28 | # endif |
29 | |
30 | # include <sysdep.h> |
31 | |
32 | .section .text.ssse3,"ax" ,@progbits |
33 | ENTRY (WCSCPY) |
34 | |
35 | mov %rsi, %rcx |
36 | mov %rdi, %rdx |
37 | |
38 | cmpl $0, (%rcx) |
39 | jz L(Exit4) |
40 | cmpl $0, 4(%rcx) |
41 | jz L(Exit8) |
42 | cmpl $0, 8(%rcx) |
43 | jz L(Exit12) |
44 | cmpl $0, 12(%rcx) |
45 | jz L(Exit16) |
46 | |
47 | lea 16(%rcx), %rsi |
48 | and $-16, %rsi |
49 | |
50 | pxor %xmm0, %xmm0 |
51 | mov (%rcx), %r9 |
52 | mov %r9, (%rdx) |
53 | |
54 | pcmpeqd (%rsi), %xmm0 |
55 | mov 8(%rcx), %r9 |
56 | mov %r9, 8(%rdx) |
57 | |
58 | pmovmskb %xmm0, %rax |
59 | sub %rcx, %rsi |
60 | |
61 | test %rax, %rax |
62 | jnz L(CopyFrom1To16Bytes) |
63 | |
64 | mov %rdx, %rax |
65 | addq $16, %rdx |
66 | and $-16, %rdx |
67 | sub %rdx, %rax |
68 | sub %rax, %rcx |
69 | mov %rcx, %rax |
70 | and $0xf, %rax |
71 | mov $0, %rsi |
72 | |
73 | /* case: rcx_offset == rdx_offset */ |
74 | |
75 | jz L(Align16Both) |
76 | |
77 | cmp $4, %rax |
78 | je L(Shl4) |
79 | cmp $8, %rax |
80 | je L(Shl8) |
81 | jmp L(Shl12) |
82 | |
83 | L(Align16Both): |
84 | movaps (%rcx), %xmm1 |
85 | movaps 16(%rcx), %xmm2 |
86 | movaps %xmm1, (%rdx) |
87 | pcmpeqd %xmm2, %xmm0 |
88 | pmovmskb %xmm0, %eax |
89 | addq $16, %rsi |
90 | |
91 | test %eax, %eax |
92 | jnz L(CopyFrom1To16Bytes) |
93 | |
94 | movaps 16(%rcx, %rsi), %xmm3 |
95 | movaps %xmm2, (%rdx, %rsi) |
96 | pcmpeqd %xmm3, %xmm0 |
97 | pmovmskb %xmm0, %eax |
98 | addq $16, %rsi |
99 | |
100 | test %eax, %eax |
101 | jnz L(CopyFrom1To16Bytes) |
102 | |
103 | movaps 16(%rcx, %rsi), %xmm4 |
104 | movaps %xmm3, (%rdx, %rsi) |
105 | pcmpeqd %xmm4, %xmm0 |
106 | pmovmskb %xmm0, %eax |
107 | addq $16, %rsi |
108 | |
109 | test %eax, %eax |
110 | jnz L(CopyFrom1To16Bytes) |
111 | |
112 | movaps 16(%rcx, %rsi), %xmm1 |
113 | movaps %xmm4, (%rdx, %rsi) |
114 | pcmpeqd %xmm1, %xmm0 |
115 | pmovmskb %xmm0, %eax |
116 | addq $16, %rsi |
117 | |
118 | test %eax, %eax |
119 | jnz L(CopyFrom1To16Bytes) |
120 | |
121 | movaps 16(%rcx, %rsi), %xmm2 |
122 | movaps %xmm1, (%rdx, %rsi) |
123 | pcmpeqd %xmm2, %xmm0 |
124 | pmovmskb %xmm0, %eax |
125 | addq $16, %rsi |
126 | |
127 | test %eax, %eax |
128 | jnz L(CopyFrom1To16Bytes) |
129 | |
130 | movaps 16(%rcx, %rsi), %xmm3 |
131 | movaps %xmm2, (%rdx, %rsi) |
132 | pcmpeqd %xmm3, %xmm0 |
133 | pmovmskb %xmm0, %eax |
134 | addq $16, %rsi |
135 | |
136 | test %eax, %eax |
137 | jnz L(CopyFrom1To16Bytes) |
138 | |
139 | movaps %xmm3, (%rdx, %rsi) |
140 | mov %rcx, %rax |
141 | lea 16(%rcx, %rsi), %rcx |
142 | and $-0x40, %rcx |
143 | sub %rcx, %rax |
144 | sub %rax, %rdx |
145 | |
146 | mov $-0x40, %rsi |
147 | |
148 | .p2align 4 |
149 | L(Aligned64Loop): |
150 | movaps (%rcx), %xmm2 |
151 | movaps %xmm2, %xmm4 |
152 | movaps 16(%rcx), %xmm5 |
153 | movaps 32(%rcx), %xmm3 |
154 | movaps %xmm3, %xmm6 |
155 | movaps 48(%rcx), %xmm7 |
156 | pminub %xmm5, %xmm2 |
157 | pminub %xmm7, %xmm3 |
158 | pminub %xmm2, %xmm3 |
159 | pcmpeqd %xmm0, %xmm3 |
160 | pmovmskb %xmm3, %eax |
161 | addq $64, %rdx |
162 | addq $64, %rcx |
163 | testl %eax, %eax |
164 | jnz L(Aligned64Leave) |
165 | movaps %xmm4, -64(%rdx) |
166 | movaps %xmm5, -48(%rdx) |
167 | movaps %xmm6, -32(%rdx) |
168 | movaps %xmm7, -16(%rdx) |
169 | jmp L(Aligned64Loop) |
170 | |
171 | L(Aligned64Leave): |
172 | pcmpeqd %xmm4, %xmm0 |
173 | pmovmskb %xmm0, %eax |
174 | test %eax, %eax |
175 | jnz L(CopyFrom1To16Bytes) |
176 | |
177 | pcmpeqd %xmm5, %xmm0 |
178 | |
179 | pmovmskb %xmm0, %eax |
180 | movaps %xmm4, -64(%rdx) |
181 | addq $16, %rsi |
182 | test %eax, %eax |
183 | jnz L(CopyFrom1To16Bytes) |
184 | |
185 | pcmpeqd %xmm6, %xmm0 |
186 | |
187 | pmovmskb %xmm0, %eax |
188 | movaps %xmm5, -48(%rdx) |
189 | addq $16, %rsi |
190 | test %eax, %eax |
191 | jnz L(CopyFrom1To16Bytes) |
192 | |
193 | movaps %xmm6, -32(%rdx) |
194 | pcmpeqd %xmm7, %xmm0 |
195 | |
196 | pmovmskb %xmm0, %eax |
197 | addq $16, %rsi |
198 | test %eax, %eax |
199 | jnz L(CopyFrom1To16Bytes) |
200 | |
201 | mov $-0x40, %rsi |
202 | movaps %xmm7, -16(%rdx) |
203 | jmp L(Aligned64Loop) |
204 | |
205 | .p2align 4 |
206 | L(Shl4): |
207 | movaps -4(%rcx), %xmm1 |
208 | movaps 12(%rcx), %xmm2 |
209 | L(Shl4Start): |
210 | pcmpeqd %xmm2, %xmm0 |
211 | pmovmskb %xmm0, %eax |
212 | movaps %xmm2, %xmm3 |
213 | |
214 | test %eax, %eax |
215 | jnz L(Shl4LoopExit) |
216 | |
217 | palignr $4, %xmm1, %xmm2 |
218 | movaps %xmm2, (%rdx) |
219 | movaps 28(%rcx), %xmm2 |
220 | |
221 | pcmpeqd %xmm2, %xmm0 |
222 | addq $16, %rdx |
223 | pmovmskb %xmm0, %eax |
224 | addq $16, %rcx |
225 | movaps %xmm2, %xmm1 |
226 | |
227 | test %eax, %eax |
228 | jnz L(Shl4LoopExit) |
229 | |
230 | palignr $4, %xmm3, %xmm2 |
231 | movaps %xmm2, (%rdx) |
232 | movaps 28(%rcx), %xmm2 |
233 | |
234 | pcmpeqd %xmm2, %xmm0 |
235 | addq $16, %rdx |
236 | pmovmskb %xmm0, %eax |
237 | addq $16, %rcx |
238 | movaps %xmm2, %xmm3 |
239 | |
240 | test %eax, %eax |
241 | jnz L(Shl4LoopExit) |
242 | |
243 | palignr $4, %xmm1, %xmm2 |
244 | movaps %xmm2, (%rdx) |
245 | movaps 28(%rcx), %xmm2 |
246 | |
247 | pcmpeqd %xmm2, %xmm0 |
248 | addq $16, %rdx |
249 | pmovmskb %xmm0, %eax |
250 | addq $16, %rcx |
251 | |
252 | test %eax, %eax |
253 | jnz L(Shl4LoopExit) |
254 | |
255 | palignr $4, %xmm3, %xmm2 |
256 | movaps %xmm2, (%rdx) |
257 | addq $28, %rcx |
258 | addq $16, %rdx |
259 | |
260 | mov %rcx, %rax |
261 | and $-0x40, %rcx |
262 | sub %rcx, %rax |
263 | addq $-12, %rcx |
264 | sub %rax, %rdx |
265 | |
266 | movaps -4(%rcx), %xmm1 |
267 | |
268 | .p2align 4 |
269 | L(Shl4LoopStart): |
270 | movaps 12(%rcx), %xmm2 |
271 | movaps 28(%rcx), %xmm3 |
272 | movaps %xmm3, %xmm6 |
273 | movaps 44(%rcx), %xmm4 |
274 | movaps %xmm4, %xmm7 |
275 | movaps 60(%rcx), %xmm5 |
276 | pminub %xmm2, %xmm6 |
277 | pminub %xmm5, %xmm7 |
278 | pminub %xmm6, %xmm7 |
279 | pcmpeqd %xmm0, %xmm7 |
280 | pmovmskb %xmm7, %eax |
281 | movaps %xmm5, %xmm7 |
282 | palignr $4, %xmm4, %xmm5 |
283 | palignr $4, %xmm3, %xmm4 |
284 | test %eax, %eax |
285 | jnz L(Shl4Start) |
286 | |
287 | palignr $4, %xmm2, %xmm3 |
288 | addq $64, %rcx |
289 | palignr $4, %xmm1, %xmm2 |
290 | movaps %xmm7, %xmm1 |
291 | movaps %xmm5, 48(%rdx) |
292 | movaps %xmm4, 32(%rdx) |
293 | movaps %xmm3, 16(%rdx) |
294 | movaps %xmm2, (%rdx) |
295 | addq $64, %rdx |
296 | jmp L(Shl4LoopStart) |
297 | |
298 | L(Shl4LoopExit): |
299 | movdqu -4(%rcx), %xmm1 |
300 | mov $12, %rsi |
301 | movdqu %xmm1, -4(%rdx) |
302 | jmp L(CopyFrom1To16Bytes) |
303 | |
304 | .p2align 4 |
305 | L(Shl8): |
306 | movaps -8(%rcx), %xmm1 |
307 | movaps 8(%rcx), %xmm2 |
308 | L(Shl8Start): |
309 | pcmpeqd %xmm2, %xmm0 |
310 | pmovmskb %xmm0, %eax |
311 | movaps %xmm2, %xmm3 |
312 | |
313 | test %eax, %eax |
314 | jnz L(Shl8LoopExit) |
315 | |
316 | palignr $8, %xmm1, %xmm2 |
317 | movaps %xmm2, (%rdx) |
318 | movaps 24(%rcx), %xmm2 |
319 | |
320 | pcmpeqd %xmm2, %xmm0 |
321 | addq $16, %rdx |
322 | pmovmskb %xmm0, %eax |
323 | addq $16, %rcx |
324 | movaps %xmm2, %xmm1 |
325 | |
326 | test %eax, %eax |
327 | jnz L(Shl8LoopExit) |
328 | |
329 | palignr $8, %xmm3, %xmm2 |
330 | movaps %xmm2, (%rdx) |
331 | movaps 24(%rcx), %xmm2 |
332 | |
333 | pcmpeqd %xmm2, %xmm0 |
334 | addq $16, %rdx |
335 | pmovmskb %xmm0, %eax |
336 | addq $16, %rcx |
337 | movaps %xmm2, %xmm3 |
338 | |
339 | test %eax, %eax |
340 | jnz L(Shl8LoopExit) |
341 | |
342 | palignr $8, %xmm1, %xmm2 |
343 | movaps %xmm2, (%rdx) |
344 | movaps 24(%rcx), %xmm2 |
345 | |
346 | pcmpeqd %xmm2, %xmm0 |
347 | addq $16, %rdx |
348 | pmovmskb %xmm0, %eax |
349 | addq $16, %rcx |
350 | |
351 | test %eax, %eax |
352 | jnz L(Shl8LoopExit) |
353 | |
354 | palignr $8, %xmm3, %xmm2 |
355 | movaps %xmm2, (%rdx) |
356 | addq $24, %rcx |
357 | addq $16, %rdx |
358 | |
359 | mov %rcx, %rax |
360 | and $-0x40, %rcx |
361 | sub %rcx, %rax |
362 | addq $-8, %rcx |
363 | sub %rax, %rdx |
364 | |
365 | movaps -8(%rcx), %xmm1 |
366 | |
367 | .p2align 4 |
368 | L(Shl8LoopStart): |
369 | movaps 8(%rcx), %xmm2 |
370 | movaps 24(%rcx), %xmm3 |
371 | movaps %xmm3, %xmm6 |
372 | movaps 40(%rcx), %xmm4 |
373 | movaps %xmm4, %xmm7 |
374 | movaps 56(%rcx), %xmm5 |
375 | pminub %xmm2, %xmm6 |
376 | pminub %xmm5, %xmm7 |
377 | pminub %xmm6, %xmm7 |
378 | pcmpeqd %xmm0, %xmm7 |
379 | pmovmskb %xmm7, %eax |
380 | movaps %xmm5, %xmm7 |
381 | palignr $8, %xmm4, %xmm5 |
382 | palignr $8, %xmm3, %xmm4 |
383 | test %eax, %eax |
384 | jnz L(Shl8Start) |
385 | |
386 | palignr $8, %xmm2, %xmm3 |
387 | addq $64, %rcx |
388 | palignr $8, %xmm1, %xmm2 |
389 | movaps %xmm7, %xmm1 |
390 | movaps %xmm5, 48(%rdx) |
391 | movaps %xmm4, 32(%rdx) |
392 | movaps %xmm3, 16(%rdx) |
393 | movaps %xmm2, (%rdx) |
394 | addq $64, %rdx |
395 | jmp L(Shl8LoopStart) |
396 | |
397 | L(Shl8LoopExit): |
398 | mov (%rcx), %r9 |
399 | mov $8, %rsi |
400 | mov %r9, (%rdx) |
401 | jmp L(CopyFrom1To16Bytes) |
402 | |
403 | .p2align 4 |
404 | L(Shl12): |
405 | movaps -12(%rcx), %xmm1 |
406 | movaps 4(%rcx), %xmm2 |
407 | L(Shl12Start): |
408 | pcmpeqd %xmm2, %xmm0 |
409 | pmovmskb %xmm0, %eax |
410 | movaps %xmm2, %xmm3 |
411 | |
412 | test %eax, %eax |
413 | jnz L(Shl12LoopExit) |
414 | |
415 | palignr $12, %xmm1, %xmm2 |
416 | movaps %xmm2, (%rdx) |
417 | movaps 20(%rcx), %xmm2 |
418 | |
419 | pcmpeqd %xmm2, %xmm0 |
420 | addq $16, %rdx |
421 | pmovmskb %xmm0, %eax |
422 | addq $16, %rcx |
423 | movaps %xmm2, %xmm1 |
424 | |
425 | test %eax, %eax |
426 | jnz L(Shl12LoopExit) |
427 | |
428 | palignr $12, %xmm3, %xmm2 |
429 | movaps %xmm2, (%rdx) |
430 | movaps 20(%rcx), %xmm2 |
431 | |
432 | pcmpeqd %xmm2, %xmm0 |
433 | addq $16, %rdx |
434 | pmovmskb %xmm0, %eax |
435 | addq $16, %rcx |
436 | movaps %xmm2, %xmm3 |
437 | |
438 | test %eax, %eax |
439 | jnz L(Shl12LoopExit) |
440 | |
441 | palignr $12, %xmm1, %xmm2 |
442 | movaps %xmm2, (%rdx) |
443 | movaps 20(%rcx), %xmm2 |
444 | |
445 | pcmpeqd %xmm2, %xmm0 |
446 | addq $16, %rdx |
447 | pmovmskb %xmm0, %eax |
448 | addq $16, %rcx |
449 | |
450 | test %eax, %eax |
451 | jnz L(Shl12LoopExit) |
452 | |
453 | palignr $12, %xmm3, %xmm2 |
454 | movaps %xmm2, (%rdx) |
455 | addq $20, %rcx |
456 | addq $16, %rdx |
457 | |
458 | mov %rcx, %rax |
459 | and $-0x40, %rcx |
460 | sub %rcx, %rax |
461 | addq $-4, %rcx |
462 | sub %rax, %rdx |
463 | |
464 | movaps -12(%rcx), %xmm1 |
465 | |
466 | .p2align 4 |
467 | L(Shl12LoopStart): |
468 | movaps 4(%rcx), %xmm2 |
469 | movaps 20(%rcx), %xmm3 |
470 | movaps %xmm3, %xmm6 |
471 | movaps 36(%rcx), %xmm4 |
472 | movaps %xmm4, %xmm7 |
473 | movaps 52(%rcx), %xmm5 |
474 | pminub %xmm2, %xmm6 |
475 | pminub %xmm5, %xmm7 |
476 | pminub %xmm6, %xmm7 |
477 | pcmpeqd %xmm0, %xmm7 |
478 | pmovmskb %xmm7, %eax |
479 | movaps %xmm5, %xmm7 |
480 | palignr $12, %xmm4, %xmm5 |
481 | palignr $12, %xmm3, %xmm4 |
482 | test %eax, %eax |
483 | jnz L(Shl12Start) |
484 | palignr $12, %xmm2, %xmm3 |
485 | addq $64, %rcx |
486 | palignr $12, %xmm1, %xmm2 |
487 | movaps %xmm7, %xmm1 |
488 | movaps %xmm5, 48(%rdx) |
489 | movaps %xmm4, 32(%rdx) |
490 | movaps %xmm3, 16(%rdx) |
491 | movaps %xmm2, (%rdx) |
492 | addq $64, %rdx |
493 | jmp L(Shl12LoopStart) |
494 | |
495 | L(Shl12LoopExit): |
496 | mov (%rcx), %r9d |
497 | mov $4, %rsi |
498 | mov %r9d, (%rdx) |
499 | jmp L(CopyFrom1To16Bytes) |
500 | |
501 | .p2align 4 |
502 | L(CopyFrom1To16Bytes): |
503 | add %rsi, %rdx |
504 | add %rsi, %rcx |
505 | |
506 | test %al, %al |
507 | jz L(ExitHigh) |
508 | test $0x01, %al |
509 | jnz L(Exit4) |
510 | |
511 | mov (%rcx), %rax |
512 | mov %rax, (%rdx) |
513 | mov %rdi, %rax |
514 | ret |
515 | |
516 | .p2align 4 |
517 | L(ExitHigh): |
518 | test $0x01, %ah |
519 | jnz L(Exit12) |
520 | |
521 | mov (%rcx), %rax |
522 | mov %rax, (%rdx) |
523 | mov 8(%rcx), %rax |
524 | mov %rax, 8(%rdx) |
525 | mov %rdi, %rax |
526 | ret |
527 | |
528 | .p2align 4 |
529 | L(Exit4): |
530 | movl (%rcx), %eax |
531 | movl %eax, (%rdx) |
532 | mov %rdi, %rax |
533 | ret |
534 | |
535 | .p2align 4 |
536 | L(Exit8): |
537 | mov (%rcx), %rax |
538 | mov %rax, (%rdx) |
539 | mov %rdi, %rax |
540 | ret |
541 | |
542 | .p2align 4 |
543 | L(Exit12): |
544 | mov (%rcx), %rax |
545 | mov %rax, (%rdx) |
546 | mov 8(%rcx), %eax |
547 | mov %eax, 8(%rdx) |
548 | mov %rdi, %rax |
549 | ret |
550 | |
551 | .p2align 4 |
552 | L(Exit16): |
553 | mov (%rcx), %rax |
554 | mov %rax, (%rdx) |
555 | mov 8(%rcx), %rax |
556 | mov %rax, 8(%rdx) |
557 | mov %rdi, %rax |
558 | ret |
559 | |
560 | END(WCSCPY) |
561 | #endif |
562 | |