1 | /* wcscpy with SSSE3 |
2 | Copyright (C) 2011-2021 Free Software Foundation, Inc. |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #if IS_IN (libc) |
21 | # include <sysdep.h> |
22 | |
23 | .section .text.ssse3,"ax" ,@progbits |
24 | ENTRY (__wcscpy_ssse3) |
25 | |
26 | mov %rsi, %rcx |
27 | mov %rdi, %rdx |
28 | |
29 | cmpl $0, (%rcx) |
30 | jz L(Exit4) |
31 | cmpl $0, 4(%rcx) |
32 | jz L(Exit8) |
33 | cmpl $0, 8(%rcx) |
34 | jz L(Exit12) |
35 | cmpl $0, 12(%rcx) |
36 | jz L(Exit16) |
37 | |
38 | lea 16(%rcx), %rsi |
39 | and $-16, %rsi |
40 | |
41 | pxor %xmm0, %xmm0 |
42 | mov (%rcx), %r9 |
43 | mov %r9, (%rdx) |
44 | |
45 | pcmpeqd (%rsi), %xmm0 |
46 | mov 8(%rcx), %r9 |
47 | mov %r9, 8(%rdx) |
48 | |
49 | pmovmskb %xmm0, %rax |
50 | sub %rcx, %rsi |
51 | |
52 | test %rax, %rax |
53 | jnz L(CopyFrom1To16Bytes) |
54 | |
55 | mov %rdx, %rax |
56 | lea 16(%rdx), %rdx |
57 | and $-16, %rdx |
58 | sub %rdx, %rax |
59 | sub %rax, %rcx |
60 | mov %rcx, %rax |
61 | and $0xf, %rax |
62 | mov $0, %rsi |
63 | |
64 | /* case: rcx_offset == rdx_offset */ |
65 | |
66 | jz L(Align16Both) |
67 | |
68 | cmp $4, %rax |
69 | je L(Shl4) |
70 | cmp $8, %rax |
71 | je L(Shl8) |
72 | jmp L(Shl12) |
73 | |
74 | L(Align16Both): |
75 | movaps (%rcx), %xmm1 |
76 | movaps 16(%rcx), %xmm2 |
77 | movaps %xmm1, (%rdx) |
78 | pcmpeqd %xmm2, %xmm0 |
79 | pmovmskb %xmm0, %rax |
80 | lea 16(%rsi), %rsi |
81 | |
82 | test %rax, %rax |
83 | jnz L(CopyFrom1To16Bytes) |
84 | |
85 | movaps 16(%rcx, %rsi), %xmm3 |
86 | movaps %xmm2, (%rdx, %rsi) |
87 | pcmpeqd %xmm3, %xmm0 |
88 | pmovmskb %xmm0, %rax |
89 | lea 16(%rsi), %rsi |
90 | |
91 | test %rax, %rax |
92 | jnz L(CopyFrom1To16Bytes) |
93 | |
94 | movaps 16(%rcx, %rsi), %xmm4 |
95 | movaps %xmm3, (%rdx, %rsi) |
96 | pcmpeqd %xmm4, %xmm0 |
97 | pmovmskb %xmm0, %rax |
98 | lea 16(%rsi), %rsi |
99 | |
100 | test %rax, %rax |
101 | jnz L(CopyFrom1To16Bytes) |
102 | |
103 | movaps 16(%rcx, %rsi), %xmm1 |
104 | movaps %xmm4, (%rdx, %rsi) |
105 | pcmpeqd %xmm1, %xmm0 |
106 | pmovmskb %xmm0, %rax |
107 | lea 16(%rsi), %rsi |
108 | |
109 | test %rax, %rax |
110 | jnz L(CopyFrom1To16Bytes) |
111 | |
112 | movaps 16(%rcx, %rsi), %xmm2 |
113 | movaps %xmm1, (%rdx, %rsi) |
114 | pcmpeqd %xmm2, %xmm0 |
115 | pmovmskb %xmm0, %rax |
116 | lea 16(%rsi), %rsi |
117 | |
118 | test %rax, %rax |
119 | jnz L(CopyFrom1To16Bytes) |
120 | |
121 | movaps 16(%rcx, %rsi), %xmm3 |
122 | movaps %xmm2, (%rdx, %rsi) |
123 | pcmpeqd %xmm3, %xmm0 |
124 | pmovmskb %xmm0, %rax |
125 | lea 16(%rsi), %rsi |
126 | |
127 | test %rax, %rax |
128 | jnz L(CopyFrom1To16Bytes) |
129 | |
130 | movaps %xmm3, (%rdx, %rsi) |
131 | mov %rcx, %rax |
132 | lea 16(%rcx, %rsi), %rcx |
133 | and $-0x40, %rcx |
134 | sub %rcx, %rax |
135 | sub %rax, %rdx |
136 | |
137 | mov $-0x40, %rsi |
138 | |
139 | .p2align 4 |
140 | L(Aligned64Loop): |
141 | movaps (%rcx), %xmm2 |
142 | movaps %xmm2, %xmm4 |
143 | movaps 16(%rcx), %xmm5 |
144 | movaps 32(%rcx), %xmm3 |
145 | movaps %xmm3, %xmm6 |
146 | movaps 48(%rcx), %xmm7 |
147 | pminub %xmm5, %xmm2 |
148 | pminub %xmm7, %xmm3 |
149 | pminub %xmm2, %xmm3 |
150 | pcmpeqd %xmm0, %xmm3 |
151 | pmovmskb %xmm3, %rax |
152 | lea 64(%rdx), %rdx |
153 | lea 64(%rcx), %rcx |
154 | test %rax, %rax |
155 | jnz L(Aligned64Leave) |
156 | movaps %xmm4, -64(%rdx) |
157 | movaps %xmm5, -48(%rdx) |
158 | movaps %xmm6, -32(%rdx) |
159 | movaps %xmm7, -16(%rdx) |
160 | jmp L(Aligned64Loop) |
161 | |
162 | L(Aligned64Leave): |
163 | pcmpeqd %xmm4, %xmm0 |
164 | pmovmskb %xmm0, %rax |
165 | test %rax, %rax |
166 | jnz L(CopyFrom1To16Bytes) |
167 | |
168 | pcmpeqd %xmm5, %xmm0 |
169 | |
170 | pmovmskb %xmm0, %rax |
171 | movaps %xmm4, -64(%rdx) |
172 | test %rax, %rax |
173 | lea 16(%rsi), %rsi |
174 | jnz L(CopyFrom1To16Bytes) |
175 | |
176 | pcmpeqd %xmm6, %xmm0 |
177 | |
178 | pmovmskb %xmm0, %rax |
179 | movaps %xmm5, -48(%rdx) |
180 | test %rax, %rax |
181 | lea 16(%rsi), %rsi |
182 | jnz L(CopyFrom1To16Bytes) |
183 | |
184 | movaps %xmm6, -32(%rdx) |
185 | pcmpeqd %xmm7, %xmm0 |
186 | |
187 | pmovmskb %xmm0, %rax |
188 | lea 16(%rsi), %rsi |
189 | test %rax, %rax |
190 | jnz L(CopyFrom1To16Bytes) |
191 | |
192 | mov $-0x40, %rsi |
193 | movaps %xmm7, -16(%rdx) |
194 | jmp L(Aligned64Loop) |
195 | |
196 | .p2align 4 |
197 | L(Shl4): |
198 | movaps -4(%rcx), %xmm1 |
199 | movaps 12(%rcx), %xmm2 |
200 | L(Shl4Start): |
201 | pcmpeqd %xmm2, %xmm0 |
202 | pmovmskb %xmm0, %rax |
203 | movaps %xmm2, %xmm3 |
204 | |
205 | test %rax, %rax |
206 | jnz L(Shl4LoopExit) |
207 | |
208 | palignr $4, %xmm1, %xmm2 |
209 | movaps %xmm2, (%rdx) |
210 | movaps 28(%rcx), %xmm2 |
211 | |
212 | pcmpeqd %xmm2, %xmm0 |
213 | lea 16(%rdx), %rdx |
214 | pmovmskb %xmm0, %rax |
215 | lea 16(%rcx), %rcx |
216 | movaps %xmm2, %xmm1 |
217 | |
218 | test %rax, %rax |
219 | jnz L(Shl4LoopExit) |
220 | |
221 | palignr $4, %xmm3, %xmm2 |
222 | movaps %xmm2, (%rdx) |
223 | movaps 28(%rcx), %xmm2 |
224 | |
225 | pcmpeqd %xmm2, %xmm0 |
226 | lea 16(%rdx), %rdx |
227 | pmovmskb %xmm0, %rax |
228 | lea 16(%rcx), %rcx |
229 | movaps %xmm2, %xmm3 |
230 | |
231 | test %rax, %rax |
232 | jnz L(Shl4LoopExit) |
233 | |
234 | palignr $4, %xmm1, %xmm2 |
235 | movaps %xmm2, (%rdx) |
236 | movaps 28(%rcx), %xmm2 |
237 | |
238 | pcmpeqd %xmm2, %xmm0 |
239 | lea 16(%rdx), %rdx |
240 | pmovmskb %xmm0, %rax |
241 | lea 16(%rcx), %rcx |
242 | |
243 | test %rax, %rax |
244 | jnz L(Shl4LoopExit) |
245 | |
246 | palignr $4, %xmm3, %xmm2 |
247 | movaps %xmm2, (%rdx) |
248 | lea 28(%rcx), %rcx |
249 | lea 16(%rdx), %rdx |
250 | |
251 | mov %rcx, %rax |
252 | and $-0x40, %rcx |
253 | sub %rcx, %rax |
254 | lea -12(%rcx), %rcx |
255 | sub %rax, %rdx |
256 | |
257 | movaps -4(%rcx), %xmm1 |
258 | |
259 | .p2align 4 |
260 | L(Shl4LoopStart): |
261 | movaps 12(%rcx), %xmm2 |
262 | movaps 28(%rcx), %xmm3 |
263 | movaps %xmm3, %xmm6 |
264 | movaps 44(%rcx), %xmm4 |
265 | movaps %xmm4, %xmm7 |
266 | movaps 60(%rcx), %xmm5 |
267 | pminub %xmm2, %xmm6 |
268 | pminub %xmm5, %xmm7 |
269 | pminub %xmm6, %xmm7 |
270 | pcmpeqd %xmm0, %xmm7 |
271 | pmovmskb %xmm7, %rax |
272 | movaps %xmm5, %xmm7 |
273 | palignr $4, %xmm4, %xmm5 |
274 | test %rax, %rax |
275 | palignr $4, %xmm3, %xmm4 |
276 | jnz L(Shl4Start) |
277 | |
278 | palignr $4, %xmm2, %xmm3 |
279 | lea 64(%rcx), %rcx |
280 | palignr $4, %xmm1, %xmm2 |
281 | movaps %xmm7, %xmm1 |
282 | movaps %xmm5, 48(%rdx) |
283 | movaps %xmm4, 32(%rdx) |
284 | movaps %xmm3, 16(%rdx) |
285 | movaps %xmm2, (%rdx) |
286 | lea 64(%rdx), %rdx |
287 | jmp L(Shl4LoopStart) |
288 | |
289 | L(Shl4LoopExit): |
290 | movdqu -4(%rcx), %xmm1 |
291 | mov $12, %rsi |
292 | movdqu %xmm1, -4(%rdx) |
293 | jmp L(CopyFrom1To16Bytes) |
294 | |
295 | .p2align 4 |
296 | L(Shl8): |
297 | movaps -8(%rcx), %xmm1 |
298 | movaps 8(%rcx), %xmm2 |
299 | L(Shl8Start): |
300 | pcmpeqd %xmm2, %xmm0 |
301 | pmovmskb %xmm0, %rax |
302 | movaps %xmm2, %xmm3 |
303 | |
304 | test %rax, %rax |
305 | jnz L(Shl8LoopExit) |
306 | |
307 | palignr $8, %xmm1, %xmm2 |
308 | movaps %xmm2, (%rdx) |
309 | movaps 24(%rcx), %xmm2 |
310 | |
311 | pcmpeqd %xmm2, %xmm0 |
312 | lea 16(%rdx), %rdx |
313 | pmovmskb %xmm0, %rax |
314 | lea 16(%rcx), %rcx |
315 | movaps %xmm2, %xmm1 |
316 | |
317 | test %rax, %rax |
318 | jnz L(Shl8LoopExit) |
319 | |
320 | palignr $8, %xmm3, %xmm2 |
321 | movaps %xmm2, (%rdx) |
322 | movaps 24(%rcx), %xmm2 |
323 | |
324 | pcmpeqd %xmm2, %xmm0 |
325 | lea 16(%rdx), %rdx |
326 | pmovmskb %xmm0, %rax |
327 | lea 16(%rcx), %rcx |
328 | movaps %xmm2, %xmm3 |
329 | |
330 | test %rax, %rax |
331 | jnz L(Shl8LoopExit) |
332 | |
333 | palignr $8, %xmm1, %xmm2 |
334 | movaps %xmm2, (%rdx) |
335 | movaps 24(%rcx), %xmm2 |
336 | |
337 | pcmpeqd %xmm2, %xmm0 |
338 | lea 16(%rdx), %rdx |
339 | pmovmskb %xmm0, %rax |
340 | lea 16(%rcx), %rcx |
341 | |
342 | test %rax, %rax |
343 | jnz L(Shl8LoopExit) |
344 | |
345 | palignr $8, %xmm3, %xmm2 |
346 | movaps %xmm2, (%rdx) |
347 | lea 24(%rcx), %rcx |
348 | lea 16(%rdx), %rdx |
349 | |
350 | mov %rcx, %rax |
351 | and $-0x40, %rcx |
352 | sub %rcx, %rax |
353 | lea -8(%rcx), %rcx |
354 | sub %rax, %rdx |
355 | |
356 | movaps -8(%rcx), %xmm1 |
357 | |
358 | .p2align 4 |
359 | L(Shl8LoopStart): |
360 | movaps 8(%rcx), %xmm2 |
361 | movaps 24(%rcx), %xmm3 |
362 | movaps %xmm3, %xmm6 |
363 | movaps 40(%rcx), %xmm4 |
364 | movaps %xmm4, %xmm7 |
365 | movaps 56(%rcx), %xmm5 |
366 | pminub %xmm2, %xmm6 |
367 | pminub %xmm5, %xmm7 |
368 | pminub %xmm6, %xmm7 |
369 | pcmpeqd %xmm0, %xmm7 |
370 | pmovmskb %xmm7, %rax |
371 | movaps %xmm5, %xmm7 |
372 | palignr $8, %xmm4, %xmm5 |
373 | test %rax, %rax |
374 | palignr $8, %xmm3, %xmm4 |
375 | jnz L(Shl8Start) |
376 | |
377 | palignr $8, %xmm2, %xmm3 |
378 | lea 64(%rcx), %rcx |
379 | palignr $8, %xmm1, %xmm2 |
380 | movaps %xmm7, %xmm1 |
381 | movaps %xmm5, 48(%rdx) |
382 | movaps %xmm4, 32(%rdx) |
383 | movaps %xmm3, 16(%rdx) |
384 | movaps %xmm2, (%rdx) |
385 | lea 64(%rdx), %rdx |
386 | jmp L(Shl8LoopStart) |
387 | |
388 | L(Shl8LoopExit): |
389 | mov (%rcx), %r9 |
390 | mov $8, %rsi |
391 | mov %r9, (%rdx) |
392 | jmp L(CopyFrom1To16Bytes) |
393 | |
394 | .p2align 4 |
395 | L(Shl12): |
396 | movaps -12(%rcx), %xmm1 |
397 | movaps 4(%rcx), %xmm2 |
398 | L(Shl12Start): |
399 | pcmpeqd %xmm2, %xmm0 |
400 | pmovmskb %xmm0, %rax |
401 | movaps %xmm2, %xmm3 |
402 | |
403 | test %rax, %rax |
404 | jnz L(Shl12LoopExit) |
405 | |
406 | palignr $12, %xmm1, %xmm2 |
407 | movaps %xmm2, (%rdx) |
408 | movaps 20(%rcx), %xmm2 |
409 | |
410 | pcmpeqd %xmm2, %xmm0 |
411 | lea 16(%rdx), %rdx |
412 | pmovmskb %xmm0, %rax |
413 | lea 16(%rcx), %rcx |
414 | movaps %xmm2, %xmm1 |
415 | |
416 | test %rax, %rax |
417 | jnz L(Shl12LoopExit) |
418 | |
419 | palignr $12, %xmm3, %xmm2 |
420 | movaps %xmm2, (%rdx) |
421 | movaps 20(%rcx), %xmm2 |
422 | |
423 | pcmpeqd %xmm2, %xmm0 |
424 | lea 16(%rdx), %rdx |
425 | pmovmskb %xmm0, %rax |
426 | lea 16(%rcx), %rcx |
427 | movaps %xmm2, %xmm3 |
428 | |
429 | test %rax, %rax |
430 | jnz L(Shl12LoopExit) |
431 | |
432 | palignr $12, %xmm1, %xmm2 |
433 | movaps %xmm2, (%rdx) |
434 | movaps 20(%rcx), %xmm2 |
435 | |
436 | pcmpeqd %xmm2, %xmm0 |
437 | lea 16(%rdx), %rdx |
438 | pmovmskb %xmm0, %rax |
439 | lea 16(%rcx), %rcx |
440 | |
441 | test %rax, %rax |
442 | jnz L(Shl12LoopExit) |
443 | |
444 | palignr $12, %xmm3, %xmm2 |
445 | movaps %xmm2, (%rdx) |
446 | lea 20(%rcx), %rcx |
447 | lea 16(%rdx), %rdx |
448 | |
449 | mov %rcx, %rax |
450 | and $-0x40, %rcx |
451 | sub %rcx, %rax |
452 | lea -4(%rcx), %rcx |
453 | sub %rax, %rdx |
454 | |
455 | movaps -12(%rcx), %xmm1 |
456 | |
457 | .p2align 4 |
458 | L(Shl12LoopStart): |
459 | movaps 4(%rcx), %xmm2 |
460 | movaps 20(%rcx), %xmm3 |
461 | movaps %xmm3, %xmm6 |
462 | movaps 36(%rcx), %xmm4 |
463 | movaps %xmm4, %xmm7 |
464 | movaps 52(%rcx), %xmm5 |
465 | pminub %xmm2, %xmm6 |
466 | pminub %xmm5, %xmm7 |
467 | pminub %xmm6, %xmm7 |
468 | pcmpeqd %xmm0, %xmm7 |
469 | pmovmskb %xmm7, %rax |
470 | movaps %xmm5, %xmm7 |
471 | palignr $12, %xmm4, %xmm5 |
472 | test %rax, %rax |
473 | palignr $12, %xmm3, %xmm4 |
474 | jnz L(Shl12Start) |
475 | palignr $12, %xmm2, %xmm3 |
476 | lea 64(%rcx), %rcx |
477 | palignr $12, %xmm1, %xmm2 |
478 | movaps %xmm7, %xmm1 |
479 | movaps %xmm5, 48(%rdx) |
480 | movaps %xmm4, 32(%rdx) |
481 | movaps %xmm3, 16(%rdx) |
482 | movaps %xmm2, (%rdx) |
483 | lea 64(%rdx), %rdx |
484 | jmp L(Shl12LoopStart) |
485 | |
486 | L(Shl12LoopExit): |
487 | mov (%rcx), %r9d |
488 | mov $4, %rsi |
489 | mov %r9d, (%rdx) |
490 | jmp L(CopyFrom1To16Bytes) |
491 | |
492 | .p2align 4 |
493 | L(CopyFrom1To16Bytes): |
494 | add %rsi, %rdx |
495 | add %rsi, %rcx |
496 | |
497 | test %al, %al |
498 | jz L(ExitHigh) |
499 | test $0x01, %al |
500 | jnz L(Exit4) |
501 | |
502 | mov (%rcx), %rax |
503 | mov %rax, (%rdx) |
504 | mov %rdi, %rax |
505 | ret |
506 | |
507 | .p2align 4 |
508 | L(ExitHigh): |
509 | test $0x01, %ah |
510 | jnz L(Exit12) |
511 | |
512 | mov (%rcx), %rax |
513 | mov %rax, (%rdx) |
514 | mov 8(%rcx), %rax |
515 | mov %rax, 8(%rdx) |
516 | mov %rdi, %rax |
517 | ret |
518 | |
519 | .p2align 4 |
520 | L(Exit4): |
521 | movl (%rcx), %eax |
522 | movl %eax, (%rdx) |
523 | mov %rdi, %rax |
524 | ret |
525 | |
526 | .p2align 4 |
527 | L(Exit8): |
528 | mov (%rcx), %rax |
529 | mov %rax, (%rdx) |
530 | mov %rdi, %rax |
531 | ret |
532 | |
533 | .p2align 4 |
534 | L(Exit12): |
535 | mov (%rcx), %rax |
536 | mov %rax, (%rdx) |
537 | mov 8(%rcx), %eax |
538 | mov %eax, 8(%rdx) |
539 | mov %rdi, %rax |
540 | ret |
541 | |
542 | .p2align 4 |
543 | L(Exit16): |
544 | mov (%rcx), %rax |
545 | mov %rax, (%rdx) |
546 | mov 8(%rcx), %rax |
547 | mov %rax, 8(%rdx) |
548 | mov %rdi, %rax |
549 | ret |
550 | |
551 | END(__wcscpy_ssse3) |
552 | #endif |
553 | |