1 | /* |
2 | Optimized memcpy for x86-64. |
3 | |
4 | Copyright (C) 2007-2016 Free Software Foundation, Inc. |
5 | Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007. |
6 | |
7 | This file is part of the GNU C Library. |
8 | |
9 | The GNU C Library is free software; you can redistribute it and/or |
10 | modify it under the terms of the GNU Lesser General Public |
11 | License as published by the Free Software Foundation; either |
12 | version 2.1 of the License, or (at your option) any later version. |
13 | |
14 | The GNU C Library is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | Lesser General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU Lesser General Public |
20 | License along with the GNU C Library; if not, see |
21 | <http://www.gnu.org/licenses/>. |
22 | */ |
23 | |
24 | #include <sysdep.h> |
25 | #include "asm-syntax.h" |
26 | |
27 | /* Stack slots in the red-zone. */ |
28 | |
29 | #ifdef USE_AS_MEMPCPY |
30 | # define RETVAL (0) |
31 | #else |
32 | # define RETVAL (-8) |
33 | # if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc) |
34 | # define memcpy __memcpy |
35 | # undef libc_hidden_builtin_def |
36 | # define libc_hidden_builtin_def(name) \ |
37 | .globl __GI_memcpy; __GI_memcpy = __memcpy |
38 | # endif |
39 | #endif |
40 | #define SAVE0 (RETVAL - 8) |
41 | #define SAVE1 (SAVE0 - 8) |
42 | #define SAVE2 (SAVE1 - 8) |
43 | #define SAVE3 (SAVE2 - 8) |
44 | |
45 | .text |
46 | |
47 | #if defined PIC && IS_IN (libc) |
48 | ENTRY_CHK (__memcpy_chk) |
49 | |
50 | cmpq %rdx, %rcx |
51 | jb HIDDEN_JUMPTARGET (__chk_fail) |
52 | |
53 | END_CHK (__memcpy_chk) |
54 | #endif |
55 | |
56 | ENTRY(memcpy) /* (void *, const void*, size_t) */ |
57 | |
58 | /* Handle tiny blocks. */ |
59 | |
60 | L(1try): /* up to 32B */ |
61 | cmpq $32, %rdx |
62 | #ifndef USE_AS_MEMPCPY |
63 | movq %rdi, %rax /* save return value */ |
64 | #endif |
65 | jae L(1after) |
66 | |
67 | L(1): /* 1-byte once */ |
68 | testb $1, %dl |
69 | jz L(1a) |
70 | |
71 | movzbl (%rsi), %ecx |
72 | movb %cl, (%rdi) |
73 | |
74 | incq %rsi |
75 | incq %rdi |
76 | |
77 | .p2align 4,, 4 |
78 | |
79 | L(1a): /* 2-byte once */ |
80 | testb $2, %dl |
81 | jz L(1b) |
82 | |
83 | movzwl (%rsi), %ecx |
84 | movw %cx, (%rdi) |
85 | |
86 | addq $2, %rsi |
87 | addq $2, %rdi |
88 | |
89 | .p2align 4,, 4 |
90 | |
91 | L(1b): /* 4-byte once */ |
92 | testb $4, %dl |
93 | jz L(1c) |
94 | |
95 | movl (%rsi), %ecx |
96 | movl %ecx, (%rdi) |
97 | |
98 | addq $4, %rsi |
99 | addq $4, %rdi |
100 | |
101 | .p2align 4,, 4 |
102 | |
103 | L(1c): /* 8-byte once */ |
104 | testb $8, %dl |
105 | jz L(1d) |
106 | |
107 | movq (%rsi), %rcx |
108 | movq %rcx, (%rdi) |
109 | |
110 | addq $8, %rsi |
111 | addq $8, %rdi |
112 | |
113 | .p2align 4,, 4 |
114 | |
115 | L(1d): /* 16-byte loop */ |
116 | andl $0xf0, %edx |
117 | jz L(exit) |
118 | |
119 | .p2align 4 |
120 | |
121 | L(1loop): |
122 | movq (%rsi), %rcx |
123 | movq 8(%rsi), %r8 |
124 | movq %rcx, (%rdi) |
125 | movq %r8, 8(%rdi) |
126 | |
127 | subl $16, %edx |
128 | |
129 | leaq 16(%rsi), %rsi |
130 | leaq 16(%rdi), %rdi |
131 | |
132 | jnz L(1loop) |
133 | |
134 | .p2align 4,, 4 |
135 | |
136 | L(exit): /* exit */ |
137 | #ifdef USE_AS_MEMPCPY |
138 | movq %rdi, %rax /* return value */ |
139 | #else |
140 | rep |
141 | #endif |
142 | retq |
143 | |
144 | .p2align 4 |
145 | |
146 | L(1after): |
147 | #ifndef USE_AS_MEMPCPY |
148 | movq %rax, RETVAL(%rsp) /* save return value */ |
149 | #endif |
150 | |
151 | /* Align to the natural word size. */ |
152 | |
153 | L(aligntry): |
154 | movl %esi, %ecx /* align by source */ |
155 | |
156 | andl $7, %ecx |
157 | jz L(alignafter) /* already aligned */ |
158 | |
159 | L(align): /* align */ |
160 | leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */ |
161 | subl $8, %ecx |
162 | |
163 | .p2align 4 |
164 | |
165 | L(alignloop): /* 1-byte alignment loop */ |
166 | movzbl (%rsi), %eax |
167 | movb %al, (%rdi) |
168 | |
169 | incl %ecx |
170 | |
171 | leaq 1(%rsi), %rsi |
172 | leaq 1(%rdi), %rdi |
173 | |
174 | jnz L(alignloop) |
175 | |
176 | .p2align 4 |
177 | |
178 | L(alignafter): |
179 | |
180 | /* Handle mid-sized blocks. */ |
181 | |
182 | L(32try): /* up to 1KB */ |
183 | cmpq $1024, %rdx |
184 | ja L(32after) |
185 | |
186 | L(32): /* 32-byte loop */ |
187 | movl %edx, %ecx |
188 | shrl $5, %ecx |
189 | jz L(32skip) |
190 | |
191 | .p2align 4 |
192 | |
193 | L(32loop): |
194 | decl %ecx |
195 | |
196 | movq (%rsi), %rax |
197 | movq 8(%rsi), %r8 |
198 | movq 16(%rsi), %r9 |
199 | movq 24(%rsi), %r10 |
200 | |
201 | movq %rax, (%rdi) |
202 | movq %r8, 8(%rdi) |
203 | movq %r9, 16(%rdi) |
204 | movq %r10, 24(%rdi) |
205 | |
206 | leaq 32(%rsi), %rsi |
207 | leaq 32(%rdi), %rdi |
208 | |
209 | jz L(32skip) /* help out smaller blocks */ |
210 | |
211 | decl %ecx |
212 | |
213 | movq (%rsi), %rax |
214 | movq 8(%rsi), %r8 |
215 | movq 16(%rsi), %r9 |
216 | movq 24(%rsi), %r10 |
217 | |
218 | movq %rax, (%rdi) |
219 | movq %r8, 8(%rdi) |
220 | movq %r9, 16(%rdi) |
221 | movq %r10, 24(%rdi) |
222 | |
223 | leaq 32(%rsi), %rsi |
224 | leaq 32(%rdi), %rdi |
225 | |
226 | jnz L(32loop) |
227 | |
228 | .p2align 4 |
229 | |
230 | L(32skip): |
231 | andl $31, %edx /* check for left overs */ |
232 | #ifdef USE_AS_MEMPCPY |
233 | jnz L(1) |
234 | |
235 | movq %rdi, %rax |
236 | #else |
237 | movq RETVAL(%rsp), %rax |
238 | jnz L(1) |
239 | |
240 | rep |
241 | #endif |
242 | retq /* exit */ |
243 | |
244 | .p2align 4 |
245 | |
246 | L(32after): |
247 | |
248 | /* |
249 | In order to minimize code-size in RTLD, algorithms specific for |
250 | larger blocks are excluded when building for RTLD. |
251 | */ |
252 | |
253 | /* Handle blocks smaller than 1/2 L1. */ |
254 | |
255 | L(fasttry): /* first 1/2 L1 */ |
256 | #if IS_IN (libc) /* only up to this algorithm outside of libc.so */ |
257 | mov __x86_data_cache_size_half(%rip), %R11_LP |
258 | cmpq %rdx, %r11 /* calculate the smaller of */ |
259 | cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ |
260 | #endif |
261 | |
262 | L(fast): /* good ol' MOVS */ |
263 | #if IS_IN (libc) |
264 | movq %r11, %rcx |
265 | andq $-8, %r11 |
266 | #else |
267 | movq %rdx, %rcx |
268 | #endif |
269 | shrq $3, %rcx |
270 | jz L(fastskip) |
271 | |
272 | rep |
273 | movsq |
274 | |
275 | .p2align 4,, 4 |
276 | |
277 | L(fastskip): |
278 | #if IS_IN (libc) |
279 | subq %r11, %rdx /* check for more */ |
280 | testq $-8, %rdx |
281 | jnz L(fastafter) |
282 | #endif |
283 | |
284 | andl $7, %edx /* check for left overs */ |
285 | #ifdef USE_AS_MEMPCPY |
286 | jnz L(1) |
287 | |
288 | movq %rdi, %rax |
289 | #else |
290 | movq RETVAL(%rsp), %rax |
291 | jnz L(1) |
292 | |
293 | rep |
294 | #endif |
295 | retq /* exit */ |
296 | |
297 | #if IS_IN (libc) /* none of the algorithms below for RTLD */ |
298 | |
299 | .p2align 4 |
300 | |
301 | L(fastafter): |
302 | |
303 | /* Handle large blocks smaller than 1/2 L2. */ |
304 | |
305 | L(pretry): /* first 1/2 L2 */ |
306 | mov __x86_shared_cache_size_half (%rip), %R8_LP |
307 | cmpq %rdx, %r8 /* calculate the lesser of */ |
308 | cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */ |
309 | |
310 | L(pre): /* 64-byte with prefetching */ |
311 | movq %r8, %rcx |
312 | andq $-64, %r8 |
313 | shrq $6, %rcx |
314 | jz L(preskip) |
315 | |
316 | movq %r14, SAVE0(%rsp) |
317 | cfi_rel_offset (%r14, SAVE0) |
318 | movq %r13, SAVE1(%rsp) |
319 | cfi_rel_offset (%r13, SAVE1) |
320 | movq %r12, SAVE2(%rsp) |
321 | cfi_rel_offset (%r12, SAVE2) |
322 | movq %rbx, SAVE3(%rsp) |
323 | cfi_rel_offset (%rbx, SAVE3) |
324 | |
325 | cmpl $0, __x86_prefetchw(%rip) |
326 | jz L(preloop) /* check if PREFETCHW OK */ |
327 | |
328 | .p2align 4 |
329 | |
330 | /* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */ |
331 | |
332 | L(prewloop): /* cache-line in state M */ |
333 | decq %rcx |
334 | |
335 | movq (%rsi), %rax |
336 | movq 8 (%rsi), %rbx |
337 | movq 16 (%rsi), %r9 |
338 | movq 24 (%rsi), %r10 |
339 | movq 32 (%rsi), %r11 |
340 | movq 40 (%rsi), %r12 |
341 | movq 48 (%rsi), %r13 |
342 | movq 56 (%rsi), %r14 |
343 | |
344 | prefetcht0 0 + 896 (%rsi) |
345 | prefetcht0 64 + 896 (%rsi) |
346 | |
347 | movq %rax, (%rdi) |
348 | movq %rbx, 8(%rdi) |
349 | movq %r9, 16(%rdi) |
350 | movq %r10, 24(%rdi) |
351 | movq %r11, 32(%rdi) |
352 | movq %r12, 40(%rdi) |
353 | movq %r13, 48(%rdi) |
354 | movq %r14, 56(%rdi) |
355 | |
356 | leaq 64(%rsi), %rsi |
357 | leaq 64(%rdi), %rdi |
358 | |
359 | jz L(prebail) |
360 | |
361 | decq %rcx |
362 | |
363 | movq (%rsi), %rax |
364 | movq 8(%rsi), %rbx |
365 | movq 16(%rsi), %r9 |
366 | movq 24(%rsi), %r10 |
367 | movq 32(%rsi), %r11 |
368 | movq 40(%rsi), %r12 |
369 | movq 48(%rsi), %r13 |
370 | movq 56(%rsi), %r14 |
371 | |
372 | movq %rax, (%rdi) |
373 | movq %rbx, 8(%rdi) |
374 | movq %r9, 16(%rdi) |
375 | movq %r10, 24(%rdi) |
376 | movq %r11, 32(%rdi) |
377 | movq %r12, 40(%rdi) |
378 | movq %r13, 48(%rdi) |
379 | movq %r14, 56(%rdi) |
380 | |
381 | prefetchw 896 - 64(%rdi) |
382 | prefetchw 896 - 0(%rdi) |
383 | |
384 | leaq 64(%rsi), %rsi |
385 | leaq 64(%rdi), %rdi |
386 | |
387 | jnz L(prewloop) |
388 | jmp L(prebail) |
389 | |
390 | .p2align 4 |
391 | |
392 | /* ... when PREFETCHW is not available. */ |
393 | |
394 | L(preloop): /* cache-line in state E */ |
395 | decq %rcx |
396 | |
397 | movq (%rsi), %rax |
398 | movq 8(%rsi), %rbx |
399 | movq 16(%rsi), %r9 |
400 | movq 24(%rsi), %r10 |
401 | movq 32(%rsi), %r11 |
402 | movq 40(%rsi), %r12 |
403 | movq 48(%rsi), %r13 |
404 | movq 56(%rsi), %r14 |
405 | |
406 | prefetcht0 896 + 0(%rsi) |
407 | prefetcht0 896 + 64(%rsi) |
408 | |
409 | movq %rax, (%rdi) |
410 | movq %rbx, 8(%rdi) |
411 | movq %r9, 16(%rdi) |
412 | movq %r10, 24(%rdi) |
413 | movq %r11, 32(%rdi) |
414 | movq %r12, 40(%rdi) |
415 | movq %r13, 48(%rdi) |
416 | movq %r14, 56(%rdi) |
417 | |
418 | leaq 64 (%rsi), %rsi |
419 | leaq 64 (%rdi), %rdi |
420 | |
421 | jz L(prebail) |
422 | |
423 | decq %rcx |
424 | |
425 | movq (%rsi), %rax |
426 | movq 8(%rsi), %rbx |
427 | movq 16(%rsi), %r9 |
428 | movq 24(%rsi), %r10 |
429 | movq 32(%rsi), %r11 |
430 | movq 40(%rsi), %r12 |
431 | movq 48(%rsi), %r13 |
432 | movq 56(%rsi), %r14 |
433 | |
434 | prefetcht0 896 - 64(%rdi) |
435 | prefetcht0 896 - 0(%rdi) |
436 | |
437 | movq %rax, (%rdi) |
438 | movq %rbx, 8(%rdi) |
439 | movq %r9, 16(%rdi) |
440 | movq %r10, 24(%rdi) |
441 | movq %r11, 32(%rdi) |
442 | movq %r12, 40(%rdi) |
443 | movq %r13, 48(%rdi) |
444 | movq %r14, 56(%rdi) |
445 | |
446 | leaq 64(%rsi), %rsi |
447 | leaq 64(%rdi), %rdi |
448 | |
449 | jnz L(preloop) |
450 | |
451 | L(prebail): |
452 | movq SAVE3(%rsp), %rbx |
453 | cfi_restore (%rbx) |
454 | movq SAVE2(%rsp), %r12 |
455 | cfi_restore (%r12) |
456 | movq SAVE1(%rsp), %r13 |
457 | cfi_restore (%r13) |
458 | movq SAVE0(%rsp), %r14 |
459 | cfi_restore (%r14) |
460 | |
461 | /* .p2align 4 */ |
462 | |
463 | L(preskip): |
464 | subq %r8, %rdx /* check for more */ |
465 | testq $-64, %rdx |
466 | jnz L(preafter) |
467 | |
468 | andl $63, %edx /* check for left overs */ |
469 | #ifdef USE_AS_MEMPCPY |
470 | jnz L(1) |
471 | |
472 | movq %rdi, %rax |
473 | #else |
474 | movq RETVAL(%rsp), %rax |
475 | jnz L(1) |
476 | |
477 | rep |
478 | #endif |
479 | retq /* exit */ |
480 | |
481 | .p2align 4 |
482 | |
483 | L(preafter): |
484 | |
485 | /* Handle huge blocks. */ |
486 | |
487 | L(NTtry): |
488 | |
489 | L(NT): /* non-temporal 128-byte */ |
490 | movq %rdx, %rcx |
491 | shrq $7, %rcx |
492 | jz L(NTskip) |
493 | |
494 | movq %r14, SAVE0(%rsp) |
495 | cfi_rel_offset (%r14, SAVE0) |
496 | movq %r13, SAVE1(%rsp) |
497 | cfi_rel_offset (%r13, SAVE1) |
498 | movq %r12, SAVE2(%rsp) |
499 | cfi_rel_offset (%r12, SAVE2) |
500 | |
501 | .p2align 4 |
502 | |
503 | L(NTloop): |
504 | prefetchnta 768(%rsi) |
505 | prefetchnta 832(%rsi) |
506 | |
507 | decq %rcx |
508 | |
509 | movq (%rsi), %rax |
510 | movq 8(%rsi), %r8 |
511 | movq 16(%rsi), %r9 |
512 | movq 24(%rsi), %r10 |
513 | movq 32(%rsi), %r11 |
514 | movq 40(%rsi), %r12 |
515 | movq 48(%rsi), %r13 |
516 | movq 56(%rsi), %r14 |
517 | |
518 | movntiq %rax, (%rdi) |
519 | movntiq %r8, 8(%rdi) |
520 | movntiq %r9, 16(%rdi) |
521 | movntiq %r10, 24(%rdi) |
522 | movntiq %r11, 32(%rdi) |
523 | movntiq %r12, 40(%rdi) |
524 | movntiq %r13, 48(%rdi) |
525 | movntiq %r14, 56(%rdi) |
526 | |
527 | movq 64(%rsi), %rax |
528 | movq 72(%rsi), %r8 |
529 | movq 80(%rsi), %r9 |
530 | movq 88(%rsi), %r10 |
531 | movq 96(%rsi), %r11 |
532 | movq 104(%rsi), %r12 |
533 | movq 112(%rsi), %r13 |
534 | movq 120(%rsi), %r14 |
535 | |
536 | movntiq %rax, 64(%rdi) |
537 | movntiq %r8, 72(%rdi) |
538 | movntiq %r9, 80(%rdi) |
539 | movntiq %r10, 88(%rdi) |
540 | movntiq %r11, 96(%rdi) |
541 | movntiq %r12, 104(%rdi) |
542 | movntiq %r13, 112(%rdi) |
543 | movntiq %r14, 120(%rdi) |
544 | |
545 | leaq 128(%rsi), %rsi |
546 | leaq 128(%rdi), %rdi |
547 | |
548 | jnz L(NTloop) |
549 | |
550 | sfence /* serialize memory stores */ |
551 | |
552 | movq SAVE2(%rsp), %r12 |
553 | cfi_restore (%r12) |
554 | movq SAVE1(%rsp), %r13 |
555 | cfi_restore (%r13) |
556 | movq SAVE0(%rsp), %r14 |
557 | cfi_restore (%r14) |
558 | |
559 | L(NTskip): |
560 | andl $127, %edx /* check for left overs */ |
561 | #ifdef USE_AS_MEMPCPY |
562 | jnz L(1) |
563 | |
564 | movq %rdi, %rax |
565 | #else |
566 | movq RETVAL(%rsp), %rax |
567 | jnz L(1) |
568 | |
569 | rep |
570 | #endif |
571 | retq /* exit */ |
572 | |
573 | #endif /* IS_IN (libc) */ |
574 | |
575 | END(memcpy) |
576 | |
577 | #ifndef USE_AS_MEMPCPY |
578 | libc_hidden_builtin_def (memcpy) |
579 | # if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc) |
580 | # undef memcpy |
581 | # include <shlib-compat.h> |
582 | versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14); |
583 | # endif |
584 | #endif |
585 | |