1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. On machines with ERMS feature, if size greater than equal or to
34 __x86_rep_movsb_threshold and less than
35 __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36 7. If size >= __x86_shared_non_temporal_threshold and there is no
37 overlap between destination and source, use non-temporal store
38 instead of aligned store copying from either 2 or 4 pages at
39 once.
40 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
41 and source and destination do not page alias, copy from 2 pages
42 at once using non-temporal stores. Page aliasing in this case is
43 considered true if destination's page alignment - sources' page
44 alignment is less than 8 * VEC_SIZE.
45 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
46 and destination do page alias copy from 4 pages at once using
47 non-temporal stores. */
48
49#include <sysdep.h>
50
51#ifndef MEMCPY_SYMBOL
52# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53#endif
54
55#ifndef MEMPCPY_SYMBOL
56# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
57#endif
58
59#ifndef MEMMOVE_CHK_SYMBOL
60# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
61#endif
62
63#ifndef XMM0
64# define XMM0 xmm0
65#endif
66
67#ifndef YMM0
68# define YMM0 ymm0
69#endif
70
71#ifndef VZEROUPPER
72# if VEC_SIZE > 16
73# define VZEROUPPER vzeroupper
74# else
75# define VZEROUPPER
76# endif
77#endif
78
79#ifndef PAGE_SIZE
80# define PAGE_SIZE 4096
81#endif
82
83#if PAGE_SIZE != 4096
84# error Unsupported PAGE_SIZE
85#endif
86
87#ifndef LOG_PAGE_SIZE
88# define LOG_PAGE_SIZE 12
89#endif
90
91#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
92# error Invalid LOG_PAGE_SIZE
93#endif
94
95/* Byte per page for large_memcpy inner loop. */
96#if VEC_SIZE == 64
97# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
98#else
99# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
100#endif
101
102/* Amount to shift rdx by to compare for memcpy_large_4x. */
103#ifndef LOG_4X_MEMCPY_THRESH
104# define LOG_4X_MEMCPY_THRESH 4
105#endif
106
107/* Avoid short distance rep movsb only with non-SSE vector. */
108#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
109# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
110#else
111# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
112#endif
113
114#ifndef PREFETCH
115# define PREFETCH(addr) prefetcht0 addr
116#endif
117
118/* Assume 64-byte prefetch size. */
119#ifndef PREFETCH_SIZE
120# define PREFETCH_SIZE 64
121#endif
122
123#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
124
125#if PREFETCH_SIZE == 64
126# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
127# define PREFETCH_ONE_SET(dir, base, offset) \
128 PREFETCH ((offset)base)
129# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
130# define PREFETCH_ONE_SET(dir, base, offset) \
131 PREFETCH ((offset)base); \
132 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
133# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
134# define PREFETCH_ONE_SET(dir, base, offset) \
135 PREFETCH ((offset)base); \
136 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
137 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
138 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
139# else
140# error Unsupported PREFETCHED_LOAD_SIZE!
141# endif
142#else
143# error Unsupported PREFETCH_SIZE!
144#endif
145
146#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
147# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
148 VMOVU (offset)base, vec0; \
149 VMOVU ((offset) + VEC_SIZE)base, vec1;
150# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
151 VMOVNT vec0, (offset)base; \
152 VMOVNT vec1, ((offset) + VEC_SIZE)base;
153#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
154# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
155 VMOVU (offset)base, vec0; \
156 VMOVU ((offset) + VEC_SIZE)base, vec1; \
157 VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
158 VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
159# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
160 VMOVNT vec0, (offset)base; \
161 VMOVNT vec1, ((offset) + VEC_SIZE)base; \
162 VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
163 VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
164#else
165# error Invalid LARGE_LOAD_SIZE
166#endif
167
168#ifndef SECTION
169# error SECTION is not defined!
170#endif
171
172 .section SECTION(.text),"ax",@progbits
173#if defined SHARED && IS_IN (libc)
174ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
175 cmp %RDX_LP, %RCX_LP
176 jb HIDDEN_JUMPTARGET (__chk_fail)
177END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
178#endif
179
180ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
181 mov %RDI_LP, %RAX_LP
182 add %RDX_LP, %RAX_LP
183 jmp L(start)
184END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
185
186#if defined SHARED && IS_IN (libc)
187ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
188 cmp %RDX_LP, %RCX_LP
189 jb HIDDEN_JUMPTARGET (__chk_fail)
190END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
191#endif
192
193ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
194 movq %rdi, %rax
195L(start):
196# ifdef __ILP32__
197 /* Clear the upper 32 bits. */
198 movl %edx, %edx
199# endif
200 cmp $VEC_SIZE, %RDX_LP
201 jb L(less_vec)
202 cmp $(VEC_SIZE * 2), %RDX_LP
203 ja L(more_2x_vec)
204#if !defined USE_MULTIARCH || !IS_IN (libc)
205L(last_2x_vec):
206#endif
207 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
208 VMOVU (%rsi), %VEC(0)
209 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
210 VMOVU %VEC(0), (%rdi)
211 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
212#if !defined USE_MULTIARCH || !IS_IN (libc)
213L(nop):
214 ret
215#else
216 VZEROUPPER_RETURN
217#endif
218#if defined USE_MULTIARCH && IS_IN (libc)
219END (MEMMOVE_SYMBOL (__memmove, unaligned))
220
221# if VEC_SIZE == 16
222ENTRY (__mempcpy_chk_erms)
223 cmp %RDX_LP, %RCX_LP
224 jb HIDDEN_JUMPTARGET (__chk_fail)
225END (__mempcpy_chk_erms)
226
227/* Only used to measure performance of REP MOVSB. */
228ENTRY (__mempcpy_erms)
229 mov %RDI_LP, %RAX_LP
230 /* Skip zero length. */
231 test %RDX_LP, %RDX_LP
232 jz 2f
233 add %RDX_LP, %RAX_LP
234 jmp L(start_movsb)
235END (__mempcpy_erms)
236
237ENTRY (__memmove_chk_erms)
238 cmp %RDX_LP, %RCX_LP
239 jb HIDDEN_JUMPTARGET (__chk_fail)
240END (__memmove_chk_erms)
241
242ENTRY (__memmove_erms)
243 movq %rdi, %rax
244 /* Skip zero length. */
245 test %RDX_LP, %RDX_LP
246 jz 2f
247L(start_movsb):
248 mov %RDX_LP, %RCX_LP
249 cmp %RSI_LP, %RDI_LP
250 jb 1f
251 /* Source == destination is less common. */
252 je 2f
253 lea (%rsi,%rcx), %RDX_LP
254 cmp %RDX_LP, %RDI_LP
255 jb L(movsb_backward)
2561:
257 rep movsb
2582:
259 ret
260L(movsb_backward):
261 leaq -1(%rdi,%rcx), %rdi
262 leaq -1(%rsi,%rcx), %rsi
263 std
264 rep movsb
265 cld
266 ret
267END (__memmove_erms)
268strong_alias (__memmove_erms, __memcpy_erms)
269strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
270# endif
271
272# ifdef SHARED
273ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
274 cmp %RDX_LP, %RCX_LP
275 jb HIDDEN_JUMPTARGET (__chk_fail)
276END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
277# endif
278
279ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
280 mov %RDI_LP, %RAX_LP
281 add %RDX_LP, %RAX_LP
282 jmp L(start_erms)
283END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
284
285# ifdef SHARED
286ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
287 cmp %RDX_LP, %RCX_LP
288 jb HIDDEN_JUMPTARGET (__chk_fail)
289END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
290# endif
291
292ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
293 movq %rdi, %rax
294L(start_erms):
295# ifdef __ILP32__
296 /* Clear the upper 32 bits. */
297 movl %edx, %edx
298# endif
299 cmp $VEC_SIZE, %RDX_LP
300 jb L(less_vec)
301 cmp $(VEC_SIZE * 2), %RDX_LP
302 ja L(movsb_more_2x_vec)
303L(last_2x_vec):
304 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
305 VMOVU (%rsi), %VEC(0)
306 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
307 VMOVU %VEC(0), (%rdi)
308 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
309L(return):
310#if VEC_SIZE > 16
311 ZERO_UPPER_VEC_REGISTERS_RETURN
312#else
313 ret
314#endif
315
316L(movsb):
317 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
318 jae L(more_8x_vec)
319 cmpq %rsi, %rdi
320 jb 1f
321 /* Source == destination is less common. */
322 je L(nop)
323 leaq (%rsi,%rdx), %r9
324 cmpq %r9, %rdi
325 /* Avoid slow backward REP MOVSB. */
326 jb L(more_8x_vec_backward)
327# if AVOID_SHORT_DISTANCE_REP_MOVSB
328 andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
329 jz 3f
330 movq %rdi, %rcx
331 subq %rsi, %rcx
332 jmp 2f
333# endif
3341:
335# if AVOID_SHORT_DISTANCE_REP_MOVSB
336 andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
337 jz 3f
338 movq %rsi, %rcx
339 subq %rdi, %rcx
3402:
341/* Avoid "rep movsb" if RCX, the distance between source and destination,
342 is N*4GB + [1..63] with N >= 0. */
343 cmpl $63, %ecx
344 jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
3453:
346# endif
347 mov %RDX_LP, %RCX_LP
348 rep movsb
349L(nop):
350 ret
351#endif
352
353L(less_vec):
354 /* Less than 1 VEC. */
355#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
356# error Unsupported VEC_SIZE!
357#endif
358#if VEC_SIZE > 32
359 cmpb $32, %dl
360 jae L(between_32_63)
361#endif
362#if VEC_SIZE > 16
363 cmpb $16, %dl
364 jae L(between_16_31)
365#endif
366 cmpb $8, %dl
367 jae L(between_8_15)
368 cmpb $4, %dl
369 jae L(between_4_7)
370 cmpb $1, %dl
371 ja L(between_2_3)
372 jb 1f
373 movzbl (%rsi), %ecx
374 movb %cl, (%rdi)
3751:
376 ret
377#if VEC_SIZE > 32
378L(between_32_63):
379 /* From 32 to 63. No branch when size == 32. */
380 VMOVU (%rsi), %YMM0
381 VMOVU -32(%rsi,%rdx), %YMM1
382 VMOVU %YMM0, (%rdi)
383 VMOVU %YMM1, -32(%rdi,%rdx)
384 VZEROUPPER_RETURN
385#endif
386#if VEC_SIZE > 16
387 /* From 16 to 31. No branch when size == 16. */
388L(between_16_31):
389 VMOVU (%rsi), %XMM0
390 VMOVU -16(%rsi,%rdx), %XMM1
391 VMOVU %XMM0, (%rdi)
392 VMOVU %XMM1, -16(%rdi,%rdx)
393 VZEROUPPER_RETURN
394#endif
395L(between_8_15):
396 /* From 8 to 15. No branch when size == 8. */
397 movq -8(%rsi,%rdx), %rcx
398 movq (%rsi), %rsi
399 movq %rcx, -8(%rdi,%rdx)
400 movq %rsi, (%rdi)
401 ret
402L(between_4_7):
403 /* From 4 to 7. No branch when size == 4. */
404 movl -4(%rsi,%rdx), %ecx
405 movl (%rsi), %esi
406 movl %ecx, -4(%rdi,%rdx)
407 movl %esi, (%rdi)
408 ret
409L(between_2_3):
410 /* From 2 to 3. No branch when size == 2. */
411 movzwl -2(%rsi,%rdx), %ecx
412 movzwl (%rsi), %esi
413 movw %cx, -2(%rdi,%rdx)
414 movw %si, (%rdi)
415 ret
416
417#if defined USE_MULTIARCH && IS_IN (libc)
418L(movsb_more_2x_vec):
419 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
420 ja L(movsb)
421#endif
422L(more_2x_vec):
423 /* More than 2 * VEC and there may be overlap between destination
424 and source. */
425 cmpq $(VEC_SIZE * 8), %rdx
426 ja L(more_8x_vec)
427 cmpq $(VEC_SIZE * 4), %rdx
428 jbe L(last_4x_vec)
429 /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
430 VMOVU (%rsi), %VEC(0)
431 VMOVU VEC_SIZE(%rsi), %VEC(1)
432 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
433 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
434 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
435 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
436 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
437 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
438 VMOVU %VEC(0), (%rdi)
439 VMOVU %VEC(1), VEC_SIZE(%rdi)
440 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
441 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
442 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
443 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
444 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
445 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
446 VZEROUPPER_RETURN
447L(last_4x_vec):
448 /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
449 VMOVU (%rsi), %VEC(0)
450 VMOVU VEC_SIZE(%rsi), %VEC(1)
451 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
452 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
453 VMOVU %VEC(0), (%rdi)
454 VMOVU %VEC(1), VEC_SIZE(%rdi)
455 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
456 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
457 VZEROUPPER_RETURN
458
459L(more_8x_vec):
460 /* Check if non-temporal move candidate. */
461#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
462 /* Check non-temporal store threshold. */
463 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
464 ja L(large_memcpy_2x)
465#endif
466 /* Entry if rdx is greater than non-temporal threshold but there
467 is overlap. */
468L(more_8x_vec_check):
469 cmpq %rsi, %rdi
470 ja L(more_8x_vec_backward)
471 /* Source == destination is less common. */
472 je L(nop)
473 /* Load the first VEC and last 4 * VEC to support overlapping
474 addresses. */
475 VMOVU (%rsi), %VEC(4)
476 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
477 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
478 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
479 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
480 /* Save start and stop of the destination buffer. */
481 movq %rdi, %r11
482 leaq -VEC_SIZE(%rdi, %rdx), %rcx
483 /* Align destination for aligned stores in the loop. Compute
484 how much destination is misaligned. */
485 movq %rdi, %r8
486 andq $(VEC_SIZE - 1), %r8
487 /* Get the negative of offset for alignment. */
488 subq $VEC_SIZE, %r8
489 /* Adjust source. */
490 subq %r8, %rsi
491 /* Adjust destination which should be aligned now. */
492 subq %r8, %rdi
493 /* Adjust length. */
494 addq %r8, %rdx
495
496 .p2align 4
497L(loop_4x_vec_forward):
498 /* Copy 4 * VEC a time forward. */
499 VMOVU (%rsi), %VEC(0)
500 VMOVU VEC_SIZE(%rsi), %VEC(1)
501 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
502 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
503 subq $-(VEC_SIZE * 4), %rsi
504 addq $-(VEC_SIZE * 4), %rdx
505 VMOVA %VEC(0), (%rdi)
506 VMOVA %VEC(1), VEC_SIZE(%rdi)
507 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
508 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
509 subq $-(VEC_SIZE * 4), %rdi
510 cmpq $(VEC_SIZE * 4), %rdx
511 ja L(loop_4x_vec_forward)
512 /* Store the last 4 * VEC. */
513 VMOVU %VEC(5), (%rcx)
514 VMOVU %VEC(6), -VEC_SIZE(%rcx)
515 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
516 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
517 /* Store the first VEC. */
518 VMOVU %VEC(4), (%r11)
519 VZEROUPPER_RETURN
520
521L(more_8x_vec_backward):
522 /* Load the first 4 * VEC and last VEC to support overlapping
523 addresses. */
524 VMOVU (%rsi), %VEC(4)
525 VMOVU VEC_SIZE(%rsi), %VEC(5)
526 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
527 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
528 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
529 /* Save stop of the destination buffer. */
530 leaq -VEC_SIZE(%rdi, %rdx), %r11
531 /* Align destination end for aligned stores in the loop. Compute
532 how much destination end is misaligned. */
533 leaq -VEC_SIZE(%rsi, %rdx), %rcx
534 movq %r11, %r9
535 movq %r11, %r8
536 andq $(VEC_SIZE - 1), %r8
537 /* Adjust source. */
538 subq %r8, %rcx
539 /* Adjust the end of destination which should be aligned now. */
540 subq %r8, %r9
541 /* Adjust length. */
542 subq %r8, %rdx
543
544 .p2align 4
545L(loop_4x_vec_backward):
546 /* Copy 4 * VEC a time backward. */
547 VMOVU (%rcx), %VEC(0)
548 VMOVU -VEC_SIZE(%rcx), %VEC(1)
549 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
550 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
551 addq $-(VEC_SIZE * 4), %rcx
552 addq $-(VEC_SIZE * 4), %rdx
553 VMOVA %VEC(0), (%r9)
554 VMOVA %VEC(1), -VEC_SIZE(%r9)
555 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
556 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
557 addq $-(VEC_SIZE * 4), %r9
558 cmpq $(VEC_SIZE * 4), %rdx
559 ja L(loop_4x_vec_backward)
560 /* Store the first 4 * VEC. */
561 VMOVU %VEC(4), (%rdi)
562 VMOVU %VEC(5), VEC_SIZE(%rdi)
563 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
564 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
565 /* Store the last VEC. */
566 VMOVU %VEC(8), (%r11)
567 VZEROUPPER_RETURN
568
569#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
570 .p2align 4
571L(large_memcpy_2x):
572 /* Compute absolute value of difference between source and
573 destination. */
574 movq %rdi, %r9
575 subq %rsi, %r9
576 movq %r9, %r8
577 leaq -1(%r9), %rcx
578 sarq $63, %r8
579 xorq %r8, %r9
580 subq %r8, %r9
581 /* Don't use non-temporal store if there is overlap between
582 destination and source since destination may be in cache when
583 source is loaded. */
584 cmpq %r9, %rdx
585 ja L(more_8x_vec_check)
586
587 /* Cache align destination. First store the first 64 bytes then
588 adjust alignments. */
589 VMOVU (%rsi), %VEC(8)
590#if VEC_SIZE < 64
591 VMOVU VEC_SIZE(%rsi), %VEC(9)
592#if VEC_SIZE < 32
593 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
594 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
595#endif
596#endif
597 VMOVU %VEC(8), (%rdi)
598#if VEC_SIZE < 64
599 VMOVU %VEC(9), VEC_SIZE(%rdi)
600#if VEC_SIZE < 32
601 VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
602 VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
603#endif
604#endif
605 /* Adjust source, destination, and size. */
606 movq %rdi, %r8
607 andq $63, %r8
608 /* Get the negative of offset for alignment. */
609 subq $64, %r8
610 /* Adjust source. */
611 subq %r8, %rsi
612 /* Adjust destination which should be aligned now. */
613 subq %r8, %rdi
614 /* Adjust length. */
615 addq %r8, %rdx
616
617 /* Test if source and destination addresses will alias. If they do
618 the larger pipeline in large_memcpy_4x alleviated the
619 performance drop. */
620 testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
621 jz L(large_memcpy_4x)
622
623 movq %rdx, %r10
624 shrq $LOG_4X_MEMCPY_THRESH, %r10
625 cmp __x86_shared_non_temporal_threshold(%rip), %r10
626 jae L(large_memcpy_4x)
627
628 /* edx will store remainder size for copying tail. */
629 andl $(PAGE_SIZE * 2 - 1), %edx
630 /* r10 stores outer loop counter. */
631 shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
632 /* Copy 4x VEC at a time from 2 pages. */
633 .p2align 4
634L(loop_large_memcpy_2x_outer):
635 /* ecx stores inner loop counter. */
636 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
637L(loop_large_memcpy_2x_inner):
638 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
639 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
640 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
641 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
642 /* Load vectors from rsi. */
643 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
644 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
645 subq $-LARGE_LOAD_SIZE, %rsi
646 /* Non-temporal store vectors to rdi. */
647 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
648 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
649 subq $-LARGE_LOAD_SIZE, %rdi
650 decl %ecx
651 jnz L(loop_large_memcpy_2x_inner)
652 addq $PAGE_SIZE, %rdi
653 addq $PAGE_SIZE, %rsi
654 decq %r10
655 jne L(loop_large_memcpy_2x_outer)
656 sfence
657
658 /* Check if only last 4 loads are needed. */
659 cmpl $(VEC_SIZE * 4), %edx
660 jbe L(large_memcpy_2x_end)
661
662 /* Handle the last 2 * PAGE_SIZE bytes. */
663L(loop_large_memcpy_2x_tail):
664 /* Copy 4 * VEC a time forward with non-temporal stores. */
665 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
666 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
667 VMOVU (%rsi), %VEC(0)
668 VMOVU VEC_SIZE(%rsi), %VEC(1)
669 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
670 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
671 subq $-(VEC_SIZE * 4), %rsi
672 addl $-(VEC_SIZE * 4), %edx
673 VMOVA %VEC(0), (%rdi)
674 VMOVA %VEC(1), VEC_SIZE(%rdi)
675 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
676 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
677 subq $-(VEC_SIZE * 4), %rdi
678 cmpl $(VEC_SIZE * 4), %edx
679 ja L(loop_large_memcpy_2x_tail)
680
681L(large_memcpy_2x_end):
682 /* Store the last 4 * VEC. */
683 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
684 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
685 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
686 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
687
688 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
689 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
690 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
691 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
692 VZEROUPPER_RETURN
693
694 .p2align 4
695L(large_memcpy_4x):
696 movq %rdx, %r10
697 /* edx will store remainder size for copying tail. */
698 andl $(PAGE_SIZE * 4 - 1), %edx
699 /* r10 stores outer loop counter. */
700 shrq $(LOG_PAGE_SIZE + 2), %r10
701 /* Copy 4x VEC at a time from 4 pages. */
702 .p2align 4
703L(loop_large_memcpy_4x_outer):
704 /* ecx stores inner loop counter. */
705 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
706L(loop_large_memcpy_4x_inner):
707 /* Only one prefetch set per page as doing 4 pages give more time
708 for prefetcher to keep up. */
709 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
710 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
711 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
712 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
713 /* Load vectors from rsi. */
714 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
715 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
716 LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
717 LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
718 subq $-LARGE_LOAD_SIZE, %rsi
719 /* Non-temporal store vectors to rdi. */
720 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
721 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
722 STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
723 STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
724 subq $-LARGE_LOAD_SIZE, %rdi
725 decl %ecx
726 jnz L(loop_large_memcpy_4x_inner)
727 addq $(PAGE_SIZE * 3), %rdi
728 addq $(PAGE_SIZE * 3), %rsi
729 decq %r10
730 jne L(loop_large_memcpy_4x_outer)
731 sfence
732 /* Check if only last 4 loads are needed. */
733 cmpl $(VEC_SIZE * 4), %edx
734 jbe L(large_memcpy_4x_end)
735
736 /* Handle the last 4 * PAGE_SIZE bytes. */
737L(loop_large_memcpy_4x_tail):
738 /* Copy 4 * VEC a time forward with non-temporal stores. */
739 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
740 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
741 VMOVU (%rsi), %VEC(0)
742 VMOVU VEC_SIZE(%rsi), %VEC(1)
743 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
744 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
745 subq $-(VEC_SIZE * 4), %rsi
746 addl $-(VEC_SIZE * 4), %edx
747 VMOVA %VEC(0), (%rdi)
748 VMOVA %VEC(1), VEC_SIZE(%rdi)
749 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
750 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
751 subq $-(VEC_SIZE * 4), %rdi
752 cmpl $(VEC_SIZE * 4), %edx
753 ja L(loop_large_memcpy_4x_tail)
754
755L(large_memcpy_4x_end):
756 /* Store the last 4 * VEC. */
757 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
758 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
759 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
760 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
761
762 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
763 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
764 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
765 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
766 VZEROUPPER_RETURN
767#endif
768END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
769
770#if IS_IN (libc)
771# ifdef USE_MULTIARCH
772strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
773 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
774# ifdef SHARED
775strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
776 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
777# endif
778# endif
779# ifdef SHARED
780strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
781 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
782# endif
783#endif
784strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
785 MEMCPY_SYMBOL (__memcpy, unaligned))
786