1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. On machines with ERMS feature, if size greater than equal or to
34 __x86_rep_movsb_threshold and less than
35 __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36 7. If size >= __x86_shared_non_temporal_threshold and there is no
37 overlap between destination and source, use non-temporal store
38 instead of aligned store copying from either 2 or 4 pages at
39 once.
40 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
41 and source and destination do not page alias, copy from 2 pages
42 at once using non-temporal stores. Page aliasing in this case is
43 considered true if destination's page alignment - sources' page
44 alignment is less than 8 * VEC_SIZE.
45 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
46 and destination do page alias copy from 4 pages at once using
47 non-temporal stores. */
48
49#include <sysdep.h>
50
51#ifndef MEMCPY_SYMBOL
52# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53#endif
54
55#ifndef MEMPCPY_SYMBOL
56# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
57#endif
58
59#ifndef MEMMOVE_CHK_SYMBOL
60# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
61#endif
62
63#ifndef XMM0
64# define XMM0 xmm0
65#endif
66
67#ifndef YMM0
68# define YMM0 ymm0
69#endif
70
71#ifndef VZEROUPPER
72# if VEC_SIZE > 16
73# define VZEROUPPER vzeroupper
74# else
75# define VZEROUPPER
76# endif
77#endif
78
79/* Whether to align before movsb. Ultimately we want 64 byte
80 align and not worth it to load 4x VEC for VEC_SIZE == 16. */
81#define ALIGN_MOVSB (VEC_SIZE > 16)
82/* Number of bytes to align movsb to. */
83#define MOVSB_ALIGN_TO 64
84
85#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
86#define LARGE_MOV_SIZE (MOV_SIZE > 4)
87
88#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
89# error MOV_SIZE Unknown
90#endif
91
92#if LARGE_MOV_SIZE
93# define SMALL_SIZE_OFFSET (4)
94#else
95# define SMALL_SIZE_OFFSET (0)
96#endif
97
98#ifndef PAGE_SIZE
99# define PAGE_SIZE 4096
100#endif
101
102#if PAGE_SIZE != 4096
103# error Unsupported PAGE_SIZE
104#endif
105
106#ifndef LOG_PAGE_SIZE
107# define LOG_PAGE_SIZE 12
108#endif
109
110#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
111# error Invalid LOG_PAGE_SIZE
112#endif
113
114/* Byte per page for large_memcpy inner loop. */
115#if VEC_SIZE == 64
116# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
117#else
118# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
119#endif
120
121/* Amount to shift rdx by to compare for memcpy_large_4x. */
122#ifndef LOG_4X_MEMCPY_THRESH
123# define LOG_4X_MEMCPY_THRESH 4
124#endif
125
126/* Avoid short distance rep movsb only with non-SSE vector. */
127#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
128# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
129#else
130# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
131#endif
132
133#ifndef PREFETCH
134# define PREFETCH(addr) prefetcht0 addr
135#endif
136
137/* Assume 64-byte prefetch size. */
138#ifndef PREFETCH_SIZE
139# define PREFETCH_SIZE 64
140#endif
141
142#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
143
144#if PREFETCH_SIZE == 64
145# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
146# define PREFETCH_ONE_SET(dir, base, offset) \
147 PREFETCH ((offset)base)
148# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
149# define PREFETCH_ONE_SET(dir, base, offset) \
150 PREFETCH ((offset)base); \
151 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
152# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
153# define PREFETCH_ONE_SET(dir, base, offset) \
154 PREFETCH ((offset)base); \
155 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
156 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
157 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
158# else
159# error Unsupported PREFETCHED_LOAD_SIZE!
160# endif
161#else
162# error Unsupported PREFETCH_SIZE!
163#endif
164
165#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
166# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
167 VMOVU (offset)base, vec0; \
168 VMOVU ((offset) + VEC_SIZE)base, vec1;
169# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
170 VMOVNT vec0, (offset)base; \
171 VMOVNT vec1, ((offset) + VEC_SIZE)base;
172#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
173# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
174 VMOVU (offset)base, vec0; \
175 VMOVU ((offset) + VEC_SIZE)base, vec1; \
176 VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
177 VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
178# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
179 VMOVNT vec0, (offset)base; \
180 VMOVNT vec1, ((offset) + VEC_SIZE)base; \
181 VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
182 VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
183#else
184# error Invalid LARGE_LOAD_SIZE
185#endif
186
187#ifndef SECTION
188# error SECTION is not defined!
189#endif
190
191 .section SECTION(.text),"ax",@progbits
192#if defined SHARED && IS_IN (libc)
193ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
194 cmp %RDX_LP, %RCX_LP
195 jb HIDDEN_JUMPTARGET (__chk_fail)
196END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
197#endif
198
199ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
200 mov %RDI_LP, %RAX_LP
201 add %RDX_LP, %RAX_LP
202 jmp L(start)
203END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
204
205#if defined SHARED && IS_IN (libc)
206ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
207 cmp %RDX_LP, %RCX_LP
208 jb HIDDEN_JUMPTARGET (__chk_fail)
209END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
210#endif
211
212ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
213 movq %rdi, %rax
214L(start):
215# ifdef __ILP32__
216 /* Clear the upper 32 bits. */
217 movl %edx, %edx
218# endif
219 cmp $VEC_SIZE, %RDX_LP
220 jb L(less_vec)
221 /* Load regardless. */
222 VMOVU (%rsi), %VEC(0)
223 cmp $(VEC_SIZE * 2), %RDX_LP
224 ja L(more_2x_vec)
225 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
226 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
227 VMOVU %VEC(0), (%rdi)
228 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
229#if !(defined USE_MULTIARCH && IS_IN (libc))
230 ZERO_UPPER_VEC_REGISTERS_RETURN
231#else
232 VZEROUPPER_RETURN
233#endif
234#if defined USE_MULTIARCH && IS_IN (libc)
235END (MEMMOVE_SYMBOL (__memmove, unaligned))
236# if VEC_SIZE == 16
237ENTRY (__mempcpy_chk_erms)
238 cmp %RDX_LP, %RCX_LP
239 jb HIDDEN_JUMPTARGET (__chk_fail)
240END (__mempcpy_chk_erms)
241
242/* Only used to measure performance of REP MOVSB. */
243ENTRY (__mempcpy_erms)
244 mov %RDI_LP, %RAX_LP
245 /* Skip zero length. */
246 test %RDX_LP, %RDX_LP
247 jz 2f
248 add %RDX_LP, %RAX_LP
249 jmp L(start_movsb)
250END (__mempcpy_erms)
251
252ENTRY (__memmove_chk_erms)
253 cmp %RDX_LP, %RCX_LP
254 jb HIDDEN_JUMPTARGET (__chk_fail)
255END (__memmove_chk_erms)
256
257ENTRY (__memmove_erms)
258 movq %rdi, %rax
259 /* Skip zero length. */
260 test %RDX_LP, %RDX_LP
261 jz 2f
262L(start_movsb):
263 mov %RDX_LP, %RCX_LP
264 cmp %RSI_LP, %RDI_LP
265 jb 1f
266 /* Source == destination is less common. */
267 je 2f
268 lea (%rsi,%rcx), %RDX_LP
269 cmp %RDX_LP, %RDI_LP
270 jb L(movsb_backward)
2711:
272 rep movsb
2732:
274 ret
275L(movsb_backward):
276 leaq -1(%rdi,%rcx), %rdi
277 leaq -1(%rsi,%rcx), %rsi
278 std
279 rep movsb
280 cld
281 ret
282END (__memmove_erms)
283strong_alias (__memmove_erms, __memcpy_erms)
284strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
285# endif
286
287# ifdef SHARED
288ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
289 cmp %RDX_LP, %RCX_LP
290 jb HIDDEN_JUMPTARGET (__chk_fail)
291END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
292# endif
293
294ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
295 mov %RDI_LP, %RAX_LP
296 add %RDX_LP, %RAX_LP
297 jmp L(start_erms)
298END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
299
300# ifdef SHARED
301ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
302 cmp %RDX_LP, %RCX_LP
303 jb HIDDEN_JUMPTARGET (__chk_fail)
304END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
305# endif
306
307ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
308 movq %rdi, %rax
309L(start_erms):
310# ifdef __ILP32__
311 /* Clear the upper 32 bits. */
312 movl %edx, %edx
313# endif
314 cmp $VEC_SIZE, %RDX_LP
315 jb L(less_vec)
316 /* Load regardless. */
317 VMOVU (%rsi), %VEC(0)
318 cmp $(VEC_SIZE * 2), %RDX_LP
319 ja L(movsb_more_2x_vec)
320 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
321 */
322 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
323 VMOVU %VEC(0), (%rdi)
324 VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
325L(return):
326# if VEC_SIZE > 16
327 ZERO_UPPER_VEC_REGISTERS_RETURN
328# else
329 ret
330# endif
331#endif
332
333#if LARGE_MOV_SIZE
334 /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
335 ENTRY block and L(less_vec). */
336 .p2align 4,, 8
337L(between_4_7):
338 /* From 4 to 7. No branch when size == 4. */
339 movl (%rsi), %ecx
340 movl (%rsi, %rdx), %esi
341 movl %ecx, (%rdi)
342 movl %esi, (%rdi, %rdx)
343 ret
344#endif
345
346 .p2align 4
347L(less_vec):
348 /* Less than 1 VEC. */
349#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
350# error Unsupported VEC_SIZE!
351#endif
352#if VEC_SIZE > 32
353 cmpl $32, %edx
354 jae L(between_32_63)
355#endif
356#if VEC_SIZE > 16
357 cmpl $16, %edx
358 jae L(between_16_31)
359#endif
360 cmpl $8, %edx
361 jae L(between_8_15)
362#if SMALL_MOV_SIZE
363 cmpl $4, %edx
364#else
365 subq $4, %rdx
366#endif
367 jae L(between_4_7)
368 cmpl $(1 - SMALL_SIZE_OFFSET), %edx
369 jl L(copy_0)
370 movb (%rsi), %cl
371 je L(copy_1)
372 movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
373 movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
374L(copy_1):
375 movb %cl, (%rdi)
376L(copy_0):
377 ret
378
379#if SMALL_MOV_SIZE
380 .p2align 4,, 8
381L(between_4_7):
382 /* From 4 to 7. No branch when size == 4. */
383 movl -4(%rsi, %rdx), %ecx
384 movl (%rsi), %esi
385 movl %ecx, -4(%rdi, %rdx)
386 movl %esi, (%rdi)
387 ret
388#endif
389
390#if VEC_SIZE > 16
391 /* From 16 to 31. No branch when size == 16. */
392 .p2align 4,, 8
393L(between_16_31):
394 vmovdqu (%rsi), %xmm0
395 vmovdqu -16(%rsi, %rdx), %xmm1
396 vmovdqu %xmm0, (%rdi)
397 vmovdqu %xmm1, -16(%rdi, %rdx)
398 /* No ymm registers have been touched. */
399 ret
400#endif
401
402#if VEC_SIZE > 32
403 .p2align 4,, 10
404L(between_32_63):
405 /* From 32 to 63. No branch when size == 32. */
406 VMOVU (%rsi), %YMM0
407 VMOVU -32(%rsi, %rdx), %YMM1
408 VMOVU %YMM0, (%rdi)
409 VMOVU %YMM1, -32(%rdi, %rdx)
410 VZEROUPPER_RETURN
411#endif
412
413 .p2align 4,, 10
414L(between_8_15):
415 /* From 8 to 15. No branch when size == 8. */
416 movq -8(%rsi, %rdx), %rcx
417 movq (%rsi), %rsi
418 movq %rsi, (%rdi)
419 movq %rcx, -8(%rdi, %rdx)
420 ret
421
422 .p2align 4,, 10
423L(last_4x_vec):
424 /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
425
426 /* VEC(0) and VEC(1) have already been loaded. */
427 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
428 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
429 VMOVU %VEC(0), (%rdi)
430 VMOVU %VEC(1), VEC_SIZE(%rdi)
431 VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
432 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
433 VZEROUPPER_RETURN
434
435 .p2align 4
436#if defined USE_MULTIARCH && IS_IN (libc)
437L(movsb_more_2x_vec):
438 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
439 ja L(movsb)
440#endif
441L(more_2x_vec):
442 /* More than 2 * VEC and there may be overlap between
443 destination and source. */
444 cmpq $(VEC_SIZE * 8), %rdx
445 ja L(more_8x_vec)
446 /* Load VEC(1) regardless. VEC(0) has already been loaded. */
447 VMOVU VEC_SIZE(%rsi), %VEC(1)
448 cmpq $(VEC_SIZE * 4), %rdx
449 jbe L(last_4x_vec)
450 /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
451 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
452 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
453 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
454 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
455 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
456 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
457 VMOVU %VEC(0), (%rdi)
458 VMOVU %VEC(1), VEC_SIZE(%rdi)
459 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
460 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
461 VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
462 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
463 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
464 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
465 VZEROUPPER_RETURN
466
467 .p2align 4,, 4
468L(more_8x_vec):
469 movq %rdi, %rcx
470 subq %rsi, %rcx
471 /* Go to backwards temporal copy if overlap no matter what as
472 backward REP MOVSB is slow and we don't want to use NT stores if
473 there is overlap. */
474 cmpq %rdx, %rcx
475 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
476 jb L(more_8x_vec_backward_check_nop)
477 /* Check if non-temporal move candidate. */
478#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
479 /* Check non-temporal store threshold. */
480 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
481 ja L(large_memcpy_2x)
482#endif
483 /* To reach this point there cannot be overlap and dst > src. So
484 check for overlap and src > dst in which case correctness
485 requires forward copy. Otherwise decide between backward/forward
486 copy depending on address aliasing. */
487
488 /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
489 but less than __x86_shared_non_temporal_threshold. */
490L(more_8x_vec_check):
491 /* rcx contains dst - src. Add back length (rdx). */
492 leaq (%rcx, %rdx), %r8
493 /* If r8 has different sign than rcx then there is overlap so we
494 must do forward copy. */
495 xorq %rcx, %r8
496 /* Isolate just sign bit of r8. */
497 shrq $63, %r8
498 /* Get 4k difference dst - src. */
499 andl $(PAGE_SIZE - 256), %ecx
500 /* If r8 is non-zero must do foward for correctness. Otherwise
501 if ecx is non-zero there is 4k False Alaising so do backward
502 copy. */
503 addl %r8d, %ecx
504 jz L(more_8x_vec_backward)
505
506 /* if rdx is greater than __x86_shared_non_temporal_threshold
507 but there is overlap, or from short distance movsb. */
508L(more_8x_vec_forward):
509 /* Load first and last 4 * VEC to support overlapping addresses.
510 */
511
512 /* First vec was already loaded into VEC(0). */
513 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
514 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
515 /* Save begining of dst. */
516 movq %rdi, %rcx
517 /* Align dst to VEC_SIZE - 1. */
518 orq $(VEC_SIZE - 1), %rdi
519 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
520 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
521
522 /* Subtract dst from src. Add back after dst aligned. */
523 subq %rcx, %rsi
524 /* Finish aligning dst. */
525 incq %rdi
526 /* Restore src adjusted with new value for aligned dst. */
527 addq %rdi, %rsi
528 /* Store end of buffer minus tail in rdx. */
529 leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
530
531 /* Dont use multi-byte nop to align. */
532 .p2align 4,, 11
533L(loop_4x_vec_forward):
534 /* Copy 4 * VEC a time forward. */
535 VMOVU (%rsi), %VEC(1)
536 VMOVU VEC_SIZE(%rsi), %VEC(2)
537 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
538 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
539 subq $-(VEC_SIZE * 4), %rsi
540 VMOVA %VEC(1), (%rdi)
541 VMOVA %VEC(2), VEC_SIZE(%rdi)
542 VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
543 VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
544 subq $-(VEC_SIZE * 4), %rdi
545 cmpq %rdi, %rdx
546 ja L(loop_4x_vec_forward)
547 /* Store the last 4 * VEC. */
548 VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
549 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
550 VMOVU %VEC(7), VEC_SIZE(%rdx)
551 VMOVU %VEC(8), (%rdx)
552 /* Store the first VEC. */
553 VMOVU %VEC(0), (%rcx)
554 /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
555 */
556L(nop_backward):
557 VZEROUPPER_RETURN
558
559 .p2align 4,, 8
560L(more_8x_vec_backward_check_nop):
561 /* rcx contains dst - src. Test for dst == src to skip all of
562 memmove. */
563 testq %rcx, %rcx
564 jz L(nop_backward)
565L(more_8x_vec_backward):
566 /* Load the first 4 * VEC and last VEC to support overlapping
567 addresses. */
568
569 /* First vec was also loaded into VEC(0). */
570 VMOVU VEC_SIZE(%rsi), %VEC(5)
571 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
572 /* Begining of region for 4x backward copy stored in rcx. */
573 leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
574 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
575 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
576 /* Subtract dst from src. Add back after dst aligned. */
577 subq %rdi, %rsi
578 /* Align dst. */
579 andq $-(VEC_SIZE), %rcx
580 /* Restore src. */
581 addq %rcx, %rsi
582
583 /* Don't use multi-byte nop to align. */
584 .p2align 4,, 11
585L(loop_4x_vec_backward):
586 /* Copy 4 * VEC a time backward. */
587 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
588 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
589 VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
590 VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
591 addq $(VEC_SIZE * -4), %rsi
592 VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
593 VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
594 VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
595 VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
596 addq $(VEC_SIZE * -4), %rcx
597 cmpq %rcx, %rdi
598 jb L(loop_4x_vec_backward)
599 /* Store the first 4 * VEC. */
600 VMOVU %VEC(0), (%rdi)
601 VMOVU %VEC(5), VEC_SIZE(%rdi)
602 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
603 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
604 /* Store the last VEC. */
605 VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
606 VZEROUPPER_RETURN
607
608#if defined USE_MULTIARCH && IS_IN (libc)
609 /* L(skip_short_movsb_check) is only used with ERMS. Not for
610 FSRM. */
611 .p2align 5,, 16
612# if ALIGN_MOVSB
613L(skip_short_movsb_check):
614# if MOVSB_ALIGN_TO > VEC_SIZE
615 VMOVU VEC_SIZE(%rsi), %VEC(1)
616# endif
617# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
618# error Unsupported MOVSB_ALIGN_TO
619# endif
620 /* If CPU does not have FSRM two options for aligning. Align src
621 if dst and src 4k alias. Otherwise align dst. */
622 testl $(PAGE_SIZE - 512), %ecx
623 jnz L(movsb_align_dst)
624 /* Fall through. dst and src 4k alias. It's better to align src
625 here because the bottleneck will be loads dues to the false
626 dependency on dst. */
627
628 /* rcx already has dst - src. */
629 movq %rcx, %r9
630 /* Add src to len. Subtract back after src aligned. -1 because
631 src is initially aligned to MOVSB_ALIGN_TO - 1. */
632 leaq -1(%rsi, %rdx), %rcx
633 /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
634 orq $(MOVSB_ALIGN_TO - 1), %rsi
635 /* Restore dst and len adjusted with new values for aligned dst.
636 */
637 leaq 1(%rsi, %r9), %rdi
638 subq %rsi, %rcx
639 /* Finish aligning src. */
640 incq %rsi
641
642 rep movsb
643
644 VMOVU %VEC(0), (%r8)
645# if MOVSB_ALIGN_TO > VEC_SIZE
646 VMOVU %VEC(1), VEC_SIZE(%r8)
647# endif
648 VZEROUPPER_RETURN
649# endif
650
651 .p2align 4,, 12
652L(movsb):
653 movq %rdi, %rcx
654 subq %rsi, %rcx
655 /* Go to backwards temporal copy if overlap no matter what as
656 backward REP MOVSB is slow and we don't want to use NT stores if
657 there is overlap. */
658 cmpq %rdx, %rcx
659 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
660 jb L(more_8x_vec_backward_check_nop)
661# if ALIGN_MOVSB
662 /* Save dest for storing aligning VECs later. */
663 movq %rdi, %r8
664# endif
665 /* If above __x86_rep_movsb_stop_threshold most likely is
666 candidate for NT moves aswell. */
667 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
668 jae L(large_memcpy_2x_check)
669# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
670 /* Only avoid short movsb if CPU has FSRM. */
671 testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
672 jz L(skip_short_movsb_check)
673# if AVOID_SHORT_DISTANCE_REP_MOVSB
674 /* Avoid "rep movsb" if RCX, the distance between source and
675 destination, is N*4GB + [1..63] with N >= 0. */
676
677 /* ecx contains dst - src. Early check for backward copy
678 conditions means only case of slow movsb with src = dst + [0,
679 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
680 for that case. */
681 cmpl $-64, %ecx
682 ja L(more_8x_vec_forward)
683# endif
684# endif
685# if ALIGN_MOVSB
686# if MOVSB_ALIGN_TO > VEC_SIZE
687 VMOVU VEC_SIZE(%rsi), %VEC(1)
688# endif
689# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
690# error Unsupported MOVSB_ALIGN_TO
691# endif
692 /* Fall through means cpu has FSRM. In that case exclusively
693 align destination. */
694L(movsb_align_dst):
695 /* Subtract dst from src. Add back after dst aligned. */
696 subq %rdi, %rsi
697 /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
698 addq $(MOVSB_ALIGN_TO - 1), %rdi
699 /* Add dst to len. Subtract back after dst aligned. */
700 leaq (%r8, %rdx), %rcx
701 /* Finish aligning dst. */
702 andq $-(MOVSB_ALIGN_TO), %rdi
703 /* Restore src and len adjusted with new values for aligned dst.
704 */
705 addq %rdi, %rsi
706 subq %rdi, %rcx
707
708 rep movsb
709
710 /* Store VECs loaded for aligning. */
711 VMOVU %VEC(0), (%r8)
712# if MOVSB_ALIGN_TO > VEC_SIZE
713 VMOVU %VEC(1), VEC_SIZE(%r8)
714# endif
715 VZEROUPPER_RETURN
716# else /* !ALIGN_MOVSB. */
717L(skip_short_movsb_check):
718 mov %RDX_LP, %RCX_LP
719 rep movsb
720 ret
721# endif
722#endif
723
724 .p2align 4,, 10
725#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
726L(large_memcpy_2x_check):
727 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
728 jb L(more_8x_vec_check)
729L(large_memcpy_2x):
730 /* To reach this point it is impossible for dst > src and
731 overlap. Remaining to check is src > dst and overlap. rcx
732 already contains dst - src. Negate rcx to get src - dst. If
733 length > rcx then there is overlap and forward copy is best. */
734 negq %rcx
735 cmpq %rcx, %rdx
736 ja L(more_8x_vec_forward)
737
738 /* Cache align destination. First store the first 64 bytes then
739 adjust alignments. */
740
741 /* First vec was also loaded into VEC(0). */
742# if VEC_SIZE < 64
743 VMOVU VEC_SIZE(%rsi), %VEC(1)
744# if VEC_SIZE < 32
745 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
746 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
747# endif
748# endif
749 VMOVU %VEC(0), (%rdi)
750# if VEC_SIZE < 64
751 VMOVU %VEC(1), VEC_SIZE(%rdi)
752# if VEC_SIZE < 32
753 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
754 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
755# endif
756# endif
757
758 /* Adjust source, destination, and size. */
759 movq %rdi, %r8
760 andq $63, %r8
761 /* Get the negative of offset for alignment. */
762 subq $64, %r8
763 /* Adjust source. */
764 subq %r8, %rsi
765 /* Adjust destination which should be aligned now. */
766 subq %r8, %rdi
767 /* Adjust length. */
768 addq %r8, %rdx
769
770 /* Test if source and destination addresses will alias. If they
771 do the larger pipeline in large_memcpy_4x alleviated the
772 performance drop. */
773
774 /* ecx contains -(dst - src). not ecx will return dst - src - 1
775 which works for testing aliasing. */
776 notl %ecx
777 testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
778 jz L(large_memcpy_4x)
779
780 movq %rdx, %r10
781 shrq $LOG_4X_MEMCPY_THRESH, %r10
782 cmp __x86_shared_non_temporal_threshold(%rip), %r10
783 jae L(large_memcpy_4x)
784
785 /* edx will store remainder size for copying tail. */
786 andl $(PAGE_SIZE * 2 - 1), %edx
787 /* r10 stores outer loop counter. */
788 shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
789 /* Copy 4x VEC at a time from 2 pages. */
790 .p2align 4
791L(loop_large_memcpy_2x_outer):
792 /* ecx stores inner loop counter. */
793 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
794L(loop_large_memcpy_2x_inner):
795 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
796 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
797 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
798 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
799 /* Load vectors from rsi. */
800 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
801 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
802 subq $-LARGE_LOAD_SIZE, %rsi
803 /* Non-temporal store vectors to rdi. */
804 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
805 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
806 subq $-LARGE_LOAD_SIZE, %rdi
807 decl %ecx
808 jnz L(loop_large_memcpy_2x_inner)
809 addq $PAGE_SIZE, %rdi
810 addq $PAGE_SIZE, %rsi
811 decq %r10
812 jne L(loop_large_memcpy_2x_outer)
813 sfence
814
815 /* Check if only last 4 loads are needed. */
816 cmpl $(VEC_SIZE * 4), %edx
817 jbe L(large_memcpy_2x_end)
818
819 /* Handle the last 2 * PAGE_SIZE bytes. */
820L(loop_large_memcpy_2x_tail):
821 /* Copy 4 * VEC a time forward with non-temporal stores. */
822 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
823 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
824 VMOVU (%rsi), %VEC(0)
825 VMOVU VEC_SIZE(%rsi), %VEC(1)
826 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
827 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
828 subq $-(VEC_SIZE * 4), %rsi
829 addl $-(VEC_SIZE * 4), %edx
830 VMOVA %VEC(0), (%rdi)
831 VMOVA %VEC(1), VEC_SIZE(%rdi)
832 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
833 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
834 subq $-(VEC_SIZE * 4), %rdi
835 cmpl $(VEC_SIZE * 4), %edx
836 ja L(loop_large_memcpy_2x_tail)
837
838L(large_memcpy_2x_end):
839 /* Store the last 4 * VEC. */
840 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
841 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
842 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
843 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
844
845 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
846 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
847 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
848 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
849 VZEROUPPER_RETURN
850
851 .p2align 4
852L(large_memcpy_4x):
853 movq %rdx, %r10
854 /* edx will store remainder size for copying tail. */
855 andl $(PAGE_SIZE * 4 - 1), %edx
856 /* r10 stores outer loop counter. */
857 shrq $(LOG_PAGE_SIZE + 2), %r10
858 /* Copy 4x VEC at a time from 4 pages. */
859 .p2align 4
860L(loop_large_memcpy_4x_outer):
861 /* ecx stores inner loop counter. */
862 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
863L(loop_large_memcpy_4x_inner):
864 /* Only one prefetch set per page as doing 4 pages give more
865 time for prefetcher to keep up. */
866 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
867 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
868 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
869 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
870 /* Load vectors from rsi. */
871 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
872 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
873 LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
874 LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
875 subq $-LARGE_LOAD_SIZE, %rsi
876 /* Non-temporal store vectors to rdi. */
877 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
878 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
879 STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
880 STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
881 subq $-LARGE_LOAD_SIZE, %rdi
882 decl %ecx
883 jnz L(loop_large_memcpy_4x_inner)
884 addq $(PAGE_SIZE * 3), %rdi
885 addq $(PAGE_SIZE * 3), %rsi
886 decq %r10
887 jne L(loop_large_memcpy_4x_outer)
888 sfence
889 /* Check if only last 4 loads are needed. */
890 cmpl $(VEC_SIZE * 4), %edx
891 jbe L(large_memcpy_4x_end)
892
893 /* Handle the last 4 * PAGE_SIZE bytes. */
894L(loop_large_memcpy_4x_tail):
895 /* Copy 4 * VEC a time forward with non-temporal stores. */
896 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
897 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
898 VMOVU (%rsi), %VEC(0)
899 VMOVU VEC_SIZE(%rsi), %VEC(1)
900 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
901 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
902 subq $-(VEC_SIZE * 4), %rsi
903 addl $-(VEC_SIZE * 4), %edx
904 VMOVA %VEC(0), (%rdi)
905 VMOVA %VEC(1), VEC_SIZE(%rdi)
906 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
907 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
908 subq $-(VEC_SIZE * 4), %rdi
909 cmpl $(VEC_SIZE * 4), %edx
910 ja L(loop_large_memcpy_4x_tail)
911
912L(large_memcpy_4x_end):
913 /* Store the last 4 * VEC. */
914 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
915 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
916 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
917 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
918
919 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
920 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
921 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
922 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
923 VZEROUPPER_RETURN
924#endif
925END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
926
927#if IS_IN (libc)
928# ifdef USE_MULTIARCH
929strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
930 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
931# ifdef SHARED
932strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
933 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
934# endif
935# endif
936# ifdef SHARED
937strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
938 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
939# endif
940#endif
941strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
942 MEMCPY_SYMBOL (__memcpy, unaligned))
943