1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37#include <sysdep.h>
38
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
42
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
56# endif
57#endif
58
59/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66#ifndef REP_MOVSB_THRESHOLD
67# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68#endif
69
70#ifndef PREFETCH
71# define PREFETCH(addr) prefetcht0 addr
72#endif
73
74/* Assume 64-byte prefetch size. */
75#ifndef PREFETCH_SIZE
76# define PREFETCH_SIZE 64
77#endif
78
79#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81#if PREFETCH_SIZE == 64
82# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83# define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86# define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90# define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
95# else
96# error Unsupported PREFETCHED_LOAD_SIZE!
97# endif
98#else
99# error Unsupported PREFETCH_SIZE!
100#endif
101
102#ifndef SECTION
103# error SECTION is not defined!
104#endif
105
106 .section SECTION(.text),"ax",@progbits
107#if defined SHARED && IS_IN (libc)
108ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
109 cmpq %rdx, %rcx
110 jb HIDDEN_JUMPTARGET (__chk_fail)
111END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
112#endif
113
114ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
115 movq %rdi, %rax
116 addq %rdx, %rax
117 jmp L(start)
118END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
119
120#if defined SHARED && IS_IN (libc)
121ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
122 cmpq %rdx, %rcx
123 jb HIDDEN_JUMPTARGET (__chk_fail)
124END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125#endif
126
127ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
128 movq %rdi, %rax
129L(start):
130 cmpq $VEC_SIZE, %rdx
131 jb L(less_vec)
132 cmpq $(VEC_SIZE * 2), %rdx
133 ja L(more_2x_vec)
134#if !defined USE_MULTIARCH || !IS_IN (libc)
135L(last_2x_vec):
136#endif
137 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
138 VMOVU (%rsi), %VEC(0)
139 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
140 VMOVU %VEC(0), (%rdi)
141 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
142 VZEROUPPER
143#if !defined USE_MULTIARCH || !IS_IN (libc)
144L(nop):
145#endif
146 ret
147#if defined USE_MULTIARCH && IS_IN (libc)
148END (MEMMOVE_SYMBOL (__memmove, unaligned))
149
150# if VEC_SIZE == 16
151ENTRY (__mempcpy_chk_erms)
152 cmpq %rdx, %rcx
153 jb HIDDEN_JUMPTARGET (__chk_fail)
154END (__mempcpy_chk_erms)
155
156/* Only used to measure performance of REP MOVSB. */
157ENTRY (__mempcpy_erms)
158 movq %rdi, %rax
159 /* Skip zero length. */
160 testq %rdx, %rdx
161 jz 2f
162 addq %rdx, %rax
163 jmp L(start_movsb)
164END (__mempcpy_erms)
165
166ENTRY (__memmove_chk_erms)
167 cmpq %rdx, %rcx
168 jb HIDDEN_JUMPTARGET (__chk_fail)
169END (__memmove_chk_erms)
170
171ENTRY (__memmove_erms)
172 movq %rdi, %rax
173 /* Skip zero length. */
174 testq %rdx, %rdx
175 jz 2f
176L(start_movsb):
177 movq %rdx, %rcx
178 cmpq %rsi, %rdi
179 jb 1f
180 /* Source == destination is less common. */
181 je 2f
182 leaq (%rsi,%rcx), %rdx
183 cmpq %rdx, %rdi
184 jb L(movsb_backward)
1851:
186 rep movsb
1872:
188 ret
189L(movsb_backward):
190 leaq -1(%rdi,%rcx), %rdi
191 leaq -1(%rsi,%rcx), %rsi
192 std
193 rep movsb
194 cld
195 ret
196END (__memmove_erms)
197strong_alias (__memmove_erms, __memcpy_erms)
198strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
199# endif
200
201# ifdef SHARED
202ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
203 cmpq %rdx, %rcx
204 jb HIDDEN_JUMPTARGET (__chk_fail)
205END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
206# endif
207
208ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
209 movq %rdi, %rax
210 addq %rdx, %rax
211 jmp L(start_erms)
212END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
213
214# ifdef SHARED
215ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
216 cmpq %rdx, %rcx
217 jb HIDDEN_JUMPTARGET (__chk_fail)
218END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
219# endif
220
221ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
222 movq %rdi, %rax
223L(start_erms):
224 cmpq $VEC_SIZE, %rdx
225 jb L(less_vec)
226 cmpq $(VEC_SIZE * 2), %rdx
227 ja L(movsb_more_2x_vec)
228L(last_2x_vec):
229 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
230 VMOVU (%rsi), %VEC(0)
231 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
232 VMOVU %VEC(0), (%rdi)
233 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
234L(return):
235 VZEROUPPER
236 ret
237
238L(movsb):
239 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
240 jae L(more_8x_vec)
241 cmpq %rsi, %rdi
242 jb 1f
243 /* Source == destination is less common. */
244 je L(nop)
245 leaq (%rsi,%rdx), %r9
246 cmpq %r9, %rdi
247 /* Avoid slow backward REP MOVSB. */
248# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
249# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
250# endif
251 jb L(more_8x_vec_backward)
2521:
253 movq %rdx, %rcx
254 rep movsb
255L(nop):
256 ret
257#endif
258
259L(less_vec):
260 /* Less than 1 VEC. */
261#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
262# error Unsupported VEC_SIZE!
263#endif
264#if VEC_SIZE > 32
265 cmpb $32, %dl
266 jae L(between_32_63)
267#endif
268#if VEC_SIZE > 16
269 cmpb $16, %dl
270 jae L(between_16_31)
271#endif
272 cmpb $8, %dl
273 jae L(between_8_15)
274 cmpb $4, %dl
275 jae L(between_4_7)
276 cmpb $1, %dl
277 ja L(between_2_3)
278 jb 1f
279 movzbl (%rsi), %ecx
280 movb %cl, (%rdi)
2811:
282 ret
283#if VEC_SIZE > 32
284L(between_32_63):
285 /* From 32 to 63. No branch when size == 32. */
286 vmovdqu (%rsi), %ymm0
287 vmovdqu -32(%rsi,%rdx), %ymm1
288 vmovdqu %ymm0, (%rdi)
289 vmovdqu %ymm1, -32(%rdi,%rdx)
290 VZEROUPPER
291 ret
292#endif
293#if VEC_SIZE > 16
294 /* From 16 to 31. No branch when size == 16. */
295L(between_16_31):
296 vmovdqu (%rsi), %xmm0
297 vmovdqu -16(%rsi,%rdx), %xmm1
298 vmovdqu %xmm0, (%rdi)
299 vmovdqu %xmm1, -16(%rdi,%rdx)
300 ret
301#endif
302L(between_8_15):
303 /* From 8 to 15. No branch when size == 8. */
304 movq -8(%rsi,%rdx), %rcx
305 movq (%rsi), %rsi
306 movq %rcx, -8(%rdi,%rdx)
307 movq %rsi, (%rdi)
308 ret
309L(between_4_7):
310 /* From 4 to 7. No branch when size == 4. */
311 movl -4(%rsi,%rdx), %ecx
312 movl (%rsi), %esi
313 movl %ecx, -4(%rdi,%rdx)
314 movl %esi, (%rdi)
315 ret
316L(between_2_3):
317 /* From 2 to 3. No branch when size == 2. */
318 movzwl -2(%rsi,%rdx), %ecx
319 movzwl (%rsi), %esi
320 movw %cx, -2(%rdi,%rdx)
321 movw %si, (%rdi)
322 ret
323
324#if defined USE_MULTIARCH && IS_IN (libc)
325L(movsb_more_2x_vec):
326 cmpq $REP_MOVSB_THRESHOLD, %rdx
327 ja L(movsb)
328#endif
329L(more_2x_vec):
330 /* More than 2 * VEC and there may be overlap between destination
331 and source. */
332 cmpq $(VEC_SIZE * 8), %rdx
333 ja L(more_8x_vec)
334 cmpq $(VEC_SIZE * 4), %rdx
335 jb L(last_4x_vec)
336 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
337 VMOVU (%rsi), %VEC(0)
338 VMOVU VEC_SIZE(%rsi), %VEC(1)
339 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
340 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
341 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
342 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
343 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
344 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
345 VMOVU %VEC(0), (%rdi)
346 VMOVU %VEC(1), VEC_SIZE(%rdi)
347 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
348 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
349 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
350 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
351 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
352 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
353 VZEROUPPER
354 ret
355L(last_4x_vec):
356 /* Copy from 2 * VEC to 4 * VEC. */
357 VMOVU (%rsi), %VEC(0)
358 VMOVU VEC_SIZE(%rsi), %VEC(1)
359 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
360 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
361 VMOVU %VEC(0), (%rdi)
362 VMOVU %VEC(1), VEC_SIZE(%rdi)
363 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
364 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
365 VZEROUPPER
366 ret
367
368L(more_8x_vec):
369 cmpq %rsi, %rdi
370 ja L(more_8x_vec_backward)
371 /* Source == destination is less common. */
372 je L(nop)
373 /* Load the first VEC and last 4 * VEC to support overlapping
374 addresses. */
375 VMOVU (%rsi), %VEC(4)
376 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
377 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
378 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
379 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
380 /* Save start and stop of the destination buffer. */
381 movq %rdi, %r11
382 leaq -VEC_SIZE(%rdi, %rdx), %rcx
383 /* Align destination for aligned stores in the loop. Compute
384 how much destination is misaligned. */
385 movq %rdi, %r8
386 andq $(VEC_SIZE - 1), %r8
387 /* Get the negative of offset for alignment. */
388 subq $VEC_SIZE, %r8
389 /* Adjust source. */
390 subq %r8, %rsi
391 /* Adjust destination which should be aligned now. */
392 subq %r8, %rdi
393 /* Adjust length. */
394 addq %r8, %rdx
395#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
396 /* Check non-temporal store threshold. */
397 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
398 ja L(large_forward)
399#endif
400L(loop_4x_vec_forward):
401 /* Copy 4 * VEC a time forward. */
402 VMOVU (%rsi), %VEC(0)
403 VMOVU VEC_SIZE(%rsi), %VEC(1)
404 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
405 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
406 addq $(VEC_SIZE * 4), %rsi
407 subq $(VEC_SIZE * 4), %rdx
408 VMOVA %VEC(0), (%rdi)
409 VMOVA %VEC(1), VEC_SIZE(%rdi)
410 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
411 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
412 addq $(VEC_SIZE * 4), %rdi
413 cmpq $(VEC_SIZE * 4), %rdx
414 ja L(loop_4x_vec_forward)
415 /* Store the last 4 * VEC. */
416 VMOVU %VEC(5), (%rcx)
417 VMOVU %VEC(6), -VEC_SIZE(%rcx)
418 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
419 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
420 /* Store the first VEC. */
421 VMOVU %VEC(4), (%r11)
422 VZEROUPPER
423 ret
424
425L(more_8x_vec_backward):
426 /* Load the first 4 * VEC and last VEC to support overlapping
427 addresses. */
428 VMOVU (%rsi), %VEC(4)
429 VMOVU VEC_SIZE(%rsi), %VEC(5)
430 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
431 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
432 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
433 /* Save stop of the destination buffer. */
434 leaq -VEC_SIZE(%rdi, %rdx), %r11
435 /* Align destination end for aligned stores in the loop. Compute
436 how much destination end is misaligned. */
437 leaq -VEC_SIZE(%rsi, %rdx), %rcx
438 movq %r11, %r9
439 movq %r11, %r8
440 andq $(VEC_SIZE - 1), %r8
441 /* Adjust source. */
442 subq %r8, %rcx
443 /* Adjust the end of destination which should be aligned now. */
444 subq %r8, %r9
445 /* Adjust length. */
446 subq %r8, %rdx
447#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
448 /* Check non-temporal store threshold. */
449 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
450 ja L(large_backward)
451#endif
452L(loop_4x_vec_backward):
453 /* Copy 4 * VEC a time backward. */
454 VMOVU (%rcx), %VEC(0)
455 VMOVU -VEC_SIZE(%rcx), %VEC(1)
456 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
457 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
458 subq $(VEC_SIZE * 4), %rcx
459 subq $(VEC_SIZE * 4), %rdx
460 VMOVA %VEC(0), (%r9)
461 VMOVA %VEC(1), -VEC_SIZE(%r9)
462 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
463 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
464 subq $(VEC_SIZE * 4), %r9
465 cmpq $(VEC_SIZE * 4), %rdx
466 ja L(loop_4x_vec_backward)
467 /* Store the first 4 * VEC. */
468 VMOVU %VEC(4), (%rdi)
469 VMOVU %VEC(5), VEC_SIZE(%rdi)
470 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
471 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
472 /* Store the last VEC. */
473 VMOVU %VEC(8), (%r11)
474 VZEROUPPER
475 ret
476
477#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
478L(large_forward):
479 /* Don't use non-temporal store if there is overlap between
480 destination and source since destination may be in cache
481 when source is loaded. */
482 leaq (%rdi, %rdx), %r10
483 cmpq %r10, %rsi
484 jb L(loop_4x_vec_forward)
485L(loop_large_forward):
486 /* Copy 4 * VEC a time forward with non-temporal stores. */
487 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
488 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
489 VMOVU (%rsi), %VEC(0)
490 VMOVU VEC_SIZE(%rsi), %VEC(1)
491 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
492 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
493 addq $PREFETCHED_LOAD_SIZE, %rsi
494 subq $PREFETCHED_LOAD_SIZE, %rdx
495 VMOVNT %VEC(0), (%rdi)
496 VMOVNT %VEC(1), VEC_SIZE(%rdi)
497 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
498 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
499 addq $PREFETCHED_LOAD_SIZE, %rdi
500 cmpq $PREFETCHED_LOAD_SIZE, %rdx
501 ja L(loop_large_forward)
502 sfence
503 /* Store the last 4 * VEC. */
504 VMOVU %VEC(5), (%rcx)
505 VMOVU %VEC(6), -VEC_SIZE(%rcx)
506 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
507 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
508 /* Store the first VEC. */
509 VMOVU %VEC(4), (%r11)
510 VZEROUPPER
511 ret
512
513L(large_backward):
514 /* Don't use non-temporal store if there is overlap between
515 destination and source since destination may be in cache
516 when source is loaded. */
517 leaq (%rcx, %rdx), %r10
518 cmpq %r10, %r9
519 jb L(loop_4x_vec_backward)
520L(loop_large_backward):
521 /* Copy 4 * VEC a time backward with non-temporal stores. */
522 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
523 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
524 VMOVU (%rcx), %VEC(0)
525 VMOVU -VEC_SIZE(%rcx), %VEC(1)
526 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
527 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
528 subq $PREFETCHED_LOAD_SIZE, %rcx
529 subq $PREFETCHED_LOAD_SIZE, %rdx
530 VMOVNT %VEC(0), (%r9)
531 VMOVNT %VEC(1), -VEC_SIZE(%r9)
532 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
533 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
534 subq $PREFETCHED_LOAD_SIZE, %r9
535 cmpq $PREFETCHED_LOAD_SIZE, %rdx
536 ja L(loop_large_backward)
537 sfence
538 /* Store the first 4 * VEC. */
539 VMOVU %VEC(4), (%rdi)
540 VMOVU %VEC(5), VEC_SIZE(%rdi)
541 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
542 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
543 /* Store the last VEC. */
544 VMOVU %VEC(8), (%r11)
545 VZEROUPPER
546 ret
547#endif
548END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
549
550#if IS_IN (libc)
551# ifdef USE_MULTIARCH
552strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
553 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
554# ifdef SHARED
555strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
556 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
557# endif
558# endif
559# ifdef SHARED
560strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
561 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
562# endif
563#endif
564strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
565 MEMCPY_SYMBOL (__memcpy, unaligned))
566