1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37#include <sysdep.h>
38
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
42
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
56# endif
57#endif
58
59/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66#ifndef REP_MOVSB_THRESHOLD
67# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68#endif
69
70#ifndef PREFETCH
71# define PREFETCH(addr) prefetcht0 addr
72#endif
73
74/* Assume 64-byte prefetch size. */
75#ifndef PREFETCH_SIZE
76# define PREFETCH_SIZE 64
77#endif
78
79#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81#if PREFETCH_SIZE == 64
82# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83# define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86# define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90# define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96# else
97# error Unsupported PREFETCHED_LOAD_SIZE!
98# endif
99#else
100# error Unsupported PREFETCH_SIZE!
101#endif
102
103#ifndef SECTION
104# error SECTION is not defined!
105#endif
106
107 .section SECTION(.text),"ax",@progbits
108#if defined SHARED && IS_IN (libc)
109ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110 cmpq %rdx, %rcx
111 jb HIDDEN_JUMPTARGET (__chk_fail)
112END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113#endif
114
115#if VEC_SIZE == 16 || defined SHARED
116ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
117 movq %rdi, %rax
118 addq %rdx, %rax
119 jmp L(start)
120END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
121#endif
122
123#if defined SHARED && IS_IN (libc)
124ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125 cmpq %rdx, %rcx
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
128#endif
129
130ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
131 movq %rdi, %rax
132L(start):
133 cmpq $VEC_SIZE, %rdx
134 jb L(less_vec)
135 cmpq $(VEC_SIZE * 2), %rdx
136 ja L(more_2x_vec)
137#if !defined USE_MULTIARCH || !IS_IN (libc)
138L(last_2x_vec):
139#endif
140 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
141 VMOVU (%rsi), %VEC(0)
142 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
143 VMOVU %VEC(0), (%rdi)
144 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
145 VZEROUPPER
146#if !defined USE_MULTIARCH || !IS_IN (libc)
147L(nop):
148#endif
149 ret
150#if defined USE_MULTIARCH && IS_IN (libc)
151END (MEMMOVE_SYMBOL (__memmove, unaligned))
152
153# if VEC_SIZE == 16
154# if defined SHARED
155ENTRY (__mempcpy_chk_erms)
156 cmpq %rdx, %rcx
157 jb HIDDEN_JUMPTARGET (__chk_fail)
158END (__mempcpy_chk_erms)
159
160/* Only used to measure performance of REP MOVSB. */
161ENTRY (__mempcpy_erms)
162 movq %rdi, %rax
163 addq %rdx, %rax
164 jmp L(start_movsb)
165END (__mempcpy_erms)
166# endif
167
168ENTRY (__memmove_chk_erms)
169 cmpq %rdx, %rcx
170 jb HIDDEN_JUMPTARGET (__chk_fail)
171END (__memmove_chk_erms)
172
173ENTRY (__memmove_erms)
174 movq %rdi, %rax
175L(start_movsb):
176 movq %rdx, %rcx
177 cmpq %rsi, %rdi
178 jb 1f
179 /* Source == destination is less common. */
180 je 2f
181 leaq (%rsi,%rcx), %rdx
182 cmpq %rdx, %rdi
183 jb L(movsb_backward)
1841:
185 rep movsb
1862:
187 ret
188L(movsb_backward):
189 leaq -1(%rdi,%rcx), %rdi
190 leaq -1(%rsi,%rcx), %rsi
191 std
192 rep movsb
193 cld
194 ret
195END (__memmove_erms)
196# if defined SHARED
197strong_alias (__memmove_erms, __memcpy_erms)
198strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
199# endif
200# endif
201
202# ifdef SHARED
203ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
204 cmpq %rdx, %rcx
205 jb HIDDEN_JUMPTARGET (__chk_fail)
206END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
207
208ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
209 movq %rdi, %rax
210 addq %rdx, %rax
211 jmp L(start_erms)
212END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
213
214ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
215 cmpq %rdx, %rcx
216 jb HIDDEN_JUMPTARGET (__chk_fail)
217END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
218# endif
219
220ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
221 movq %rdi, %rax
222L(start_erms):
223 cmpq $VEC_SIZE, %rdx
224 jb L(less_vec)
225 cmpq $(VEC_SIZE * 2), %rdx
226 ja L(movsb_more_2x_vec)
227L(last_2x_vec):
228 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
229 VMOVU (%rsi), %VEC(0)
230 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
231 VMOVU %VEC(0), (%rdi)
232 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
233L(return):
234 VZEROUPPER
235 ret
236
237L(movsb):
238 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
239 jae L(more_8x_vec)
240 cmpq %rsi, %rdi
241 jb 1f
242 /* Source == destination is less common. */
243 je L(nop)
244 leaq (%rsi,%rdx), %r9
245 cmpq %r9, %rdi
246 /* Avoid slow backward REP MOVSB. */
247# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
248# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
249# endif
250 jb L(more_8x_vec_backward)
2511:
252 movq %rdx, %rcx
253 rep movsb
254L(nop):
255 ret
256#endif
257
258L(less_vec):
259 /* Less than 1 VEC. */
260#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
261# error Unsupported VEC_SIZE!
262#endif
263#if VEC_SIZE > 32
264 cmpb $32, %dl
265 jae L(between_32_63)
266#endif
267#if VEC_SIZE > 16
268 cmpb $16, %dl
269 jae L(between_16_31)
270#endif
271 cmpb $8, %dl
272 jae L(between_8_15)
273 cmpb $4, %dl
274 jae L(between_4_7)
275 cmpb $1, %dl
276 ja L(between_2_3)
277 jb 1f
278 movzbl (%rsi), %ecx
279 movb %cl, (%rdi)
2801:
281 ret
282#if VEC_SIZE > 32
283L(between_32_63):
284 /* From 32 to 63. No branch when size == 32. */
285 vmovdqu (%rsi), %ymm0
286 vmovdqu -32(%rsi,%rdx), %ymm1
287 vmovdqu %ymm0, (%rdi)
288 vmovdqu %ymm1, -32(%rdi,%rdx)
289 VZEROUPPER
290 ret
291#endif
292#if VEC_SIZE > 16
293 /* From 16 to 31. No branch when size == 16. */
294L(between_16_31):
295 vmovdqu (%rsi), %xmm0
296 vmovdqu -16(%rsi,%rdx), %xmm1
297 vmovdqu %xmm0, (%rdi)
298 vmovdqu %xmm1, -16(%rdi,%rdx)
299 ret
300#endif
301L(between_8_15):
302 /* From 8 to 15. No branch when size == 8. */
303 movq -8(%rsi,%rdx), %rcx
304 movq (%rsi), %rsi
305 movq %rcx, -8(%rdi,%rdx)
306 movq %rsi, (%rdi)
307 ret
308L(between_4_7):
309 /* From 4 to 7. No branch when size == 4. */
310 movl -4(%rsi,%rdx), %ecx
311 movl (%rsi), %esi
312 movl %ecx, -4(%rdi,%rdx)
313 movl %esi, (%rdi)
314 ret
315L(between_2_3):
316 /* From 2 to 3. No branch when size == 2. */
317 movzwl -2(%rsi,%rdx), %ecx
318 movzwl (%rsi), %esi
319 movw %cx, -2(%rdi,%rdx)
320 movw %si, (%rdi)
321 ret
322
323#if defined USE_MULTIARCH && IS_IN (libc)
324L(movsb_more_2x_vec):
325 cmpq $REP_MOVSB_THRESHOLD, %rdx
326 ja L(movsb)
327#endif
328L(more_2x_vec):
329 /* More than 2 * VEC and there may be overlap between destination
330 and source. */
331 cmpq $(VEC_SIZE * 8), %rdx
332 ja L(more_8x_vec)
333 cmpq $(VEC_SIZE * 4), %rdx
334 jb L(last_4x_vec)
335 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
336 VMOVU (%rsi), %VEC(0)
337 VMOVU VEC_SIZE(%rsi), %VEC(1)
338 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
339 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
340 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
341 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
342 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
343 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
344 VMOVU %VEC(0), (%rdi)
345 VMOVU %VEC(1), VEC_SIZE(%rdi)
346 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
347 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
348 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
349 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
350 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
351 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
352 VZEROUPPER
353 ret
354L(last_4x_vec):
355 /* Copy from 2 * VEC to 4 * VEC. */
356 VMOVU (%rsi), %VEC(0)
357 VMOVU VEC_SIZE(%rsi), %VEC(1)
358 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
359 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
360 VMOVU %VEC(0), (%rdi)
361 VMOVU %VEC(1), VEC_SIZE(%rdi)
362 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
363 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
364 VZEROUPPER
365 ret
366
367L(more_8x_vec):
368 cmpq %rsi, %rdi
369 ja L(more_8x_vec_backward)
370 /* Source == destination is less common. */
371 je L(nop)
372 /* Load the first VEC and last 4 * VEC to support overlapping
373 addresses. */
374 VMOVU (%rsi), %VEC(4)
375 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
376 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
377 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
378 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
379 /* Save start and stop of the destination buffer. */
380 movq %rdi, %r11
381 leaq -VEC_SIZE(%rdi, %rdx), %rcx
382 /* Align destination for aligned stores in the loop. Compute
383 how much destination is misaligned. */
384 movq %rdi, %r8
385 andq $(VEC_SIZE - 1), %r8
386 /* Get the negative of offset for alignment. */
387 subq $VEC_SIZE, %r8
388 /* Adjust source. */
389 subq %r8, %rsi
390 /* Adjust destination which should be aligned now. */
391 subq %r8, %rdi
392 /* Adjust length. */
393 addq %r8, %rdx
394#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
395 /* Check non-temporal store threshold. */
396 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
397 ja L(large_forward)
398#endif
399L(loop_4x_vec_forward):
400 /* Copy 4 * VEC a time forward. */
401 VMOVU (%rsi), %VEC(0)
402 VMOVU VEC_SIZE(%rsi), %VEC(1)
403 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
404 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
405 addq $(VEC_SIZE * 4), %rsi
406 subq $(VEC_SIZE * 4), %rdx
407 VMOVA %VEC(0), (%rdi)
408 VMOVA %VEC(1), VEC_SIZE(%rdi)
409 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
410 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
411 addq $(VEC_SIZE * 4), %rdi
412 cmpq $(VEC_SIZE * 4), %rdx
413 ja L(loop_4x_vec_forward)
414 /* Store the last 4 * VEC. */
415 VMOVU %VEC(5), (%rcx)
416 VMOVU %VEC(6), -VEC_SIZE(%rcx)
417 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
418 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
419 /* Store the first VEC. */
420 VMOVU %VEC(4), (%r11)
421 VZEROUPPER
422 ret
423
424L(more_8x_vec_backward):
425 /* Load the first 4 * VEC and last VEC to support overlapping
426 addresses. */
427 VMOVU (%rsi), %VEC(4)
428 VMOVU VEC_SIZE(%rsi), %VEC(5)
429 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
430 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
431 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
432 /* Save stop of the destination buffer. */
433 leaq -VEC_SIZE(%rdi, %rdx), %r11
434 /* Align destination end for aligned stores in the loop. Compute
435 how much destination end is misaligned. */
436 leaq -VEC_SIZE(%rsi, %rdx), %rcx
437 movq %r11, %r9
438 movq %r11, %r8
439 andq $(VEC_SIZE - 1), %r8
440 /* Adjust source. */
441 subq %r8, %rcx
442 /* Adjust the end of destination which should be aligned now. */
443 subq %r8, %r9
444 /* Adjust length. */
445 subq %r8, %rdx
446#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
447 /* Check non-temporal store threshold. */
448 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
449 ja L(large_backward)
450#endif
451L(loop_4x_vec_backward):
452 /* Copy 4 * VEC a time backward. */
453 VMOVU (%rcx), %VEC(0)
454 VMOVU -VEC_SIZE(%rcx), %VEC(1)
455 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
456 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
457 subq $(VEC_SIZE * 4), %rcx
458 subq $(VEC_SIZE * 4), %rdx
459 VMOVA %VEC(0), (%r9)
460 VMOVA %VEC(1), -VEC_SIZE(%r9)
461 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
462 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
463 subq $(VEC_SIZE * 4), %r9
464 cmpq $(VEC_SIZE * 4), %rdx
465 ja L(loop_4x_vec_backward)
466 /* Store the first 4 * VEC. */
467 VMOVU %VEC(4), (%rdi)
468 VMOVU %VEC(5), VEC_SIZE(%rdi)
469 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
470 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
471 /* Store the last VEC. */
472 VMOVU %VEC(8), (%r11)
473 VZEROUPPER
474 ret
475
476#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
477L(large_forward):
478 /* Don't use non-temporal store if there is overlap between
479 destination and source since destination may be in cache
480 when source is loaded. */
481 leaq (%rdi, %rdx), %r10
482 cmpq %r10, %rsi
483 jb L(loop_4x_vec_forward)
484L(loop_large_forward):
485 /* Copy 4 * VEC a time forward with non-temporal stores. */
486 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
487 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
488 VMOVU (%rsi), %VEC(0)
489 VMOVU VEC_SIZE(%rsi), %VEC(1)
490 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
491 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
492 addq $PREFETCHED_LOAD_SIZE, %rsi
493 subq $PREFETCHED_LOAD_SIZE, %rdx
494 VMOVNT %VEC(0), (%rdi)
495 VMOVNT %VEC(1), VEC_SIZE(%rdi)
496 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
497 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
498 addq $PREFETCHED_LOAD_SIZE, %rdi
499 cmpq $PREFETCHED_LOAD_SIZE, %rdx
500 ja L(loop_large_forward)
501 sfence
502 /* Store the last 4 * VEC. */
503 VMOVU %VEC(5), (%rcx)
504 VMOVU %VEC(6), -VEC_SIZE(%rcx)
505 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
506 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
507 /* Store the first VEC. */
508 VMOVU %VEC(4), (%r11)
509 VZEROUPPER
510 ret
511
512L(large_backward):
513 /* Don't use non-temporal store if there is overlap between
514 destination and source since destination may be in cache
515 when source is loaded. */
516 leaq (%rcx, %rdx), %r10
517 cmpq %r10, %r9
518 jb L(loop_4x_vec_backward)
519L(loop_large_backward):
520 /* Copy 4 * VEC a time backward with non-temporal stores. */
521 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
522 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
523 VMOVU (%rcx), %VEC(0)
524 VMOVU -VEC_SIZE(%rcx), %VEC(1)
525 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
526 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
527 subq $PREFETCHED_LOAD_SIZE, %rcx
528 subq $PREFETCHED_LOAD_SIZE, %rdx
529 VMOVNT %VEC(0), (%r9)
530 VMOVNT %VEC(1), -VEC_SIZE(%r9)
531 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
532 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
533 subq $PREFETCHED_LOAD_SIZE, %r9
534 cmpq $PREFETCHED_LOAD_SIZE, %rdx
535 ja L(loop_large_backward)
536 sfence
537 /* Store the first 4 * VEC. */
538 VMOVU %VEC(4), (%rdi)
539 VMOVU %VEC(5), VEC_SIZE(%rdi)
540 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
541 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
542 /* Store the last VEC. */
543 VMOVU %VEC(8), (%r11)
544 VZEROUPPER
545 ret
546#endif
547END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
548
549#ifdef SHARED
550# if IS_IN (libc)
551# ifdef USE_MULTIARCH
552strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
553 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
554strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
555 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
556# endif
557strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
558 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
559# endif
560#endif
561#if VEC_SIZE == 16 || defined SHARED
562strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
563 MEMCPY_SYMBOL (__memcpy, unaligned))
564#endif
565