1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37#include <sysdep.h>
38
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
42
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
56# endif
57#endif
58
59/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66#ifndef REP_MOVSB_THRESHOLD
67# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68#endif
69
70#ifndef PREFETCH
71# define PREFETCH(addr) prefetcht0 addr
72#endif
73
74/* Assume 64-byte prefetch size. */
75#ifndef PREFETCH_SIZE
76# define PREFETCH_SIZE 64
77#endif
78
79#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81#if PREFETCH_SIZE == 64
82# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83# define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86# define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90# define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96# else
97# error Unsupported PREFETCHED_LOAD_SIZE!
98# endif
99#else
100# error Unsupported PREFETCH_SIZE!
101#endif
102
103#ifndef SECTION
104# error SECTION is not defined!
105#endif
106
107 .section SECTION(.text),"ax",@progbits
108#if defined SHARED && IS_IN (libc)
109ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110 cmpq %rdx, %rcx
111 jb HIDDEN_JUMPTARGET (__chk_fail)
112END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113#endif
114
115ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
116 movq %rdi, %rax
117 addq %rdx, %rax
118 jmp L(start)
119END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
120
121#if defined SHARED && IS_IN (libc)
122ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
123 cmpq %rdx, %rcx
124 jb HIDDEN_JUMPTARGET (__chk_fail)
125END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
126#endif
127
128ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
129 movq %rdi, %rax
130L(start):
131 cmpq $VEC_SIZE, %rdx
132 jb L(less_vec)
133 cmpq $(VEC_SIZE * 2), %rdx
134 ja L(more_2x_vec)
135#if !defined USE_MULTIARCH || !IS_IN (libc)
136L(last_2x_vec):
137#endif
138 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
139 VMOVU (%rsi), %VEC(0)
140 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
141 VMOVU %VEC(0), (%rdi)
142 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
143 VZEROUPPER
144#if !defined USE_MULTIARCH || !IS_IN (libc)
145L(nop):
146#endif
147 ret
148#if defined USE_MULTIARCH && IS_IN (libc)
149END (MEMMOVE_SYMBOL (__memmove, unaligned))
150
151# if VEC_SIZE == 16
152ENTRY (__mempcpy_chk_erms)
153 cmpq %rdx, %rcx
154 jb HIDDEN_JUMPTARGET (__chk_fail)
155END (__mempcpy_chk_erms)
156
157/* Only used to measure performance of REP MOVSB. */
158ENTRY (__mempcpy_erms)
159 movq %rdi, %rax
160 addq %rdx, %rax
161 jmp L(start_movsb)
162END (__mempcpy_erms)
163
164ENTRY (__memmove_chk_erms)
165 cmpq %rdx, %rcx
166 jb HIDDEN_JUMPTARGET (__chk_fail)
167END (__memmove_chk_erms)
168
169ENTRY (__memmove_erms)
170 movq %rdi, %rax
171L(start_movsb):
172 movq %rdx, %rcx
173 cmpq %rsi, %rdi
174 jb 1f
175 /* Source == destination is less common. */
176 je 2f
177 leaq (%rsi,%rcx), %rdx
178 cmpq %rdx, %rdi
179 jb L(movsb_backward)
1801:
181 rep movsb
1822:
183 ret
184L(movsb_backward):
185 leaq -1(%rdi,%rcx), %rdi
186 leaq -1(%rsi,%rcx), %rsi
187 std
188 rep movsb
189 cld
190 ret
191END (__memmove_erms)
192strong_alias (__memmove_erms, __memcpy_erms)
193strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
194# endif
195
196# ifdef SHARED
197ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
198 cmpq %rdx, %rcx
199 jb HIDDEN_JUMPTARGET (__chk_fail)
200END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
201# endif
202
203ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
204 movq %rdi, %rax
205 addq %rdx, %rax
206 jmp L(start_erms)
207END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
208
209# ifdef SHARED
210ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
211 cmpq %rdx, %rcx
212 jb HIDDEN_JUMPTARGET (__chk_fail)
213END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
214# endif
215
216ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
217 movq %rdi, %rax
218L(start_erms):
219 cmpq $VEC_SIZE, %rdx
220 jb L(less_vec)
221 cmpq $(VEC_SIZE * 2), %rdx
222 ja L(movsb_more_2x_vec)
223L(last_2x_vec):
224 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
225 VMOVU (%rsi), %VEC(0)
226 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
227 VMOVU %VEC(0), (%rdi)
228 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
229L(return):
230 VZEROUPPER
231 ret
232
233L(movsb):
234 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
235 jae L(more_8x_vec)
236 cmpq %rsi, %rdi
237 jb 1f
238 /* Source == destination is less common. */
239 je L(nop)
240 leaq (%rsi,%rdx), %r9
241 cmpq %r9, %rdi
242 /* Avoid slow backward REP MOVSB. */
243# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
244# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
245# endif
246 jb L(more_8x_vec_backward)
2471:
248 movq %rdx, %rcx
249 rep movsb
250L(nop):
251 ret
252#endif
253
254L(less_vec):
255 /* Less than 1 VEC. */
256#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
257# error Unsupported VEC_SIZE!
258#endif
259#if VEC_SIZE > 32
260 cmpb $32, %dl
261 jae L(between_32_63)
262#endif
263#if VEC_SIZE > 16
264 cmpb $16, %dl
265 jae L(between_16_31)
266#endif
267 cmpb $8, %dl
268 jae L(between_8_15)
269 cmpb $4, %dl
270 jae L(between_4_7)
271 cmpb $1, %dl
272 ja L(between_2_3)
273 jb 1f
274 movzbl (%rsi), %ecx
275 movb %cl, (%rdi)
2761:
277 ret
278#if VEC_SIZE > 32
279L(between_32_63):
280 /* From 32 to 63. No branch when size == 32. */
281 vmovdqu (%rsi), %ymm0
282 vmovdqu -32(%rsi,%rdx), %ymm1
283 vmovdqu %ymm0, (%rdi)
284 vmovdqu %ymm1, -32(%rdi,%rdx)
285 VZEROUPPER
286 ret
287#endif
288#if VEC_SIZE > 16
289 /* From 16 to 31. No branch when size == 16. */
290L(between_16_31):
291 vmovdqu (%rsi), %xmm0
292 vmovdqu -16(%rsi,%rdx), %xmm1
293 vmovdqu %xmm0, (%rdi)
294 vmovdqu %xmm1, -16(%rdi,%rdx)
295 ret
296#endif
297L(between_8_15):
298 /* From 8 to 15. No branch when size == 8. */
299 movq -8(%rsi,%rdx), %rcx
300 movq (%rsi), %rsi
301 movq %rcx, -8(%rdi,%rdx)
302 movq %rsi, (%rdi)
303 ret
304L(between_4_7):
305 /* From 4 to 7. No branch when size == 4. */
306 movl -4(%rsi,%rdx), %ecx
307 movl (%rsi), %esi
308 movl %ecx, -4(%rdi,%rdx)
309 movl %esi, (%rdi)
310 ret
311L(between_2_3):
312 /* From 2 to 3. No branch when size == 2. */
313 movzwl -2(%rsi,%rdx), %ecx
314 movzwl (%rsi), %esi
315 movw %cx, -2(%rdi,%rdx)
316 movw %si, (%rdi)
317 ret
318
319#if defined USE_MULTIARCH && IS_IN (libc)
320L(movsb_more_2x_vec):
321 cmpq $REP_MOVSB_THRESHOLD, %rdx
322 ja L(movsb)
323#endif
324L(more_2x_vec):
325 /* More than 2 * VEC and there may be overlap between destination
326 and source. */
327 cmpq $(VEC_SIZE * 8), %rdx
328 ja L(more_8x_vec)
329 cmpq $(VEC_SIZE * 4), %rdx
330 jb L(last_4x_vec)
331 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
332 VMOVU (%rsi), %VEC(0)
333 VMOVU VEC_SIZE(%rsi), %VEC(1)
334 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
335 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
336 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
337 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
338 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
339 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
340 VMOVU %VEC(0), (%rdi)
341 VMOVU %VEC(1), VEC_SIZE(%rdi)
342 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
343 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
344 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
345 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
346 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
347 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
348 VZEROUPPER
349 ret
350L(last_4x_vec):
351 /* Copy from 2 * VEC to 4 * VEC. */
352 VMOVU (%rsi), %VEC(0)
353 VMOVU VEC_SIZE(%rsi), %VEC(1)
354 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
355 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
356 VMOVU %VEC(0), (%rdi)
357 VMOVU %VEC(1), VEC_SIZE(%rdi)
358 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
359 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
360 VZEROUPPER
361 ret
362
363L(more_8x_vec):
364 cmpq %rsi, %rdi
365 ja L(more_8x_vec_backward)
366 /* Source == destination is less common. */
367 je L(nop)
368 /* Load the first VEC and last 4 * VEC to support overlapping
369 addresses. */
370 VMOVU (%rsi), %VEC(4)
371 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
372 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
373 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
374 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
375 /* Save start and stop of the destination buffer. */
376 movq %rdi, %r11
377 leaq -VEC_SIZE(%rdi, %rdx), %rcx
378 /* Align destination for aligned stores in the loop. Compute
379 how much destination is misaligned. */
380 movq %rdi, %r8
381 andq $(VEC_SIZE - 1), %r8
382 /* Get the negative of offset for alignment. */
383 subq $VEC_SIZE, %r8
384 /* Adjust source. */
385 subq %r8, %rsi
386 /* Adjust destination which should be aligned now. */
387 subq %r8, %rdi
388 /* Adjust length. */
389 addq %r8, %rdx
390#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
391 /* Check non-temporal store threshold. */
392 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
393 ja L(large_forward)
394#endif
395L(loop_4x_vec_forward):
396 /* Copy 4 * VEC a time forward. */
397 VMOVU (%rsi), %VEC(0)
398 VMOVU VEC_SIZE(%rsi), %VEC(1)
399 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
400 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
401 addq $(VEC_SIZE * 4), %rsi
402 subq $(VEC_SIZE * 4), %rdx
403 VMOVA %VEC(0), (%rdi)
404 VMOVA %VEC(1), VEC_SIZE(%rdi)
405 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
406 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
407 addq $(VEC_SIZE * 4), %rdi
408 cmpq $(VEC_SIZE * 4), %rdx
409 ja L(loop_4x_vec_forward)
410 /* Store the last 4 * VEC. */
411 VMOVU %VEC(5), (%rcx)
412 VMOVU %VEC(6), -VEC_SIZE(%rcx)
413 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
414 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
415 /* Store the first VEC. */
416 VMOVU %VEC(4), (%r11)
417 VZEROUPPER
418 ret
419
420L(more_8x_vec_backward):
421 /* Load the first 4 * VEC and last VEC to support overlapping
422 addresses. */
423 VMOVU (%rsi), %VEC(4)
424 VMOVU VEC_SIZE(%rsi), %VEC(5)
425 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
426 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
427 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
428 /* Save stop of the destination buffer. */
429 leaq -VEC_SIZE(%rdi, %rdx), %r11
430 /* Align destination end for aligned stores in the loop. Compute
431 how much destination end is misaligned. */
432 leaq -VEC_SIZE(%rsi, %rdx), %rcx
433 movq %r11, %r9
434 movq %r11, %r8
435 andq $(VEC_SIZE - 1), %r8
436 /* Adjust source. */
437 subq %r8, %rcx
438 /* Adjust the end of destination which should be aligned now. */
439 subq %r8, %r9
440 /* Adjust length. */
441 subq %r8, %rdx
442#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
443 /* Check non-temporal store threshold. */
444 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
445 ja L(large_backward)
446#endif
447L(loop_4x_vec_backward):
448 /* Copy 4 * VEC a time backward. */
449 VMOVU (%rcx), %VEC(0)
450 VMOVU -VEC_SIZE(%rcx), %VEC(1)
451 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
452 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
453 subq $(VEC_SIZE * 4), %rcx
454 subq $(VEC_SIZE * 4), %rdx
455 VMOVA %VEC(0), (%r9)
456 VMOVA %VEC(1), -VEC_SIZE(%r9)
457 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
458 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
459 subq $(VEC_SIZE * 4), %r9
460 cmpq $(VEC_SIZE * 4), %rdx
461 ja L(loop_4x_vec_backward)
462 /* Store the first 4 * VEC. */
463 VMOVU %VEC(4), (%rdi)
464 VMOVU %VEC(5), VEC_SIZE(%rdi)
465 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
466 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
467 /* Store the last VEC. */
468 VMOVU %VEC(8), (%r11)
469 VZEROUPPER
470 ret
471
472#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
473L(large_forward):
474 /* Don't use non-temporal store if there is overlap between
475 destination and source since destination may be in cache
476 when source is loaded. */
477 leaq (%rdi, %rdx), %r10
478 cmpq %r10, %rsi
479 jb L(loop_4x_vec_forward)
480L(loop_large_forward):
481 /* Copy 4 * VEC a time forward with non-temporal stores. */
482 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
483 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
484 VMOVU (%rsi), %VEC(0)
485 VMOVU VEC_SIZE(%rsi), %VEC(1)
486 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
487 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
488 addq $PREFETCHED_LOAD_SIZE, %rsi
489 subq $PREFETCHED_LOAD_SIZE, %rdx
490 VMOVNT %VEC(0), (%rdi)
491 VMOVNT %VEC(1), VEC_SIZE(%rdi)
492 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
493 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
494 addq $PREFETCHED_LOAD_SIZE, %rdi
495 cmpq $PREFETCHED_LOAD_SIZE, %rdx
496 ja L(loop_large_forward)
497 sfence
498 /* Store the last 4 * VEC. */
499 VMOVU %VEC(5), (%rcx)
500 VMOVU %VEC(6), -VEC_SIZE(%rcx)
501 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
502 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
503 /* Store the first VEC. */
504 VMOVU %VEC(4), (%r11)
505 VZEROUPPER
506 ret
507
508L(large_backward):
509 /* Don't use non-temporal store if there is overlap between
510 destination and source since destination may be in cache
511 when source is loaded. */
512 leaq (%rcx, %rdx), %r10
513 cmpq %r10, %r9
514 jb L(loop_4x_vec_backward)
515L(loop_large_backward):
516 /* Copy 4 * VEC a time backward with non-temporal stores. */
517 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
518 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
519 VMOVU (%rcx), %VEC(0)
520 VMOVU -VEC_SIZE(%rcx), %VEC(1)
521 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
522 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
523 subq $PREFETCHED_LOAD_SIZE, %rcx
524 subq $PREFETCHED_LOAD_SIZE, %rdx
525 VMOVNT %VEC(0), (%r9)
526 VMOVNT %VEC(1), -VEC_SIZE(%r9)
527 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
528 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
529 subq $PREFETCHED_LOAD_SIZE, %r9
530 cmpq $PREFETCHED_LOAD_SIZE, %rdx
531 ja L(loop_large_backward)
532 sfence
533 /* Store the first 4 * VEC. */
534 VMOVU %VEC(4), (%rdi)
535 VMOVU %VEC(5), VEC_SIZE(%rdi)
536 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
537 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
538 /* Store the last VEC. */
539 VMOVU %VEC(8), (%r11)
540 VZEROUPPER
541 ret
542#endif
543END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
544
545#if IS_IN (libc)
546# ifdef USE_MULTIARCH
547strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
548 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
549# ifdef SHARED
550strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
551 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
552# endif
553# endif
554# ifdef SHARED
555strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
556 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
557# endif
558#endif
559strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
560 MEMCPY_SYMBOL (__memcpy, unaligned))
561