1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37#include <sysdep.h>
38
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
42
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
56# endif
57#endif
58
59/* Avoid short distance rep movsb only with non-SSE vector. */
60#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
61# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
62#else
63# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
64#endif
65
66#ifndef PREFETCH
67# define PREFETCH(addr) prefetcht0 addr
68#endif
69
70/* Assume 64-byte prefetch size. */
71#ifndef PREFETCH_SIZE
72# define PREFETCH_SIZE 64
73#endif
74
75#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
76
77#if PREFETCH_SIZE == 64
78# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
79# define PREFETCH_ONE_SET(dir, base, offset) \
80 PREFETCH ((offset)base)
81# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
82# define PREFETCH_ONE_SET(dir, base, offset) \
83 PREFETCH ((offset)base); \
84 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
85# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
86# define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
89 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
90 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
91# else
92# error Unsupported PREFETCHED_LOAD_SIZE!
93# endif
94#else
95# error Unsupported PREFETCH_SIZE!
96#endif
97
98#ifndef SECTION
99# error SECTION is not defined!
100#endif
101
102 .section SECTION(.text),"ax",@progbits
103#if defined SHARED && IS_IN (libc)
104ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
105 cmp %RDX_LP, %RCX_LP
106 jb HIDDEN_JUMPTARGET (__chk_fail)
107END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
108#endif
109
110ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
111 mov %RDI_LP, %RAX_LP
112 add %RDX_LP, %RAX_LP
113 jmp L(start)
114END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
115
116#if defined SHARED && IS_IN (libc)
117ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
118 cmp %RDX_LP, %RCX_LP
119 jb HIDDEN_JUMPTARGET (__chk_fail)
120END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
121#endif
122
123ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
124 movq %rdi, %rax
125L(start):
126# ifdef __ILP32__
127 /* Clear the upper 32 bits. */
128 movl %edx, %edx
129# endif
130 cmp $VEC_SIZE, %RDX_LP
131 jb L(less_vec)
132 cmp $(VEC_SIZE * 2), %RDX_LP
133 ja L(more_2x_vec)
134#if !defined USE_MULTIARCH || !IS_IN (libc)
135L(last_2x_vec):
136#endif
137 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
138 VMOVU (%rsi), %VEC(0)
139 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
140 VMOVU %VEC(0), (%rdi)
141 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
142 VZEROUPPER
143#if !defined USE_MULTIARCH || !IS_IN (libc)
144L(nop):
145#endif
146 ret
147#if defined USE_MULTIARCH && IS_IN (libc)
148END (MEMMOVE_SYMBOL (__memmove, unaligned))
149
150# if VEC_SIZE == 16
151ENTRY (__mempcpy_chk_erms)
152 cmp %RDX_LP, %RCX_LP
153 jb HIDDEN_JUMPTARGET (__chk_fail)
154END (__mempcpy_chk_erms)
155
156/* Only used to measure performance of REP MOVSB. */
157ENTRY (__mempcpy_erms)
158 mov %RDI_LP, %RAX_LP
159 /* Skip zero length. */
160 test %RDX_LP, %RDX_LP
161 jz 2f
162 add %RDX_LP, %RAX_LP
163 jmp L(start_movsb)
164END (__mempcpy_erms)
165
166ENTRY (__memmove_chk_erms)
167 cmp %RDX_LP, %RCX_LP
168 jb HIDDEN_JUMPTARGET (__chk_fail)
169END (__memmove_chk_erms)
170
171ENTRY (__memmove_erms)
172 movq %rdi, %rax
173 /* Skip zero length. */
174 test %RDX_LP, %RDX_LP
175 jz 2f
176L(start_movsb):
177 mov %RDX_LP, %RCX_LP
178 cmp %RSI_LP, %RDI_LP
179 jb 1f
180 /* Source == destination is less common. */
181 je 2f
182 lea (%rsi,%rcx), %RDX_LP
183 cmp %RDX_LP, %RDI_LP
184 jb L(movsb_backward)
1851:
186 rep movsb
1872:
188 ret
189L(movsb_backward):
190 leaq -1(%rdi,%rcx), %rdi
191 leaq -1(%rsi,%rcx), %rsi
192 std
193 rep movsb
194 cld
195 ret
196END (__memmove_erms)
197strong_alias (__memmove_erms, __memcpy_erms)
198strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
199# endif
200
201# ifdef SHARED
202ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
203 cmp %RDX_LP, %RCX_LP
204 jb HIDDEN_JUMPTARGET (__chk_fail)
205END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
206# endif
207
208ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
209 mov %RDI_LP, %RAX_LP
210 add %RDX_LP, %RAX_LP
211 jmp L(start_erms)
212END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
213
214# ifdef SHARED
215ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
216 cmp %RDX_LP, %RCX_LP
217 jb HIDDEN_JUMPTARGET (__chk_fail)
218END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
219# endif
220
221ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
222 movq %rdi, %rax
223L(start_erms):
224# ifdef __ILP32__
225 /* Clear the upper 32 bits. */
226 movl %edx, %edx
227# endif
228 cmp $VEC_SIZE, %RDX_LP
229 jb L(less_vec)
230 cmp $(VEC_SIZE * 2), %RDX_LP
231 ja L(movsb_more_2x_vec)
232L(last_2x_vec):
233 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
234 VMOVU (%rsi), %VEC(0)
235 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
236 VMOVU %VEC(0), (%rdi)
237 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
238L(return):
239 VZEROUPPER
240 ret
241
242L(movsb):
243 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
244 jae L(more_8x_vec)
245 cmpq %rsi, %rdi
246 jb 1f
247 /* Source == destination is less common. */
248 je L(nop)
249 leaq (%rsi,%rdx), %r9
250 cmpq %r9, %rdi
251 /* Avoid slow backward REP MOVSB. */
252 jb L(more_8x_vec_backward)
253# if AVOID_SHORT_DISTANCE_REP_MOVSB
254 movq %rdi, %rcx
255 subq %rsi, %rcx
256 jmp 2f
257# endif
2581:
259# if AVOID_SHORT_DISTANCE_REP_MOVSB
260 movq %rsi, %rcx
261 subq %rdi, %rcx
2622:
263/* Avoid "rep movsb" if RCX, the distance between source and destination,
264 is N*4GB + [1..63] with N >= 0. */
265 cmpl $63, %ecx
266 jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
267# endif
268 mov %RDX_LP, %RCX_LP
269 rep movsb
270L(nop):
271 ret
272#endif
273
274L(less_vec):
275 /* Less than 1 VEC. */
276#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
277# error Unsupported VEC_SIZE!
278#endif
279#if VEC_SIZE > 32
280 cmpb $32, %dl
281 jae L(between_32_63)
282#endif
283#if VEC_SIZE > 16
284 cmpb $16, %dl
285 jae L(between_16_31)
286#endif
287 cmpb $8, %dl
288 jae L(between_8_15)
289 cmpb $4, %dl
290 jae L(between_4_7)
291 cmpb $1, %dl
292 ja L(between_2_3)
293 jb 1f
294 movzbl (%rsi), %ecx
295 movb %cl, (%rdi)
2961:
297 ret
298#if VEC_SIZE > 32
299L(between_32_63):
300 /* From 32 to 63. No branch when size == 32. */
301 vmovdqu (%rsi), %ymm0
302 vmovdqu -32(%rsi,%rdx), %ymm1
303 vmovdqu %ymm0, (%rdi)
304 vmovdqu %ymm1, -32(%rdi,%rdx)
305 VZEROUPPER
306 ret
307#endif
308#if VEC_SIZE > 16
309 /* From 16 to 31. No branch when size == 16. */
310L(between_16_31):
311 vmovdqu (%rsi), %xmm0
312 vmovdqu -16(%rsi,%rdx), %xmm1
313 vmovdqu %xmm0, (%rdi)
314 vmovdqu %xmm1, -16(%rdi,%rdx)
315 ret
316#endif
317L(between_8_15):
318 /* From 8 to 15. No branch when size == 8. */
319 movq -8(%rsi,%rdx), %rcx
320 movq (%rsi), %rsi
321 movq %rcx, -8(%rdi,%rdx)
322 movq %rsi, (%rdi)
323 ret
324L(between_4_7):
325 /* From 4 to 7. No branch when size == 4. */
326 movl -4(%rsi,%rdx), %ecx
327 movl (%rsi), %esi
328 movl %ecx, -4(%rdi,%rdx)
329 movl %esi, (%rdi)
330 ret
331L(between_2_3):
332 /* From 2 to 3. No branch when size == 2. */
333 movzwl -2(%rsi,%rdx), %ecx
334 movzwl (%rsi), %esi
335 movw %cx, -2(%rdi,%rdx)
336 movw %si, (%rdi)
337 ret
338
339#if defined USE_MULTIARCH && IS_IN (libc)
340L(movsb_more_2x_vec):
341 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
342 ja L(movsb)
343#endif
344L(more_2x_vec):
345 /* More than 2 * VEC and there may be overlap between destination
346 and source. */
347 cmpq $(VEC_SIZE * 8), %rdx
348 ja L(more_8x_vec)
349 cmpq $(VEC_SIZE * 4), %rdx
350 jb L(last_4x_vec)
351 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
352 VMOVU (%rsi), %VEC(0)
353 VMOVU VEC_SIZE(%rsi), %VEC(1)
354 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
355 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
356 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
357 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
358 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
359 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
360 VMOVU %VEC(0), (%rdi)
361 VMOVU %VEC(1), VEC_SIZE(%rdi)
362 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
363 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
364 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
365 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
366 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
367 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
368 VZEROUPPER
369 ret
370L(last_4x_vec):
371 /* Copy from 2 * VEC to 4 * VEC. */
372 VMOVU (%rsi), %VEC(0)
373 VMOVU VEC_SIZE(%rsi), %VEC(1)
374 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
375 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
376 VMOVU %VEC(0), (%rdi)
377 VMOVU %VEC(1), VEC_SIZE(%rdi)
378 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
379 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
380 VZEROUPPER
381 ret
382
383L(more_8x_vec):
384 cmpq %rsi, %rdi
385 ja L(more_8x_vec_backward)
386 /* Source == destination is less common. */
387 je L(nop)
388 /* Load the first VEC and last 4 * VEC to support overlapping
389 addresses. */
390 VMOVU (%rsi), %VEC(4)
391 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
392 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
393 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
394 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
395 /* Save start and stop of the destination buffer. */
396 movq %rdi, %r11
397 leaq -VEC_SIZE(%rdi, %rdx), %rcx
398 /* Align destination for aligned stores in the loop. Compute
399 how much destination is misaligned. */
400 movq %rdi, %r8
401 andq $(VEC_SIZE - 1), %r8
402 /* Get the negative of offset for alignment. */
403 subq $VEC_SIZE, %r8
404 /* Adjust source. */
405 subq %r8, %rsi
406 /* Adjust destination which should be aligned now. */
407 subq %r8, %rdi
408 /* Adjust length. */
409 addq %r8, %rdx
410#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
411 /* Check non-temporal store threshold. */
412 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
413 ja L(large_forward)
414#endif
415L(loop_4x_vec_forward):
416 /* Copy 4 * VEC a time forward. */
417 VMOVU (%rsi), %VEC(0)
418 VMOVU VEC_SIZE(%rsi), %VEC(1)
419 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
420 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
421 addq $(VEC_SIZE * 4), %rsi
422 subq $(VEC_SIZE * 4), %rdx
423 VMOVA %VEC(0), (%rdi)
424 VMOVA %VEC(1), VEC_SIZE(%rdi)
425 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
426 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
427 addq $(VEC_SIZE * 4), %rdi
428 cmpq $(VEC_SIZE * 4), %rdx
429 ja L(loop_4x_vec_forward)
430 /* Store the last 4 * VEC. */
431 VMOVU %VEC(5), (%rcx)
432 VMOVU %VEC(6), -VEC_SIZE(%rcx)
433 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
434 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
435 /* Store the first VEC. */
436 VMOVU %VEC(4), (%r11)
437 VZEROUPPER
438 ret
439
440L(more_8x_vec_backward):
441 /* Load the first 4 * VEC and last VEC to support overlapping
442 addresses. */
443 VMOVU (%rsi), %VEC(4)
444 VMOVU VEC_SIZE(%rsi), %VEC(5)
445 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
446 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
447 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
448 /* Save stop of the destination buffer. */
449 leaq -VEC_SIZE(%rdi, %rdx), %r11
450 /* Align destination end for aligned stores in the loop. Compute
451 how much destination end is misaligned. */
452 leaq -VEC_SIZE(%rsi, %rdx), %rcx
453 movq %r11, %r9
454 movq %r11, %r8
455 andq $(VEC_SIZE - 1), %r8
456 /* Adjust source. */
457 subq %r8, %rcx
458 /* Adjust the end of destination which should be aligned now. */
459 subq %r8, %r9
460 /* Adjust length. */
461 subq %r8, %rdx
462#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
463 /* Check non-temporal store threshold. */
464 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
465 ja L(large_backward)
466#endif
467L(loop_4x_vec_backward):
468 /* Copy 4 * VEC a time backward. */
469 VMOVU (%rcx), %VEC(0)
470 VMOVU -VEC_SIZE(%rcx), %VEC(1)
471 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
472 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
473 subq $(VEC_SIZE * 4), %rcx
474 subq $(VEC_SIZE * 4), %rdx
475 VMOVA %VEC(0), (%r9)
476 VMOVA %VEC(1), -VEC_SIZE(%r9)
477 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
478 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
479 subq $(VEC_SIZE * 4), %r9
480 cmpq $(VEC_SIZE * 4), %rdx
481 ja L(loop_4x_vec_backward)
482 /* Store the first 4 * VEC. */
483 VMOVU %VEC(4), (%rdi)
484 VMOVU %VEC(5), VEC_SIZE(%rdi)
485 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
486 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
487 /* Store the last VEC. */
488 VMOVU %VEC(8), (%r11)
489 VZEROUPPER
490 ret
491
492#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
493L(large_forward):
494 /* Don't use non-temporal store if there is overlap between
495 destination and source since destination may be in cache
496 when source is loaded. */
497 leaq (%rdi, %rdx), %r10
498 cmpq %r10, %rsi
499 jb L(loop_4x_vec_forward)
500L(loop_large_forward):
501 /* Copy 4 * VEC a time forward with non-temporal stores. */
502 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
503 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
504 VMOVU (%rsi), %VEC(0)
505 VMOVU VEC_SIZE(%rsi), %VEC(1)
506 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
507 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
508 addq $PREFETCHED_LOAD_SIZE, %rsi
509 subq $PREFETCHED_LOAD_SIZE, %rdx
510 VMOVNT %VEC(0), (%rdi)
511 VMOVNT %VEC(1), VEC_SIZE(%rdi)
512 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
513 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
514 addq $PREFETCHED_LOAD_SIZE, %rdi
515 cmpq $PREFETCHED_LOAD_SIZE, %rdx
516 ja L(loop_large_forward)
517 sfence
518 /* Store the last 4 * VEC. */
519 VMOVU %VEC(5), (%rcx)
520 VMOVU %VEC(6), -VEC_SIZE(%rcx)
521 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
522 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
523 /* Store the first VEC. */
524 VMOVU %VEC(4), (%r11)
525 VZEROUPPER
526 ret
527
528L(large_backward):
529 /* Don't use non-temporal store if there is overlap between
530 destination and source since destination may be in cache
531 when source is loaded. */
532 leaq (%rcx, %rdx), %r10
533 cmpq %r10, %r9
534 jb L(loop_4x_vec_backward)
535L(loop_large_backward):
536 /* Copy 4 * VEC a time backward with non-temporal stores. */
537 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
538 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
539 VMOVU (%rcx), %VEC(0)
540 VMOVU -VEC_SIZE(%rcx), %VEC(1)
541 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
542 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
543 subq $PREFETCHED_LOAD_SIZE, %rcx
544 subq $PREFETCHED_LOAD_SIZE, %rdx
545 VMOVNT %VEC(0), (%r9)
546 VMOVNT %VEC(1), -VEC_SIZE(%r9)
547 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
548 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
549 subq $PREFETCHED_LOAD_SIZE, %r9
550 cmpq $PREFETCHED_LOAD_SIZE, %rdx
551 ja L(loop_large_backward)
552 sfence
553 /* Store the first 4 * VEC. */
554 VMOVU %VEC(4), (%rdi)
555 VMOVU %VEC(5), VEC_SIZE(%rdi)
556 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
557 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
558 /* Store the last VEC. */
559 VMOVU %VEC(8), (%r11)
560 VZEROUPPER
561 ret
562#endif
563END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
564
565#if IS_IN (libc)
566# ifdef USE_MULTIARCH
567strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
568 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
569# ifdef SHARED
570strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
571 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
572# endif
573# endif
574# ifdef SHARED
575strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
576 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
577# endif
578#endif
579strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
580 MEMCPY_SYMBOL (__memcpy, unaligned))
581