memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2018 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	/ Threshold to use Enhanced REP MOVSB. Since there is overhead to set*
60	up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61	memcpy micro benchmark in glibc shows that 2KB is the approximate
62	value above which REP MOVSB becomes faster than SSE2 optimization
63	on processors with Enhanced REP MOVSB. Since larger register size
64	can move more data with a single load and store, the threshold is
65	higher with larger register size. /*
66	#ifndef REP_MOVSB_THRESHOLD
67	# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68	#endif
69
70	#ifndef PREFETCH
71	# define PREFETCH(addr) prefetcht0 addr
72	#endif
73
74	/ Assume 64-byte prefetch size. /
75	#ifndef PREFETCH_SIZE
76	# define PREFETCH_SIZE 64
77	#endif
78
79	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81	#if PREFETCH_SIZE == 64
82	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83	# define PREFETCH_ONE_SET(dir, base, offset) \
84	PREFETCH ((offset)base)
85	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86	# define PREFETCH_ONE_SET(dir, base, offset) \
87	PREFETCH ((offset)base); \
88	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90	# define PREFETCH_ONE_SET(dir, base, offset) \
91	PREFETCH ((offset)base); \
92	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
94	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
95	# else
96	# error Unsupported PREFETCHED_LOAD_SIZE!
97	# endif
98	#else
99	# error Unsupported PREFETCH_SIZE!
100	#endif
101
102	#ifndef SECTION
103	# error SECTION is not defined!
104	#endif
105
106	.section SECTION(.text),"ax",@progbits
107	#if defined SHARED && IS_IN (libc)
108	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
109	cmpq %rdx, %rcx
110	jb HIDDEN_JUMPTARGET (__chk_fail)
111	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
112	#endif
113
114	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
115	movq %rdi, %rax
116	addq %rdx, %rax
117	jmp L(start)
118	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
119
120	#if defined SHARED && IS_IN (libc)
121	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
122	cmpq %rdx, %rcx
123	jb HIDDEN_JUMPTARGET (__chk_fail)
124	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125	#endif
126
127	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
128	movq %rdi, %rax
129	L(start):
130	cmpq $VEC_SIZE, %rdx
131	jb L(less_vec)
132	cmpq $(VEC_SIZE * `2`), %rdx
133	ja L(more_2x_vec)
134	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
135	L(last_2x_vec):
136	#endif
137	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
138	VMOVU (%rsi), %VEC(`0`)
139	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
140	VMOVU %VEC(`0`), (%rdi)
141	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
142	VZEROUPPER
143	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
144	L(nop):
145	#endif
146	ret
147	#if defined USE_MULTIARCH && IS_IN (libc)
148	END (MEMMOVE_SYMBOL (__memmove, unaligned))
149
150	# if VEC_SIZE == 16
151	ENTRY (__mempcpy_chk_erms)
152	cmpq %rdx, %rcx
153	jb HIDDEN_JUMPTARGET (__chk_fail)
154	END (__mempcpy_chk_erms)
155
156	/ Only used to measure performance of REP MOVSB. /
157	ENTRY (__mempcpy_erms)
158	movq %rdi, %rax
159	/ Skip zero length. /
160	testq %rdx, %rdx
161	jz `2f`
162	addq %rdx, %rax
163	jmp L(start_movsb)
164	END (__mempcpy_erms)
165
166	ENTRY (__memmove_chk_erms)
167	cmpq %rdx, %rcx
168	jb HIDDEN_JUMPTARGET (__chk_fail)
169	END (__memmove_chk_erms)
170
171	ENTRY (__memmove_erms)
172	movq %rdi, %rax
173	/ Skip zero length. /
174	testq %rdx, %rdx
175	jz `2f`
176	L(start_movsb):
177	movq %rdx, %rcx
178	cmpq %rsi, %rdi
179	jb `1f`
180	/ Source == destination is less common. /
181	je `2f`
182	leaq (%rsi,%rcx), %rdx
183	cmpq %rdx, %rdi
184	jb L(movsb_backward)
185	`1`:
186	rep movsb
187	`2`:
188	ret
189	L(movsb_backward):
190	leaq -`1`(%rdi,%rcx), %rdi
191	leaq -`1`(%rsi,%rcx), %rsi
192	std
193	rep movsb
194	cld
195	ret
196	END (__memmove_erms)
197	strong_alias (__memmove_erms, __memcpy_erms)
198	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
199	# endif
200
201	# ifdef SHARED
202	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
203	cmpq %rdx, %rcx
204	jb HIDDEN_JUMPTARGET (__chk_fail)
205	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
206	# endif
207
208	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
209	movq %rdi, %rax
210	addq %rdx, %rax
211	jmp L(start_erms)
212	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
213
214	# ifdef SHARED
215	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
216	cmpq %rdx, %rcx
217	jb HIDDEN_JUMPTARGET (__chk_fail)
218	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
219	# endif
220
221	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
222	movq %rdi, %rax
223	L(start_erms):
224	cmpq $VEC_SIZE, %rdx
225	jb L(less_vec)
226	cmpq $(VEC_SIZE * `2`), %rdx
227	ja L(movsb_more_2x_vec)
228	L(last_2x_vec):
229	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
230	VMOVU (%rsi), %VEC(`0`)
231	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
232	VMOVU %VEC(`0`), (%rdi)
233	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
234	L(return):
235	VZEROUPPER
236	ret
237
238	L(movsb):
239	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
240	jae L(more_8x_vec)
241	cmpq %rsi, %rdi
242	jb `1f`
243	/ Source == destination is less common. /
244	je L(nop)
245	leaq (%rsi,%rdx), %r9
246	cmpq %r9, %rdi
247	/ Avoid slow backward REP MOVSB. /
248	# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
249	# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
250	# endif
251	jb L(more_8x_vec_backward)
252	`1`:
253	movq %rdx, %rcx
254	rep movsb
255	L(nop):
256	ret
257	#endif
258
259	L(less_vec):
260	/ Less than 1 VEC. /
261	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
262	# error Unsupported VEC_SIZE!
263	#endif
264	#if VEC_SIZE > 32
265	cmpb $`32`, %dl
266	jae L(between_32_63)
267	#endif
268	#if VEC_SIZE > 16
269	cmpb $`16`, %dl
270	jae L(between_16_31)
271	#endif
272	cmpb $`8`, %dl
273	jae L(between_8_15)
274	cmpb $`4`, %dl
275	jae L(between_4_7)
276	cmpb $`1`, %dl
277	ja L(between_2_3)
278	jb `1f`
279	movzbl (%rsi), %ecx
280	movb %cl, (%rdi)
281	`1`:
282	ret
283	#if VEC_SIZE > 32
284	L(between_32_63):
285	/ From 32 to 63. No branch when size == 32. /
286	vmovdqu (%rsi), %ymm0
287	vmovdqu -`32`(%rsi,%rdx), %ymm1
288	vmovdqu %ymm0, (%rdi)
289	vmovdqu %ymm1, -`32`(%rdi,%rdx)
290	VZEROUPPER
291	ret
292	#endif
293	#if VEC_SIZE > 16
294	/ From 16 to 31. No branch when size == 16. /
295	L(between_16_31):
296	vmovdqu (%rsi), %xmm0
297	vmovdqu -`16`(%rsi,%rdx), %xmm1
298	vmovdqu %xmm0, (%rdi)
299	vmovdqu %xmm1, -`16`(%rdi,%rdx)
300	ret
301	#endif
302	L(between_8_15):
303	/ From 8 to 15. No branch when size == 8. /
304	movq -`8`(%rsi,%rdx), %rcx
305	movq (%rsi), %rsi
306	movq %rcx, -`8`(%rdi,%rdx)
307	movq %rsi, (%rdi)
308	ret
309	L(between_4_7):
310	/ From 4 to 7. No branch when size == 4. /
311	movl -`4`(%rsi,%rdx), %ecx
312	movl (%rsi), %esi
313	movl %ecx, -`4`(%rdi,%rdx)
314	movl %esi, (%rdi)
315	ret
316	L(between_2_3):
317	/ From 2 to 3. No branch when size == 2. /
318	movzwl -`2`(%rsi,%rdx), %ecx
319	movzwl (%rsi), %esi
320	movw %cx, -`2`(%rdi,%rdx)
321	movw %si, (%rdi)
322	ret
323
324	#if defined USE_MULTIARCH && IS_IN (libc)
325	L(movsb_more_2x_vec):
326	cmpq $REP_MOVSB_THRESHOLD, %rdx
327	ja L(movsb)
328	#endif
329	L(more_2x_vec):
330	/ More than 2 * VEC and there may be overlap between destination*
331	and source. /*
332	cmpq $(VEC_SIZE * `8`), %rdx
333	ja L(more_8x_vec)
334	cmpq $(VEC_SIZE * `4`), %rdx
335	jb L(last_4x_vec)
336	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
337	VMOVU (%rsi), %VEC(`0`)
338	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
339	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
340	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
341	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
342	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
343	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
344	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
345	VMOVU %VEC(`0`), (%rdi)
346	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
347	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
348	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
349	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
350	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
351	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
352	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
353	VZEROUPPER
354	ret
355	L(last_4x_vec):
356	/ Copy from 2 * VEC to 4 * VEC. /
357	VMOVU (%rsi), %VEC(`0`)
358	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
359	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
360	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
361	VMOVU %VEC(`0`), (%rdi)
362	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
363	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
364	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
365	VZEROUPPER
366	ret
367
368	L(more_8x_vec):
369	cmpq %rsi, %rdi
370	ja L(more_8x_vec_backward)
371	/ Source == destination is less common. /
372	je L(nop)
373	/ Load the first VEC and last 4 * VEC to support overlapping*
374	addresses. /*
375	VMOVU (%rsi), %VEC(`4`)
376	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
377	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
378	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
379	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
380	/ Save start and stop of the destination buffer. /
381	movq %rdi, %r11
382	leaq -VEC_SIZE(%rdi, %rdx), %rcx
383	/ Align destination for aligned stores in the loop. Compute*
384	how much destination is misaligned. /*
385	movq %rdi, %r8
386	andq $(VEC_SIZE - `1`), %r8
387	/ Get the negative of offset for alignment. /
388	subq $VEC_SIZE, %r8
389	/ Adjust source. /
390	subq %r8, %rsi
391	/ Adjust destination which should be aligned now. /
392	subq %r8, %rdi
393	/ Adjust length. /
394	addq %r8, %rdx
395	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
396	/ Check non-temporal store threshold. /
397	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
398	ja L(large_forward)
399	#endif
400	L(loop_4x_vec_forward):
401	/ Copy 4 * VEC a time forward. /
402	VMOVU (%rsi), %VEC(`0`)
403	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
404	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
405	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
406	addq $(VEC_SIZE * `4`), %rsi
407	subq $(VEC_SIZE * `4`), %rdx
408	VMOVA %VEC(`0`), (%rdi)
409	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
410	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
411	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
412	addq $(VEC_SIZE * `4`), %rdi
413	cmpq $(VEC_SIZE * `4`), %rdx
414	ja L(loop_4x_vec_forward)
415	/ Store the last 4 * VEC. /
416	VMOVU %VEC(`5`), (%rcx)
417	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
418	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
419	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
420	/ Store the first VEC. /
421	VMOVU %VEC(`4`), (%r11)
422	VZEROUPPER
423	ret
424
425	L(more_8x_vec_backward):
426	/ Load the first 4 * VEC and last VEC to support overlapping*
427	addresses. /*
428	VMOVU (%rsi), %VEC(`4`)
429	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
430	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
431	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
432	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
433	/ Save stop of the destination buffer. /
434	leaq -VEC_SIZE(%rdi, %rdx), %r11
435	/ Align destination end for aligned stores in the loop. Compute*
436	how much destination end is misaligned. /*
437	leaq -VEC_SIZE(%rsi, %rdx), %rcx
438	movq %r11, %r9
439	movq %r11, %r8
440	andq $(VEC_SIZE - `1`), %r8
441	/ Adjust source. /
442	subq %r8, %rcx
443	/ Adjust the end of destination which should be aligned now. /
444	subq %r8, %r9
445	/ Adjust length. /
446	subq %r8, %rdx
447	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
448	/ Check non-temporal store threshold. /
449	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
450	ja L(large_backward)
451	#endif
452	L(loop_4x_vec_backward):
453	/ Copy 4 * VEC a time backward. /
454	VMOVU (%rcx), %VEC(`0`)
455	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
456	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
457	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
458	subq $(VEC_SIZE * `4`), %rcx
459	subq $(VEC_SIZE * `4`), %rdx
460	VMOVA %VEC(`0`), (%r9)
461	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
462	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
463	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
464	subq $(VEC_SIZE * `4`), %r9
465	cmpq $(VEC_SIZE * `4`), %rdx
466	ja L(loop_4x_vec_backward)
467	/ Store the first 4 * VEC. /
468	VMOVU %VEC(`4`), (%rdi)
469	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
470	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
471	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
472	/ Store the last VEC. /
473	VMOVU %VEC(`8`), (%r11)
474	VZEROUPPER
475	ret
476
477	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
478	L(large_forward):
479	/ Don't use non-temporal store if there is overlap between*
480	destination and source since destination may be in cache
481	when source is loaded. /*
482	leaq (%rdi, %rdx), %r10
483	cmpq %r10, %rsi
484	jb L(loop_4x_vec_forward)
485	L(loop_large_forward):
486	/ Copy 4 * VEC a time forward with non-temporal stores. /
487	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
488	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
489	VMOVU (%rsi), %VEC(`0`)
490	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
491	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
492	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
493	addq $PREFETCHED_LOAD_SIZE, %rsi
494	subq $PREFETCHED_LOAD_SIZE, %rdx
495	VMOVNT %VEC(`0`), (%rdi)
496	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
497	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
498	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
499	addq $PREFETCHED_LOAD_SIZE, %rdi
500	cmpq $PREFETCHED_LOAD_SIZE, %rdx
501	ja L(loop_large_forward)
502	sfence
503	/ Store the last 4 * VEC. /
504	VMOVU %VEC(`5`), (%rcx)
505	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
506	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
507	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
508	/ Store the first VEC. /
509	VMOVU %VEC(`4`), (%r11)
510	VZEROUPPER
511	ret
512
513	L(large_backward):
514	/ Don't use non-temporal store if there is overlap between*
515	destination and source since destination may be in cache
516	when source is loaded. /*
517	leaq (%rcx, %rdx), %r10
518	cmpq %r10, %r9
519	jb L(loop_4x_vec_backward)
520	L(loop_large_backward):
521	/ Copy 4 * VEC a time backward with non-temporal stores. /
522	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
523	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
524	VMOVU (%rcx), %VEC(`0`)
525	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
526	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
527	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
528	subq $PREFETCHED_LOAD_SIZE, %rcx
529	subq $PREFETCHED_LOAD_SIZE, %rdx
530	VMOVNT %VEC(`0`), (%r9)
531	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
532	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
533	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
534	subq $PREFETCHED_LOAD_SIZE, %r9
535	cmpq $PREFETCHED_LOAD_SIZE, %rdx
536	ja L(loop_large_backward)
537	sfence
538	/ Store the first 4 * VEC. /
539	VMOVU %VEC(`4`), (%rdi)
540	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
541	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
542	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
543	/ Store the last VEC. /
544	VMOVU %VEC(`8`), (%r11)
545	VZEROUPPER
546	ret
547	#endif
548	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
549
550	#if IS_IN (libc)
551	# ifdef USE_MULTIARCH
552	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
553	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
554	# ifdef SHARED
555	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
556	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
557	# endif
558	# endif
559	# ifdef SHARED
560	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
561	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
562	# endif
563	#endif
564	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
565	MEMCPY_SYMBOL (__memcpy, unaligned))
566

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S