memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2017 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	/ Threshold to use Enhanced REP MOVSB. Since there is overhead to set*
60	up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61	memcpy micro benchmark in glibc shows that 2KB is the approximate
62	value above which REP MOVSB becomes faster than SSE2 optimization
63	on processors with Enhanced REP MOVSB. Since larger register size
64	can move more data with a single load and store, the threshold is
65	higher with larger register size. /*
66	#ifndef REP_MOVSB_THRESHOLD
67	# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68	#endif
69
70	#ifndef PREFETCH
71	# define PREFETCH(addr) prefetcht0 addr
72	#endif
73
74	/ Assume 64-byte prefetch size. /
75	#ifndef PREFETCH_SIZE
76	# define PREFETCH_SIZE 64
77	#endif
78
79	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81	#if PREFETCH_SIZE == 64
82	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83	# define PREFETCH_ONE_SET(dir, base, offset) \
84	PREFETCH ((offset)base)
85	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86	# define PREFETCH_ONE_SET(dir, base, offset) \
87	PREFETCH ((offset)base); \
88	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90	# define PREFETCH_ONE_SET(dir, base, offset) \
91	PREFETCH ((offset)base); \
92	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96	# else
97	# error Unsupported PREFETCHED_LOAD_SIZE!
98	# endif
99	#else
100	# error Unsupported PREFETCH_SIZE!
101	#endif
102
103	#ifndef SECTION
104	# error SECTION is not defined!
105	#endif
106
107	.section SECTION(.text),"ax",@progbits
108	#if defined SHARED && IS_IN (libc)
109	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110	cmpq %rdx, %rcx
111	jb HIDDEN_JUMPTARGET (__chk_fail)
112	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113	#endif
114
115	#if VEC_SIZE == 16 \|\| defined SHARED
116	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
117	movq %rdi, %rax
118	addq %rdx, %rax
119	jmp L(start)
120	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
121	#endif
122
123	#if defined SHARED && IS_IN (libc)
124	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125	cmpq %rdx, %rcx
126	jb HIDDEN_JUMPTARGET (__chk_fail)
127	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
128	#endif
129
130	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
131	movq %rdi, %rax
132	L(start):
133	cmpq $VEC_SIZE, %rdx
134	jb L(less_vec)
135	cmpq $(VEC_SIZE * `2`), %rdx
136	ja L(more_2x_vec)
137	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
138	L(last_2x_vec):
139	#endif
140	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
141	VMOVU (%rsi), %VEC(`0`)
142	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
143	VMOVU %VEC(`0`), (%rdi)
144	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
145	VZEROUPPER
146	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
147	L(nop):
148	#endif
149	ret
150	#if defined USE_MULTIARCH && IS_IN (libc)
151	END (MEMMOVE_SYMBOL (__memmove, unaligned))
152
153	# if VEC_SIZE == 16
154	# if defined SHARED
155	ENTRY (__mempcpy_chk_erms)
156	cmpq %rdx, %rcx
157	jb HIDDEN_JUMPTARGET (__chk_fail)
158	END (__mempcpy_chk_erms)
159
160	/ Only used to measure performance of REP MOVSB. /
161	ENTRY (__mempcpy_erms)
162	movq %rdi, %rax
163	addq %rdx, %rax
164	jmp L(start_movsb)
165	END (__mempcpy_erms)
166	# endif
167
168	ENTRY (__memmove_chk_erms)
169	cmpq %rdx, %rcx
170	jb HIDDEN_JUMPTARGET (__chk_fail)
171	END (__memmove_chk_erms)
172
173	ENTRY (__memmove_erms)
174	movq %rdi, %rax
175	L(start_movsb):
176	movq %rdx, %rcx
177	cmpq %rsi, %rdi
178	jb `1f`
179	/ Source == destination is less common. /
180	je `2f`
181	leaq (%rsi,%rcx), %rdx
182	cmpq %rdx, %rdi
183	jb L(movsb_backward)
184	`1`:
185	rep movsb
186	`2`:
187	ret
188	L(movsb_backward):
189	leaq -`1`(%rdi,%rcx), %rdi
190	leaq -`1`(%rsi,%rcx), %rsi
191	std
192	rep movsb
193	cld
194	ret
195	END (__memmove_erms)
196	# if defined SHARED
197	strong_alias (__memmove_erms, __memcpy_erms)
198	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
199	# endif
200	# endif
201
202	# ifdef SHARED
203	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
204	cmpq %rdx, %rcx
205	jb HIDDEN_JUMPTARGET (__chk_fail)
206	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
207
208	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
209	movq %rdi, %rax
210	addq %rdx, %rax
211	jmp L(start_erms)
212	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
213
214	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
215	cmpq %rdx, %rcx
216	jb HIDDEN_JUMPTARGET (__chk_fail)
217	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
218	# endif
219
220	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
221	movq %rdi, %rax
222	L(start_erms):
223	cmpq $VEC_SIZE, %rdx
224	jb L(less_vec)
225	cmpq $(VEC_SIZE * `2`), %rdx
226	ja L(movsb_more_2x_vec)
227	L(last_2x_vec):
228	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
229	VMOVU (%rsi), %VEC(`0`)
230	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
231	VMOVU %VEC(`0`), (%rdi)
232	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
233	L(return):
234	VZEROUPPER
235	ret
236
237	L(movsb):
238	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
239	jae L(more_8x_vec)
240	cmpq %rsi, %rdi
241	jb `1f`
242	/ Source == destination is less common. /
243	je L(nop)
244	leaq (%rsi,%rdx), %r9
245	cmpq %r9, %rdi
246	/ Avoid slow backward REP MOVSB. /
247	# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
248	# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
249	# endif
250	jb L(more_8x_vec_backward)
251	`1`:
252	movq %rdx, %rcx
253	rep movsb
254	L(nop):
255	ret
256	#endif
257
258	L(less_vec):
259	/ Less than 1 VEC. /
260	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
261	# error Unsupported VEC_SIZE!
262	#endif
263	#if VEC_SIZE > 32
264	cmpb $`32`, %dl
265	jae L(between_32_63)
266	#endif
267	#if VEC_SIZE > 16
268	cmpb $`16`, %dl
269	jae L(between_16_31)
270	#endif
271	cmpb $`8`, %dl
272	jae L(between_8_15)
273	cmpb $`4`, %dl
274	jae L(between_4_7)
275	cmpb $`1`, %dl
276	ja L(between_2_3)
277	jb `1f`
278	movzbl (%rsi), %ecx
279	movb %cl, (%rdi)
280	`1`:
281	ret
282	#if VEC_SIZE > 32
283	L(between_32_63):
284	/ From 32 to 63. No branch when size == 32. /
285	vmovdqu (%rsi), %ymm0
286	vmovdqu -`32`(%rsi,%rdx), %ymm1
287	vmovdqu %ymm0, (%rdi)
288	vmovdqu %ymm1, -`32`(%rdi,%rdx)
289	VZEROUPPER
290	ret
291	#endif
292	#if VEC_SIZE > 16
293	/ From 16 to 31. No branch when size == 16. /
294	L(between_16_31):
295	vmovdqu (%rsi), %xmm0
296	vmovdqu -`16`(%rsi,%rdx), %xmm1
297	vmovdqu %xmm0, (%rdi)
298	vmovdqu %xmm1, -`16`(%rdi,%rdx)
299	ret
300	#endif
301	L(between_8_15):
302	/ From 8 to 15. No branch when size == 8. /
303	movq -`8`(%rsi,%rdx), %rcx
304	movq (%rsi), %rsi
305	movq %rcx, -`8`(%rdi,%rdx)
306	movq %rsi, (%rdi)
307	ret
308	L(between_4_7):
309	/ From 4 to 7. No branch when size == 4. /
310	movl -`4`(%rsi,%rdx), %ecx
311	movl (%rsi), %esi
312	movl %ecx, -`4`(%rdi,%rdx)
313	movl %esi, (%rdi)
314	ret
315	L(between_2_3):
316	/ From 2 to 3. No branch when size == 2. /
317	movzwl -`2`(%rsi,%rdx), %ecx
318	movzwl (%rsi), %esi
319	movw %cx, -`2`(%rdi,%rdx)
320	movw %si, (%rdi)
321	ret
322
323	#if defined USE_MULTIARCH && IS_IN (libc)
324	L(movsb_more_2x_vec):
325	cmpq $REP_MOVSB_THRESHOLD, %rdx
326	ja L(movsb)
327	#endif
328	L(more_2x_vec):
329	/ More than 2 * VEC and there may be overlap between destination*
330	and source. /*
331	cmpq $(VEC_SIZE * `8`), %rdx
332	ja L(more_8x_vec)
333	cmpq $(VEC_SIZE * `4`), %rdx
334	jb L(last_4x_vec)
335	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
336	VMOVU (%rsi), %VEC(`0`)
337	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
338	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
339	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
340	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
341	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
342	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
343	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
344	VMOVU %VEC(`0`), (%rdi)
345	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
346	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
347	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
348	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
349	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
350	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
351	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
352	VZEROUPPER
353	ret
354	L(last_4x_vec):
355	/ Copy from 2 * VEC to 4 * VEC. /
356	VMOVU (%rsi), %VEC(`0`)
357	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
358	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
359	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
360	VMOVU %VEC(`0`), (%rdi)
361	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
362	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
363	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
364	VZEROUPPER
365	ret
366
367	L(more_8x_vec):
368	cmpq %rsi, %rdi
369	ja L(more_8x_vec_backward)
370	/ Source == destination is less common. /
371	je L(nop)
372	/ Load the first VEC and last 4 * VEC to support overlapping*
373	addresses. /*
374	VMOVU (%rsi), %VEC(`4`)
375	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
376	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
377	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
378	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
379	/ Save start and stop of the destination buffer. /
380	movq %rdi, %r11
381	leaq -VEC_SIZE(%rdi, %rdx), %rcx
382	/ Align destination for aligned stores in the loop. Compute*
383	how much destination is misaligned. /*
384	movq %rdi, %r8
385	andq $(VEC_SIZE - `1`), %r8
386	/ Get the negative of offset for alignment. /
387	subq $VEC_SIZE, %r8
388	/ Adjust source. /
389	subq %r8, %rsi
390	/ Adjust destination which should be aligned now. /
391	subq %r8, %rdi
392	/ Adjust length. /
393	addq %r8, %rdx
394	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
395	/ Check non-temporal store threshold. /
396	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
397	ja L(large_forward)
398	#endif
399	L(loop_4x_vec_forward):
400	/ Copy 4 * VEC a time forward. /
401	VMOVU (%rsi), %VEC(`0`)
402	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
403	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
404	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
405	addq $(VEC_SIZE * `4`), %rsi
406	subq $(VEC_SIZE * `4`), %rdx
407	VMOVA %VEC(`0`), (%rdi)
408	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
409	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
410	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
411	addq $(VEC_SIZE * `4`), %rdi
412	cmpq $(VEC_SIZE * `4`), %rdx
413	ja L(loop_4x_vec_forward)
414	/ Store the last 4 * VEC. /
415	VMOVU %VEC(`5`), (%rcx)
416	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
417	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
418	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
419	/ Store the first VEC. /
420	VMOVU %VEC(`4`), (%r11)
421	VZEROUPPER
422	ret
423
424	L(more_8x_vec_backward):
425	/ Load the first 4 * VEC and last VEC to support overlapping*
426	addresses. /*
427	VMOVU (%rsi), %VEC(`4`)
428	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
429	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
430	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
431	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
432	/ Save stop of the destination buffer. /
433	leaq -VEC_SIZE(%rdi, %rdx), %r11
434	/ Align destination end for aligned stores in the loop. Compute*
435	how much destination end is misaligned. /*
436	leaq -VEC_SIZE(%rsi, %rdx), %rcx
437	movq %r11, %r9
438	movq %r11, %r8
439	andq $(VEC_SIZE - `1`), %r8
440	/ Adjust source. /
441	subq %r8, %rcx
442	/ Adjust the end of destination which should be aligned now. /
443	subq %r8, %r9
444	/ Adjust length. /
445	subq %r8, %rdx
446	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
447	/ Check non-temporal store threshold. /
448	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
449	ja L(large_backward)
450	#endif
451	L(loop_4x_vec_backward):
452	/ Copy 4 * VEC a time backward. /
453	VMOVU (%rcx), %VEC(`0`)
454	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
455	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
456	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
457	subq $(VEC_SIZE * `4`), %rcx
458	subq $(VEC_SIZE * `4`), %rdx
459	VMOVA %VEC(`0`), (%r9)
460	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
461	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
462	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
463	subq $(VEC_SIZE * `4`), %r9
464	cmpq $(VEC_SIZE * `4`), %rdx
465	ja L(loop_4x_vec_backward)
466	/ Store the first 4 * VEC. /
467	VMOVU %VEC(`4`), (%rdi)
468	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
469	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
470	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
471	/ Store the last VEC. /
472	VMOVU %VEC(`8`), (%r11)
473	VZEROUPPER
474	ret
475
476	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
477	L(large_forward):
478	/ Don't use non-temporal store if there is overlap between*
479	destination and source since destination may be in cache
480	when source is loaded. /*
481	leaq (%rdi, %rdx), %r10
482	cmpq %r10, %rsi
483	jb L(loop_4x_vec_forward)
484	L(loop_large_forward):
485	/ Copy 4 * VEC a time forward with non-temporal stores. /
486	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
487	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
488	VMOVU (%rsi), %VEC(`0`)
489	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
490	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
491	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
492	addq $PREFETCHED_LOAD_SIZE, %rsi
493	subq $PREFETCHED_LOAD_SIZE, %rdx
494	VMOVNT %VEC(`0`), (%rdi)
495	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
496	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
497	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
498	addq $PREFETCHED_LOAD_SIZE, %rdi
499	cmpq $PREFETCHED_LOAD_SIZE, %rdx
500	ja L(loop_large_forward)
501	sfence
502	/ Store the last 4 * VEC. /
503	VMOVU %VEC(`5`), (%rcx)
504	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
505	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
506	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
507	/ Store the first VEC. /
508	VMOVU %VEC(`4`), (%r11)
509	VZEROUPPER
510	ret
511
512	L(large_backward):
513	/ Don't use non-temporal store if there is overlap between*
514	destination and source since destination may be in cache
515	when source is loaded. /*
516	leaq (%rcx, %rdx), %r10
517	cmpq %r10, %r9
518	jb L(loop_4x_vec_backward)
519	L(loop_large_backward):
520	/ Copy 4 * VEC a time backward with non-temporal stores. /
521	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
522	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
523	VMOVU (%rcx), %VEC(`0`)
524	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
525	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
526	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
527	subq $PREFETCHED_LOAD_SIZE, %rcx
528	subq $PREFETCHED_LOAD_SIZE, %rdx
529	VMOVNT %VEC(`0`), (%r9)
530	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
531	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
532	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
533	subq $PREFETCHED_LOAD_SIZE, %r9
534	cmpq $PREFETCHED_LOAD_SIZE, %rdx
535	ja L(loop_large_backward)
536	sfence
537	/ Store the first 4 * VEC. /
538	VMOVU %VEC(`4`), (%rdi)
539	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
540	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
541	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
542	/ Store the last VEC. /
543	VMOVU %VEC(`8`), (%r11)
544	VZEROUPPER
545	ret
546	#endif
547	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
548
549	#ifdef SHARED
550	# if IS_IN (libc)
551	# ifdef USE_MULTIARCH
552	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
553	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
554	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
555	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
556	# endif
557	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
558	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
559	# endif
560	#endif
561	#if VEC_SIZE == 16 \|\| defined SHARED
562	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
563	MEMCPY_SYMBOL (__memcpy, unaligned))
564	#endif
565

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S