memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	/ Threshold to use Enhanced REP MOVSB. Since there is overhead to set*
60	up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61	memcpy micro benchmark in glibc shows that 2KB is the approximate
62	value above which REP MOVSB becomes faster than SSE2 optimization
63	on processors with Enhanced REP MOVSB. Since larger register size
64	can move more data with a single load and store, the threshold is
65	higher with larger register size. /*
66	#ifndef REP_MOVSB_THRESHOLD
67	# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68	#endif
69
70	#ifndef PREFETCH
71	# define PREFETCH(addr) prefetcht0 addr
72	#endif
73
74	/ Assume 64-byte prefetch size. /
75	#ifndef PREFETCH_SIZE
76	# define PREFETCH_SIZE 64
77	#endif
78
79	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81	#if PREFETCH_SIZE == 64
82	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83	# define PREFETCH_ONE_SET(dir, base, offset) \
84	PREFETCH ((offset)base)
85	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86	# define PREFETCH_ONE_SET(dir, base, offset) \
87	PREFETCH ((offset)base); \
88	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90	# define PREFETCH_ONE_SET(dir, base, offset) \
91	PREFETCH ((offset)base); \
92	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96	# else
97	# error Unsupported PREFETCHED_LOAD_SIZE!
98	# endif
99	#else
100	# error Unsupported PREFETCH_SIZE!
101	#endif
102
103	#ifndef SECTION
104	# error SECTION is not defined!
105	#endif
106
107	.section SECTION(.text),"ax",@progbits
108	#if defined SHARED && IS_IN (libc)
109	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110	cmpq %rdx, %rcx
111	jb HIDDEN_JUMPTARGET (__chk_fail)
112	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113	#endif
114
115	#if VEC_SIZE == 16 \|\| defined SHARED
116	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
117	movq %rdi, %rax
118	addq %rdx, %rax
119	jmp L(start)
120	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
121	#endif
122
123	#if defined SHARED && IS_IN (libc)
124	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125	cmpq %rdx, %rcx
126	jb HIDDEN_JUMPTARGET (__chk_fail)
127	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
128	#endif
129
130	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
131	movq %rdi, %rax
132	L(start):
133	cmpq $VEC_SIZE, %rdx
134	jb L(less_vec)
135	cmpq $(VEC_SIZE * `2`), %rdx
136	ja L(more_2x_vec)
137	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
138	L(last_2x_vec):
139	#endif
140	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
141	VMOVU (%rsi), %VEC(`0`)
142	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
143	VMOVU %VEC(`0`), (%rdi)
144	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
145	VZEROUPPER
146	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
147	L(nop):
148	#endif
149	ret
150	#if defined USE_MULTIARCH && IS_IN (libc)
151	END (MEMMOVE_SYMBOL (__memmove, unaligned))
152
153	# if VEC_SIZE == 16
154	# if defined SHARED
155	/ Only used to measure performance of REP MOVSB. /
156	ENTRY (__mempcpy_erms)
157	movq %rdi, %rax
158	addq %rdx, %rax
159	jmp L(start_movsb)
160	END (__mempcpy_erms)
161	# endif
162
163	ENTRY (__memmove_erms)
164	movq %rdi, %rax
165	L(start_movsb):
166	movq %rdx, %rcx
167	cmpq %rsi, %rdi
168	jb `1f`
169	/ Source == destination is less common. /
170	je `2f`
171	leaq (%rsi,%rcx), %rdx
172	cmpq %rdx, %rdi
173	jb L(movsb_backward)
174	`1`:
175	rep movsb
176	`2`:
177	ret
178	L(movsb_backward):
179	leaq -`1`(%rdi,%rcx), %rdi
180	leaq -`1`(%rsi,%rcx), %rsi
181	std
182	rep movsb
183	cld
184	ret
185	END (__memmove_erms)
186	# if defined SHARED
187	strong_alias (__memmove_erms, __memcpy_erms)
188	# endif
189	# endif
190
191	# ifdef SHARED
192	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
193	cmpq %rdx, %rcx
194	jb HIDDEN_JUMPTARGET (__chk_fail)
195	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
196
197	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
198	movq %rdi, %rax
199	addq %rdx, %rax
200	jmp L(start_erms)
201	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
202
203	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
204	cmpq %rdx, %rcx
205	jb HIDDEN_JUMPTARGET (__chk_fail)
206	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
207	# endif
208
209	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
210	movq %rdi, %rax
211	L(start_erms):
212	cmpq $VEC_SIZE, %rdx
213	jb L(less_vec)
214	cmpq $(VEC_SIZE * `2`), %rdx
215	ja L(movsb_more_2x_vec)
216	L(last_2x_vec):
217	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
218	VMOVU (%rsi), %VEC(`0`)
219	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
220	VMOVU %VEC(`0`), (%rdi)
221	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
222	L(return):
223	VZEROUPPER
224	ret
225
226	L(movsb):
227	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
228	jae L(more_8x_vec)
229	cmpq %rsi, %rdi
230	jb `1f`
231	/ Source == destination is less common. /
232	je L(nop)
233	leaq (%rsi,%rdx), %r9
234	cmpq %r9, %rdi
235	/ Avoid slow backward REP MOVSB. /
236	# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
237	# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
238	# endif
239	jb L(more_8x_vec_backward)
240	`1`:
241	movq %rdx, %rcx
242	rep movsb
243	L(nop):
244	ret
245	#endif
246
247	L(less_vec):
248	/ Less than 1 VEC. /
249	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
250	# error Unsupported VEC_SIZE!
251	#endif
252	#if VEC_SIZE > 32
253	cmpb $`32`, %dl
254	jae L(between_32_63)
255	#endif
256	#if VEC_SIZE > 16
257	cmpb $`16`, %dl
258	jae L(between_16_31)
259	#endif
260	cmpb $`8`, %dl
261	jae L(between_8_15)
262	cmpb $`4`, %dl
263	jae L(between_4_7)
264	cmpb $`1`, %dl
265	ja L(between_2_3)
266	jb `1f`
267	movzbl (%rsi), %ecx
268	movb %cl, (%rdi)
269	`1`:
270	ret
271	#if VEC_SIZE > 32
272	L(between_32_63):
273	/ From 32 to 63. No branch when size == 32. /
274	vmovdqu (%rsi), %ymm0
275	vmovdqu -`32`(%rsi,%rdx), %ymm1
276	vmovdqu %ymm0, (%rdi)
277	vmovdqu %ymm1, -`32`(%rdi,%rdx)
278	VZEROUPPER
279	ret
280	#endif
281	#if VEC_SIZE > 16
282	/ From 16 to 31. No branch when size == 16. /
283	L(between_16_31):
284	vmovdqu (%rsi), %xmm0
285	vmovdqu -`16`(%rsi,%rdx), %xmm1
286	vmovdqu %xmm0, (%rdi)
287	vmovdqu %xmm1, -`16`(%rdi,%rdx)
288	ret
289	#endif
290	L(between_8_15):
291	/ From 8 to 15. No branch when size == 8. /
292	movq -`8`(%rsi,%rdx), %rcx
293	movq (%rsi), %rsi
294	movq %rcx, -`8`(%rdi,%rdx)
295	movq %rsi, (%rdi)
296	ret
297	L(between_4_7):
298	/ From 4 to 7. No branch when size == 4. /
299	movl -`4`(%rsi,%rdx), %ecx
300	movl (%rsi), %esi
301	movl %ecx, -`4`(%rdi,%rdx)
302	movl %esi, (%rdi)
303	ret
304	L(between_2_3):
305	/ From 2 to 3. No branch when size == 2. /
306	movzwl -`2`(%rsi,%rdx), %ecx
307	movzwl (%rsi), %esi
308	movw %cx, -`2`(%rdi,%rdx)
309	movw %si, (%rdi)
310	ret
311
312	#if defined USE_MULTIARCH && IS_IN (libc)
313	L(movsb_more_2x_vec):
314	cmpq $REP_MOVSB_THRESHOLD, %rdx
315	ja L(movsb)
316	#endif
317	L(more_2x_vec):
318	/ More than 2 * VEC and there may be overlap between destination*
319	and source. /*
320	cmpq $(VEC_SIZE * `8`), %rdx
321	ja L(more_8x_vec)
322	cmpq $(VEC_SIZE * `4`), %rdx
323	jb L(last_4x_vec)
324	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
325	VMOVU (%rsi), %VEC(`0`)
326	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
327	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
328	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
329	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
330	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
331	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
332	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
333	VMOVU %VEC(`0`), (%rdi)
334	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
335	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
336	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
337	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
338	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
339	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
340	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
341	VZEROUPPER
342	ret
343	L(last_4x_vec):
344	/ Copy from 2 * VEC to 4 * VEC. /
345	VMOVU (%rsi), %VEC(`0`)
346	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
347	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
348	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
349	VMOVU %VEC(`0`), (%rdi)
350	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
351	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
352	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
353	VZEROUPPER
354	ret
355
356	L(more_8x_vec):
357	cmpq %rsi, %rdi
358	ja L(more_8x_vec_backward)
359	/ Source == destination is less common. /
360	je L(nop)
361	/ Load the first VEC and last 4 * VEC to support overlapping*
362	addresses. /*
363	VMOVU (%rsi), %VEC(`4`)
364	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
365	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
366	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
367	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
368	/ Save start and stop of the destination buffer. /
369	movq %rdi, %r11
370	leaq -VEC_SIZE(%rdi, %rdx), %rcx
371	/ Align destination for aligned stores in the loop. Compute*
372	how much destination is misaligned. /*
373	movq %rdi, %r8
374	andq $(VEC_SIZE - `1`), %r8
375	/ Get the negative of offset for alignment. /
376	subq $VEC_SIZE, %r8
377	/ Adjust source. /
378	subq %r8, %rsi
379	/ Adjust destination which should be aligned now. /
380	subq %r8, %rdi
381	/ Adjust length. /
382	addq %r8, %rdx
383	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
384	/ Check non-temporal store threshold. /
385	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
386	ja L(large_forward)
387	#endif
388	L(loop_4x_vec_forward):
389	/ Copy 4 * VEC a time forward. /
390	VMOVU (%rsi), %VEC(`0`)
391	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
392	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
393	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
394	addq $(VEC_SIZE * `4`), %rsi
395	subq $(VEC_SIZE * `4`), %rdx
396	VMOVA %VEC(`0`), (%rdi)
397	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
398	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
399	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
400	addq $(VEC_SIZE * `4`), %rdi
401	cmpq $(VEC_SIZE * `4`), %rdx
402	ja L(loop_4x_vec_forward)
403	/ Store the last 4 * VEC. /
404	VMOVU %VEC(`5`), (%rcx)
405	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
406	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
407	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
408	/ Store the first VEC. /
409	VMOVU %VEC(`4`), (%r11)
410	VZEROUPPER
411	ret
412
413	L(more_8x_vec_backward):
414	/ Load the first 4 * VEC and last VEC to support overlapping*
415	addresses. /*
416	VMOVU (%rsi), %VEC(`4`)
417	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
418	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
419	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
420	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
421	/ Save stop of the destination buffer. /
422	leaq -VEC_SIZE(%rdi, %rdx), %r11
423	/ Align destination end for aligned stores in the loop. Compute*
424	how much destination end is misaligned. /*
425	leaq -VEC_SIZE(%rsi, %rdx), %rcx
426	movq %r11, %r9
427	movq %r11, %r8
428	andq $(VEC_SIZE - `1`), %r8
429	/ Adjust source. /
430	subq %r8, %rcx
431	/ Adjust the end of destination which should be aligned now. /
432	subq %r8, %r9
433	/ Adjust length. /
434	subq %r8, %rdx
435	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
436	/ Check non-temporal store threshold. /
437	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
438	ja L(large_backward)
439	#endif
440	L(loop_4x_vec_backward):
441	/ Copy 4 * VEC a time backward. /
442	VMOVU (%rcx), %VEC(`0`)
443	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
444	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
445	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
446	subq $(VEC_SIZE * `4`), %rcx
447	subq $(VEC_SIZE * `4`), %rdx
448	VMOVA %VEC(`0`), (%r9)
449	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
450	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
451	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
452	subq $(VEC_SIZE * `4`), %r9
453	cmpq $(VEC_SIZE * `4`), %rdx
454	ja L(loop_4x_vec_backward)
455	/ Store the first 4 * VEC. /
456	VMOVU %VEC(`4`), (%rdi)
457	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
458	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
459	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
460	/ Store the last VEC. /
461	VMOVU %VEC(`8`), (%r11)
462	VZEROUPPER
463	ret
464
465	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
466	L(large_forward):
467	/ Don't use non-temporal store if there is overlap between*
468	destination and source since destination may be in cache
469	when source is loaded. /*
470	leaq (%rdi, %rdx), %r10
471	cmpq %r10, %rsi
472	jb L(loop_4x_vec_forward)
473	L(loop_large_forward):
474	/ Copy 4 * VEC a time forward with non-temporal stores. /
475	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
476	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
477	VMOVU (%rsi), %VEC(`0`)
478	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
479	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
480	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
481	addq $PREFETCHED_LOAD_SIZE, %rsi
482	subq $PREFETCHED_LOAD_SIZE, %rdx
483	VMOVNT %VEC(`0`), (%rdi)
484	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
485	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
486	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
487	addq $PREFETCHED_LOAD_SIZE, %rdi
488	cmpq $PREFETCHED_LOAD_SIZE, %rdx
489	ja L(loop_large_forward)
490	sfence
491	/ Store the last 4 * VEC. /
492	VMOVU %VEC(`5`), (%rcx)
493	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
494	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
495	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
496	/ Store the first VEC. /
497	VMOVU %VEC(`4`), (%r11)
498	VZEROUPPER
499	ret
500
501	L(large_backward):
502	/ Don't use non-temporal store if there is overlap between*
503	destination and source since destination may be in cache
504	when source is loaded. /*
505	leaq (%rcx, %rdx), %r10
506	cmpq %r10, %r9
507	jb L(loop_4x_vec_backward)
508	L(loop_large_backward):
509	/ Copy 4 * VEC a time backward with non-temporal stores. /
510	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
511	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
512	VMOVU (%rcx), %VEC(`0`)
513	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
514	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
515	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
516	subq $PREFETCHED_LOAD_SIZE, %rcx
517	subq $PREFETCHED_LOAD_SIZE, %rdx
518	VMOVNT %VEC(`0`), (%r9)
519	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
520	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
521	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
522	subq $PREFETCHED_LOAD_SIZE, %r9
523	cmpq $PREFETCHED_LOAD_SIZE, %rdx
524	ja L(loop_large_backward)
525	sfence
526	/ Store the first 4 * VEC. /
527	VMOVU %VEC(`4`), (%rdi)
528	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
529	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
530	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
531	/ Store the last VEC. /
532	VMOVU %VEC(`8`), (%r11)
533	VZEROUPPER
534	ret
535	#endif
536	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
537
538	#ifdef SHARED
539	# if IS_IN (libc)
540	# ifdef USE_MULTIARCH
541	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
542	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
543	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
544	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
545	# endif
546	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
547	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
548	# endif
549	#endif
550	#if VEC_SIZE == 16 \|\| defined SHARED
551	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
552	MEMCPY_SYMBOL (__memcpy, unaligned))
553	#endif
554

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S