memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2020 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	#ifndef PREFETCH
60	# define PREFETCH(addr) prefetcht0 addr
61	#endif
62
63	/ Assume 64-byte prefetch size. /
64	#ifndef PREFETCH_SIZE
65	# define PREFETCH_SIZE 64
66	#endif
67
68	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
69
70	#if PREFETCH_SIZE == 64
71	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
72	# define PREFETCH_ONE_SET(dir, base, offset) \
73	PREFETCH ((offset)base)
74	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
75	# define PREFETCH_ONE_SET(dir, base, offset) \
76	PREFETCH ((offset)base); \
77	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
78	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
79	# define PREFETCH_ONE_SET(dir, base, offset) \
80	PREFETCH ((offset)base); \
81	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
82	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
83	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
84	# else
85	# error Unsupported PREFETCHED_LOAD_SIZE!
86	# endif
87	#else
88	# error Unsupported PREFETCH_SIZE!
89	#endif
90
91	#ifndef SECTION
92	# error SECTION is not defined!
93	#endif
94
95	.section SECTION(.text),"ax",@progbits
96	#if defined SHARED && IS_IN (libc)
97	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
98	cmp %RDX_LP, %RCX_LP
99	jb HIDDEN_JUMPTARGET (__chk_fail)
100	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
101	#endif
102
103	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
104	mov %RDI_LP, %RAX_LP
105	add %RDX_LP, %RAX_LP
106	jmp L(start)
107	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
108
109	#if defined SHARED && IS_IN (libc)
110	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
111	cmp %RDX_LP, %RCX_LP
112	jb HIDDEN_JUMPTARGET (__chk_fail)
113	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
114	#endif
115
116	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
117	movq %rdi, %rax
118	L(start):
119	# ifdef __ILP32__
120	/ Clear the upper 32 bits. /
121	movl %edx, %edx
122	# endif
123	cmp $VEC_SIZE, %RDX_LP
124	jb L(less_vec)
125	cmp $(VEC_SIZE * `2`), %RDX_LP
126	ja L(more_2x_vec)
127	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
128	L(last_2x_vec):
129	#endif
130	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
131	VMOVU (%rsi), %VEC(`0`)
132	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
133	VMOVU %VEC(`0`), (%rdi)
134	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
135	VZEROUPPER
136	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
137	L(nop):
138	#endif
139	ret
140	#if defined USE_MULTIARCH && IS_IN (libc)
141	END (MEMMOVE_SYMBOL (__memmove, unaligned))
142
143	# if VEC_SIZE == 16
144	ENTRY (__mempcpy_chk_erms)
145	cmp %RDX_LP, %RCX_LP
146	jb HIDDEN_JUMPTARGET (__chk_fail)
147	END (__mempcpy_chk_erms)
148
149	/ Only used to measure performance of REP MOVSB. /
150	ENTRY (__mempcpy_erms)
151	mov %RDI_LP, %RAX_LP
152	/ Skip zero length. /
153	test %RDX_LP, %RDX_LP
154	jz `2f`
155	add %RDX_LP, %RAX_LP
156	jmp L(start_movsb)
157	END (__mempcpy_erms)
158
159	ENTRY (__memmove_chk_erms)
160	cmp %RDX_LP, %RCX_LP
161	jb HIDDEN_JUMPTARGET (__chk_fail)
162	END (__memmove_chk_erms)
163
164	ENTRY (__memmove_erms)
165	movq %rdi, %rax
166	/ Skip zero length. /
167	test %RDX_LP, %RDX_LP
168	jz `2f`
169	L(start_movsb):
170	mov %RDX_LP, %RCX_LP
171	cmp %RSI_LP, %RDI_LP
172	jb `1f`
173	/ Source == destination is less common. /
174	je `2f`
175	lea (%rsi,%rcx), %RDX_LP
176	cmp %RDX_LP, %RDI_LP
177	jb L(movsb_backward)
178	`1`:
179	rep movsb
180	`2`:
181	ret
182	L(movsb_backward):
183	leaq -`1`(%rdi,%rcx), %rdi
184	leaq -`1`(%rsi,%rcx), %rsi
185	std
186	rep movsb
187	cld
188	ret
189	END (__memmove_erms)
190	strong_alias (__memmove_erms, __memcpy_erms)
191	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
192	# endif
193
194	# ifdef SHARED
195	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
196	cmp %RDX_LP, %RCX_LP
197	jb HIDDEN_JUMPTARGET (__chk_fail)
198	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
199	# endif
200
201	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
202	mov %RDI_LP, %RAX_LP
203	add %RDX_LP, %RAX_LP
204	jmp L(start_erms)
205	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
206
207	# ifdef SHARED
208	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
209	cmp %RDX_LP, %RCX_LP
210	jb HIDDEN_JUMPTARGET (__chk_fail)
211	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
212	# endif
213
214	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
215	movq %rdi, %rax
216	L(start_erms):
217	# ifdef __ILP32__
218	/ Clear the upper 32 bits. /
219	movl %edx, %edx
220	# endif
221	cmp $VEC_SIZE, %RDX_LP
222	jb L(less_vec)
223	cmp $(VEC_SIZE * `2`), %RDX_LP
224	ja L(movsb_more_2x_vec)
225	L(last_2x_vec):
226	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
227	VMOVU (%rsi), %VEC(`0`)
228	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
229	VMOVU %VEC(`0`), (%rdi)
230	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
231	L(return):
232	VZEROUPPER
233	ret
234
235	L(movsb):
236	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
237	jae L(more_8x_vec)
238	cmpq %rsi, %rdi
239	jb `1f`
240	/ Source == destination is less common. /
241	je L(nop)
242	leaq (%rsi,%rdx), %r9
243	cmpq %r9, %rdi
244	/ Avoid slow backward REP MOVSB. /
245	jb L(more_8x_vec_backward)
246	`1`:
247	mov %RDX_LP, %RCX_LP
248	rep movsb
249	L(nop):
250	ret
251	#endif
252
253	L(less_vec):
254	/ Less than 1 VEC. /
255	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
256	# error Unsupported VEC_SIZE!
257	#endif
258	#if VEC_SIZE > 32
259	cmpb $`32`, %dl
260	jae L(between_32_63)
261	#endif
262	#if VEC_SIZE > 16
263	cmpb $`16`, %dl
264	jae L(between_16_31)
265	#endif
266	cmpb $`8`, %dl
267	jae L(between_8_15)
268	cmpb $`4`, %dl
269	jae L(between_4_7)
270	cmpb $`1`, %dl
271	ja L(between_2_3)
272	jb `1f`
273	movzbl (%rsi), %ecx
274	movb %cl, (%rdi)
275	`1`:
276	ret
277	#if VEC_SIZE > 32
278	L(between_32_63):
279	/ From 32 to 63. No branch when size == 32. /
280	vmovdqu (%rsi), %ymm0
281	vmovdqu -`32`(%rsi,%rdx), %ymm1
282	vmovdqu %ymm0, (%rdi)
283	vmovdqu %ymm1, -`32`(%rdi,%rdx)
284	VZEROUPPER
285	ret
286	#endif
287	#if VEC_SIZE > 16
288	/ From 16 to 31. No branch when size == 16. /
289	L(between_16_31):
290	vmovdqu (%rsi), %xmm0
291	vmovdqu -`16`(%rsi,%rdx), %xmm1
292	vmovdqu %xmm0, (%rdi)
293	vmovdqu %xmm1, -`16`(%rdi,%rdx)
294	ret
295	#endif
296	L(between_8_15):
297	/ From 8 to 15. No branch when size == 8. /
298	movq -`8`(%rsi,%rdx), %rcx
299	movq (%rsi), %rsi
300	movq %rcx, -`8`(%rdi,%rdx)
301	movq %rsi, (%rdi)
302	ret
303	L(between_4_7):
304	/ From 4 to 7. No branch when size == 4. /
305	movl -`4`(%rsi,%rdx), %ecx
306	movl (%rsi), %esi
307	movl %ecx, -`4`(%rdi,%rdx)
308	movl %esi, (%rdi)
309	ret
310	L(between_2_3):
311	/ From 2 to 3. No branch when size == 2. /
312	movzwl -`2`(%rsi,%rdx), %ecx
313	movzwl (%rsi), %esi
314	movw %cx, -`2`(%rdi,%rdx)
315	movw %si, (%rdi)
316	ret
317
318	#if defined USE_MULTIARCH && IS_IN (libc)
319	L(movsb_more_2x_vec):
320	cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
321	ja L(movsb)
322	#endif
323	L(more_2x_vec):
324	/ More than 2 * VEC and there may be overlap between destination*
325	and source. /*
326	cmpq $(VEC_SIZE * `8`), %rdx
327	ja L(more_8x_vec)
328	cmpq $(VEC_SIZE * `4`), %rdx
329	jb L(last_4x_vec)
330	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
331	VMOVU (%rsi), %VEC(`0`)
332	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
333	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
334	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
335	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
336	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
337	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
338	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
339	VMOVU %VEC(`0`), (%rdi)
340	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
341	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
342	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
343	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
344	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
345	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
346	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
347	VZEROUPPER
348	ret
349	L(last_4x_vec):
350	/ Copy from 2 * VEC to 4 * VEC. /
351	VMOVU (%rsi), %VEC(`0`)
352	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
353	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
354	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
355	VMOVU %VEC(`0`), (%rdi)
356	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
357	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
358	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
359	VZEROUPPER
360	ret
361
362	L(more_8x_vec):
363	cmpq %rsi, %rdi
364	ja L(more_8x_vec_backward)
365	/ Source == destination is less common. /
366	je L(nop)
367	/ Load the first VEC and last 4 * VEC to support overlapping*
368	addresses. /*
369	VMOVU (%rsi), %VEC(`4`)
370	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
371	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
372	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
373	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
374	/ Save start and stop of the destination buffer. /
375	movq %rdi, %r11
376	leaq -VEC_SIZE(%rdi, %rdx), %rcx
377	/ Align destination for aligned stores in the loop. Compute*
378	how much destination is misaligned. /*
379	movq %rdi, %r8
380	andq $(VEC_SIZE - `1`), %r8
381	/ Get the negative of offset for alignment. /
382	subq $VEC_SIZE, %r8
383	/ Adjust source. /
384	subq %r8, %rsi
385	/ Adjust destination which should be aligned now. /
386	subq %r8, %rdi
387	/ Adjust length. /
388	addq %r8, %rdx
389	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
390	/ Check non-temporal store threshold. /
391	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
392	ja L(large_forward)
393	#endif
394	L(loop_4x_vec_forward):
395	/ Copy 4 * VEC a time forward. /
396	VMOVU (%rsi), %VEC(`0`)
397	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
398	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
399	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
400	addq $(VEC_SIZE * `4`), %rsi
401	subq $(VEC_SIZE * `4`), %rdx
402	VMOVA %VEC(`0`), (%rdi)
403	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
404	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
405	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
406	addq $(VEC_SIZE * `4`), %rdi
407	cmpq $(VEC_SIZE * `4`), %rdx
408	ja L(loop_4x_vec_forward)
409	/ Store the last 4 * VEC. /
410	VMOVU %VEC(`5`), (%rcx)
411	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
412	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
413	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
414	/ Store the first VEC. /
415	VMOVU %VEC(`4`), (%r11)
416	VZEROUPPER
417	ret
418
419	L(more_8x_vec_backward):
420	/ Load the first 4 * VEC and last VEC to support overlapping*
421	addresses. /*
422	VMOVU (%rsi), %VEC(`4`)
423	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
424	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
425	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
426	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
427	/ Save stop of the destination buffer. /
428	leaq -VEC_SIZE(%rdi, %rdx), %r11
429	/ Align destination end for aligned stores in the loop. Compute*
430	how much destination end is misaligned. /*
431	leaq -VEC_SIZE(%rsi, %rdx), %rcx
432	movq %r11, %r9
433	movq %r11, %r8
434	andq $(VEC_SIZE - `1`), %r8
435	/ Adjust source. /
436	subq %r8, %rcx
437	/ Adjust the end of destination which should be aligned now. /
438	subq %r8, %r9
439	/ Adjust length. /
440	subq %r8, %rdx
441	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
442	/ Check non-temporal store threshold. /
443	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
444	ja L(large_backward)
445	#endif
446	L(loop_4x_vec_backward):
447	/ Copy 4 * VEC a time backward. /
448	VMOVU (%rcx), %VEC(`0`)
449	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
450	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
451	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
452	subq $(VEC_SIZE * `4`), %rcx
453	subq $(VEC_SIZE * `4`), %rdx
454	VMOVA %VEC(`0`), (%r9)
455	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
456	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
457	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
458	subq $(VEC_SIZE * `4`), %r9
459	cmpq $(VEC_SIZE * `4`), %rdx
460	ja L(loop_4x_vec_backward)
461	/ Store the first 4 * VEC. /
462	VMOVU %VEC(`4`), (%rdi)
463	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
464	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
465	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
466	/ Store the last VEC. /
467	VMOVU %VEC(`8`), (%r11)
468	VZEROUPPER
469	ret
470
471	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
472	L(large_forward):
473	/ Don't use non-temporal store if there is overlap between*
474	destination and source since destination may be in cache
475	when source is loaded. /*
476	leaq (%rdi, %rdx), %r10
477	cmpq %r10, %rsi
478	jb L(loop_4x_vec_forward)
479	L(loop_large_forward):
480	/ Copy 4 * VEC a time forward with non-temporal stores. /
481	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
482	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
483	VMOVU (%rsi), %VEC(`0`)
484	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
485	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
486	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
487	addq $PREFETCHED_LOAD_SIZE, %rsi
488	subq $PREFETCHED_LOAD_SIZE, %rdx
489	VMOVNT %VEC(`0`), (%rdi)
490	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
491	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
492	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
493	addq $PREFETCHED_LOAD_SIZE, %rdi
494	cmpq $PREFETCHED_LOAD_SIZE, %rdx
495	ja L(loop_large_forward)
496	sfence
497	/ Store the last 4 * VEC. /
498	VMOVU %VEC(`5`), (%rcx)
499	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
500	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
501	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
502	/ Store the first VEC. /
503	VMOVU %VEC(`4`), (%r11)
504	VZEROUPPER
505	ret
506
507	L(large_backward):
508	/ Don't use non-temporal store if there is overlap between*
509	destination and source since destination may be in cache
510	when source is loaded. /*
511	leaq (%rcx, %rdx), %r10
512	cmpq %r10, %r9
513	jb L(loop_4x_vec_backward)
514	L(loop_large_backward):
515	/ Copy 4 * VEC a time backward with non-temporal stores. /
516	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
517	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
518	VMOVU (%rcx), %VEC(`0`)
519	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
520	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
521	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
522	subq $PREFETCHED_LOAD_SIZE, %rcx
523	subq $PREFETCHED_LOAD_SIZE, %rdx
524	VMOVNT %VEC(`0`), (%r9)
525	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
526	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
527	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
528	subq $PREFETCHED_LOAD_SIZE, %r9
529	cmpq $PREFETCHED_LOAD_SIZE, %rdx
530	ja L(loop_large_backward)
531	sfence
532	/ Store the first 4 * VEC. /
533	VMOVU %VEC(`4`), (%rdi)
534	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
535	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
536	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
537	/ Store the last VEC. /
538	VMOVU %VEC(`8`), (%r11)
539	VZEROUPPER
540	ret
541	#endif
542	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
543
544	#if IS_IN (libc)
545	# ifdef USE_MULTIARCH
546	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
547	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
548	# ifdef SHARED
549	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
550	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
551	# endif
552	# endif
553	# ifdef SHARED
554	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
555	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
556	# endif
557	#endif
558	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
559	MEMCPY_SYMBOL (__memcpy, unaligned))
560

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S