memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	/ Avoid short distance rep movsb only with non-SSE vector. /
60	#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
61	# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
62	#else
63	# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
64	#endif
65
66	#ifndef PREFETCH
67	# define PREFETCH(addr) prefetcht0 addr
68	#endif
69
70	/ Assume 64-byte prefetch size. /
71	#ifndef PREFETCH_SIZE
72	# define PREFETCH_SIZE 64
73	#endif
74
75	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
76
77	#if PREFETCH_SIZE == 64
78	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
79	# define PREFETCH_ONE_SET(dir, base, offset) \
80	PREFETCH ((offset)base)
81	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
82	# define PREFETCH_ONE_SET(dir, base, offset) \
83	PREFETCH ((offset)base); \
84	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
85	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
86	# define PREFETCH_ONE_SET(dir, base, offset) \
87	PREFETCH ((offset)base); \
88	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
89	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
90	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
91	# else
92	# error Unsupported PREFETCHED_LOAD_SIZE!
93	# endif
94	#else
95	# error Unsupported PREFETCH_SIZE!
96	#endif
97
98	#ifndef SECTION
99	# error SECTION is not defined!
100	#endif
101
102	.section SECTION(.text),"ax",@progbits
103	#if defined SHARED && IS_IN (libc)
104	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
105	cmp %RDX_LP, %RCX_LP
106	jb HIDDEN_JUMPTARGET (__chk_fail)
107	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
108	#endif
109
110	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
111	mov %RDI_LP, %RAX_LP
112	add %RDX_LP, %RAX_LP
113	jmp L(start)
114	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
115
116	#if defined SHARED && IS_IN (libc)
117	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
118	cmp %RDX_LP, %RCX_LP
119	jb HIDDEN_JUMPTARGET (__chk_fail)
120	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
121	#endif
122
123	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
124	movq %rdi, %rax
125	L(start):
126	# ifdef __ILP32__
127	/ Clear the upper 32 bits. /
128	movl %edx, %edx
129	# endif
130	cmp $VEC_SIZE, %RDX_LP
131	jb L(less_vec)
132	cmp $(VEC_SIZE * `2`), %RDX_LP
133	ja L(more_2x_vec)
134	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
135	L(last_2x_vec):
136	#endif
137	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
138	VMOVU (%rsi), %VEC(`0`)
139	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
140	VMOVU %VEC(`0`), (%rdi)
141	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
142	VZEROUPPER
143	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
144	L(nop):
145	#endif
146	ret
147	#if defined USE_MULTIARCH && IS_IN (libc)
148	END (MEMMOVE_SYMBOL (__memmove, unaligned))
149
150	# if VEC_SIZE == 16
151	ENTRY (__mempcpy_chk_erms)
152	cmp %RDX_LP, %RCX_LP
153	jb HIDDEN_JUMPTARGET (__chk_fail)
154	END (__mempcpy_chk_erms)
155
156	/ Only used to measure performance of REP MOVSB. /
157	ENTRY (__mempcpy_erms)
158	mov %RDI_LP, %RAX_LP
159	/ Skip zero length. /
160	test %RDX_LP, %RDX_LP
161	jz `2f`
162	add %RDX_LP, %RAX_LP
163	jmp L(start_movsb)
164	END (__mempcpy_erms)
165
166	ENTRY (__memmove_chk_erms)
167	cmp %RDX_LP, %RCX_LP
168	jb HIDDEN_JUMPTARGET (__chk_fail)
169	END (__memmove_chk_erms)
170
171	ENTRY (__memmove_erms)
172	movq %rdi, %rax
173	/ Skip zero length. /
174	test %RDX_LP, %RDX_LP
175	jz `2f`
176	L(start_movsb):
177	mov %RDX_LP, %RCX_LP
178	cmp %RSI_LP, %RDI_LP
179	jb `1f`
180	/ Source == destination is less common. /
181	je `2f`
182	lea (%rsi,%rcx), %RDX_LP
183	cmp %RDX_LP, %RDI_LP
184	jb L(movsb_backward)
185	`1`:
186	rep movsb
187	`2`:
188	ret
189	L(movsb_backward):
190	leaq -`1`(%rdi,%rcx), %rdi
191	leaq -`1`(%rsi,%rcx), %rsi
192	std
193	rep movsb
194	cld
195	ret
196	END (__memmove_erms)
197	strong_alias (__memmove_erms, __memcpy_erms)
198	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
199	# endif
200
201	# ifdef SHARED
202	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
203	cmp %RDX_LP, %RCX_LP
204	jb HIDDEN_JUMPTARGET (__chk_fail)
205	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
206	# endif
207
208	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
209	mov %RDI_LP, %RAX_LP
210	add %RDX_LP, %RAX_LP
211	jmp L(start_erms)
212	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
213
214	# ifdef SHARED
215	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
216	cmp %RDX_LP, %RCX_LP
217	jb HIDDEN_JUMPTARGET (__chk_fail)
218	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
219	# endif
220
221	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
222	movq %rdi, %rax
223	L(start_erms):
224	# ifdef __ILP32__
225	/ Clear the upper 32 bits. /
226	movl %edx, %edx
227	# endif
228	cmp $VEC_SIZE, %RDX_LP
229	jb L(less_vec)
230	cmp $(VEC_SIZE * `2`), %RDX_LP
231	ja L(movsb_more_2x_vec)
232	L(last_2x_vec):
233	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
234	VMOVU (%rsi), %VEC(`0`)
235	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
236	VMOVU %VEC(`0`), (%rdi)
237	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
238	L(return):
239	VZEROUPPER
240	ret
241
242	L(movsb):
243	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
244	jae L(more_8x_vec)
245	cmpq %rsi, %rdi
246	jb `1f`
247	/ Source == destination is less common. /
248	je L(nop)
249	leaq (%rsi,%rdx), %r9
250	cmpq %r9, %rdi
251	/ Avoid slow backward REP MOVSB. /
252	jb L(more_8x_vec_backward)
253	# if AVOID_SHORT_DISTANCE_REP_MOVSB
254	movq %rdi, %rcx
255	subq %rsi, %rcx
256	jmp `2f`
257	# endif
258	`1`:
259	# if AVOID_SHORT_DISTANCE_REP_MOVSB
260	movq %rsi, %rcx
261	subq %rdi, %rcx
262	`2`:
263	/ Avoid "rep movsb" if RCX, the distance between source and destination,*
264	is N4GB + [1..63] with N >= 0. /
265	cmpl $`63`, %ecx
266	jbe L(more_2x_vec) / Avoid "rep movsb" if ECX <= 63. /
267	# endif
268	mov %RDX_LP, %RCX_LP
269	rep movsb
270	L(nop):
271	ret
272	#endif
273
274	L(less_vec):
275	/ Less than 1 VEC. /
276	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
277	# error Unsupported VEC_SIZE!
278	#endif
279	#if VEC_SIZE > 32
280	cmpb $`32`, %dl
281	jae L(between_32_63)
282	#endif
283	#if VEC_SIZE > 16
284	cmpb $`16`, %dl
285	jae L(between_16_31)
286	#endif
287	cmpb $`8`, %dl
288	jae L(between_8_15)
289	cmpb $`4`, %dl
290	jae L(between_4_7)
291	cmpb $`1`, %dl
292	ja L(between_2_3)
293	jb `1f`
294	movzbl (%rsi), %ecx
295	movb %cl, (%rdi)
296	`1`:
297	ret
298	#if VEC_SIZE > 32
299	L(between_32_63):
300	/ From 32 to 63. No branch when size == 32. /
301	vmovdqu (%rsi), %ymm0
302	vmovdqu -`32`(%rsi,%rdx), %ymm1
303	vmovdqu %ymm0, (%rdi)
304	vmovdqu %ymm1, -`32`(%rdi,%rdx)
305	VZEROUPPER
306	ret
307	#endif
308	#if VEC_SIZE > 16
309	/ From 16 to 31. No branch when size == 16. /
310	L(between_16_31):
311	vmovdqu (%rsi), %xmm0
312	vmovdqu -`16`(%rsi,%rdx), %xmm1
313	vmovdqu %xmm0, (%rdi)
314	vmovdqu %xmm1, -`16`(%rdi,%rdx)
315	ret
316	#endif
317	L(between_8_15):
318	/ From 8 to 15. No branch when size == 8. /
319	movq -`8`(%rsi,%rdx), %rcx
320	movq (%rsi), %rsi
321	movq %rcx, -`8`(%rdi,%rdx)
322	movq %rsi, (%rdi)
323	ret
324	L(between_4_7):
325	/ From 4 to 7. No branch when size == 4. /
326	movl -`4`(%rsi,%rdx), %ecx
327	movl (%rsi), %esi
328	movl %ecx, -`4`(%rdi,%rdx)
329	movl %esi, (%rdi)
330	ret
331	L(between_2_3):
332	/ From 2 to 3. No branch when size == 2. /
333	movzwl -`2`(%rsi,%rdx), %ecx
334	movzwl (%rsi), %esi
335	movw %cx, -`2`(%rdi,%rdx)
336	movw %si, (%rdi)
337	ret
338
339	#if defined USE_MULTIARCH && IS_IN (libc)
340	L(movsb_more_2x_vec):
341	cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
342	ja L(movsb)
343	#endif
344	L(more_2x_vec):
345	/ More than 2 * VEC and there may be overlap between destination*
346	and source. /*
347	cmpq $(VEC_SIZE * `8`), %rdx
348	ja L(more_8x_vec)
349	cmpq $(VEC_SIZE * `4`), %rdx
350	jb L(last_4x_vec)
351	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
352	VMOVU (%rsi), %VEC(`0`)
353	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
354	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
355	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
356	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
357	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
358	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
359	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
360	VMOVU %VEC(`0`), (%rdi)
361	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
362	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
363	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
364	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
365	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
366	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
367	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
368	VZEROUPPER
369	ret
370	L(last_4x_vec):
371	/ Copy from 2 * VEC to 4 * VEC. /
372	VMOVU (%rsi), %VEC(`0`)
373	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
374	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
375	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
376	VMOVU %VEC(`0`), (%rdi)
377	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
378	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
379	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
380	VZEROUPPER
381	ret
382
383	L(more_8x_vec):
384	cmpq %rsi, %rdi
385	ja L(more_8x_vec_backward)
386	/ Source == destination is less common. /
387	je L(nop)
388	/ Load the first VEC and last 4 * VEC to support overlapping*
389	addresses. /*
390	VMOVU (%rsi), %VEC(`4`)
391	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
392	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
393	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
394	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
395	/ Save start and stop of the destination buffer. /
396	movq %rdi, %r11
397	leaq -VEC_SIZE(%rdi, %rdx), %rcx
398	/ Align destination for aligned stores in the loop. Compute*
399	how much destination is misaligned. /*
400	movq %rdi, %r8
401	andq $(VEC_SIZE - `1`), %r8
402	/ Get the negative of offset for alignment. /
403	subq $VEC_SIZE, %r8
404	/ Adjust source. /
405	subq %r8, %rsi
406	/ Adjust destination which should be aligned now. /
407	subq %r8, %rdi
408	/ Adjust length. /
409	addq %r8, %rdx
410	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
411	/ Check non-temporal store threshold. /
412	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
413	ja L(large_forward)
414	#endif
415	L(loop_4x_vec_forward):
416	/ Copy 4 * VEC a time forward. /
417	VMOVU (%rsi), %VEC(`0`)
418	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
419	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
420	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
421	addq $(VEC_SIZE * `4`), %rsi
422	subq $(VEC_SIZE * `4`), %rdx
423	VMOVA %VEC(`0`), (%rdi)
424	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
425	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
426	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
427	addq $(VEC_SIZE * `4`), %rdi
428	cmpq $(VEC_SIZE * `4`), %rdx
429	ja L(loop_4x_vec_forward)
430	/ Store the last 4 * VEC. /
431	VMOVU %VEC(`5`), (%rcx)
432	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
433	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
434	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
435	/ Store the first VEC. /
436	VMOVU %VEC(`4`), (%r11)
437	VZEROUPPER
438	ret
439
440	L(more_8x_vec_backward):
441	/ Load the first 4 * VEC and last VEC to support overlapping*
442	addresses. /*
443	VMOVU (%rsi), %VEC(`4`)
444	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
445	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
446	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
447	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
448	/ Save stop of the destination buffer. /
449	leaq -VEC_SIZE(%rdi, %rdx), %r11
450	/ Align destination end for aligned stores in the loop. Compute*
451	how much destination end is misaligned. /*
452	leaq -VEC_SIZE(%rsi, %rdx), %rcx
453	movq %r11, %r9
454	movq %r11, %r8
455	andq $(VEC_SIZE - `1`), %r8
456	/ Adjust source. /
457	subq %r8, %rcx
458	/ Adjust the end of destination which should be aligned now. /
459	subq %r8, %r9
460	/ Adjust length. /
461	subq %r8, %rdx
462	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
463	/ Check non-temporal store threshold. /
464	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
465	ja L(large_backward)
466	#endif
467	L(loop_4x_vec_backward):
468	/ Copy 4 * VEC a time backward. /
469	VMOVU (%rcx), %VEC(`0`)
470	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
471	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
472	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
473	subq $(VEC_SIZE * `4`), %rcx
474	subq $(VEC_SIZE * `4`), %rdx
475	VMOVA %VEC(`0`), (%r9)
476	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
477	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
478	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
479	subq $(VEC_SIZE * `4`), %r9
480	cmpq $(VEC_SIZE * `4`), %rdx
481	ja L(loop_4x_vec_backward)
482	/ Store the first 4 * VEC. /
483	VMOVU %VEC(`4`), (%rdi)
484	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
485	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
486	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
487	/ Store the last VEC. /
488	VMOVU %VEC(`8`), (%r11)
489	VZEROUPPER
490	ret
491
492	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
493	L(large_forward):
494	/ Don't use non-temporal store if there is overlap between*
495	destination and source since destination may be in cache
496	when source is loaded. /*
497	leaq (%rdi, %rdx), %r10
498	cmpq %r10, %rsi
499	jb L(loop_4x_vec_forward)
500	L(loop_large_forward):
501	/ Copy 4 * VEC a time forward with non-temporal stores. /
502	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
503	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
504	VMOVU (%rsi), %VEC(`0`)
505	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
506	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
507	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
508	addq $PREFETCHED_LOAD_SIZE, %rsi
509	subq $PREFETCHED_LOAD_SIZE, %rdx
510	VMOVNT %VEC(`0`), (%rdi)
511	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
512	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
513	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
514	addq $PREFETCHED_LOAD_SIZE, %rdi
515	cmpq $PREFETCHED_LOAD_SIZE, %rdx
516	ja L(loop_large_forward)
517	sfence
518	/ Store the last 4 * VEC. /
519	VMOVU %VEC(`5`), (%rcx)
520	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
521	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
522	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
523	/ Store the first VEC. /
524	VMOVU %VEC(`4`), (%r11)
525	VZEROUPPER
526	ret
527
528	L(large_backward):
529	/ Don't use non-temporal store if there is overlap between*
530	destination and source since destination may be in cache
531	when source is loaded. /*
532	leaq (%rcx, %rdx), %r10
533	cmpq %r10, %r9
534	jb L(loop_4x_vec_backward)
535	L(loop_large_backward):
536	/ Copy 4 * VEC a time backward with non-temporal stores. /
537	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
538	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
539	VMOVU (%rcx), %VEC(`0`)
540	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
541	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
542	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
543	subq $PREFETCHED_LOAD_SIZE, %rcx
544	subq $PREFETCHED_LOAD_SIZE, %rdx
545	VMOVNT %VEC(`0`), (%r9)
546	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
547	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
548	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
549	subq $PREFETCHED_LOAD_SIZE, %r9
550	cmpq $PREFETCHED_LOAD_SIZE, %rdx
551	ja L(loop_large_backward)
552	sfence
553	/ Store the first 4 * VEC. /
554	VMOVU %VEC(`4`), (%rdi)
555	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
556	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
557	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
558	/ Store the last VEC. /
559	VMOVU %VEC(`8`), (%r11)
560	VZEROUPPER
561	ret
562	#endif
563	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
564
565	#if IS_IN (libc)
566	# ifdef USE_MULTIARCH
567	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
568	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
569	# ifdef SHARED
570	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
571	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
572	# endif
573	# endif
574	# ifdef SHARED
575	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
576	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
577	# endif
578	#endif
579	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
580	MEMCPY_SYMBOL (__memcpy, unaligned))
581

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S