memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. On machines with ERMS feature, if size greater than equal or to
34	__x86_rep_movsb_threshold and less than
35	__x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36	7. If size >= __x86_shared_non_temporal_threshold and there is no
37	overlap between destination and source, use non-temporal store
38	instead of aligned store copying from either 2 or 4 pages at
39	once.
40	8. For point 7) if size < 16 __x86_shared_non_temporal_threshold*
41	and source and destination do not page alias, copy from 2 pages
42	at once using non-temporal stores. Page aliasing in this case is
43	considered true if destination's page alignment - sources' page
44	alignment is less than 8 VEC_SIZE.*
45	9. If size >= 16 __x86_shared_non_temporal_threshold or source*
46	and destination do page alias copy from 4 pages at once using
47	non-temporal stores. /*
48
49	#include <sysdep.h>
50
51	#ifndef MEMCPY_SYMBOL
52	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53	#endif
54
55	#ifndef MEMPCPY_SYMBOL
56	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
57	#endif
58
59	#ifndef MEMMOVE_CHK_SYMBOL
60	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
61	#endif
62
63	#ifndef XMM0
64	# define XMM0 xmm0
65	#endif
66
67	#ifndef YMM0
68	# define YMM0 ymm0
69	#endif
70
71	#ifndef VZEROUPPER
72	# if VEC_SIZE > 16
73	# define VZEROUPPER vzeroupper
74	# else
75	# define VZEROUPPER
76	# endif
77	#endif
78
79	#ifndef PAGE_SIZE
80	# define PAGE_SIZE 4096
81	#endif
82
83	#if PAGE_SIZE != 4096
84	# error Unsupported PAGE_SIZE
85	#endif
86
87	#ifndef LOG_PAGE_SIZE
88	# define LOG_PAGE_SIZE 12
89	#endif
90
91	#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
92	# error Invalid LOG_PAGE_SIZE
93	#endif
94
95	/ Byte per page for large_memcpy inner loop. /
96	#if VEC_SIZE == 64
97	# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
98	#else
99	# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
100	#endif
101
102	/ Amount to shift rdx by to compare for memcpy_large_4x. /
103	#ifndef LOG_4X_MEMCPY_THRESH
104	# define LOG_4X_MEMCPY_THRESH 4
105	#endif
106
107	/ Avoid short distance rep movsb only with non-SSE vector. /
108	#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
109	# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
110	#else
111	# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
112	#endif
113
114	#ifndef PREFETCH
115	# define PREFETCH(addr) prefetcht0 addr
116	#endif
117
118	/ Assume 64-byte prefetch size. /
119	#ifndef PREFETCH_SIZE
120	# define PREFETCH_SIZE 64
121	#endif
122
123	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
124
125	#if PREFETCH_SIZE == 64
126	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
127	# define PREFETCH_ONE_SET(dir, base, offset) \
128	PREFETCH ((offset)base)
129	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
130	# define PREFETCH_ONE_SET(dir, base, offset) \
131	PREFETCH ((offset)base); \
132	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
133	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
134	# define PREFETCH_ONE_SET(dir, base, offset) \
135	PREFETCH ((offset)base); \
136	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
137	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
138	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
139	# else
140	# error Unsupported PREFETCHED_LOAD_SIZE!
141	# endif
142	#else
143	# error Unsupported PREFETCH_SIZE!
144	#endif
145
146	#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
147	# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
148	VMOVU (offset)base, vec0; \
149	VMOVU ((offset) + VEC_SIZE)base, vec1;
150	# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
151	VMOVNT vec0, (offset)base; \
152	VMOVNT vec1, ((offset) + VEC_SIZE)base;
153	#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
154	# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
155	VMOVU (offset)base, vec0; \
156	VMOVU ((offset) + VEC_SIZE)base, vec1; \
157	VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
158	VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
159	# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
160	VMOVNT vec0, (offset)base; \
161	VMOVNT vec1, ((offset) + VEC_SIZE)base; \
162	VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
163	VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
164	#else
165	# error Invalid LARGE_LOAD_SIZE
166	#endif
167
168	#ifndef SECTION
169	# error SECTION is not defined!
170	#endif
171
172	.section SECTION(.text),"ax",@progbits
173	#if defined SHARED && IS_IN (libc)
174	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
175	cmp %RDX_LP, %RCX_LP
176	jb HIDDEN_JUMPTARGET (__chk_fail)
177	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
178	#endif
179
180	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
181	mov %RDI_LP, %RAX_LP
182	add %RDX_LP, %RAX_LP
183	jmp L(start)
184	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
185
186	#if defined SHARED && IS_IN (libc)
187	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
188	cmp %RDX_LP, %RCX_LP
189	jb HIDDEN_JUMPTARGET (__chk_fail)
190	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
191	#endif
192
193	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
194	movq %rdi, %rax
195	L(start):
196	# ifdef __ILP32__
197	/ Clear the upper 32 bits. /
198	movl %edx, %edx
199	# endif
200	cmp $VEC_SIZE, %RDX_LP
201	jb L(less_vec)
202	cmp $(VEC_SIZE * `2`), %RDX_LP
203	ja L(more_2x_vec)
204	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
205	L(last_2x_vec):
206	#endif
207	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
208	VMOVU (%rsi), %VEC(`0`)
209	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
210	VMOVU %VEC(`0`), (%rdi)
211	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
212	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
213	L(nop):
214	ret
215	#else
216	VZEROUPPER_RETURN
217	#endif
218	#if defined USE_MULTIARCH && IS_IN (libc)
219	END (MEMMOVE_SYMBOL (__memmove, unaligned))
220
221	# if VEC_SIZE == 16
222	ENTRY (__mempcpy_chk_erms)
223	cmp %RDX_LP, %RCX_LP
224	jb HIDDEN_JUMPTARGET (__chk_fail)
225	END (__mempcpy_chk_erms)
226
227	/ Only used to measure performance of REP MOVSB. /
228	ENTRY (__mempcpy_erms)
229	mov %RDI_LP, %RAX_LP
230	/ Skip zero length. /
231	test %RDX_LP, %RDX_LP
232	jz `2f`
233	add %RDX_LP, %RAX_LP
234	jmp L(start_movsb)
235	END (__mempcpy_erms)
236
237	ENTRY (__memmove_chk_erms)
238	cmp %RDX_LP, %RCX_LP
239	jb HIDDEN_JUMPTARGET (__chk_fail)
240	END (__memmove_chk_erms)
241
242	ENTRY (__memmove_erms)
243	movq %rdi, %rax
244	/ Skip zero length. /
245	test %RDX_LP, %RDX_LP
246	jz `2f`
247	L(start_movsb):
248	mov %RDX_LP, %RCX_LP
249	cmp %RSI_LP, %RDI_LP
250	jb `1f`
251	/ Source == destination is less common. /
252	je `2f`
253	lea (%rsi,%rcx), %RDX_LP
254	cmp %RDX_LP, %RDI_LP
255	jb L(movsb_backward)
256	`1`:
257	rep movsb
258	`2`:
259	ret
260	L(movsb_backward):
261	leaq -`1`(%rdi,%rcx), %rdi
262	leaq -`1`(%rsi,%rcx), %rsi
263	std
264	rep movsb
265	cld
266	ret
267	END (__memmove_erms)
268	strong_alias (__memmove_erms, __memcpy_erms)
269	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
270	# endif
271
272	# ifdef SHARED
273	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
274	cmp %RDX_LP, %RCX_LP
275	jb HIDDEN_JUMPTARGET (__chk_fail)
276	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
277	# endif
278
279	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
280	mov %RDI_LP, %RAX_LP
281	add %RDX_LP, %RAX_LP
282	jmp L(start_erms)
283	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
284
285	# ifdef SHARED
286	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
287	cmp %RDX_LP, %RCX_LP
288	jb HIDDEN_JUMPTARGET (__chk_fail)
289	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
290	# endif
291
292	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
293	movq %rdi, %rax
294	L(start_erms):
295	# ifdef __ILP32__
296	/ Clear the upper 32 bits. /
297	movl %edx, %edx
298	# endif
299	cmp $VEC_SIZE, %RDX_LP
300	jb L(less_vec)
301	cmp $(VEC_SIZE * `2`), %RDX_LP
302	ja L(movsb_more_2x_vec)
303	L(last_2x_vec):
304	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
305	VMOVU (%rsi), %VEC(`0`)
306	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
307	VMOVU %VEC(`0`), (%rdi)
308	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
309	L(return):
310	#if VEC_SIZE > 16
311	ZERO_UPPER_VEC_REGISTERS_RETURN
312	#else
313	ret
314	#endif
315
316	L(movsb):
317	cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
318	jae L(more_8x_vec)
319	cmpq %rsi, %rdi
320	jb `1f`
321	/ Source == destination is less common. /
322	je L(nop)
323	leaq (%rsi,%rdx), %r9
324	cmpq %r9, %rdi
325	/ Avoid slow backward REP MOVSB. /
326	jb L(more_8x_vec_backward)
327	# if AVOID_SHORT_DISTANCE_REP_MOVSB
328	andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
329	jz `3f`
330	movq %rdi, %rcx
331	subq %rsi, %rcx
332	jmp `2f`
333	# endif
334	`1`:
335	# if AVOID_SHORT_DISTANCE_REP_MOVSB
336	andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
337	jz `3f`
338	movq %rsi, %rcx
339	subq %rdi, %rcx
340	`2`:
341	/ Avoid "rep movsb" if RCX, the distance between source and destination,*
342	is N4GB + [1..63] with N >= 0. /
343	cmpl $`63`, %ecx
344	jbe L(more_2x_vec) / Avoid "rep movsb" if ECX <= 63. /
345	`3`:
346	# endif
347	mov %RDX_LP, %RCX_LP
348	rep movsb
349	L(nop):
350	ret
351	#endif
352
353	L(less_vec):
354	/ Less than 1 VEC. /
355	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
356	# error Unsupported VEC_SIZE!
357	#endif
358	#if VEC_SIZE > 32
359	cmpb $`32`, %dl
360	jae L(between_32_63)
361	#endif
362	#if VEC_SIZE > 16
363	cmpb $`16`, %dl
364	jae L(between_16_31)
365	#endif
366	cmpb $`8`, %dl
367	jae L(between_8_15)
368	cmpb $`4`, %dl
369	jae L(between_4_7)
370	cmpb $`1`, %dl
371	ja L(between_2_3)
372	jb `1f`
373	movzbl (%rsi), %ecx
374	movb %cl, (%rdi)
375	`1`:
376	ret
377	#if VEC_SIZE > 32
378	L(between_32_63):
379	/ From 32 to 63. No branch when size == 32. /
380	VMOVU (%rsi), %YMM0
381	VMOVU -`32`(%rsi,%rdx), %YMM1
382	VMOVU %YMM0, (%rdi)
383	VMOVU %YMM1, -`32`(%rdi,%rdx)
384	VZEROUPPER_RETURN
385	#endif
386	#if VEC_SIZE > 16
387	/ From 16 to 31. No branch when size == 16. /
388	L(between_16_31):
389	VMOVU (%rsi), %XMM0
390	VMOVU -`16`(%rsi,%rdx), %XMM1
391	VMOVU %XMM0, (%rdi)
392	VMOVU %XMM1, -`16`(%rdi,%rdx)
393	VZEROUPPER_RETURN
394	#endif
395	L(between_8_15):
396	/ From 8 to 15. No branch when size == 8. /
397	movq -`8`(%rsi,%rdx), %rcx
398	movq (%rsi), %rsi
399	movq %rcx, -`8`(%rdi,%rdx)
400	movq %rsi, (%rdi)
401	ret
402	L(between_4_7):
403	/ From 4 to 7. No branch when size == 4. /
404	movl -`4`(%rsi,%rdx), %ecx
405	movl (%rsi), %esi
406	movl %ecx, -`4`(%rdi,%rdx)
407	movl %esi, (%rdi)
408	ret
409	L(between_2_3):
410	/ From 2 to 3. No branch when size == 2. /
411	movzwl -`2`(%rsi,%rdx), %ecx
412	movzwl (%rsi), %esi
413	movw %cx, -`2`(%rdi,%rdx)
414	movw %si, (%rdi)
415	ret
416
417	#if defined USE_MULTIARCH && IS_IN (libc)
418	L(movsb_more_2x_vec):
419	cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
420	ja L(movsb)
421	#endif
422	L(more_2x_vec):
423	/ More than 2 * VEC and there may be overlap between destination*
424	and source. /*
425	cmpq $(VEC_SIZE * `8`), %rdx
426	ja L(more_8x_vec)
427	cmpq $(VEC_SIZE * `4`), %rdx
428	jbe L(last_4x_vec)
429	/ Copy from 4 * VEC + 1 to 8 * VEC, inclusively. /
430	VMOVU (%rsi), %VEC(`0`)
431	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
432	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
433	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
434	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
435	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
436	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
437	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
438	VMOVU %VEC(`0`), (%rdi)
439	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
440	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
441	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
442	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
443	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
444	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
445	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
446	VZEROUPPER_RETURN
447	L(last_4x_vec):
448	/ Copy from 2 * VEC + 1 to 4 * VEC, inclusively. /
449	VMOVU (%rsi), %VEC(`0`)
450	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
451	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
452	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
453	VMOVU %VEC(`0`), (%rdi)
454	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
455	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
456	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
457	VZEROUPPER_RETURN
458
459	L(more_8x_vec):
460	/ Check if non-temporal move candidate. /
461	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
462	/ Check non-temporal store threshold. /
463	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
464	ja L(large_memcpy_2x)
465	#endif
466	/ Entry if rdx is greater than non-temporal threshold but there*
467	is overlap. /*
468	L(more_8x_vec_check):
469	cmpq %rsi, %rdi
470	ja L(more_8x_vec_backward)
471	/ Source == destination is less common. /
472	je L(nop)
473	/ Load the first VEC and last 4 * VEC to support overlapping*
474	addresses. /*
475	VMOVU (%rsi), %VEC(`4`)
476	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
477	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
478	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
479	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
480	/ Save start and stop of the destination buffer. /
481	movq %rdi, %r11
482	leaq -VEC_SIZE(%rdi, %rdx), %rcx
483	/ Align destination for aligned stores in the loop. Compute*
484	how much destination is misaligned. /*
485	movq %rdi, %r8
486	andq $(VEC_SIZE - `1`), %r8
487	/ Get the negative of offset for alignment. /
488	subq $VEC_SIZE, %r8
489	/ Adjust source. /
490	subq %r8, %rsi
491	/ Adjust destination which should be aligned now. /
492	subq %r8, %rdi
493	/ Adjust length. /
494	addq %r8, %rdx
495
496	.p2align `4`
497	L(loop_4x_vec_forward):
498	/ Copy 4 * VEC a time forward. /
499	VMOVU (%rsi), %VEC(`0`)
500	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
501	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
502	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
503	subq $-(VEC_SIZE * `4`), %rsi
504	addq $-(VEC_SIZE * `4`), %rdx
505	VMOVA %VEC(`0`), (%rdi)
506	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
507	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
508	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
509	subq $-(VEC_SIZE * `4`), %rdi
510	cmpq $(VEC_SIZE * `4`), %rdx
511	ja L(loop_4x_vec_forward)
512	/ Store the last 4 * VEC. /
513	VMOVU %VEC(`5`), (%rcx)
514	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
515	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
516	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
517	/ Store the first VEC. /
518	VMOVU %VEC(`4`), (%r11)
519	VZEROUPPER_RETURN
520
521	L(more_8x_vec_backward):
522	/ Load the first 4 * VEC and last VEC to support overlapping*
523	addresses. /*
524	VMOVU (%rsi), %VEC(`4`)
525	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
526	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
527	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
528	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
529	/ Save stop of the destination buffer. /
530	leaq -VEC_SIZE(%rdi, %rdx), %r11
531	/ Align destination end for aligned stores in the loop. Compute*
532	how much destination end is misaligned. /*
533	leaq -VEC_SIZE(%rsi, %rdx), %rcx
534	movq %r11, %r9
535	movq %r11, %r8
536	andq $(VEC_SIZE - `1`), %r8
537	/ Adjust source. /
538	subq %r8, %rcx
539	/ Adjust the end of destination which should be aligned now. /
540	subq %r8, %r9
541	/ Adjust length. /
542	subq %r8, %rdx
543
544	.p2align `4`
545	L(loop_4x_vec_backward):
546	/ Copy 4 * VEC a time backward. /
547	VMOVU (%rcx), %VEC(`0`)
548	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
549	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
550	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
551	addq $-(VEC_SIZE * `4`), %rcx
552	addq $-(VEC_SIZE * `4`), %rdx
553	VMOVA %VEC(`0`), (%r9)
554	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
555	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
556	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
557	addq $-(VEC_SIZE * `4`), %r9
558	cmpq $(VEC_SIZE * `4`), %rdx
559	ja L(loop_4x_vec_backward)
560	/ Store the first 4 * VEC. /
561	VMOVU %VEC(`4`), (%rdi)
562	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
563	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
564	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
565	/ Store the last VEC. /
566	VMOVU %VEC(`8`), (%r11)
567	VZEROUPPER_RETURN
568
569	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
570	.p2align `4`
571	L(large_memcpy_2x):
572	/ Compute absolute value of difference between source and*
573	destination. /*
574	movq %rdi, %r9
575	subq %rsi, %r9
576	movq %r9, %r8
577	leaq -`1`(%r9), %rcx
578	sarq $`63`, %r8
579	xorq %r8, %r9
580	subq %r8, %r9
581	/ Don't use non-temporal store if there is overlap between*
582	destination and source since destination may be in cache when
583	source is loaded. /*
584	cmpq %r9, %rdx
585	ja L(more_8x_vec_check)
586
587	/ Cache align destination. First store the first 64 bytes then*
588	adjust alignments. /*
589	VMOVU (%rsi), %VEC(`8`)
590	#if VEC_SIZE < 64
591	VMOVU VEC_SIZE(%rsi), %VEC(`9`)
592	#if VEC_SIZE < 32
593	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`10`)
594	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`11`)
595	#endif
596	#endif
597	VMOVU %VEC(`8`), (%rdi)
598	#if VEC_SIZE < 64
599	VMOVU %VEC(`9`), VEC_SIZE(%rdi)
600	#if VEC_SIZE < 32
601	VMOVU %VEC(`10`), (VEC_SIZE * `2`)(%rdi)
602	VMOVU %VEC(`11`), (VEC_SIZE * `3`)(%rdi)
603	#endif
604	#endif
605	/ Adjust source, destination, and size. /
606	movq %rdi, %r8
607	andq $`63`, %r8
608	/ Get the negative of offset for alignment. /
609	subq $`64`, %r8
610	/ Adjust source. /
611	subq %r8, %rsi
612	/ Adjust destination which should be aligned now. /
613	subq %r8, %rdi
614	/ Adjust length. /
615	addq %r8, %rdx
616
617	/ Test if source and destination addresses will alias. If they do*
618	the larger pipeline in large_memcpy_4x alleviated the
619	performance drop. /*
620	testl $(PAGE_SIZE - VEC_SIZE * `8`), %ecx
621	jz L(large_memcpy_4x)
622
623	movq %rdx, %r10
624	shrq $LOG_4X_MEMCPY_THRESH, %r10
625	cmp __x86_shared_non_temporal_threshold(%rip), %r10
626	jae L(large_memcpy_4x)
627
628	/ edx will store remainder size for copying tail. /
629	andl $(PAGE_SIZE * `2` - `1`), %edx
630	/ r10 stores outer loop counter. /
631	shrq $((LOG_PAGE_SIZE + `1`) - LOG_4X_MEMCPY_THRESH), %r10
632	/ Copy 4x VEC at a time from 2 pages. /
633	.p2align `4`
634	L(loop_large_memcpy_2x_outer):
635	/ ecx stores inner loop counter. /
636	movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
637	L(loop_large_memcpy_2x_inner):
638	PREFETCH_ONE_SET(`1`, (%rsi), PREFETCHED_LOAD_SIZE)
639	PREFETCH_ONE_SET(`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
640	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
641	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * `2`)
642	/ Load vectors from rsi. /
643	LOAD_ONE_SET((%rsi), `0`, %VEC(`0`), %VEC(`1`), %VEC(`2`), %VEC(`3`))
644	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(`4`), %VEC(`5`), %VEC(`6`), %VEC(`7`))
645	subq $-LARGE_LOAD_SIZE, %rsi
646	/ Non-temporal store vectors to rdi. /
647	STORE_ONE_SET((%rdi), `0`, %VEC(`0`), %VEC(`1`), %VEC(`2`), %VEC(`3`))
648	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(`4`), %VEC(`5`), %VEC(`6`), %VEC(`7`))
649	subq $-LARGE_LOAD_SIZE, %rdi
650	decl %ecx
651	jnz L(loop_large_memcpy_2x_inner)
652	addq $PAGE_SIZE, %rdi
653	addq $PAGE_SIZE, %rsi
654	decq %r10
655	jne L(loop_large_memcpy_2x_outer)
656	sfence
657
658	/ Check if only last 4 loads are needed. /
659	cmpl $(VEC_SIZE * `4`), %edx
660	jbe L(large_memcpy_2x_end)
661
662	/ Handle the last 2 * PAGE_SIZE bytes. /
663	L(loop_large_memcpy_2x_tail):
664	/ Copy 4 * VEC a time forward with non-temporal stores. /
665	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE)
666	PREFETCH_ONE_SET (`1`, (%rdi), PREFETCHED_LOAD_SIZE)
667	VMOVU (%rsi), %VEC(`0`)
668	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
669	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
670	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
671	subq $-(VEC_SIZE * `4`), %rsi
672	addl $-(VEC_SIZE * `4`), %edx
673	VMOVA %VEC(`0`), (%rdi)
674	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
675	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
676	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
677	subq $-(VEC_SIZE * `4`), %rdi
678	cmpl $(VEC_SIZE * `4`), %edx
679	ja L(loop_large_memcpy_2x_tail)
680
681	L(large_memcpy_2x_end):
682	/ Store the last 4 * VEC. /
683	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`0`)
684	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`1`)
685	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`2`)
686	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`3`)
687
688	VMOVU %VEC(`0`), -(VEC_SIZE * `4`)(%rdi, %rdx)
689	VMOVU %VEC(`1`), -(VEC_SIZE * `3`)(%rdi, %rdx)
690	VMOVU %VEC(`2`), -(VEC_SIZE * `2`)(%rdi, %rdx)
691	VMOVU %VEC(`3`), -VEC_SIZE(%rdi, %rdx)
692	VZEROUPPER_RETURN
693
694	.p2align `4`
695	L(large_memcpy_4x):
696	movq %rdx, %r10
697	/ edx will store remainder size for copying tail. /
698	andl $(PAGE_SIZE * `4` - `1`), %edx
699	/ r10 stores outer loop counter. /
700	shrq $(LOG_PAGE_SIZE + `2`), %r10
701	/ Copy 4x VEC at a time from 4 pages. /
702	.p2align `4`
703	L(loop_large_memcpy_4x_outer):
704	/ ecx stores inner loop counter. /
705	movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
706	L(loop_large_memcpy_4x_inner):
707	/ Only one prefetch set per page as doing 4 pages give more time*
708	for prefetcher to keep up. /*
709	PREFETCH_ONE_SET(`1`, (%rsi), PREFETCHED_LOAD_SIZE)
710	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
711	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE * `2` + PREFETCHED_LOAD_SIZE)
712	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE * `3` + PREFETCHED_LOAD_SIZE)
713	/ Load vectors from rsi. /
714	LOAD_ONE_SET((%rsi), `0`, %VEC(`0`), %VEC(`1`), %VEC(`2`), %VEC(`3`))
715	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(`4`), %VEC(`5`), %VEC(`6`), %VEC(`7`))
716	LOAD_ONE_SET((%rsi), PAGE_SIZE * `2`, %VEC(`8`), %VEC(`9`), %VEC(`10`), %VEC(`11`))
717	LOAD_ONE_SET((%rsi), PAGE_SIZE * `3`, %VEC(`12`), %VEC(`13`), %VEC(`14`), %VEC(`15`))
718	subq $-LARGE_LOAD_SIZE, %rsi
719	/ Non-temporal store vectors to rdi. /
720	STORE_ONE_SET((%rdi), `0`, %VEC(`0`), %VEC(`1`), %VEC(`2`), %VEC(`3`))
721	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(`4`), %VEC(`5`), %VEC(`6`), %VEC(`7`))
722	STORE_ONE_SET((%rdi), PAGE_SIZE * `2`, %VEC(`8`), %VEC(`9`), %VEC(`10`), %VEC(`11`))
723	STORE_ONE_SET((%rdi), PAGE_SIZE * `3`, %VEC(`12`), %VEC(`13`), %VEC(`14`), %VEC(`15`))
724	subq $-LARGE_LOAD_SIZE, %rdi
725	decl %ecx
726	jnz L(loop_large_memcpy_4x_inner)
727	addq $(PAGE_SIZE * `3`), %rdi
728	addq $(PAGE_SIZE * `3`), %rsi
729	decq %r10
730	jne L(loop_large_memcpy_4x_outer)
731	sfence
732	/ Check if only last 4 loads are needed. /
733	cmpl $(VEC_SIZE * `4`), %edx
734	jbe L(large_memcpy_4x_end)
735
736	/ Handle the last 4 * PAGE_SIZE bytes. /
737	L(loop_large_memcpy_4x_tail):
738	/ Copy 4 * VEC a time forward with non-temporal stores. /
739	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE)
740	PREFETCH_ONE_SET (`1`, (%rdi), PREFETCHED_LOAD_SIZE)
741	VMOVU (%rsi), %VEC(`0`)
742	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
743	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
744	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
745	subq $-(VEC_SIZE * `4`), %rsi
746	addl $-(VEC_SIZE * `4`), %edx
747	VMOVA %VEC(`0`), (%rdi)
748	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
749	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
750	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
751	subq $-(VEC_SIZE * `4`), %rdi
752	cmpl $(VEC_SIZE * `4`), %edx
753	ja L(loop_large_memcpy_4x_tail)
754
755	L(large_memcpy_4x_end):
756	/ Store the last 4 * VEC. /
757	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`0`)
758	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`1`)
759	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`2`)
760	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`3`)
761
762	VMOVU %VEC(`0`), -(VEC_SIZE * `4`)(%rdi, %rdx)
763	VMOVU %VEC(`1`), -(VEC_SIZE * `3`)(%rdi, %rdx)
764	VMOVU %VEC(`2`), -(VEC_SIZE * `2`)(%rdi, %rdx)
765	VMOVU %VEC(`3`), -VEC_SIZE(%rdi, %rdx)
766	VZEROUPPER_RETURN
767	#endif
768	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
769
770	#if IS_IN (libc)
771	# ifdef USE_MULTIARCH
772	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
773	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
774	# ifdef SHARED
775	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
776	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
777	# endif
778	# endif
779	# ifdef SHARED
780	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
781	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
782	# endif
783	#endif
784	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
785	MEMCPY_SYMBOL (__memcpy, unaligned))
786

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S