memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2018 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	/ Threshold to use Enhanced REP MOVSB. Since there is overhead to set*
60	up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61	memcpy micro benchmark in glibc shows that 2KB is the approximate
62	value above which REP MOVSB becomes faster than SSE2 optimization
63	on processors with Enhanced REP MOVSB. Since larger register size
64	can move more data with a single load and store, the threshold is
65	higher with larger register size. /*
66	#ifndef REP_MOVSB_THRESHOLD
67	# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68	#endif
69
70	#ifndef PREFETCH
71	# define PREFETCH(addr) prefetcht0 addr
72	#endif
73
74	/ Assume 64-byte prefetch size. /
75	#ifndef PREFETCH_SIZE
76	# define PREFETCH_SIZE 64
77	#endif
78
79	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81	#if PREFETCH_SIZE == 64
82	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83	# define PREFETCH_ONE_SET(dir, base, offset) \
84	PREFETCH ((offset)base)
85	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86	# define PREFETCH_ONE_SET(dir, base, offset) \
87	PREFETCH ((offset)base); \
88	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90	# define PREFETCH_ONE_SET(dir, base, offset) \
91	PREFETCH ((offset)base); \
92	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96	# else
97	# error Unsupported PREFETCHED_LOAD_SIZE!
98	# endif
99	#else
100	# error Unsupported PREFETCH_SIZE!
101	#endif
102
103	#ifndef SECTION
104	# error SECTION is not defined!
105	#endif
106
107	.section SECTION(.text),"ax",@progbits
108	#if defined SHARED && IS_IN (libc)
109	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110	cmpq %rdx, %rcx
111	jb HIDDEN_JUMPTARGET (__chk_fail)
112	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113	#endif
114
115	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
116	movq %rdi, %rax
117	addq %rdx, %rax
118	jmp L(start)
119	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
120
121	#if defined SHARED && IS_IN (libc)
122	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
123	cmpq %rdx, %rcx
124	jb HIDDEN_JUMPTARGET (__chk_fail)
125	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
126	#endif
127
128	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
129	movq %rdi, %rax
130	L(start):
131	cmpq $VEC_SIZE, %rdx
132	jb L(less_vec)
133	cmpq $(VEC_SIZE * `2`), %rdx
134	ja L(more_2x_vec)
135	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
136	L(last_2x_vec):
137	#endif
138	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
139	VMOVU (%rsi), %VEC(`0`)
140	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
141	VMOVU %VEC(`0`), (%rdi)
142	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
143	VZEROUPPER
144	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
145	L(nop):
146	#endif
147	ret
148	#if defined USE_MULTIARCH && IS_IN (libc)
149	END (MEMMOVE_SYMBOL (__memmove, unaligned))
150
151	# if VEC_SIZE == 16
152	ENTRY (__mempcpy_chk_erms)
153	cmpq %rdx, %rcx
154	jb HIDDEN_JUMPTARGET (__chk_fail)
155	END (__mempcpy_chk_erms)
156
157	/ Only used to measure performance of REP MOVSB. /
158	ENTRY (__mempcpy_erms)
159	movq %rdi, %rax
160	addq %rdx, %rax
161	jmp L(start_movsb)
162	END (__mempcpy_erms)
163
164	ENTRY (__memmove_chk_erms)
165	cmpq %rdx, %rcx
166	jb HIDDEN_JUMPTARGET (__chk_fail)
167	END (__memmove_chk_erms)
168
169	ENTRY (__memmove_erms)
170	movq %rdi, %rax
171	L(start_movsb):
172	movq %rdx, %rcx
173	cmpq %rsi, %rdi
174	jb `1f`
175	/ Source == destination is less common. /
176	je `2f`
177	leaq (%rsi,%rcx), %rdx
178	cmpq %rdx, %rdi
179	jb L(movsb_backward)
180	`1`:
181	rep movsb
182	`2`:
183	ret
184	L(movsb_backward):
185	leaq -`1`(%rdi,%rcx), %rdi
186	leaq -`1`(%rsi,%rcx), %rsi
187	std
188	rep movsb
189	cld
190	ret
191	END (__memmove_erms)
192	strong_alias (__memmove_erms, __memcpy_erms)
193	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
194	# endif
195
196	# ifdef SHARED
197	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
198	cmpq %rdx, %rcx
199	jb HIDDEN_JUMPTARGET (__chk_fail)
200	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
201	# endif
202
203	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
204	movq %rdi, %rax
205	addq %rdx, %rax
206	jmp L(start_erms)
207	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
208
209	# ifdef SHARED
210	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
211	cmpq %rdx, %rcx
212	jb HIDDEN_JUMPTARGET (__chk_fail)
213	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
214	# endif
215
216	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
217	movq %rdi, %rax
218	L(start_erms):
219	cmpq $VEC_SIZE, %rdx
220	jb L(less_vec)
221	cmpq $(VEC_SIZE * `2`), %rdx
222	ja L(movsb_more_2x_vec)
223	L(last_2x_vec):
224	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
225	VMOVU (%rsi), %VEC(`0`)
226	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
227	VMOVU %VEC(`0`), (%rdi)
228	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
229	L(return):
230	VZEROUPPER
231	ret
232
233	L(movsb):
234	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
235	jae L(more_8x_vec)
236	cmpq %rsi, %rdi
237	jb `1f`
238	/ Source == destination is less common. /
239	je L(nop)
240	leaq (%rsi,%rdx), %r9
241	cmpq %r9, %rdi
242	/ Avoid slow backward REP MOVSB. /
243	# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
244	# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
245	# endif
246	jb L(more_8x_vec_backward)
247	`1`:
248	movq %rdx, %rcx
249	rep movsb
250	L(nop):
251	ret
252	#endif
253
254	L(less_vec):
255	/ Less than 1 VEC. /
256	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
257	# error Unsupported VEC_SIZE!
258	#endif
259	#if VEC_SIZE > 32
260	cmpb $`32`, %dl
261	jae L(between_32_63)
262	#endif
263	#if VEC_SIZE > 16
264	cmpb $`16`, %dl
265	jae L(between_16_31)
266	#endif
267	cmpb $`8`, %dl
268	jae L(between_8_15)
269	cmpb $`4`, %dl
270	jae L(between_4_7)
271	cmpb $`1`, %dl
272	ja L(between_2_3)
273	jb `1f`
274	movzbl (%rsi), %ecx
275	movb %cl, (%rdi)
276	`1`:
277	ret
278	#if VEC_SIZE > 32
279	L(between_32_63):
280	/ From 32 to 63. No branch when size == 32. /
281	vmovdqu (%rsi), %ymm0
282	vmovdqu -`32`(%rsi,%rdx), %ymm1
283	vmovdqu %ymm0, (%rdi)
284	vmovdqu %ymm1, -`32`(%rdi,%rdx)
285	VZEROUPPER
286	ret
287	#endif
288	#if VEC_SIZE > 16
289	/ From 16 to 31. No branch when size == 16. /
290	L(between_16_31):
291	vmovdqu (%rsi), %xmm0
292	vmovdqu -`16`(%rsi,%rdx), %xmm1
293	vmovdqu %xmm0, (%rdi)
294	vmovdqu %xmm1, -`16`(%rdi,%rdx)
295	ret
296	#endif
297	L(between_8_15):
298	/ From 8 to 15. No branch when size == 8. /
299	movq -`8`(%rsi,%rdx), %rcx
300	movq (%rsi), %rsi
301	movq %rcx, -`8`(%rdi,%rdx)
302	movq %rsi, (%rdi)
303	ret
304	L(between_4_7):
305	/ From 4 to 7. No branch when size == 4. /
306	movl -`4`(%rsi,%rdx), %ecx
307	movl (%rsi), %esi
308	movl %ecx, -`4`(%rdi,%rdx)
309	movl %esi, (%rdi)
310	ret
311	L(between_2_3):
312	/ From 2 to 3. No branch when size == 2. /
313	movzwl -`2`(%rsi,%rdx), %ecx
314	movzwl (%rsi), %esi
315	movw %cx, -`2`(%rdi,%rdx)
316	movw %si, (%rdi)
317	ret
318
319	#if defined USE_MULTIARCH && IS_IN (libc)
320	L(movsb_more_2x_vec):
321	cmpq $REP_MOVSB_THRESHOLD, %rdx
322	ja L(movsb)
323	#endif
324	L(more_2x_vec):
325	/ More than 2 * VEC and there may be overlap between destination*
326	and source. /*
327	cmpq $(VEC_SIZE * `8`), %rdx
328	ja L(more_8x_vec)
329	cmpq $(VEC_SIZE * `4`), %rdx
330	jb L(last_4x_vec)
331	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
332	VMOVU (%rsi), %VEC(`0`)
333	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
334	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
335	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
336	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
337	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
338	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
339	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
340	VMOVU %VEC(`0`), (%rdi)
341	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
342	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
343	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
344	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
345	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
346	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
347	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
348	VZEROUPPER
349	ret
350	L(last_4x_vec):
351	/ Copy from 2 * VEC to 4 * VEC. /
352	VMOVU (%rsi), %VEC(`0`)
353	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
354	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
355	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
356	VMOVU %VEC(`0`), (%rdi)
357	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
358	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
359	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
360	VZEROUPPER
361	ret
362
363	L(more_8x_vec):
364	cmpq %rsi, %rdi
365	ja L(more_8x_vec_backward)
366	/ Source == destination is less common. /
367	je L(nop)
368	/ Load the first VEC and last 4 * VEC to support overlapping*
369	addresses. /*
370	VMOVU (%rsi), %VEC(`4`)
371	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
372	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
373	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
374	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
375	/ Save start and stop of the destination buffer. /
376	movq %rdi, %r11
377	leaq -VEC_SIZE(%rdi, %rdx), %rcx
378	/ Align destination for aligned stores in the loop. Compute*
379	how much destination is misaligned. /*
380	movq %rdi, %r8
381	andq $(VEC_SIZE - `1`), %r8
382	/ Get the negative of offset for alignment. /
383	subq $VEC_SIZE, %r8
384	/ Adjust source. /
385	subq %r8, %rsi
386	/ Adjust destination which should be aligned now. /
387	subq %r8, %rdi
388	/ Adjust length. /
389	addq %r8, %rdx
390	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
391	/ Check non-temporal store threshold. /
392	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
393	ja L(large_forward)
394	#endif
395	L(loop_4x_vec_forward):
396	/ Copy 4 * VEC a time forward. /
397	VMOVU (%rsi), %VEC(`0`)
398	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
399	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
400	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
401	addq $(VEC_SIZE * `4`), %rsi
402	subq $(VEC_SIZE * `4`), %rdx
403	VMOVA %VEC(`0`), (%rdi)
404	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
405	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
406	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
407	addq $(VEC_SIZE * `4`), %rdi
408	cmpq $(VEC_SIZE * `4`), %rdx
409	ja L(loop_4x_vec_forward)
410	/ Store the last 4 * VEC. /
411	VMOVU %VEC(`5`), (%rcx)
412	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
413	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
414	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
415	/ Store the first VEC. /
416	VMOVU %VEC(`4`), (%r11)
417	VZEROUPPER
418	ret
419
420	L(more_8x_vec_backward):
421	/ Load the first 4 * VEC and last VEC to support overlapping*
422	addresses. /*
423	VMOVU (%rsi), %VEC(`4`)
424	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
425	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
426	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
427	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
428	/ Save stop of the destination buffer. /
429	leaq -VEC_SIZE(%rdi, %rdx), %r11
430	/ Align destination end for aligned stores in the loop. Compute*
431	how much destination end is misaligned. /*
432	leaq -VEC_SIZE(%rsi, %rdx), %rcx
433	movq %r11, %r9
434	movq %r11, %r8
435	andq $(VEC_SIZE - `1`), %r8
436	/ Adjust source. /
437	subq %r8, %rcx
438	/ Adjust the end of destination which should be aligned now. /
439	subq %r8, %r9
440	/ Adjust length. /
441	subq %r8, %rdx
442	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
443	/ Check non-temporal store threshold. /
444	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
445	ja L(large_backward)
446	#endif
447	L(loop_4x_vec_backward):
448	/ Copy 4 * VEC a time backward. /
449	VMOVU (%rcx), %VEC(`0`)
450	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
451	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
452	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
453	subq $(VEC_SIZE * `4`), %rcx
454	subq $(VEC_SIZE * `4`), %rdx
455	VMOVA %VEC(`0`), (%r9)
456	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
457	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
458	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
459	subq $(VEC_SIZE * `4`), %r9
460	cmpq $(VEC_SIZE * `4`), %rdx
461	ja L(loop_4x_vec_backward)
462	/ Store the first 4 * VEC. /
463	VMOVU %VEC(`4`), (%rdi)
464	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
465	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
466	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
467	/ Store the last VEC. /
468	VMOVU %VEC(`8`), (%r11)
469	VZEROUPPER
470	ret
471
472	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
473	L(large_forward):
474	/ Don't use non-temporal store if there is overlap between*
475	destination and source since destination may be in cache
476	when source is loaded. /*
477	leaq (%rdi, %rdx), %r10
478	cmpq %r10, %rsi
479	jb L(loop_4x_vec_forward)
480	L(loop_large_forward):
481	/ Copy 4 * VEC a time forward with non-temporal stores. /
482	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
483	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
484	VMOVU (%rsi), %VEC(`0`)
485	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
486	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
487	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
488	addq $PREFETCHED_LOAD_SIZE, %rsi
489	subq $PREFETCHED_LOAD_SIZE, %rdx
490	VMOVNT %VEC(`0`), (%rdi)
491	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
492	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
493	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
494	addq $PREFETCHED_LOAD_SIZE, %rdi
495	cmpq $PREFETCHED_LOAD_SIZE, %rdx
496	ja L(loop_large_forward)
497	sfence
498	/ Store the last 4 * VEC. /
499	VMOVU %VEC(`5`), (%rcx)
500	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
501	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
502	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
503	/ Store the first VEC. /
504	VMOVU %VEC(`4`), (%r11)
505	VZEROUPPER
506	ret
507
508	L(large_backward):
509	/ Don't use non-temporal store if there is overlap between*
510	destination and source since destination may be in cache
511	when source is loaded. /*
512	leaq (%rcx, %rdx), %r10
513	cmpq %r10, %r9
514	jb L(loop_4x_vec_backward)
515	L(loop_large_backward):
516	/ Copy 4 * VEC a time backward with non-temporal stores. /
517	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
518	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
519	VMOVU (%rcx), %VEC(`0`)
520	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
521	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
522	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
523	subq $PREFETCHED_LOAD_SIZE, %rcx
524	subq $PREFETCHED_LOAD_SIZE, %rdx
525	VMOVNT %VEC(`0`), (%r9)
526	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
527	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
528	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
529	subq $PREFETCHED_LOAD_SIZE, %r9
530	cmpq $PREFETCHED_LOAD_SIZE, %rdx
531	ja L(loop_large_backward)
532	sfence
533	/ Store the first 4 * VEC. /
534	VMOVU %VEC(`4`), (%rdi)
535	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
536	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
537	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
538	/ Store the last VEC. /
539	VMOVU %VEC(`8`), (%r11)
540	VZEROUPPER
541	ret
542	#endif
543	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
544
545	#if IS_IN (libc)
546	# ifdef USE_MULTIARCH
547	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
548	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
549	# ifdef SHARED
550	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
551	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
552	# endif
553	# endif
554	# ifdef SHARED
555	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
556	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
557	# endif
558	#endif
559	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
560	MEMCPY_SYMBOL (__memcpy, unaligned))
561

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S