memcpy-avx-unaligned.S source code [glibc/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S]

1	/ memcpy with AVX*
2	Copyright (C) 2014-2016 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	#if IS_IN (libc) \
22	&& (defined SHARED \
23	\|\| defined USE_AS_MEMMOVE \
24	\|\| !defined USE_MULTIARCH)
25
26	#include "asm-syntax.h"
27	#ifndef MEMCPY
28	# define MEMCPY __memcpy_avx_unaligned
29	# define MEMCPY_CHK __memcpy_chk_avx_unaligned
30	#endif
31
32	.section .text.avx,"ax",@progbits
33	#if !defined USE_AS_BCOPY
34	ENTRY (MEMCPY_CHK)
35	cmpq %rdx, %rcx
36	jb HIDDEN_JUMPTARGET (__chk_fail)
37	END (MEMCPY_CHK)
38	#endif
39
40	ENTRY (MEMCPY)
41	mov %rdi, %rax
42	#ifdef USE_AS_MEMPCPY
43	add %rdx, %rax
44	#endif
45	cmp $`256`, %rdx
46	jae L(`256bytesormore`)
47	cmp $`16`, %dl
48	jb L(less_16bytes)
49	cmp $`128`, %dl
50	jb L(less_128bytes)
51	vmovdqu (%rsi), %xmm0
52	lea (%rsi, %rdx), %rcx
53	vmovdqu `0x10`(%rsi), %xmm1
54	vmovdqu `0x20`(%rsi), %xmm2
55	vmovdqu `0x30`(%rsi), %xmm3
56	vmovdqu `0x40`(%rsi), %xmm4
57	vmovdqu `0x50`(%rsi), %xmm5
58	vmovdqu `0x60`(%rsi), %xmm6
59	vmovdqu `0x70`(%rsi), %xmm7
60	vmovdqu -`0x80`(%rcx), %xmm8
61	vmovdqu -`0x70`(%rcx), %xmm9
62	vmovdqu -`0x60`(%rcx), %xmm10
63	vmovdqu -`0x50`(%rcx), %xmm11
64	vmovdqu -`0x40`(%rcx), %xmm12
65	vmovdqu -`0x30`(%rcx), %xmm13
66	vmovdqu -`0x20`(%rcx), %xmm14
67	vmovdqu -`0x10`(%rcx), %xmm15
68	lea (%rdi, %rdx), %rdx
69	vmovdqu %xmm0, (%rdi)
70	vmovdqu %xmm1, `0x10`(%rdi)
71	vmovdqu %xmm2, `0x20`(%rdi)
72	vmovdqu %xmm3, `0x30`(%rdi)
73	vmovdqu %xmm4, `0x40`(%rdi)
74	vmovdqu %xmm5, `0x50`(%rdi)
75	vmovdqu %xmm6, `0x60`(%rdi)
76	vmovdqu %xmm7, `0x70`(%rdi)
77	vmovdqu %xmm8, -`0x80`(%rdx)
78	vmovdqu %xmm9, -`0x70`(%rdx)
79	vmovdqu %xmm10, -`0x60`(%rdx)
80	vmovdqu %xmm11, -`0x50`(%rdx)
81	vmovdqu %xmm12, -`0x40`(%rdx)
82	vmovdqu %xmm13, -`0x30`(%rdx)
83	vmovdqu %xmm14, -`0x20`(%rdx)
84	vmovdqu %xmm15, -`0x10`(%rdx)
85	ret
86	.p2align `4`
87	L(less_128bytes):
88	cmp $`64`, %dl
89	jb L(less_64bytes)
90	vmovdqu (%rsi), %xmm0
91	lea (%rsi, %rdx), %rcx
92	vmovdqu `0x10`(%rsi), %xmm1
93	vmovdqu `0x20`(%rsi), %xmm2
94	lea (%rdi, %rdx), %rdx
95	vmovdqu `0x30`(%rsi), %xmm3
96	vmovdqu -`0x40`(%rcx), %xmm4
97	vmovdqu -`0x30`(%rcx), %xmm5
98	vmovdqu -`0x20`(%rcx), %xmm6
99	vmovdqu -`0x10`(%rcx), %xmm7
100	vmovdqu %xmm0, (%rdi)
101	vmovdqu %xmm1, `0x10`(%rdi)
102	vmovdqu %xmm2, `0x20`(%rdi)
103	vmovdqu %xmm3, `0x30`(%rdi)
104	vmovdqu %xmm4, -`0x40`(%rdx)
105	vmovdqu %xmm5, -`0x30`(%rdx)
106	vmovdqu %xmm6, -`0x20`(%rdx)
107	vmovdqu %xmm7, -`0x10`(%rdx)
108	ret
109
110	.p2align `4`
111	L(less_64bytes):
112	cmp $`32`, %dl
113	jb L(less_32bytes)
114	vmovdqu (%rsi), %xmm0
115	vmovdqu `0x10`(%rsi), %xmm1
116	vmovdqu -`0x20`(%rsi, %rdx), %xmm6
117	vmovdqu -`0x10`(%rsi, %rdx), %xmm7
118	vmovdqu %xmm0, (%rdi)
119	vmovdqu %xmm1, `0x10`(%rdi)
120	vmovdqu %xmm6, -`0x20`(%rdi, %rdx)
121	vmovdqu %xmm7, -`0x10`(%rdi, %rdx)
122	ret
123
124	.p2align `4`
125	L(less_32bytes):
126	vmovdqu (%rsi), %xmm0
127	vmovdqu -`0x10`(%rsi, %rdx), %xmm7
128	vmovdqu %xmm0, (%rdi)
129	vmovdqu %xmm7, -`0x10`(%rdi, %rdx)
130	ret
131
132	.p2align `4`
133	L(less_16bytes):
134	cmp $`8`, %dl
135	jb L(less_8bytes)
136	movq -`0x08`(%rsi, %rdx), %rcx
137	movq (%rsi), %rsi
138	movq %rsi, (%rdi)
139	movq %rcx, -`0x08`(%rdi, %rdx)
140	ret
141
142	.p2align `4`
143	L(less_8bytes):
144	cmp $`4`, %dl
145	jb L(less_4bytes)
146	mov -`0x04`(%rsi, %rdx), %ecx
147	mov (%rsi), %esi
148	mov %esi, (%rdi)
149	mov %ecx, -`0x04`(%rdi, %rdx)
150	ret
151
152	L(less_4bytes):
153	cmp $`1`, %dl
154	jbe L(less_2bytes)
155	mov -`0x02`(%rsi, %rdx), %cx
156	mov (%rsi), %si
157	mov %si, (%rdi)
158	mov %cx, -`0x02`(%rdi, %rdx)
159	ret
160
161	L(less_2bytes):
162	jb L(less_0bytes)
163	mov (%rsi), %cl
164	mov %cl, (%rdi)
165	L(less_0bytes):
166	ret
167
168	.p2align `4`
169	L(`256bytesormore`):
170	#ifdef USE_AS_MEMMOVE
171	mov %rdi, %rcx
172	sub %rsi, %rcx
173	cmp %rdx, %rcx
174	jc L(copy_backward)
175	#endif
176	cmp $`2048`, %rdx
177	jae L(gobble_data_movsb)
178	mov %rax, %r8
179	lea (%rsi, %rdx), %rcx
180	mov %rdi, %r10
181	vmovdqu -`0x80`(%rcx), %xmm5
182	vmovdqu -`0x70`(%rcx), %xmm6
183	mov $`0x80`, %rax
184	and $-`32`, %rdi
185	add $`32`, %rdi
186	vmovdqu -`0x60`(%rcx), %xmm7
187	vmovdqu -`0x50`(%rcx), %xmm8
188	mov %rdi, %r11
189	sub %r10, %r11
190	vmovdqu -`0x40`(%rcx), %xmm9
191	vmovdqu -`0x30`(%rcx), %xmm10
192	sub %r11, %rdx
193	vmovdqu -`0x20`(%rcx), %xmm11
194	vmovdqu -`0x10`(%rcx), %xmm12
195	vmovdqu (%rsi), %ymm4
196	add %r11, %rsi
197	sub %eax, %edx
198	L(goble_128_loop):
199	vmovdqu (%rsi), %ymm0
200	vmovdqu `0x20`(%rsi), %ymm1
201	vmovdqu `0x40`(%rsi), %ymm2
202	vmovdqu `0x60`(%rsi), %ymm3
203	add %rax, %rsi
204	vmovdqa %ymm0, (%rdi)
205	vmovdqa %ymm1, `0x20`(%rdi)
206	vmovdqa %ymm2, `0x40`(%rdi)
207	vmovdqa %ymm3, `0x60`(%rdi)
208	add %rax, %rdi
209	sub %eax, %edx
210	jae L(goble_128_loop)
211	add %eax, %edx
212	add %rdi, %rdx
213	vmovdqu %ymm4, (%r10)
214	vzeroupper
215	vmovdqu %xmm5, -`0x80`(%rdx)
216	vmovdqu %xmm6, -`0x70`(%rdx)
217	vmovdqu %xmm7, -`0x60`(%rdx)
218	vmovdqu %xmm8, -`0x50`(%rdx)
219	vmovdqu %xmm9, -`0x40`(%rdx)
220	vmovdqu %xmm10, -`0x30`(%rdx)
221	vmovdqu %xmm11, -`0x20`(%rdx)
222	vmovdqu %xmm12, -`0x10`(%rdx)
223	mov %r8, %rax
224	ret
225
226	.p2align `4`
227	L(gobble_data_movsb):
228	#ifdef SHARED_CACHE_SIZE_HALF
229	mov $SHARED_CACHE_SIZE_HALF, %rcx
230	#else
231	mov __x86_shared_cache_size_half(%rip), %rcx
232	#endif
233	shl $`3`, %rcx
234	cmp %rcx, %rdx
235	jae L(gobble_big_data_fwd)
236	mov %rdx, %rcx
237	mov %rdx, %rcx
238	rep movsb
239	ret
240
241	.p2align `4`
242	L(gobble_big_data_fwd):
243	lea (%rsi, %rdx), %rcx
244	vmovdqu (%rsi), %ymm4
245	vmovdqu -`0x80`(%rsi,%rdx), %xmm5
246	vmovdqu -`0x70`(%rcx), %xmm6
247	vmovdqu -`0x60`(%rcx), %xmm7
248	vmovdqu -`0x50`(%rcx), %xmm8
249	vmovdqu -`0x40`(%rcx), %xmm9
250	vmovdqu -`0x30`(%rcx), %xmm10
251	vmovdqu -`0x20`(%rcx), %xmm11
252	vmovdqu -`0x10`(%rcx), %xmm12
253	mov %rdi, %r8
254	and $-`32`, %rdi
255	add $`32`, %rdi
256	mov %rdi, %r10
257	sub %r8, %r10
258	sub %r10, %rdx
259	add %r10, %rsi
260	lea (%rdi, %rdx), %rcx
261	add $-`0x80`, %rdx
262	L(gobble_mem_fwd_loop):
263	prefetchnta `0x1c0`(%rsi)
264	prefetchnta `0x280`(%rsi)
265	vmovdqu (%rsi), %ymm0
266	vmovdqu `0x20`(%rsi), %ymm1
267	vmovdqu `0x40`(%rsi), %ymm2
268	vmovdqu `0x60`(%rsi), %ymm3
269	sub $-`0x80`, %rsi
270	vmovntdq %ymm0, (%rdi)
271	vmovntdq %ymm1, `0x20`(%rdi)
272	vmovntdq %ymm2, `0x40`(%rdi)
273	vmovntdq %ymm3, `0x60`(%rdi)
274	sub $-`0x80`, %rdi
275	add $-`0x80`, %rdx
276	jb L(gobble_mem_fwd_loop)
277	sfence
278	vmovdqu %ymm4, (%r8)
279	vzeroupper
280	vmovdqu %xmm5, -`0x80`(%rcx)
281	vmovdqu %xmm6, -`0x70`(%rcx)
282	vmovdqu %xmm7, -`0x60`(%rcx)
283	vmovdqu %xmm8, -`0x50`(%rcx)
284	vmovdqu %xmm9, -`0x40`(%rcx)
285	vmovdqu %xmm10, -`0x30`(%rcx)
286	vmovdqu %xmm11, -`0x20`(%rcx)
287	vmovdqu %xmm12, -`0x10`(%rcx)
288	ret
289
290	#ifdef USE_AS_MEMMOVE
291	.p2align `4`
292	L(copy_backward):
293	#ifdef SHARED_CACHE_SIZE_HALF
294	mov $SHARED_CACHE_SIZE_HALF, %rcx
295	#else
296	mov __x86_shared_cache_size_half(%rip), %rcx
297	#endif
298	shl $`3`, %rcx
299	vmovdqu (%rsi), %xmm5
300	vmovdqu `0x10`(%rsi), %xmm6
301	add %rdx, %rdi
302	vmovdqu `0x20`(%rsi), %xmm7
303	vmovdqu `0x30`(%rsi), %xmm8
304	lea -`0x20`(%rdi), %r10
305	mov %rdi, %r11
306	vmovdqu `0x40`(%rsi), %xmm9
307	vmovdqu `0x50`(%rsi), %xmm10
308	and $`0x1f`, %r11
309	vmovdqu `0x60`(%rsi), %xmm11
310	vmovdqu `0x70`(%rsi), %xmm12
311	xor %r11, %rdi
312	add %rdx, %rsi
313	vmovdqu -`0x20`(%rsi), %ymm4
314	sub %r11, %rsi
315	sub %r11, %rdx
316	cmp %rcx, %rdx
317	ja L(gobble_big_data_bwd)
318	add $-`0x80`, %rdx
319	L(gobble_mem_bwd_llc):
320	vmovdqu -`0x20`(%rsi), %ymm0
321	vmovdqu -`0x40`(%rsi), %ymm1
322	vmovdqu -`0x60`(%rsi), %ymm2
323	vmovdqu -`0x80`(%rsi), %ymm3
324	lea -`0x80`(%rsi), %rsi
325	vmovdqa %ymm0, -`0x20`(%rdi)
326	vmovdqa %ymm1, -`0x40`(%rdi)
327	vmovdqa %ymm2, -`0x60`(%rdi)
328	vmovdqa %ymm3, -`0x80`(%rdi)
329	lea -`0x80`(%rdi), %rdi
330	add $-`0x80`, %rdx
331	jb L(gobble_mem_bwd_llc)
332	vmovdqu %ymm4, (%r10)
333	vzeroupper
334	vmovdqu %xmm5, (%rax)
335	vmovdqu %xmm6, `0x10`(%rax)
336	vmovdqu %xmm7, `0x20`(%rax)
337	vmovdqu %xmm8, `0x30`(%rax)
338	vmovdqu %xmm9, `0x40`(%rax)
339	vmovdqu %xmm10, `0x50`(%rax)
340	vmovdqu %xmm11, `0x60`(%rax)
341	vmovdqu %xmm12, `0x70`(%rax)
342	ret
343
344	.p2align `4`
345	L(gobble_big_data_bwd):
346	add $-`0x80`, %rdx
347	L(gobble_mem_bwd_loop):
348	prefetchnta -`0x1c0`(%rsi)
349	prefetchnta -`0x280`(%rsi)
350	vmovdqu -`0x20`(%rsi), %ymm0
351	vmovdqu -`0x40`(%rsi), %ymm1
352	vmovdqu -`0x60`(%rsi), %ymm2
353	vmovdqu -`0x80`(%rsi), %ymm3
354	lea -`0x80`(%rsi), %rsi
355	vmovntdq %ymm0, -`0x20`(%rdi)
356	vmovntdq %ymm1, -`0x40`(%rdi)
357	vmovntdq %ymm2, -`0x60`(%rdi)
358	vmovntdq %ymm3, -`0x80`(%rdi)
359	lea -`0x80`(%rdi), %rdi
360	add $-`0x80`, %rdx
361	jb L(gobble_mem_bwd_loop)
362	sfence
363	vmovdqu %ymm4, (%r10)
364	vzeroupper
365	vmovdqu %xmm5, (%rax)
366	vmovdqu %xmm6, `0x10`(%rax)
367	vmovdqu %xmm7, `0x20`(%rax)
368	vmovdqu %xmm8, `0x30`(%rax)
369	vmovdqu %xmm9, `0x40`(%rax)
370	vmovdqu %xmm10, `0x50`(%rax)
371	vmovdqu %xmm11, `0x60`(%rax)
372	vmovdqu %xmm12, `0x70`(%rax)
373	ret
374	#endif
375	END (MEMCPY)
376	#endif
377

Browse the source code of glibc/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S