memmove-avx512-no-vzeroupper.S source code [glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S]

1	/ memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.*
2	Copyright (C) 2016-2018 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	#if IS_IN (libc)
22
23	# include "asm-syntax.h"
24
25	.section .text.avx512,"ax",@progbits
26	ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
27	cmpq %rdx, %rcx
28	jb HIDDEN_JUMPTARGET (__chk_fail)
29	END (__mempcpy_chk_avx512_no_vzeroupper)
30
31	ENTRY (__mempcpy_avx512_no_vzeroupper)
32	movq %rdi, %rax
33	addq %rdx, %rax
34	jmp L(start)
35	END (__mempcpy_avx512_no_vzeroupper)
36
37	ENTRY (__memmove_chk_avx512_no_vzeroupper)
38	cmpq %rdx, %rcx
39	jb HIDDEN_JUMPTARGET (__chk_fail)
40	END (__memmove_chk_avx512_no_vzeroupper)
41
42	ENTRY (__memmove_avx512_no_vzeroupper)
43	mov %rdi, %rax
44	# ifdef USE_AS_MEMPCPY
45	add %rdx, %rax
46	# endif
47	L(start):
48	lea (%rsi, %rdx), %rcx
49	lea (%rdi, %rdx), %r9
50	cmp $`512`, %rdx
51	ja L(`512bytesormore`)
52
53	L(check):
54	cmp $`16`, %rdx
55	jbe L(less_16bytes)
56	cmp $`256`, %rdx
57	jb L(less_256bytes)
58	vmovups (%rsi), %zmm0
59	vmovups `0x40`(%rsi), %zmm1
60	vmovups `0x80`(%rsi), %zmm2
61	vmovups `0xC0`(%rsi), %zmm3
62	vmovups -`0x100`(%rcx), %zmm4
63	vmovups -`0xC0`(%rcx), %zmm5
64	vmovups -`0x80`(%rcx), %zmm6
65	vmovups -`0x40`(%rcx), %zmm7
66	vmovups %zmm0, (%rdi)
67	vmovups %zmm1, `0x40`(%rdi)
68	vmovups %zmm2, `0x80`(%rdi)
69	vmovups %zmm3, `0xC0`(%rdi)
70	vmovups %zmm4, -`0x100`(%r9)
71	vmovups %zmm5, -`0xC0`(%r9)
72	vmovups %zmm6, -`0x80`(%r9)
73	vmovups %zmm7, -`0x40`(%r9)
74	ret
75
76	L(less_256bytes):
77	cmp $`128`, %dl
78	jb L(less_128bytes)
79	vmovups (%rsi), %zmm0
80	vmovups `0x40`(%rsi), %zmm1
81	vmovups -`0x80`(%rcx), %zmm2
82	vmovups -`0x40`(%rcx), %zmm3
83	vmovups %zmm0, (%rdi)
84	vmovups %zmm1, `0x40`(%rdi)
85	vmovups %zmm2, -`0x80`(%r9)
86	vmovups %zmm3, -`0x40`(%r9)
87	ret
88
89	L(less_128bytes):
90	cmp $`64`, %dl
91	jb L(less_64bytes)
92	vmovdqu (%rsi), %ymm0
93	vmovdqu `0x20`(%rsi), %ymm1
94	vmovdqu -`0x40`(%rcx), %ymm2
95	vmovdqu -`0x20`(%rcx), %ymm3
96	vmovdqu %ymm0, (%rdi)
97	vmovdqu %ymm1, `0x20`(%rdi)
98	vmovdqu %ymm2, -`0x40`(%r9)
99	vmovdqu %ymm3, -`0x20`(%r9)
100	ret
101
102	L(less_64bytes):
103	cmp $`32`, %dl
104	jb L(less_32bytes)
105	vmovdqu (%rsi), %ymm0
106	vmovdqu -`0x20`(%rcx), %ymm1
107	vmovdqu %ymm0, (%rdi)
108	vmovdqu %ymm1, -`0x20`(%r9)
109	ret
110
111	L(less_32bytes):
112	vmovdqu (%rsi), %xmm0
113	vmovdqu -`0x10`(%rcx), %xmm1
114	vmovdqu %xmm0, (%rdi)
115	vmovdqu %xmm1, -`0x10`(%r9)
116	ret
117
118	L(less_16bytes):
119	cmp $`8`, %dl
120	jb L(less_8bytes)
121	movq (%rsi), %rsi
122	movq -`0x8`(%rcx), %rcx
123	movq %rsi, (%rdi)
124	movq %rcx, -`0x8`(%r9)
125	ret
126
127	L(less_8bytes):
128	cmp $`4`, %dl
129	jb L(less_4bytes)
130	mov (%rsi), %esi
131	mov -`0x4`(%rcx), %ecx
132	mov %esi, (%rdi)
133	mov %ecx, -`0x4`(%r9)
134	ret
135
136	L(less_4bytes):
137	cmp $`2`, %dl
138	jb L(less_2bytes)
139	mov (%rsi), %si
140	mov -`0x2`(%rcx), %cx
141	mov %si, (%rdi)
142	mov %cx, -`0x2`(%r9)
143	ret
144
145	L(less_2bytes):
146	cmp $`1`, %dl
147	jb L(less_1bytes)
148	mov (%rsi), %cl
149	mov %cl, (%rdi)
150	L(less_1bytes):
151	ret
152
153	L(`512bytesormore`):
154	# ifdef SHARED_CACHE_SIZE_HALF
155	mov $SHARED_CACHE_SIZE_HALF, %r8
156	# else
157	mov __x86_shared_cache_size_half(%rip), %r8
158	# endif
159	cmp %r8, %rdx
160	jae L(preloop_large)
161	cmp $`1024`, %rdx
162	ja L(`1024bytesormore`)
163	prefetcht1 (%rsi)
164	prefetcht1 `0x40`(%rsi)
165	prefetcht1 `0x80`(%rsi)
166	prefetcht1 `0xC0`(%rsi)
167	prefetcht1 `0x100`(%rsi)
168	prefetcht1 `0x140`(%rsi)
169	prefetcht1 `0x180`(%rsi)
170	prefetcht1 `0x1C0`(%rsi)
171	prefetcht1 -`0x200`(%rcx)
172	prefetcht1 -`0x1C0`(%rcx)
173	prefetcht1 -`0x180`(%rcx)
174	prefetcht1 -`0x140`(%rcx)
175	prefetcht1 -`0x100`(%rcx)
176	prefetcht1 -`0xC0`(%rcx)
177	prefetcht1 -`0x80`(%rcx)
178	prefetcht1 -`0x40`(%rcx)
179	vmovups (%rsi), %zmm0
180	vmovups `0x40`(%rsi), %zmm1
181	vmovups `0x80`(%rsi), %zmm2
182	vmovups `0xC0`(%rsi), %zmm3
183	vmovups `0x100`(%rsi), %zmm4
184	vmovups `0x140`(%rsi), %zmm5
185	vmovups `0x180`(%rsi), %zmm6
186	vmovups `0x1C0`(%rsi), %zmm7
187	vmovups -`0x200`(%rcx), %zmm8
188	vmovups -`0x1C0`(%rcx), %zmm9
189	vmovups -`0x180`(%rcx), %zmm10
190	vmovups -`0x140`(%rcx), %zmm11
191	vmovups -`0x100`(%rcx), %zmm12
192	vmovups -`0xC0`(%rcx), %zmm13
193	vmovups -`0x80`(%rcx), %zmm14
194	vmovups -`0x40`(%rcx), %zmm15
195	vmovups %zmm0, (%rdi)
196	vmovups %zmm1, `0x40`(%rdi)
197	vmovups %zmm2, `0x80`(%rdi)
198	vmovups %zmm3, `0xC0`(%rdi)
199	vmovups %zmm4, `0x100`(%rdi)
200	vmovups %zmm5, `0x140`(%rdi)
201	vmovups %zmm6, `0x180`(%rdi)
202	vmovups %zmm7, `0x1C0`(%rdi)
203	vmovups %zmm8, -`0x200`(%r9)
204	vmovups %zmm9, -`0x1C0`(%r9)
205	vmovups %zmm10, -`0x180`(%r9)
206	vmovups %zmm11, -`0x140`(%r9)
207	vmovups %zmm12, -`0x100`(%r9)
208	vmovups %zmm13, -`0xC0`(%r9)
209	vmovups %zmm14, -`0x80`(%r9)
210	vmovups %zmm15, -`0x40`(%r9)
211	ret
212
213	L(`1024bytesormore`):
214	cmp %rsi, %rdi
215	ja L(`1024bytesormore_bkw`)
216	sub $`512`, %r9
217	vmovups -`0x200`(%rcx), %zmm8
218	vmovups -`0x1C0`(%rcx), %zmm9
219	vmovups -`0x180`(%rcx), %zmm10
220	vmovups -`0x140`(%rcx), %zmm11
221	vmovups -`0x100`(%rcx), %zmm12
222	vmovups -`0xC0`(%rcx), %zmm13
223	vmovups -`0x80`(%rcx), %zmm14
224	vmovups -`0x40`(%rcx), %zmm15
225	prefetcht1 (%rsi)
226	prefetcht1 `0x40`(%rsi)
227	prefetcht1 `0x80`(%rsi)
228	prefetcht1 `0xC0`(%rsi)
229	prefetcht1 `0x100`(%rsi)
230	prefetcht1 `0x140`(%rsi)
231	prefetcht1 `0x180`(%rsi)
232	prefetcht1 `0x1C0`(%rsi)
233
234	/ Loop with unaligned memory access. /
235	L(gobble_512bytes_loop):
236	vmovups (%rsi), %zmm0
237	vmovups `0x40`(%rsi), %zmm1
238	vmovups `0x80`(%rsi), %zmm2
239	vmovups `0xC0`(%rsi), %zmm3
240	vmovups `0x100`(%rsi), %zmm4
241	vmovups `0x140`(%rsi), %zmm5
242	vmovups `0x180`(%rsi), %zmm6
243	vmovups `0x1C0`(%rsi), %zmm7
244	add $`512`, %rsi
245	prefetcht1 (%rsi)
246	prefetcht1 `0x40`(%rsi)
247	prefetcht1 `0x80`(%rsi)
248	prefetcht1 `0xC0`(%rsi)
249	prefetcht1 `0x100`(%rsi)
250	prefetcht1 `0x140`(%rsi)
251	prefetcht1 `0x180`(%rsi)
252	prefetcht1 `0x1C0`(%rsi)
253	vmovups %zmm0, (%rdi)
254	vmovups %zmm1, `0x40`(%rdi)
255	vmovups %zmm2, `0x80`(%rdi)
256	vmovups %zmm3, `0xC0`(%rdi)
257	vmovups %zmm4, `0x100`(%rdi)
258	vmovups %zmm5, `0x140`(%rdi)
259	vmovups %zmm6, `0x180`(%rdi)
260	vmovups %zmm7, `0x1C0`(%rdi)
261	add $`512`, %rdi
262	cmp %r9, %rdi
263	jb L(gobble_512bytes_loop)
264	vmovups %zmm8, (%r9)
265	vmovups %zmm9, `0x40`(%r9)
266	vmovups %zmm10, `0x80`(%r9)
267	vmovups %zmm11, `0xC0`(%r9)
268	vmovups %zmm12, `0x100`(%r9)
269	vmovups %zmm13, `0x140`(%r9)
270	vmovups %zmm14, `0x180`(%r9)
271	vmovups %zmm15, `0x1C0`(%r9)
272	ret
273
274	L(`1024bytesormore_bkw`):
275	add $`512`, %rdi
276	vmovups `0x1C0`(%rsi), %zmm8
277	vmovups `0x180`(%rsi), %zmm9
278	vmovups `0x140`(%rsi), %zmm10
279	vmovups `0x100`(%rsi), %zmm11
280	vmovups `0xC0`(%rsi), %zmm12
281	vmovups `0x80`(%rsi), %zmm13
282	vmovups `0x40`(%rsi), %zmm14
283	vmovups (%rsi), %zmm15
284	prefetcht1 -`0x40`(%rcx)
285	prefetcht1 -`0x80`(%rcx)
286	prefetcht1 -`0xC0`(%rcx)
287	prefetcht1 -`0x100`(%rcx)
288	prefetcht1 -`0x140`(%rcx)
289	prefetcht1 -`0x180`(%rcx)
290	prefetcht1 -`0x1C0`(%rcx)
291	prefetcht1 -`0x200`(%rcx)
292
293	/ Backward loop with unaligned memory access. /
294	L(gobble_512bytes_loop_bkw):
295	vmovups -`0x40`(%rcx), %zmm0
296	vmovups -`0x80`(%rcx), %zmm1
297	vmovups -`0xC0`(%rcx), %zmm2
298	vmovups -`0x100`(%rcx), %zmm3
299	vmovups -`0x140`(%rcx), %zmm4
300	vmovups -`0x180`(%rcx), %zmm5
301	vmovups -`0x1C0`(%rcx), %zmm6
302	vmovups -`0x200`(%rcx), %zmm7
303	sub $`512`, %rcx
304	prefetcht1 -`0x40`(%rcx)
305	prefetcht1 -`0x80`(%rcx)
306	prefetcht1 -`0xC0`(%rcx)
307	prefetcht1 -`0x100`(%rcx)
308	prefetcht1 -`0x140`(%rcx)
309	prefetcht1 -`0x180`(%rcx)
310	prefetcht1 -`0x1C0`(%rcx)
311	prefetcht1 -`0x200`(%rcx)
312	vmovups %zmm0, -`0x40`(%r9)
313	vmovups %zmm1, -`0x80`(%r9)
314	vmovups %zmm2, -`0xC0`(%r9)
315	vmovups %zmm3, -`0x100`(%r9)
316	vmovups %zmm4, -`0x140`(%r9)
317	vmovups %zmm5, -`0x180`(%r9)
318	vmovups %zmm6, -`0x1C0`(%r9)
319	vmovups %zmm7, -`0x200`(%r9)
320	sub $`512`, %r9
321	cmp %rdi, %r9
322	ja L(gobble_512bytes_loop_bkw)
323	vmovups %zmm8, -`0x40`(%rdi)
324	vmovups %zmm9, -`0x80`(%rdi)
325	vmovups %zmm10, -`0xC0`(%rdi)
326	vmovups %zmm11, -`0x100`(%rdi)
327	vmovups %zmm12, -`0x140`(%rdi)
328	vmovups %zmm13, -`0x180`(%rdi)
329	vmovups %zmm14, -`0x1C0`(%rdi)
330	vmovups %zmm15, -`0x200`(%rdi)
331	ret
332
333	L(preloop_large):
334	cmp %rsi, %rdi
335	ja L(preloop_large_bkw)
336	vmovups (%rsi), %zmm4
337	vmovups `0x40`(%rsi), %zmm5
338
339	/ Align destination for access with non-temporal stores in the loop. /
340	mov %rdi, %r8
341	and $-`0x80`, %rdi
342	add $`0x80`, %rdi
343	sub %rdi, %r8
344	sub %r8, %rsi
345	add %r8, %rdx
346	L(gobble_256bytes_nt_loop):
347	prefetcht1 `0x200`(%rsi)
348	prefetcht1 `0x240`(%rsi)
349	prefetcht1 `0x280`(%rsi)
350	prefetcht1 `0x2C0`(%rsi)
351	prefetcht1 `0x300`(%rsi)
352	prefetcht1 `0x340`(%rsi)
353	prefetcht1 `0x380`(%rsi)
354	prefetcht1 `0x3C0`(%rsi)
355	vmovdqu64 (%rsi), %zmm0
356	vmovdqu64 `0x40`(%rsi), %zmm1
357	vmovdqu64 `0x80`(%rsi), %zmm2
358	vmovdqu64 `0xC0`(%rsi), %zmm3
359	vmovntdq %zmm0, (%rdi)
360	vmovntdq %zmm1, `0x40`(%rdi)
361	vmovntdq %zmm2, `0x80`(%rdi)
362	vmovntdq %zmm3, `0xC0`(%rdi)
363	sub $`256`, %rdx
364	add $`256`, %rsi
365	add $`256`, %rdi
366	cmp $`256`, %rdx
367	ja L(gobble_256bytes_nt_loop)
368	sfence
369	vmovups %zmm4, (%rax)
370	vmovups %zmm5, `0x40`(%rax)
371	jmp L(check)
372
373	L(preloop_large_bkw):
374	vmovups -`0x80`(%rcx), %zmm4
375	vmovups -`0x40`(%rcx), %zmm5
376
377	/ Align end of destination for access with non-temporal stores. /
378	mov %r9, %r8
379	and $-`0x80`, %r9
380	sub %r9, %r8
381	sub %r8, %rcx
382	sub %r8, %rdx
383	add %r9, %r8
384	L(gobble_256bytes_nt_loop_bkw):
385	prefetcht1 -`0x400`(%rcx)
386	prefetcht1 -`0x3C0`(%rcx)
387	prefetcht1 -`0x380`(%rcx)
388	prefetcht1 -`0x340`(%rcx)
389	prefetcht1 -`0x300`(%rcx)
390	prefetcht1 -`0x2C0`(%rcx)
391	prefetcht1 -`0x280`(%rcx)
392	prefetcht1 -`0x240`(%rcx)
393	vmovdqu64 -`0x100`(%rcx), %zmm0
394	vmovdqu64 -`0xC0`(%rcx), %zmm1
395	vmovdqu64 -`0x80`(%rcx), %zmm2
396	vmovdqu64 -`0x40`(%rcx), %zmm3
397	vmovntdq %zmm0, -`0x100`(%r9)
398	vmovntdq %zmm1, -`0xC0`(%r9)
399	vmovntdq %zmm2, -`0x80`(%r9)
400	vmovntdq %zmm3, -`0x40`(%r9)
401	sub $`256`, %rdx
402	sub $`256`, %rcx
403	sub $`256`, %r9
404	cmp $`256`, %rdx
405	ja L(gobble_256bytes_nt_loop_bkw)
406	sfence
407	vmovups %zmm4, -`0x80`(%r8)
408	vmovups %zmm5, -`0x40`(%r8)
409	jmp L(check)
410	END (__memmove_avx512_no_vzeroupper)
411
412	strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
413	strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
414	#endif
415

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S