memmove-avx512-no-vzeroupper.S source code [glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S]

1	/ memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.*
2	Copyright (C) 2016-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include <isa-level.h>
21
22	#if ISA_SHOULD_BUILD (4)
23
24	# include "asm-syntax.h"
25
26	.section .text.avx512,"ax",@progbits
27	ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
28	cmp %RDX_LP, %RCX_LP
29	jb HIDDEN_JUMPTARGET (__chk_fail)
30	END (__mempcpy_chk_avx512_no_vzeroupper)
31
32	ENTRY (__mempcpy_avx512_no_vzeroupper)
33	mov %RDI_LP, %RAX_LP
34	add %RDX_LP, %RAX_LP
35	jmp L(start)
36	END (__mempcpy_avx512_no_vzeroupper)
37
38	ENTRY (__memmove_chk_avx512_no_vzeroupper)
39	cmp %RDX_LP, %RCX_LP
40	jb HIDDEN_JUMPTARGET (__chk_fail)
41	END (__memmove_chk_avx512_no_vzeroupper)
42
43	ENTRY (__memmove_avx512_no_vzeroupper)
44	mov %RDI_LP, %RAX_LP
45	# ifdef USE_AS_MEMPCPY
46	add %RDX_LP, %RAX_LP
47	# endif
48	L(start):
49	# ifdef __ILP32__
50	/ Clear the upper 32 bits. /
51	mov %edx, %edx
52	# endif
53	lea (%rsi, %rdx), %rcx
54	lea (%rdi, %rdx), %r9
55	cmp $`512`, %rdx
56	ja L(`512bytesormore`)
57
58	L(check):
59	cmp $`16`, %rdx
60	jbe L(less_16bytes)
61	cmp $`256`, %rdx
62	jb L(less_256bytes)
63	vmovups (%rsi), %zmm0
64	vmovups `0x40`(%rsi), %zmm1
65	vmovups `0x80`(%rsi), %zmm2
66	vmovups `0xC0`(%rsi), %zmm3
67	vmovups -`0x100`(%rcx), %zmm4
68	vmovups -`0xC0`(%rcx), %zmm5
69	vmovups -`0x80`(%rcx), %zmm6
70	vmovups -`0x40`(%rcx), %zmm7
71	vmovups %zmm0, (%rdi)
72	vmovups %zmm1, `0x40`(%rdi)
73	vmovups %zmm2, `0x80`(%rdi)
74	vmovups %zmm3, `0xC0`(%rdi)
75	vmovups %zmm4, -`0x100`(%r9)
76	vmovups %zmm5, -`0xC0`(%r9)
77	vmovups %zmm6, -`0x80`(%r9)
78	vmovups %zmm7, -`0x40`(%r9)
79	ret
80
81	L(less_256bytes):
82	cmp $`128`, %dl
83	jb L(less_128bytes)
84	vmovups (%rsi), %zmm0
85	vmovups `0x40`(%rsi), %zmm1
86	vmovups -`0x80`(%rcx), %zmm2
87	vmovups -`0x40`(%rcx), %zmm3
88	vmovups %zmm0, (%rdi)
89	vmovups %zmm1, `0x40`(%rdi)
90	vmovups %zmm2, -`0x80`(%r9)
91	vmovups %zmm3, -`0x40`(%r9)
92	ret
93
94	L(less_128bytes):
95	cmp $`64`, %dl
96	jb L(less_64bytes)
97	vmovdqu (%rsi), %ymm0
98	vmovdqu `0x20`(%rsi), %ymm1
99	vmovdqu -`0x40`(%rcx), %ymm2
100	vmovdqu -`0x20`(%rcx), %ymm3
101	vmovdqu %ymm0, (%rdi)
102	vmovdqu %ymm1, `0x20`(%rdi)
103	vmovdqu %ymm2, -`0x40`(%r9)
104	vmovdqu %ymm3, -`0x20`(%r9)
105	ret
106
107	L(less_64bytes):
108	cmp $`32`, %dl
109	jb L(less_32bytes)
110	vmovdqu (%rsi), %ymm0
111	vmovdqu -`0x20`(%rcx), %ymm1
112	vmovdqu %ymm0, (%rdi)
113	vmovdqu %ymm1, -`0x20`(%r9)
114	ret
115
116	L(less_32bytes):
117	vmovdqu (%rsi), %xmm0
118	vmovdqu -`0x10`(%rcx), %xmm1
119	vmovdqu %xmm0, (%rdi)
120	vmovdqu %xmm1, -`0x10`(%r9)
121	ret
122
123	L(less_16bytes):
124	cmp $`8`, %dl
125	jb L(less_8bytes)
126	movq (%rsi), %rsi
127	movq -`0x8`(%rcx), %rcx
128	movq %rsi, (%rdi)
129	movq %rcx, -`0x8`(%r9)
130	ret
131
132	L(less_8bytes):
133	cmp $`4`, %dl
134	jb L(less_4bytes)
135	mov (%rsi), %esi
136	mov -`0x4`(%rcx), %ecx
137	mov %esi, (%rdi)
138	mov %ecx, -`0x4`(%r9)
139	ret
140
141	L(less_4bytes):
142	cmp $`2`, %dl
143	jb L(less_2bytes)
144	mov (%rsi), %si
145	mov -`0x2`(%rcx), %cx
146	mov %si, (%rdi)
147	mov %cx, -`0x2`(%r9)
148	ret
149
150	L(less_2bytes):
151	cmp $`1`, %dl
152	jb L(less_1bytes)
153	mov (%rsi), %cl
154	mov %cl, (%rdi)
155	L(less_1bytes):
156	ret
157
158	L(`512bytesormore`):
159	# ifdef SHARED_CACHE_SIZE_HALF
160	mov $SHARED_CACHE_SIZE_HALF, %r8
161	# else
162	mov __x86_shared_cache_size_half(%rip), %r8
163	# endif
164	cmp %r8, %rdx
165	jae L(preloop_large)
166	cmp $`1024`, %rdx
167	ja L(`1024bytesormore`)
168	prefetcht1 (%rsi)
169	prefetcht1 `0x40`(%rsi)
170	prefetcht1 `0x80`(%rsi)
171	prefetcht1 `0xC0`(%rsi)
172	prefetcht1 `0x100`(%rsi)
173	prefetcht1 `0x140`(%rsi)
174	prefetcht1 `0x180`(%rsi)
175	prefetcht1 `0x1C0`(%rsi)
176	prefetcht1 -`0x200`(%rcx)
177	prefetcht1 -`0x1C0`(%rcx)
178	prefetcht1 -`0x180`(%rcx)
179	prefetcht1 -`0x140`(%rcx)
180	prefetcht1 -`0x100`(%rcx)
181	prefetcht1 -`0xC0`(%rcx)
182	prefetcht1 -`0x80`(%rcx)
183	prefetcht1 -`0x40`(%rcx)
184	vmovups (%rsi), %zmm0
185	vmovups `0x40`(%rsi), %zmm1
186	vmovups `0x80`(%rsi), %zmm2
187	vmovups `0xC0`(%rsi), %zmm3
188	vmovups `0x100`(%rsi), %zmm4
189	vmovups `0x140`(%rsi), %zmm5
190	vmovups `0x180`(%rsi), %zmm6
191	vmovups `0x1C0`(%rsi), %zmm7
192	vmovups -`0x200`(%rcx), %zmm8
193	vmovups -`0x1C0`(%rcx), %zmm9
194	vmovups -`0x180`(%rcx), %zmm10
195	vmovups -`0x140`(%rcx), %zmm11
196	vmovups -`0x100`(%rcx), %zmm12
197	vmovups -`0xC0`(%rcx), %zmm13
198	vmovups -`0x80`(%rcx), %zmm14
199	vmovups -`0x40`(%rcx), %zmm15
200	vmovups %zmm0, (%rdi)
201	vmovups %zmm1, `0x40`(%rdi)
202	vmovups %zmm2, `0x80`(%rdi)
203	vmovups %zmm3, `0xC0`(%rdi)
204	vmovups %zmm4, `0x100`(%rdi)
205	vmovups %zmm5, `0x140`(%rdi)
206	vmovups %zmm6, `0x180`(%rdi)
207	vmovups %zmm7, `0x1C0`(%rdi)
208	vmovups %zmm8, -`0x200`(%r9)
209	vmovups %zmm9, -`0x1C0`(%r9)
210	vmovups %zmm10, -`0x180`(%r9)
211	vmovups %zmm11, -`0x140`(%r9)
212	vmovups %zmm12, -`0x100`(%r9)
213	vmovups %zmm13, -`0xC0`(%r9)
214	vmovups %zmm14, -`0x80`(%r9)
215	vmovups %zmm15, -`0x40`(%r9)
216	ret
217
218	L(`1024bytesormore`):
219	cmp %rsi, %rdi
220	ja L(`1024bytesormore_bkw`)
221	sub $`512`, %r9
222	vmovups -`0x200`(%rcx), %zmm8
223	vmovups -`0x1C0`(%rcx), %zmm9
224	vmovups -`0x180`(%rcx), %zmm10
225	vmovups -`0x140`(%rcx), %zmm11
226	vmovups -`0x100`(%rcx), %zmm12
227	vmovups -`0xC0`(%rcx), %zmm13
228	vmovups -`0x80`(%rcx), %zmm14
229	vmovups -`0x40`(%rcx), %zmm15
230	prefetcht1 (%rsi)
231	prefetcht1 `0x40`(%rsi)
232	prefetcht1 `0x80`(%rsi)
233	prefetcht1 `0xC0`(%rsi)
234	prefetcht1 `0x100`(%rsi)
235	prefetcht1 `0x140`(%rsi)
236	prefetcht1 `0x180`(%rsi)
237	prefetcht1 `0x1C0`(%rsi)
238
239	/ Loop with unaligned memory access. /
240	L(gobble_512bytes_loop):
241	vmovups (%rsi), %zmm0
242	vmovups `0x40`(%rsi), %zmm1
243	vmovups `0x80`(%rsi), %zmm2
244	vmovups `0xC0`(%rsi), %zmm3
245	vmovups `0x100`(%rsi), %zmm4
246	vmovups `0x140`(%rsi), %zmm5
247	vmovups `0x180`(%rsi), %zmm6
248	vmovups `0x1C0`(%rsi), %zmm7
249	add $`512`, %rsi
250	prefetcht1 (%rsi)
251	prefetcht1 `0x40`(%rsi)
252	prefetcht1 `0x80`(%rsi)
253	prefetcht1 `0xC0`(%rsi)
254	prefetcht1 `0x100`(%rsi)
255	prefetcht1 `0x140`(%rsi)
256	prefetcht1 `0x180`(%rsi)
257	prefetcht1 `0x1C0`(%rsi)
258	vmovups %zmm0, (%rdi)
259	vmovups %zmm1, `0x40`(%rdi)
260	vmovups %zmm2, `0x80`(%rdi)
261	vmovups %zmm3, `0xC0`(%rdi)
262	vmovups %zmm4, `0x100`(%rdi)
263	vmovups %zmm5, `0x140`(%rdi)
264	vmovups %zmm6, `0x180`(%rdi)
265	vmovups %zmm7, `0x1C0`(%rdi)
266	add $`512`, %rdi
267	cmp %r9, %rdi
268	jb L(gobble_512bytes_loop)
269	vmovups %zmm8, (%r9)
270	vmovups %zmm9, `0x40`(%r9)
271	vmovups %zmm10, `0x80`(%r9)
272	vmovups %zmm11, `0xC0`(%r9)
273	vmovups %zmm12, `0x100`(%r9)
274	vmovups %zmm13, `0x140`(%r9)
275	vmovups %zmm14, `0x180`(%r9)
276	vmovups %zmm15, `0x1C0`(%r9)
277	ret
278
279	L(`1024bytesormore_bkw`):
280	add $`512`, %rdi
281	vmovups `0x1C0`(%rsi), %zmm8
282	vmovups `0x180`(%rsi), %zmm9
283	vmovups `0x140`(%rsi), %zmm10
284	vmovups `0x100`(%rsi), %zmm11
285	vmovups `0xC0`(%rsi), %zmm12
286	vmovups `0x80`(%rsi), %zmm13
287	vmovups `0x40`(%rsi), %zmm14
288	vmovups (%rsi), %zmm15
289	prefetcht1 -`0x40`(%rcx)
290	prefetcht1 -`0x80`(%rcx)
291	prefetcht1 -`0xC0`(%rcx)
292	prefetcht1 -`0x100`(%rcx)
293	prefetcht1 -`0x140`(%rcx)
294	prefetcht1 -`0x180`(%rcx)
295	prefetcht1 -`0x1C0`(%rcx)
296	prefetcht1 -`0x200`(%rcx)
297
298	/ Backward loop with unaligned memory access. /
299	L(gobble_512bytes_loop_bkw):
300	vmovups -`0x40`(%rcx), %zmm0
301	vmovups -`0x80`(%rcx), %zmm1
302	vmovups -`0xC0`(%rcx), %zmm2
303	vmovups -`0x100`(%rcx), %zmm3
304	vmovups -`0x140`(%rcx), %zmm4
305	vmovups -`0x180`(%rcx), %zmm5
306	vmovups -`0x1C0`(%rcx), %zmm6
307	vmovups -`0x200`(%rcx), %zmm7
308	sub $`512`, %rcx
309	prefetcht1 -`0x40`(%rcx)
310	prefetcht1 -`0x80`(%rcx)
311	prefetcht1 -`0xC0`(%rcx)
312	prefetcht1 -`0x100`(%rcx)
313	prefetcht1 -`0x140`(%rcx)
314	prefetcht1 -`0x180`(%rcx)
315	prefetcht1 -`0x1C0`(%rcx)
316	prefetcht1 -`0x200`(%rcx)
317	vmovups %zmm0, -`0x40`(%r9)
318	vmovups %zmm1, -`0x80`(%r9)
319	vmovups %zmm2, -`0xC0`(%r9)
320	vmovups %zmm3, -`0x100`(%r9)
321	vmovups %zmm4, -`0x140`(%r9)
322	vmovups %zmm5, -`0x180`(%r9)
323	vmovups %zmm6, -`0x1C0`(%r9)
324	vmovups %zmm7, -`0x200`(%r9)
325	sub $`512`, %r9
326	cmp %rdi, %r9
327	ja L(gobble_512bytes_loop_bkw)
328	vmovups %zmm8, -`0x40`(%rdi)
329	vmovups %zmm9, -`0x80`(%rdi)
330	vmovups %zmm10, -`0xC0`(%rdi)
331	vmovups %zmm11, -`0x100`(%rdi)
332	vmovups %zmm12, -`0x140`(%rdi)
333	vmovups %zmm13, -`0x180`(%rdi)
334	vmovups %zmm14, -`0x1C0`(%rdi)
335	vmovups %zmm15, -`0x200`(%rdi)
336	ret
337
338	L(preloop_large):
339	cmp %rsi, %rdi
340	ja L(preloop_large_bkw)
341	vmovups (%rsi), %zmm4
342	vmovups `0x40`(%rsi), %zmm5
343
344	mov %rdi, %r11
345	/ Align destination for access with non-temporal stores in the loop. /
346	mov %rdi, %r8
347	and $-`0x80`, %rdi
348	add $`0x80`, %rdi
349	sub %rdi, %r8
350	sub %r8, %rsi
351	add %r8, %rdx
352	L(gobble_256bytes_nt_loop):
353	prefetcht1 `0x200`(%rsi)
354	prefetcht1 `0x240`(%rsi)
355	prefetcht1 `0x280`(%rsi)
356	prefetcht1 `0x2C0`(%rsi)
357	prefetcht1 `0x300`(%rsi)
358	prefetcht1 `0x340`(%rsi)
359	prefetcht1 `0x380`(%rsi)
360	prefetcht1 `0x3C0`(%rsi)
361	vmovdqu64 (%rsi), %zmm0
362	vmovdqu64 `0x40`(%rsi), %zmm1
363	vmovdqu64 `0x80`(%rsi), %zmm2
364	vmovdqu64 `0xC0`(%rsi), %zmm3
365	vmovntdq %zmm0, (%rdi)
366	vmovntdq %zmm1, `0x40`(%rdi)
367	vmovntdq %zmm2, `0x80`(%rdi)
368	vmovntdq %zmm3, `0xC0`(%rdi)
369	sub $`256`, %rdx
370	add $`256`, %rsi
371	add $`256`, %rdi
372	cmp $`256`, %rdx
373	ja L(gobble_256bytes_nt_loop)
374	sfence
375	vmovups %zmm4, (%r11)
376	vmovups %zmm5, `0x40`(%r11)
377	jmp L(check)
378
379	L(preloop_large_bkw):
380	vmovups -`0x80`(%rcx), %zmm4
381	vmovups -`0x40`(%rcx), %zmm5
382
383	/ Align end of destination for access with non-temporal stores. /
384	mov %r9, %r8
385	and $-`0x80`, %r9
386	sub %r9, %r8
387	sub %r8, %rcx
388	sub %r8, %rdx
389	add %r9, %r8
390	L(gobble_256bytes_nt_loop_bkw):
391	prefetcht1 -`0x400`(%rcx)
392	prefetcht1 -`0x3C0`(%rcx)
393	prefetcht1 -`0x380`(%rcx)
394	prefetcht1 -`0x340`(%rcx)
395	prefetcht1 -`0x300`(%rcx)
396	prefetcht1 -`0x2C0`(%rcx)
397	prefetcht1 -`0x280`(%rcx)
398	prefetcht1 -`0x240`(%rcx)
399	vmovdqu64 -`0x100`(%rcx), %zmm0
400	vmovdqu64 -`0xC0`(%rcx), %zmm1
401	vmovdqu64 -`0x80`(%rcx), %zmm2
402	vmovdqu64 -`0x40`(%rcx), %zmm3
403	vmovntdq %zmm0, -`0x100`(%r9)
404	vmovntdq %zmm1, -`0xC0`(%r9)
405	vmovntdq %zmm2, -`0x80`(%r9)
406	vmovntdq %zmm3, -`0x40`(%r9)
407	sub $`256`, %rdx
408	sub $`256`, %rcx
409	sub $`256`, %r9
410	cmp $`256`, %rdx
411	ja L(gobble_256bytes_nt_loop_bkw)
412	sfence
413	vmovups %zmm4, -`0x80`(%r8)
414	vmovups %zmm5, -`0x40`(%r8)
415	jmp L(check)
416	END (__memmove_avx512_no_vzeroupper)
417
418	strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
419	strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
420	#endif
421

Browse the source code of glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S