memset-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S]

1	/ memset with unaligned store and rep stosb*
2	Copyright (C) 2016-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ memset is implemented as:*
20	1. Use overlapping store to avoid branch.
21	2. If size is less than VEC, use integer register stores.
22	3. If size is from VEC_SIZE to 2 VEC_SIZE, use 2 VEC stores.*
23	4. If size is from 2 VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.*
24	5. If size is more to 4 VEC_SIZE, align to 4 * VEC_SIZE with*
25	4 VEC stores and store 4 VEC at a time until done. /
26
27	#include <sysdep.h>
28
29	#ifndef MEMSET_CHK_SYMBOL
30	# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31	#endif
32
33	#ifndef WMEMSET_CHK_SYMBOL
34	# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35	#endif
36
37	#ifndef VZEROUPPER
38	# if VEC_SIZE > 16
39	# define VZEROUPPER vzeroupper
40	# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
41	# else
42	# define VZEROUPPER
43	# endif
44	#endif
45
46	#ifndef VZEROUPPER_SHORT_RETURN
47	# define VZEROUPPER_SHORT_RETURN rep; ret
48	#endif
49
50	#ifndef MOVQ
51	# if VEC_SIZE > 16
52	# define MOVQ vmovq
53	# define MOVD vmovd
54	# else
55	# define MOVQ movq
56	# define MOVD movd
57	# endif
58	#endif
59
60	#if VEC_SIZE == 64
61	# define LOOP_4X_OFFSET (VEC_SIZE * 4)
62	#else
63	# define LOOP_4X_OFFSET (0)
64	#endif
65
66	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
67	# define END_REG rcx
68	# define LOOP_REG rdi
69	# define LESS_VEC_REG rax
70	#else
71	# define END_REG rdi
72	# define LOOP_REG rdx
73	# define LESS_VEC_REG rdi
74	#endif
75
76	#ifdef USE_XMM_LESS_VEC
77	# define XMM_SMALL 1
78	#else
79	# define XMM_SMALL 0
80	#endif
81
82	#ifdef USE_LESS_VEC_MASK_STORE
83	# define SET_REG64 rcx
84	# define SET_REG32 ecx
85	# define SET_REG16 cx
86	# define SET_REG8 cl
87	#else
88	# define SET_REG64 rsi
89	# define SET_REG32 esi
90	# define SET_REG16 si
91	# define SET_REG8 sil
92	#endif
93
94	#define PAGE_SIZE 4096
95
96	/ Macro to calculate size of small memset block for aligning*
97	purposes. /*
98	#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
99
100
101	#ifndef SECTION
102	# error SECTION is not defined!
103	#endif
104
105	.section SECTION(.text), "ax", @progbits
106	#if IS_IN (libc)
107	# if defined SHARED
108	ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
109	cmp %RDX_LP, %RCX_LP
110	jb HIDDEN_JUMPTARGET (__chk_fail)
111	END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
112	# endif
113
114	ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
115	shl $`2`, %RDX_LP
116	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
117	WMEMSET_VDUP_TO_VEC0_LOW()
118	cmpq $VEC_SIZE, %rdx
119	jb L(less_vec_from_wmemset)
120	WMEMSET_VDUP_TO_VEC0_HIGH()
121	jmp L(entry_from_wmemset)
122	END (WMEMSET_SYMBOL (__wmemset, unaligned))
123	#endif
124
125	#if defined SHARED && IS_IN (libc)
126	ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
127	cmp %RDX_LP, %RCX_LP
128	jb HIDDEN_JUMPTARGET (__chk_fail)
129	END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
130	#endif
131
132	ENTRY (MEMSET_SYMBOL (__memset, unaligned))
133	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
134	# ifdef __ILP32__
135	/ Clear the upper 32 bits. /
136	mov %edx, %edx
137	# endif
138	cmpq $VEC_SIZE, %rdx
139	jb L(less_vec)
140	MEMSET_VDUP_TO_VEC0_HIGH()
141	L(entry_from_wmemset):
142	cmpq $(VEC_SIZE * `2`), %rdx
143	ja L(more_2x_vec)
144	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
145	VMOVU %VMM(`0`), -VEC_SIZE(%rdi,%rdx)
146	VMOVU %VMM(`0`), (%rdi)
147	VZEROUPPER_RETURN
148	#if defined USE_MULTIARCH && IS_IN (libc)
149	END (MEMSET_SYMBOL (__memset, unaligned))
150
151	# if defined SHARED && IS_IN (libc)
152	ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
153	cmp %RDX_LP, %RCX_LP
154	jb HIDDEN_JUMPTARGET (__chk_fail)
155	END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
156	# endif
157
158	ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), `6`)
159	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
160	# ifdef __ILP32__
161	/ Clear the upper 32 bits. /
162	mov %edx, %edx
163	# endif
164	cmp $VEC_SIZE, %RDX_LP
165	jb L(less_vec)
166	MEMSET_VDUP_TO_VEC0_HIGH ()
167	cmp $(VEC_SIZE * `2`), %RDX_LP
168	ja L(stosb_more_2x_vec)
169	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
170	VMOVU %VMM(`0`), (%rdi)
171	VMOVU %VMM(`0`), (VEC_SIZE * -`1`)(%rdi, %rdx)
172	VZEROUPPER_RETURN
173	#endif
174
175	.p2align `4`,, `4`
176	L(last_2x_vec):
177	#ifdef USE_LESS_VEC_MASK_STORE
178	VMOVU %VMM(`0`), (VEC_SIZE * -`2`)(%rdi, %rdx)
179	VMOVU %VMM(`0`), (VEC_SIZE * -`1`)(%rdi, %rdx)
180	#else
181	VMOVU %VMM(`0`), (VEC_SIZE * -`2`)(%rdi)
182	VMOVU %VMM(`0`), (VEC_SIZE * -`1`)(%rdi)
183	#endif
184	VZEROUPPER_RETURN
185
186	/ If have AVX512 mask instructions put L(less_vec) close to*
187	entry as it doesn't take much space and is likely a hot target.
188	*/
189	#ifdef USE_LESS_VEC_MASK_STORE
190	.p2align `4`,, `10`
191	L(less_vec):
192	L(less_vec_from_wmemset):
193	/ Less than 1 VEC. /
194	# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
195	# error Unsupported VEC_SIZE!
196	# endif
197	/ Clear high bits from edi. Only keeping bits relevant to page*
198	cross check. Note that we are using rax which is set in
199	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. /*
200	andl $(PAGE_SIZE - `1`), %edi
201	/ Check if VEC_SIZE store cross page. Mask stores suffer*
202	serious performance degradation when it has to fault supress.
203	*/
204	cmpl $(PAGE_SIZE - VEC_SIZE), %edi
205	/ This is generally considered a cold target. /
206	ja L(cross_page)
207	# if VEC_SIZE > 32
208	movq $-`1`, %rcx
209	bzhiq %rdx, %rcx, %rcx
210	kmovq %rcx, %k1
211	# else
212	movl $-`1`, %ecx
213	bzhil %edx, %ecx, %ecx
214	kmovd %ecx, %k1
215	# endif
216	vmovdqu8 %VMM(`0`), (%rax){%k1}
217	VZEROUPPER_RETURN
218
219	# if defined USE_MULTIARCH && IS_IN (libc)
220	/ Include L(stosb_local) here if including L(less_vec) between*
221	L(stosb_more_2x_vec) and ENTRY. This is to cache align the
222	L(stosb_more_2x_vec) target. /*
223	.p2align `4`,, `10`
224	L(stosb_local):
225	movzbl %sil, %eax
226	mov %RDX_LP, %RCX_LP
227	mov %RDI_LP, %RDX_LP
228	rep stosb
229	mov %RDX_LP, %RAX_LP
230	VZEROUPPER_RETURN
231	# endif
232	#endif
233
234	#if defined USE_MULTIARCH && IS_IN (libc)
235	.p2align `4`
236	L(stosb_more_2x_vec):
237	cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
238	ja L(stosb_local)
239	#endif
240	/ Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]*
241	and (4x, 8x] jump to target. /*
242	L(more_2x_vec):
243	/ Store next 2x vec regardless. /
244	VMOVU %VMM(`0`), (%rdi)
245	VMOVU %VMM(`0`), (VEC_SIZE * `1`)(%rdi)
246
247
248	/ Two different methods of setting up pointers / compare. The two*
249	methods are based on the fact that EVEX/AVX512 mov instructions take
250	more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
251	machines also have fast LEA_BID. Both setup and END_REG to avoid complex
252	address mode. For EVEX/AVX512 this saves code size and keeps a few
253	targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
254	bottlenecks. /*
255	#if !(defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512)
256	/ If AVX2/SSE2 compute END_REG (rdi) with ALU. /
257	addq %rdx, %END_REG
258	#endif
259
260	cmpq $(VEC_SIZE * `4`), %rdx
261	jbe L(last_2x_vec)
262
263
264	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
265	/ If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with*
266	LEA_BID. /*
267
268	/ END_REG is rcx for EVEX/AVX512. /
269	leaq -(VEC_SIZE * `4` + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
270	#endif
271
272	/ Store next 2x vec regardless. /
273	VMOVU %VMM(`0`), (VEC_SIZE * `2`)(%rax)
274	VMOVU %VMM(`0`), (VEC_SIZE * `3`)(%rax)
275
276
277	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
278	/ If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add*
279	extra offset to addresses in loop. Used for AVX512 to save space
280	as no way to get (VEC_SIZE 4) in imm8. /
281	# if LOOP_4X_OFFSET == 0
282	subq $-(VEC_SIZE * `4`), %LOOP_REG
283	# endif
284	/ Avoid imm32 compare here to save code size. /
285	cmpq %rdi, %rcx
286	#else
287	addq $-(VEC_SIZE * `4`), %END_REG
288	cmpq $(VEC_SIZE * `8`), %rdx
289	#endif
290	jbe L(last_4x_vec)
291	#if !(defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512)
292	/ Set LOOP_REG (rdx). /
293	leaq (VEC_SIZE * `4`)(%rax), %LOOP_REG
294	#endif
295	/ Align dst for loop. /
296	andq $(VEC_SIZE * -`2`), %LOOP_REG
297	.p2align `4`
298	L(loop):
299	VMOVA %VMM(`0`), LOOP_4X_OFFSET(%LOOP_REG)
300	VMOVA %VMM(`0`), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
301	VMOVA %VMM(`0`), (VEC_SIZE * `2` + LOOP_4X_OFFSET)(%LOOP_REG)
302	VMOVA %VMM(`0`), (VEC_SIZE * `3` + LOOP_4X_OFFSET)(%LOOP_REG)
303	subq $-(VEC_SIZE * `4`), %LOOP_REG
304	cmpq %END_REG, %LOOP_REG
305	jb L(loop)
306	.p2align `4`,, MOV_SIZE
307	L(last_4x_vec):
308	VMOVU %VMM(`0`), LOOP_4X_OFFSET(%END_REG)
309	VMOVU %VMM(`0`), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
310	VMOVU %VMM(`0`), (VEC_SIZE * `2` + LOOP_4X_OFFSET)(%END_REG)
311	VMOVU %VMM(`0`), (VEC_SIZE * `3` + LOOP_4X_OFFSET)(%END_REG)
312	L(return_vzeroupper):
313	#if VEC_SIZE > 16
314	ZERO_UPPER_VEC_REGISTERS_RETURN
315	#else
316	ret
317	#endif
318
319	.p2align `4`,, `10`
320	#ifndef USE_LESS_VEC_MASK_STORE
321	# if defined USE_MULTIARCH && IS_IN (libc)
322	/ If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in*
323	range for 2-byte jump encoding. /*
324	L(stosb_local):
325	movzbl %sil, %eax
326	mov %RDX_LP, %RCX_LP
327	mov %RDI_LP, %RDX_LP
328	rep stosb
329	mov %RDX_LP, %RAX_LP
330	VZEROUPPER_RETURN
331	# endif
332	/ Define L(less_vec) only if not otherwise defined. /
333	.p2align `4`
334	L(less_vec):
335	/ Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to*
336	xmm). This is only does anything for AVX2. /*
337	MEMSET_VDUP_TO_VEC0_LOW ()
338	L(less_vec_from_wmemset):
339	#endif
340	L(cross_page):
341	#if VEC_SIZE > 32
342	cmpl $`32`, %edx
343	jge L(between_32_63)
344	#endif
345	#if VEC_SIZE > 16
346	cmpl $`16`, %edx
347	jge L(between_16_31)
348	#endif
349	#ifndef USE_XMM_LESS_VEC
350	MOVQ %VMM_128(`0`), %SET_REG64
351	#endif
352	cmpl $`8`, %edx
353	jge L(between_8_15)
354	cmpl $`4`, %edx
355	jge L(between_4_7)
356	cmpl $`1`, %edx
357	jg L(between_2_3)
358	jl L(between_0_0)
359	movb %SET_REG8, (%LESS_VEC_REG)
360	L(between_0_0):
361	ret
362
363	/ Align small targets only if not doing so would cross a fetch line.*
364	*/
365	#if VEC_SIZE > 32
366	.p2align `4`,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
367	/ From 32 to 63. No branch when size == 32. /
368	L(between_32_63):
369	VMOVU %VMM_256(`0`), (%LESS_VEC_REG)
370	VMOVU %VMM_256(`0`), -`32`(%LESS_VEC_REG, %rdx)
371	VZEROUPPER_RETURN
372	#endif
373
374	#if VEC_SIZE >= 32
375	.p2align `4`,, SMALL_MEMSET_ALIGN(MOV_SIZE, `1`)
376	L(between_16_31):
377	/ From 16 to 31. No branch when size == 16. /
378	VMOVU %VMM_128(`0`), (%LESS_VEC_REG)
379	VMOVU %VMM_128(`0`), -`16`(%LESS_VEC_REG, %rdx)
380	ret
381	#endif
382
383	/ Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.*
384	*/
385	.p2align `4`,, SMALL_MEMSET_ALIGN(`3` + XMM_SMALL, `1`)
386	L(between_8_15):
387	/ From 8 to 15. No branch when size == 8. /
388	#ifdef USE_XMM_LESS_VEC
389	MOVQ %VMM_128(`0`), (%rdi)
390	MOVQ %VMM_128(`0`), -`8`(%rdi, %rdx)
391	#else
392	movq %SET_REG64, (%LESS_VEC_REG)
393	movq %SET_REG64, -`8`(%LESS_VEC_REG, %rdx)
394	#endif
395	ret
396
397	/ Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.*
398	*/
399	.p2align `4`,, SMALL_MEMSET_ALIGN(`2` << XMM_SMALL, `1`)
400	L(between_4_7):
401	/ From 4 to 7. No branch when size == 4. /
402	#ifdef USE_XMM_LESS_VEC
403	MOVD %VMM_128(`0`), (%rdi)
404	MOVD %VMM_128(`0`), -`4`(%rdi, %rdx)
405	#else
406	movl %SET_REG32, (%LESS_VEC_REG)
407	movl %SET_REG32, -`4`(%LESS_VEC_REG, %rdx)
408	#endif
409	ret
410
411	/ 4 * XMM_SMALL for the third mov for AVX2. /
412	.p2align `4`,, `4` * XMM_SMALL + SMALL_MEMSET_ALIGN(`3`, `1`)
413	L(between_2_3):
414	/ From 2 to 3. No branch when size == 2. /
415	#ifdef USE_XMM_LESS_VEC
416	movb %SET_REG8, (%rdi)
417	movb %SET_REG8, `1`(%rdi)
418	movb %SET_REG8, -`1`(%rdi, %rdx)
419	#else
420	movw %SET_REG16, (%LESS_VEC_REG)
421	movb %SET_REG8, -`1`(%LESS_VEC_REG, %rdx)
422	#endif
423	ret
424	END (MEMSET_SYMBOL (__memset, unaligned_erms))
425

Browse the source code of glibc/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S