memset-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S]

1	/ memset/bzero with unaligned store and rep stosb*
2	Copyright (C) 2016-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ memset is implemented as:*
20	1. Use overlapping store to avoid branch.
21	2. If size is less than VEC, use integer register stores.
22	3. If size is from VEC_SIZE to 2 VEC_SIZE, use 2 VEC stores.*
23	4. If size is from 2 VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.*
24	5. If size is more to 4 VEC_SIZE, align to 4 * VEC_SIZE with*
25	4 VEC stores and store 4 VEC at a time until done. /
26
27	#include <sysdep.h>
28
29	#ifndef MEMSET_CHK_SYMBOL
30	# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31	#endif
32
33	#ifndef WMEMSET_CHK_SYMBOL
34	# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35	#endif
36
37	#ifndef XMM0
38	# define XMM0 xmm0
39	#endif
40
41	#ifndef YMM0
42	# define YMM0 ymm0
43	#endif
44
45	#ifndef VZEROUPPER
46	# if VEC_SIZE > 16
47	# define VZEROUPPER vzeroupper
48	# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
49	# else
50	# define VZEROUPPER
51	# endif
52	#endif
53
54	#ifndef VZEROUPPER_SHORT_RETURN
55	# define VZEROUPPER_SHORT_RETURN rep; ret
56	#endif
57
58	#ifndef MOVQ
59	# if VEC_SIZE > 16
60	# define MOVQ vmovq
61	# else
62	# define MOVQ movq
63	# endif
64	#endif
65
66	#if VEC_SIZE == 64
67	# define LOOP_4X_OFFSET (VEC_SIZE * 4)
68	#else
69	# define LOOP_4X_OFFSET (0)
70	#endif
71
72	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
73	# define END_REG rcx
74	# define LOOP_REG rdi
75	#else
76	# define END_REG rdi
77	# define LOOP_REG rdx
78	#endif
79
80	#define PAGE_SIZE 4096
81
82	/ Macro to calculate size of small memset block for aligning*
83	purposes. /*
84	#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
85
86
87	#ifndef SECTION
88	# error SECTION is not defined!
89	#endif
90
91	.section SECTION(.text),"ax",@progbits
92	#if VEC_SIZE == 16 && IS_IN (libc)
93	ENTRY (__bzero)
94	mov %RDI_LP, %RAX_LP / Set return value. /
95	mov %RSI_LP, %RDX_LP / Set n. /
96	xorl %esi, %esi
97	pxor %XMM0, %XMM0
98	jmp L(entry_from_bzero)
99	END (__bzero)
100	weak_alias (__bzero, bzero)
101	#endif
102
103	#if IS_IN (libc)
104	# if defined SHARED
105	ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
106	cmp %RDX_LP, %RCX_LP
107	jb HIDDEN_JUMPTARGET (__chk_fail)
108	END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
109	# endif
110
111	ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
112	shl $`2`, %RDX_LP
113	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
114	jmp L(entry_from_bzero)
115	END (WMEMSET_SYMBOL (__wmemset, unaligned))
116	#endif
117
118	#if defined SHARED && IS_IN (libc)
119	ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
120	cmp %RDX_LP, %RCX_LP
121	jb HIDDEN_JUMPTARGET (__chk_fail)
122	END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
123	#endif
124
125	ENTRY (MEMSET_SYMBOL (__memset, unaligned))
126	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
127	# ifdef __ILP32__
128	/ Clear the upper 32 bits. /
129	mov %edx, %edx
130	# endif
131	L(entry_from_bzero):
132	cmpq $VEC_SIZE, %rdx
133	jb L(less_vec)
134	cmpq $(VEC_SIZE * `2`), %rdx
135	ja L(more_2x_vec)
136	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
137	VMOVU %VEC(`0`), -VEC_SIZE(%rdi,%rdx)
138	VMOVU %VEC(`0`), (%rdi)
139	VZEROUPPER_RETURN
140	#if defined USE_MULTIARCH && IS_IN (libc)
141	END (MEMSET_SYMBOL (__memset, unaligned))
142
143	# if VEC_SIZE == 16
144	ENTRY (__memset_chk_erms)
145	cmp %RDX_LP, %RCX_LP
146	jb HIDDEN_JUMPTARGET (__chk_fail)
147	END (__memset_chk_erms)
148
149	/ Only used to measure performance of REP STOSB. /
150	ENTRY (__memset_erms)
151	/ Skip zero length. /
152	test %RDX_LP, %RDX_LP
153	jnz L(stosb)
154	movq %rdi, %rax
155	ret
156	# else
157	/ Provide a hidden symbol to debugger. /
158	.hidden MEMSET_SYMBOL (__memset, erms)
159	ENTRY (MEMSET_SYMBOL (__memset, erms))
160	# endif
161	L(stosb):
162	mov %RDX_LP, %RCX_LP
163	movzbl %sil, %eax
164	mov %RDI_LP, %RDX_LP
165	rep stosb
166	mov %RDX_LP, %RAX_LP
167	VZEROUPPER_RETURN
168	# if VEC_SIZE == 16
169	END (__memset_erms)
170	# else
171	END (MEMSET_SYMBOL (__memset, erms))
172	# endif
173
174	# if defined SHARED && IS_IN (libc)
175	ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
176	cmp %RDX_LP, %RCX_LP
177	jb HIDDEN_JUMPTARGET (__chk_fail)
178	END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
179	# endif
180
181	ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), `6`)
182	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
183	# ifdef __ILP32__
184	/ Clear the upper 32 bits. /
185	mov %edx, %edx
186	# endif
187	cmp $VEC_SIZE, %RDX_LP
188	jb L(less_vec)
189	cmp $(VEC_SIZE * `2`), %RDX_LP
190	ja L(stosb_more_2x_vec)
191	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE.*
192	*/
193	VMOVU %VEC(`0`), (%rax)
194	VMOVU %VEC(`0`), -VEC_SIZE(%rax, %rdx)
195	VZEROUPPER_RETURN
196	#endif
197
198	.p2align `4`,, `10`
199	L(last_2x_vec):
200	#ifdef USE_LESS_VEC_MASK_STORE
201	VMOVU %VEC(`0`), (VEC_SIZE * `2` + LOOP_4X_OFFSET)(%rcx)
202	VMOVU %VEC(`0`), (VEC_SIZE * `3` + LOOP_4X_OFFSET)(%rcx)
203	#else
204	VMOVU %VEC(`0`), (VEC_SIZE * -`2`)(%rdi)
205	VMOVU %VEC(`0`), (VEC_SIZE * -`1`)(%rdi)
206	#endif
207	VZEROUPPER_RETURN
208
209	/ If have AVX512 mask instructions put L(less_vec) close to*
210	entry as it doesn't take much space and is likely a hot target.
211	*/
212	#ifdef USE_LESS_VEC_MASK_STORE
213	.p2align `4`,, `10`
214	L(less_vec):
215	/ Less than 1 VEC. /
216	# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
217	# error Unsupported VEC_SIZE!
218	# endif
219	/ Clear high bits from edi. Only keeping bits relevant to page*
220	cross check. Note that we are using rax which is set in
221	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. /*
222	andl $(PAGE_SIZE - `1`), %edi
223	/ Check if VEC_SIZE store cross page. Mask stores suffer*
224	serious performance degradation when it has to fault supress.
225	*/
226	cmpl $(PAGE_SIZE - VEC_SIZE), %edi
227	/ This is generally considered a cold target. /
228	ja L(cross_page)
229	# if VEC_SIZE > 32
230	movq $-`1`, %rcx
231	bzhiq %rdx, %rcx, %rcx
232	kmovq %rcx, %k1
233	# else
234	movl $-`1`, %ecx
235	bzhil %edx, %ecx, %ecx
236	kmovd %ecx, %k1
237	# endif
238	vmovdqu8 %VEC(`0`), (%rax){%k1}
239	VZEROUPPER_RETURN
240
241	# if defined USE_MULTIARCH && IS_IN (libc)
242	/ Include L(stosb_local) here if including L(less_vec) between*
243	L(stosb_more_2x_vec) and ENTRY. This is to cache align the
244	L(stosb_more_2x_vec) target. /*
245	.p2align `4`,, `10`
246	L(stosb_local):
247	movzbl %sil, %eax
248	mov %RDX_LP, %RCX_LP
249	mov %RDI_LP, %RDX_LP
250	rep stosb
251	mov %RDX_LP, %RAX_LP
252	VZEROUPPER_RETURN
253	# endif
254	#endif
255
256	#if defined USE_MULTIARCH && IS_IN (libc)
257	.p2align `4`
258	L(stosb_more_2x_vec):
259	cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
260	ja L(stosb_local)
261	#endif
262	/ Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]*
263	and (4x, 8x] jump to target. /*
264	L(more_2x_vec):
265
266	/ Two different methods of setting up pointers / compare. The*
267	two methods are based on the fact that EVEX/AVX512 mov
268	instructions take more bytes then AVX2/SSE2 mov instructions. As
269	well that EVEX/AVX512 machines also have fast LEA_BID. Both
270	setup and END_REG to avoid complex address mode. For EVEX/AVX512
271	this saves code size and keeps a few targets in one fetch block.
272	For AVX2/SSE2 this helps prevent AGU bottlenecks. /*
273	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
274	/ If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +*
275	LOOP_4X_OFFSET) with LEA_BID. /*
276
277	/ END_REG is rcx for EVEX/AVX512. /
278	leaq -(VEC_SIZE * `4` + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
279	#endif
280
281	/ Stores to first 2x VEC before cmp as any path forward will*
282	require it. /*
283	VMOVU %VEC(`0`), (%rax)
284	VMOVU %VEC(`0`), VEC_SIZE(%rax)
285
286
287	#if !(defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512)
288	/ If AVX2/SSE2 compute END_REG (rdi) with ALU. /
289	addq %rdx, %END_REG
290	#endif
291
292	cmpq $(VEC_SIZE * `4`), %rdx
293	jbe L(last_2x_vec)
294
295	/ Store next 2x vec regardless. /
296	VMOVU %VEC(`0`), (VEC_SIZE * `2`)(%rax)
297	VMOVU %VEC(`0`), (VEC_SIZE * `3`)(%rax)
298
299
300	#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512
301	/ If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add*
302	extra offset to addresses in loop. Used for AVX512 to save space
303	as no way to get (VEC_SIZE 4) in imm8. /
304	# if LOOP_4X_OFFSET == 0
305	subq $-(VEC_SIZE * `4`), %LOOP_REG
306	# endif
307	/ Avoid imm32 compare here to save code size. /
308	cmpq %rdi, %rcx
309	#else
310	addq $-(VEC_SIZE * `4`), %END_REG
311	cmpq $(VEC_SIZE * `8`), %rdx
312	#endif
313	jbe L(last_4x_vec)
314	#if !(defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512)
315	/ Set LOOP_REG (rdx). /
316	leaq (VEC_SIZE * `4`)(%rax), %LOOP_REG
317	#endif
318	/ Align dst for loop. /
319	andq $(VEC_SIZE * -`2`), %LOOP_REG
320	.p2align `4`
321	L(loop):
322	VMOVA %VEC(`0`), LOOP_4X_OFFSET(%LOOP_REG)
323	VMOVA %VEC(`0`), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
324	VMOVA %VEC(`0`), (VEC_SIZE * `2` + LOOP_4X_OFFSET)(%LOOP_REG)
325	VMOVA %VEC(`0`), (VEC_SIZE * `3` + LOOP_4X_OFFSET)(%LOOP_REG)
326	subq $-(VEC_SIZE * `4`), %LOOP_REG
327	cmpq %END_REG, %LOOP_REG
328	jb L(loop)
329	.p2align `4`,, MOV_SIZE
330	L(last_4x_vec):
331	VMOVU %VEC(`0`), LOOP_4X_OFFSET(%END_REG)
332	VMOVU %VEC(`0`), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
333	VMOVU %VEC(`0`), (VEC_SIZE * `2` + LOOP_4X_OFFSET)(%END_REG)
334	VMOVU %VEC(`0`), (VEC_SIZE * `3` + LOOP_4X_OFFSET)(%END_REG)
335	L(return):
336	#if VEC_SIZE > 16
337	ZERO_UPPER_VEC_REGISTERS_RETURN
338	#else
339	ret
340	#endif
341
342	.p2align `4`,, `10`
343	#ifndef USE_LESS_VEC_MASK_STORE
344	# if defined USE_MULTIARCH && IS_IN (libc)
345	/ If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in*
346	range for 2-byte jump encoding. /*
347	L(stosb_local):
348	movzbl %sil, %eax
349	mov %RDX_LP, %RCX_LP
350	mov %RDI_LP, %RDX_LP
351	rep stosb
352	mov %RDX_LP, %RAX_LP
353	VZEROUPPER_RETURN
354	# endif
355	/ Define L(less_vec) only if not otherwise defined. /
356	.p2align `4`
357	L(less_vec):
358	#endif
359	L(cross_page):
360	#if VEC_SIZE > 32
361	cmpl $`32`, %edx
362	jae L(between_32_63)
363	#endif
364	#if VEC_SIZE > 16
365	cmpl $`16`, %edx
366	jae L(between_16_31)
367	#endif
368	MOVQ %XMM0, %rdi
369	cmpl $`8`, %edx
370	jae L(between_8_15)
371	cmpl $`4`, %edx
372	jae L(between_4_7)
373	cmpl $`1`, %edx
374	ja L(between_2_3)
375	jb L(return)
376	movb %sil, (%rax)
377	VZEROUPPER_RETURN
378
379	/ Align small targets only if not doing so would cross a fetch*
380	line. /*
381	#if VEC_SIZE > 32
382	.p2align `4`,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
383	/ From 32 to 63. No branch when size == 32. /
384	L(between_32_63):
385	VMOVU %YMM0, (%rax)
386	VMOVU %YMM0, -`32`(%rax, %rdx)
387	VZEROUPPER_RETURN
388	#endif
389
390	#if VEC_SIZE >= 32
391	.p2align `4`,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
392	L(between_16_31):
393	/ From 16 to 31. No branch when size == 16. /
394	VMOVU %XMM0, (%rax)
395	VMOVU %XMM0, -`16`(%rax, %rdx)
396	VZEROUPPER_RETURN
397	#endif
398
399	.p2align `4`,, SMALL_MEMSET_ALIGN(`3`, RET_SIZE)
400	L(between_8_15):
401	/ From 8 to 15. No branch when size == 8. /
402	movq %rdi, (%rax)
403	movq %rdi, -`8`(%rax, %rdx)
404	VZEROUPPER_RETURN
405
406	.p2align `4`,, SMALL_MEMSET_ALIGN(`2`, RET_SIZE)
407	L(between_4_7):
408	/ From 4 to 7. No branch when size == 4. /
409	movl %edi, (%rax)
410	movl %edi, -`4`(%rax, %rdx)
411	VZEROUPPER_RETURN
412
413	.p2align `4`,, SMALL_MEMSET_ALIGN(`3`, RET_SIZE)
414	L(between_2_3):
415	/ From 2 to 3. No branch when size == 2. /
416	movw %di, (%rax)
417	movb %dil, -`1`(%rax, %rdx)
418	VZEROUPPER_RETURN
419	END (MEMSET_SYMBOL (__memset, unaligned_erms))
420

Browse the source code of glibc/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S