1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35#endif
36
37#ifndef VZEROUPPER
38# if VEC_SIZE > 16
39# define VZEROUPPER vzeroupper
40# else
41# define VZEROUPPER
42# endif
43#endif
44
45#ifndef VZEROUPPER_SHORT_RETURN
46# if VEC_SIZE > 16
47# define VZEROUPPER_SHORT_RETURN vzeroupper
48# else
49# define VZEROUPPER_SHORT_RETURN rep
50# endif
51#endif
52
53#ifndef MOVQ
54# if VEC_SIZE > 16
55# define MOVQ vmovq
56# else
57# define MOVQ movq
58# endif
59#endif
60
61#ifndef SECTION
62# error SECTION is not defined!
63#endif
64
65 .section SECTION(.text),"ax",@progbits
66#if VEC_SIZE == 16 && IS_IN (libc)
67ENTRY (__bzero)
68 mov %RDI_LP, %RAX_LP /* Set return value. */
69 mov %RSI_LP, %RDX_LP /* Set n. */
70 pxor %xmm0, %xmm0
71 jmp L(entry_from_bzero)
72END (__bzero)
73weak_alias (__bzero, bzero)
74#endif
75
76#if IS_IN (libc)
77# if defined SHARED
78ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
79 cmp %RDX_LP, %RCX_LP
80 jb HIDDEN_JUMPTARGET (__chk_fail)
81END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
82# endif
83
84ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
85 shl $2, %RDX_LP
86 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
87 jmp L(entry_from_bzero)
88END (WMEMSET_SYMBOL (__wmemset, unaligned))
89#endif
90
91#if defined SHARED && IS_IN (libc)
92ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
93 cmp %RDX_LP, %RCX_LP
94 jb HIDDEN_JUMPTARGET (__chk_fail)
95END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
96#endif
97
98ENTRY (MEMSET_SYMBOL (__memset, unaligned))
99 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
100# ifdef __ILP32__
101 /* Clear the upper 32 bits. */
102 mov %edx, %edx
103# endif
104L(entry_from_bzero):
105 cmpq $VEC_SIZE, %rdx
106 jb L(less_vec)
107 cmpq $(VEC_SIZE * 2), %rdx
108 ja L(more_2x_vec)
109 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
110 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
111 VMOVU %VEC(0), (%rdi)
112 VZEROUPPER
113 ret
114#if defined USE_MULTIARCH && IS_IN (libc)
115END (MEMSET_SYMBOL (__memset, unaligned))
116
117# if VEC_SIZE == 16
118ENTRY (__memset_chk_erms)
119 cmp %RDX_LP, %RCX_LP
120 jb HIDDEN_JUMPTARGET (__chk_fail)
121END (__memset_chk_erms)
122
123/* Only used to measure performance of REP STOSB. */
124ENTRY (__memset_erms)
125 /* Skip zero length. */
126 test %RDX_LP, %RDX_LP
127 jnz L(stosb)
128 movq %rdi, %rax
129 ret
130# else
131/* Provide a hidden symbol to debugger. */
132 .hidden MEMSET_SYMBOL (__memset, erms)
133ENTRY (MEMSET_SYMBOL (__memset, erms))
134# endif
135L(stosb):
136 /* Issue vzeroupper before rep stosb. */
137 VZEROUPPER
138 mov %RDX_LP, %RCX_LP
139 movzbl %sil, %eax
140 mov %RDI_LP, %RDX_LP
141 rep stosb
142 mov %RDX_LP, %RAX_LP
143 ret
144# if VEC_SIZE == 16
145END (__memset_erms)
146# else
147END (MEMSET_SYMBOL (__memset, erms))
148# endif
149
150# if defined SHARED && IS_IN (libc)
151ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
152 cmp %RDX_LP, %RCX_LP
153 jb HIDDEN_JUMPTARGET (__chk_fail)
154END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
155# endif
156
157ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
158 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
159# ifdef __ILP32__
160 /* Clear the upper 32 bits. */
161 mov %edx, %edx
162# endif
163 cmp $VEC_SIZE, %RDX_LP
164 jb L(less_vec)
165 cmp $(VEC_SIZE * 2), %RDX_LP
166 ja L(stosb_more_2x_vec)
167 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
168 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
169 VMOVU %VEC(0), (%rdi)
170 VZEROUPPER
171 ret
172
173L(stosb_more_2x_vec):
174 cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
175 ja L(stosb)
176#endif
177L(more_2x_vec):
178 cmpq $(VEC_SIZE * 4), %rdx
179 ja L(loop_start)
180 VMOVU %VEC(0), (%rdi)
181 VMOVU %VEC(0), VEC_SIZE(%rdi)
182 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
183 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
184L(return):
185 VZEROUPPER
186 ret
187
188L(loop_start):
189 leaq (VEC_SIZE * 4)(%rdi), %rcx
190 VMOVU %VEC(0), (%rdi)
191 andq $-(VEC_SIZE * 4), %rcx
192 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
193 VMOVU %VEC(0), VEC_SIZE(%rdi)
194 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
195 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
196 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
197 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
198 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
199 addq %rdi, %rdx
200 andq $-(VEC_SIZE * 4), %rdx
201 cmpq %rdx, %rcx
202 je L(return)
203L(loop):
204 VMOVA %VEC(0), (%rcx)
205 VMOVA %VEC(0), VEC_SIZE(%rcx)
206 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
207 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
208 addq $(VEC_SIZE * 4), %rcx
209 cmpq %rcx, %rdx
210 jne L(loop)
211 VZEROUPPER_SHORT_RETURN
212 ret
213L(less_vec):
214 /* Less than 1 VEC. */
215# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
216# error Unsupported VEC_SIZE!
217# endif
218# if VEC_SIZE > 32
219 cmpb $32, %dl
220 jae L(between_32_63)
221# endif
222# if VEC_SIZE > 16
223 cmpb $16, %dl
224 jae L(between_16_31)
225# endif
226 MOVQ %xmm0, %rcx
227 cmpb $8, %dl
228 jae L(between_8_15)
229 cmpb $4, %dl
230 jae L(between_4_7)
231 cmpb $1, %dl
232 ja L(between_2_3)
233 jb 1f
234 movb %cl, (%rdi)
2351:
236 VZEROUPPER
237 ret
238# if VEC_SIZE > 32
239 /* From 32 to 63. No branch when size == 32. */
240L(between_32_63):
241 vmovdqu %ymm0, -32(%rdi,%rdx)
242 vmovdqu %ymm0, (%rdi)
243 VZEROUPPER
244 ret
245# endif
246# if VEC_SIZE > 16
247 /* From 16 to 31. No branch when size == 16. */
248L(between_16_31):
249 vmovdqu %xmm0, -16(%rdi,%rdx)
250 vmovdqu %xmm0, (%rdi)
251 VZEROUPPER
252 ret
253# endif
254 /* From 8 to 15. No branch when size == 8. */
255L(between_8_15):
256 movq %rcx, -8(%rdi,%rdx)
257 movq %rcx, (%rdi)
258 VZEROUPPER
259 ret
260L(between_4_7):
261 /* From 4 to 7. No branch when size == 4. */
262 movl %ecx, -4(%rdi,%rdx)
263 movl %ecx, (%rdi)
264 VZEROUPPER
265 ret
266L(between_2_3):
267 /* From 2 to 3. No branch when size == 2. */
268 movw %cx, -2(%rdi,%rdx)
269 movw %cx, (%rdi)
270 VZEROUPPER
271 ret
272END (MEMSET_SYMBOL (__memset, unaligned_erms))
273