1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35#endif
36
37#ifndef VZEROUPPER
38# if VEC_SIZE > 16
39# define VZEROUPPER vzeroupper
40# else
41# define VZEROUPPER
42# endif
43#endif
44
45#ifndef VZEROUPPER_SHORT_RETURN
46# if VEC_SIZE > 16
47# define VZEROUPPER_SHORT_RETURN vzeroupper
48# else
49# define VZEROUPPER_SHORT_RETURN rep
50# endif
51#endif
52
53#ifndef MOVQ
54# if VEC_SIZE > 16
55# define MOVQ vmovq
56# else
57# define MOVQ movq
58# endif
59#endif
60
61/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
62 up REP STOSB operation, REP STOSB isn't faster on short data. The
63 memset micro benchmark in glibc shows that 2KB is the approximate
64 value above which REP STOSB becomes faster on processors with
65 Enhanced REP STOSB. Since the stored value is fixed, larger register
66 size has minimal impact on threshold. */
67#ifndef REP_STOSB_THRESHOLD
68# define REP_STOSB_THRESHOLD 2048
69#endif
70
71#ifndef SECTION
72# error SECTION is not defined!
73#endif
74
75 .section SECTION(.text),"ax",@progbits
76#if VEC_SIZE == 16 && IS_IN (libc)
77ENTRY (__bzero)
78 movq %rdi, %rax /* Set return value. */
79 movq %rsi, %rdx /* Set n. */
80 pxor %xmm0, %xmm0
81 jmp L(entry_from_bzero)
82END (__bzero)
83weak_alias (__bzero, bzero)
84#endif
85
86#if IS_IN (libc)
87# if defined SHARED
88ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
89 cmpq %rdx, %rcx
90 jb HIDDEN_JUMPTARGET (__chk_fail)
91END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
92# endif
93
94ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
95 shlq $2, %rdx
96 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
97 jmp L(entry_from_bzero)
98END (WMEMSET_SYMBOL (__wmemset, unaligned))
99#endif
100
101#if defined SHARED && IS_IN (libc)
102ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
103 cmpq %rdx, %rcx
104 jb HIDDEN_JUMPTARGET (__chk_fail)
105END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
106#endif
107
108ENTRY (MEMSET_SYMBOL (__memset, unaligned))
109 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
110L(entry_from_bzero):
111 cmpq $VEC_SIZE, %rdx
112 jb L(less_vec)
113 cmpq $(VEC_SIZE * 2), %rdx
114 ja L(more_2x_vec)
115 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
116 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
117 VMOVU %VEC(0), (%rdi)
118 VZEROUPPER
119 ret
120#if defined USE_MULTIARCH && IS_IN (libc)
121END (MEMSET_SYMBOL (__memset, unaligned))
122
123# if VEC_SIZE == 16
124ENTRY (__memset_chk_erms)
125 cmpq %rdx, %rcx
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127END (__memset_chk_erms)
128
129/* Only used to measure performance of REP STOSB. */
130ENTRY (__memset_erms)
131# else
132/* Provide a symbol to debugger. */
133ENTRY (MEMSET_SYMBOL (__memset, erms))
134# endif
135L(stosb):
136 /* Issue vzeroupper before rep stosb. */
137 VZEROUPPER
138 movq %rdx, %rcx
139 movzbl %sil, %eax
140 movq %rdi, %rdx
141 rep stosb
142 movq %rdx, %rax
143 ret
144# if VEC_SIZE == 16
145END (__memset_erms)
146# else
147END (MEMSET_SYMBOL (__memset, erms))
148# endif
149
150# if defined SHARED && IS_IN (libc)
151ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
152 cmpq %rdx, %rcx
153 jb HIDDEN_JUMPTARGET (__chk_fail)
154END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
155# endif
156
157ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
158 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
159 cmpq $VEC_SIZE, %rdx
160 jb L(less_vec)
161 cmpq $(VEC_SIZE * 2), %rdx
162 ja L(stosb_more_2x_vec)
163 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
164 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
165 VMOVU %VEC(0), (%rdi)
166 VZEROUPPER
167 ret
168
169L(stosb_more_2x_vec):
170 cmpq $REP_STOSB_THRESHOLD, %rdx
171 ja L(stosb)
172#endif
173L(more_2x_vec):
174 cmpq $(VEC_SIZE * 4), %rdx
175 ja L(loop_start)
176 VMOVU %VEC(0), (%rdi)
177 VMOVU %VEC(0), VEC_SIZE(%rdi)
178 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
179 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
180L(return):
181 VZEROUPPER
182 ret
183
184L(loop_start):
185 leaq (VEC_SIZE * 4)(%rdi), %rcx
186 VMOVU %VEC(0), (%rdi)
187 andq $-(VEC_SIZE * 4), %rcx
188 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
189 VMOVU %VEC(0), VEC_SIZE(%rdi)
190 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
191 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
192 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
193 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
194 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
195 addq %rdi, %rdx
196 andq $-(VEC_SIZE * 4), %rdx
197 cmpq %rdx, %rcx
198 je L(return)
199L(loop):
200 VMOVA %VEC(0), (%rcx)
201 VMOVA %VEC(0), VEC_SIZE(%rcx)
202 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
203 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
204 addq $(VEC_SIZE * 4), %rcx
205 cmpq %rcx, %rdx
206 jne L(loop)
207 VZEROUPPER_SHORT_RETURN
208 ret
209L(less_vec):
210 /* Less than 1 VEC. */
211# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
212# error Unsupported VEC_SIZE!
213# endif
214# if VEC_SIZE > 32
215 cmpb $32, %dl
216 jae L(between_32_63)
217# endif
218# if VEC_SIZE > 16
219 cmpb $16, %dl
220 jae L(between_16_31)
221# endif
222 MOVQ %xmm0, %rcx
223 cmpb $8, %dl
224 jae L(between_8_15)
225 cmpb $4, %dl
226 jae L(between_4_7)
227 cmpb $1, %dl
228 ja L(between_2_3)
229 jb 1f
230 movb %cl, (%rdi)
2311:
232 VZEROUPPER
233 ret
234# if VEC_SIZE > 32
235 /* From 32 to 63. No branch when size == 32. */
236L(between_32_63):
237 vmovdqu %ymm0, -32(%rdi,%rdx)
238 vmovdqu %ymm0, (%rdi)
239 VZEROUPPER
240 ret
241# endif
242# if VEC_SIZE > 16
243 /* From 16 to 31. No branch when size == 16. */
244L(between_16_31):
245 vmovdqu %xmm0, -16(%rdi,%rdx)
246 vmovdqu %xmm0, (%rdi)
247 VZEROUPPER
248 ret
249# endif
250 /* From 8 to 15. No branch when size == 8. */
251L(between_8_15):
252 movq %rcx, -8(%rdi,%rdx)
253 movq %rcx, (%rdi)
254 VZEROUPPER
255 ret
256L(between_4_7):
257 /* From 4 to 7. No branch when size == 4. */
258 movl %ecx, -4(%rdi,%rdx)
259 movl %ecx, (%rdi)
260 VZEROUPPER
261 ret
262L(between_2_3):
263 /* From 2 to 3. No branch when size == 2. */
264 movw %cx, -2(%rdi,%rdx)
265 movw %cx, (%rdi)
266 VZEROUPPER
267 ret
268END (MEMSET_SYMBOL (__memset, unaligned_erms))
269