1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef VZEROUPPER
34# if VEC_SIZE > 16
35# define VZEROUPPER vzeroupper
36# else
37# define VZEROUPPER
38# endif
39#endif
40
41#ifndef VZEROUPPER_SHORT_RETURN
42# if VEC_SIZE > 16
43# define VZEROUPPER_SHORT_RETURN vzeroupper
44# else
45# define VZEROUPPER_SHORT_RETURN rep
46# endif
47#endif
48
49#ifndef MOVQ
50# if VEC_SIZE > 16
51# define MOVQ vmovq
52# else
53# define MOVQ movq
54# endif
55#endif
56
57/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
58 up REP STOSB operation, REP STOSB isn't faster on short data. The
59 memset micro benchmark in glibc shows that 2KB is the approximate
60 value above which REP STOSB becomes faster on processors with
61 Enhanced REP STOSB. Since the stored value is fixed, larger register
62 size has minimal impact on threshold. */
63#ifndef REP_STOSB_THRESHOLD
64# define REP_STOSB_THRESHOLD 2048
65#endif
66
67#ifndef SECTION
68# error SECTION is not defined!
69#endif
70
71 .section SECTION(.text),"ax",@progbits
72#if VEC_SIZE == 16 && IS_IN (libc)
73ENTRY (__bzero)
74 movq %rdi, %rax /* Set return value. */
75 movq %rsi, %rdx /* Set n. */
76 pxor %xmm0, %xmm0
77 jmp L(entry_from_bzero)
78END (__bzero)
79weak_alias (__bzero, bzero)
80#endif
81
82#if defined SHARED && IS_IN (libc)
83ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
84 cmpq %rdx, %rcx
85 jb HIDDEN_JUMPTARGET (__chk_fail)
86END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
87#endif
88
89ENTRY (MEMSET_SYMBOL (__memset, unaligned))
90L(memset_entry):
91 VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
92L(entry_from_bzero):
93 cmpq $VEC_SIZE, %rdx
94 jb L(less_vec)
95 cmpq $(VEC_SIZE * 2), %rdx
96 ja L(more_2x_vec)
97 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
98 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
99 VMOVU %VEC(0), (%rdi)
100 VZEROUPPER
101 ret
102#if defined USE_MULTIARCH && IS_IN (libc)
103END (MEMSET_SYMBOL (__memset, unaligned))
104
105# if VEC_SIZE == 16
106/* Only used to measure performance of REP STOSB. */
107ENTRY (__memset_erms)
108# else
109/* Provide a symbol to debugger. */
110ENTRY (MEMSET_SYMBOL (__memset, erms))
111# endif
112L(stosb):
113 /* Issue vzeroupper before rep stosb. */
114 VZEROUPPER
115 movq %rdx, %rcx
116 movzbl %sil, %eax
117 movq %rdi, %rdx
118 rep stosb
119 movq %rdx, %rax
120 ret
121# if VEC_SIZE == 16
122END (__memset_erms)
123# else
124END (MEMSET_SYMBOL (__memset, erms))
125# endif
126
127# if defined SHARED && IS_IN (libc)
128ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
129 cmpq %rdx, %rcx
130 jb HIDDEN_JUMPTARGET (__chk_fail)
131END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
132# endif
133
134ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
135 VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
136 cmpq $VEC_SIZE, %rdx
137 jb L(less_vec)
138 cmpq $(VEC_SIZE * 2), %rdx
139 ja L(stosb_more_2x_vec)
140 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
141 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
142 VMOVU %VEC(0), (%rdi)
143 VZEROUPPER
144 ret
145
146L(stosb_more_2x_vec):
147 cmpq $REP_STOSB_THRESHOLD, %rdx
148 ja L(stosb)
149#endif
150L(more_2x_vec):
151 cmpq $(VEC_SIZE * 4), %rdx
152 ja L(loop_start)
153 VMOVU %VEC(0), (%rdi)
154 VMOVU %VEC(0), VEC_SIZE(%rdi)
155 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
156 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
157L(return):
158 VZEROUPPER
159 ret
160
161L(loop_start):
162 leaq (VEC_SIZE * 4)(%rdi), %rcx
163 VMOVU %VEC(0), (%rdi)
164 andq $-(VEC_SIZE * 4), %rcx
165 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
166 VMOVU %VEC(0), VEC_SIZE(%rdi)
167 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
168 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
169 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
170 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
171 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
172 addq %rdi, %rdx
173 andq $-(VEC_SIZE * 4), %rdx
174 cmpq %rdx, %rcx
175 je L(return)
176L(loop):
177 VMOVA %VEC(0), (%rcx)
178 VMOVA %VEC(0), VEC_SIZE(%rcx)
179 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
180 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
181 addq $(VEC_SIZE * 4), %rcx
182 cmpq %rcx, %rdx
183 jne L(loop)
184 VZEROUPPER_SHORT_RETURN
185 ret
186L(less_vec):
187 /* Less than 1 VEC. */
188# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
189# error Unsupported VEC_SIZE!
190# endif
191# if VEC_SIZE > 32
192 cmpb $32, %dl
193 jae L(between_32_63)
194# endif
195# if VEC_SIZE > 16
196 cmpb $16, %dl
197 jae L(between_16_31)
198# endif
199 MOVQ %xmm0, %rcx
200 cmpb $8, %dl
201 jae L(between_8_15)
202 cmpb $4, %dl
203 jae L(between_4_7)
204 cmpb $1, %dl
205 ja L(between_2_3)
206 jb 1f
207 movb %cl, (%rdi)
2081:
209 VZEROUPPER
210 ret
211# if VEC_SIZE > 32
212 /* From 32 to 63. No branch when size == 32. */
213L(between_32_63):
214 vmovdqu %ymm0, -32(%rdi,%rdx)
215 vmovdqu %ymm0, (%rdi)
216 VZEROUPPER
217 ret
218# endif
219# if VEC_SIZE > 16
220 /* From 16 to 31. No branch when size == 16. */
221L(between_16_31):
222 vmovdqu %xmm0, -16(%rdi,%rdx)
223 vmovdqu %xmm0, (%rdi)
224 VZEROUPPER
225 ret
226# endif
227 /* From 8 to 15. No branch when size == 8. */
228L(between_8_15):
229 movq %rcx, -8(%rdi,%rdx)
230 movq %rcx, (%rdi)
231 VZEROUPPER
232 ret
233L(between_4_7):
234 /* From 4 to 7. No branch when size == 4. */
235 movl %ecx, -4(%rdi,%rdx)
236 movl %ecx, (%rdi)
237 VZEROUPPER
238 ret
239L(between_2_3):
240 /* From 2 to 3. No branch when size == 2. */
241 movw %cx, -2(%rdi,%rdx)
242 movw %cx, (%rdi)
243 VZEROUPPER
244 ret
245END (MEMSET_SYMBOL (__memset, unaligned_erms))
246