1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef VZEROUPPER
34# if VEC_SIZE > 16
35# define VZEROUPPER vzeroupper
36# else
37# define VZEROUPPER
38# endif
39#endif
40
41#ifndef VZEROUPPER_SHORT_RETURN
42# if VEC_SIZE > 16
43# define VZEROUPPER_SHORT_RETURN vzeroupper
44# else
45# define VZEROUPPER_SHORT_RETURN rep
46# endif
47#endif
48
49#ifndef MOVQ
50# if VEC_SIZE > 16
51# define MOVQ vmovq
52# else
53# define MOVQ movq
54# endif
55#endif
56
57/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
58 up REP STOSB operation, REP STOSB isn't faster on short data. The
59 memset micro benchmark in glibc shows that 2KB is the approximate
60 value above which REP STOSB becomes faster on processors with
61 Enhanced REP STOSB. Since the stored value is fixed, larger register
62 size has minimal impact on threshold. */
63#ifndef REP_STOSB_THRESHOLD
64# define REP_STOSB_THRESHOLD 2048
65#endif
66
67#ifndef SECTION
68# error SECTION is not defined!
69#endif
70
71 .section SECTION(.text),"ax",@progbits
72#if VEC_SIZE == 16 && IS_IN (libc)
73ENTRY (__bzero)
74 movq %rdi, %rax /* Set return value. */
75 movq %rsi, %rdx /* Set n. */
76 pxor %xmm0, %xmm0
77 jmp L(entry_from_bzero)
78END (__bzero)
79weak_alias (__bzero, bzero)
80#endif
81
82#if defined SHARED && IS_IN (libc)
83ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
84 cmpq %rdx, %rcx
85 jb HIDDEN_JUMPTARGET (__chk_fail)
86END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
87#endif
88
89ENTRY (MEMSET_SYMBOL (__memset, unaligned))
90L(memset_entry):
91 VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
92L(entry_from_bzero):
93 cmpq $VEC_SIZE, %rdx
94 jb L(less_vec)
95 cmpq $(VEC_SIZE * 2), %rdx
96 ja L(more_2x_vec)
97 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
98 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
99 VMOVU %VEC(0), (%rdi)
100 VZEROUPPER
101 ret
102#if defined USE_MULTIARCH && IS_IN (libc)
103END (MEMSET_SYMBOL (__memset, unaligned))
104
105# if VEC_SIZE == 16
106/* Only used to measure performance of REP STOSB. */
107ENTRY (__memset_erms)
108# else
109/* Provide a symbol to debugger. */
110ENTRY (MEMSET_SYMBOL (__memset, erms))
111# endif
112L(stosb):
113 movq %rdx, %rcx
114 movzbl %sil, %eax
115 movq %rdi, %rdx
116 rep stosb
117 movq %rdx, %rax
118 ret
119# if VEC_SIZE == 16
120END (__memset_erms)
121# else
122END (MEMSET_SYMBOL (__memset, erms))
123# endif
124
125# if defined SHARED && IS_IN (libc)
126ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
127 cmpq %rdx, %rcx
128 jb HIDDEN_JUMPTARGET (__chk_fail)
129END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
130# endif
131
132ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
133 VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
134 cmpq $VEC_SIZE, %rdx
135 jb L(less_vec)
136 cmpq $(VEC_SIZE * 2), %rdx
137 ja L(stosb_more_2x_vec)
138 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
139 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
140 VMOVU %VEC(0), (%rdi)
141 VZEROUPPER
142 ret
143
144L(stosb_more_2x_vec):
145 cmpq $REP_STOSB_THRESHOLD, %rdx
146 ja L(stosb)
147#endif
148L(more_2x_vec):
149 cmpq $(VEC_SIZE * 4), %rdx
150 ja L(loop_start)
151 VMOVU %VEC(0), (%rdi)
152 VMOVU %VEC(0), VEC_SIZE(%rdi)
153 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
154 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
155L(return):
156 VZEROUPPER
157 ret
158
159L(loop_start):
160 leaq (VEC_SIZE * 4)(%rdi), %rcx
161 VMOVU %VEC(0), (%rdi)
162 andq $-(VEC_SIZE * 4), %rcx
163 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
164 VMOVU %VEC(0), VEC_SIZE(%rdi)
165 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
166 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
167 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
168 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
169 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
170 addq %rdi, %rdx
171 andq $-(VEC_SIZE * 4), %rdx
172 cmpq %rdx, %rcx
173 je L(return)
174L(loop):
175 VMOVA %VEC(0), (%rcx)
176 VMOVA %VEC(0), VEC_SIZE(%rcx)
177 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
178 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
179 addq $(VEC_SIZE * 4), %rcx
180 cmpq %rcx, %rdx
181 jne L(loop)
182 VZEROUPPER_SHORT_RETURN
183 ret
184L(less_vec):
185 /* Less than 1 VEC. */
186# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
187# error Unsupported VEC_SIZE!
188# endif
189# if VEC_SIZE > 32
190 cmpb $32, %dl
191 jae L(between_32_63)
192# endif
193# if VEC_SIZE > 16
194 cmpb $16, %dl
195 jae L(between_16_31)
196# endif
197 MOVQ %xmm0, %rcx
198 cmpb $8, %dl
199 jae L(between_8_15)
200 cmpb $4, %dl
201 jae L(between_4_7)
202 cmpb $1, %dl
203 ja L(between_2_3)
204 jb 1f
205 movb %cl, (%rdi)
2061:
207 VZEROUPPER
208 ret
209# if VEC_SIZE > 32
210 /* From 32 to 63. No branch when size == 32. */
211L(between_32_63):
212 vmovdqu %ymm0, -32(%rdi,%rdx)
213 vmovdqu %ymm0, (%rdi)
214 VZEROUPPER
215 ret
216# endif
217# if VEC_SIZE > 16
218 /* From 16 to 31. No branch when size == 16. */
219L(between_16_31):
220 vmovdqu %xmm0, -16(%rdi,%rdx)
221 vmovdqu %xmm0, (%rdi)
222 VZEROUPPER
223 ret
224# endif
225 /* From 8 to 15. No branch when size == 8. */
226L(between_8_15):
227 movq %rcx, -8(%rdi,%rdx)
228 movq %rcx, (%rdi)
229 VZEROUPPER
230 ret
231L(between_4_7):
232 /* From 4 to 7. No branch when size == 4. */
233 movl %ecx, -4(%rdi,%rdx)
234 movl %ecx, (%rdi)
235 VZEROUPPER
236 ret
237L(between_2_3):
238 /* From 2 to 3. No branch when size == 2. */
239 movw %cx, -2(%rdi,%rdx)
240 movw %cx, (%rdi)
241 VZEROUPPER
242 ret
243END (MEMSET_SYMBOL (__memset, unaligned_erms))
244