1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35#endif
36
37#ifndef XMM0
38# define XMM0 xmm0
39#endif
40
41#ifndef YMM0
42# define YMM0 ymm0
43#endif
44
45#ifndef VZEROUPPER
46# if VEC_SIZE > 16
47# define VZEROUPPER vzeroupper
48# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
49# else
50# define VZEROUPPER
51# endif
52#endif
53
54#ifndef VZEROUPPER_SHORT_RETURN
55# define VZEROUPPER_SHORT_RETURN rep; ret
56#endif
57
58#ifndef MOVQ
59# if VEC_SIZE > 16
60# define MOVQ vmovq
61# else
62# define MOVQ movq
63# endif
64#endif
65
66#define PAGE_SIZE 4096
67
68#ifndef SECTION
69# error SECTION is not defined!
70#endif
71
72 .section SECTION(.text),"ax",@progbits
73#if VEC_SIZE == 16 && IS_IN (libc)
74ENTRY (__bzero)
75 mov %RDI_LP, %RAX_LP /* Set return value. */
76 mov %RSI_LP, %RDX_LP /* Set n. */
77 pxor %XMM0, %XMM0
78 jmp L(entry_from_bzero)
79END (__bzero)
80weak_alias (__bzero, bzero)
81#endif
82
83#if IS_IN (libc)
84# if defined SHARED
85ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
86 cmp %RDX_LP, %RCX_LP
87 jb HIDDEN_JUMPTARGET (__chk_fail)
88END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
89# endif
90
91ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
92 shl $2, %RDX_LP
93 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
94 jmp L(entry_from_bzero)
95END (WMEMSET_SYMBOL (__wmemset, unaligned))
96#endif
97
98#if defined SHARED && IS_IN (libc)
99ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
100 cmp %RDX_LP, %RCX_LP
101 jb HIDDEN_JUMPTARGET (__chk_fail)
102END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
103#endif
104
105ENTRY (MEMSET_SYMBOL (__memset, unaligned))
106 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
107# ifdef __ILP32__
108 /* Clear the upper 32 bits. */
109 mov %edx, %edx
110# endif
111L(entry_from_bzero):
112 cmpq $VEC_SIZE, %rdx
113 jb L(less_vec)
114 cmpq $(VEC_SIZE * 2), %rdx
115 ja L(more_2x_vec)
116 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
117 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
118 VMOVU %VEC(0), (%rdi)
119 VZEROUPPER_RETURN
120#if defined USE_MULTIARCH && IS_IN (libc)
121END (MEMSET_SYMBOL (__memset, unaligned))
122
123# if VEC_SIZE == 16
124ENTRY (__memset_chk_erms)
125 cmp %RDX_LP, %RCX_LP
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127END (__memset_chk_erms)
128
129/* Only used to measure performance of REP STOSB. */
130ENTRY (__memset_erms)
131 /* Skip zero length. */
132 test %RDX_LP, %RDX_LP
133 jnz L(stosb)
134 movq %rdi, %rax
135 ret
136# else
137/* Provide a hidden symbol to debugger. */
138 .hidden MEMSET_SYMBOL (__memset, erms)
139ENTRY (MEMSET_SYMBOL (__memset, erms))
140# endif
141L(stosb):
142 mov %RDX_LP, %RCX_LP
143 movzbl %sil, %eax
144 mov %RDI_LP, %RDX_LP
145 rep stosb
146 mov %RDX_LP, %RAX_LP
147 VZEROUPPER_RETURN
148# if VEC_SIZE == 16
149END (__memset_erms)
150# else
151END (MEMSET_SYMBOL (__memset, erms))
152# endif
153
154# if defined SHARED && IS_IN (libc)
155ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
156 cmp %RDX_LP, %RCX_LP
157 jb HIDDEN_JUMPTARGET (__chk_fail)
158END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
159# endif
160
161ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
162 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
163# ifdef __ILP32__
164 /* Clear the upper 32 bits. */
165 mov %edx, %edx
166# endif
167 cmp $VEC_SIZE, %RDX_LP
168 jb L(less_vec)
169 cmp $(VEC_SIZE * 2), %RDX_LP
170 ja L(stosb_more_2x_vec)
171 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
172 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
173 VMOVU %VEC(0), (%rdi)
174 VZEROUPPER_RETURN
175
176 .p2align 4
177L(stosb_more_2x_vec):
178 cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
179 ja L(stosb)
180#else
181 .p2align 4
182#endif
183L(more_2x_vec):
184 /* Stores to first 2x VEC before cmp as any path forward will
185 require it. */
186 VMOVU %VEC(0), (%rdi)
187 VMOVU %VEC(0), VEC_SIZE(%rdi)
188 cmpq $(VEC_SIZE * 4), %rdx
189 ja L(loop_start)
190 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
191 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
192L(return):
193#if VEC_SIZE > 16
194 ZERO_UPPER_VEC_REGISTERS_RETURN
195#else
196 ret
197#endif
198
199L(loop_start):
200 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
201 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
202 cmpq $(VEC_SIZE * 8), %rdx
203 jbe L(loop_end)
204 andq $-(VEC_SIZE * 2), %rdi
205 subq $-(VEC_SIZE * 4), %rdi
206 leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
207 .p2align 4
208L(loop):
209 VMOVA %VEC(0), (%rdi)
210 VMOVA %VEC(0), VEC_SIZE(%rdi)
211 VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
212 VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
213 subq $-(VEC_SIZE * 4), %rdi
214 cmpq %rcx, %rdi
215 jb L(loop)
216L(loop_end):
217 /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
218 rdx as length is also unchanged. */
219 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
220 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
221 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
222 VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
223 VZEROUPPER_SHORT_RETURN
224
225 .p2align 4
226L(less_vec):
227 /* Less than 1 VEC. */
228# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
229# error Unsupported VEC_SIZE!
230# endif
231# ifdef USE_LESS_VEC_MASK_STORE
232 /* Clear high bits from edi. Only keeping bits relevant to page
233 cross check. Note that we are using rax which is set in
234 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
235 */
236 andl $(PAGE_SIZE - 1), %edi
237 /* Check if VEC_SIZE store cross page. Mask stores suffer serious
238 performance degradation when it has to fault supress. */
239 cmpl $(PAGE_SIZE - VEC_SIZE), %edi
240 ja L(cross_page)
241# if VEC_SIZE > 32
242 movq $-1, %rcx
243 bzhiq %rdx, %rcx, %rcx
244 kmovq %rcx, %k1
245# else
246 movl $-1, %ecx
247 bzhil %edx, %ecx, %ecx
248 kmovd %ecx, %k1
249# endif
250 vmovdqu8 %VEC(0), (%rax) {%k1}
251 VZEROUPPER_RETURN
252
253 .p2align 4
254L(cross_page):
255# endif
256# if VEC_SIZE > 32
257 cmpb $32, %dl
258 jae L(between_32_63)
259# endif
260# if VEC_SIZE > 16
261 cmpb $16, %dl
262 jae L(between_16_31)
263# endif
264 MOVQ %XMM0, %rcx
265 cmpb $8, %dl
266 jae L(between_8_15)
267 cmpb $4, %dl
268 jae L(between_4_7)
269 cmpb $1, %dl
270 ja L(between_2_3)
271 jb 1f
272 movb %cl, (%rax)
2731:
274 VZEROUPPER_RETURN
275# if VEC_SIZE > 32
276 /* From 32 to 63. No branch when size == 32. */
277L(between_32_63):
278 VMOVU %YMM0, -32(%rax,%rdx)
279 VMOVU %YMM0, (%rax)
280 VZEROUPPER_RETURN
281# endif
282# if VEC_SIZE > 16
283 /* From 16 to 31. No branch when size == 16. */
284L(between_16_31):
285 VMOVU %XMM0, -16(%rax,%rdx)
286 VMOVU %XMM0, (%rax)
287 VZEROUPPER_RETURN
288# endif
289 /* From 8 to 15. No branch when size == 8. */
290L(between_8_15):
291 movq %rcx, -8(%rax,%rdx)
292 movq %rcx, (%rax)
293 VZEROUPPER_RETURN
294L(between_4_7):
295 /* From 4 to 7. No branch when size == 4. */
296 movl %ecx, -4(%rax,%rdx)
297 movl %ecx, (%rax)
298 VZEROUPPER_RETURN
299L(between_2_3):
300 /* From 2 to 3. No branch when size == 2. */
301 movw %cx, -2(%rax,%rdx)
302 movw %cx, (%rax)
303 VZEROUPPER_RETURN
304END (MEMSET_SYMBOL (__memset, unaligned_erms))
305