1 | /* memset optimized with AVX512 for KNL hardware. |
2 | Copyright (C) 2015-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include <isa-level.h> |
21 | |
22 | #if ISA_SHOULD_BUILD (4) |
23 | |
24 | |
25 | #include "asm-syntax.h" |
26 | #ifndef MEMSET |
27 | # define MEMSET __memset_avx512_no_vzeroupper |
28 | # define MEMSET_CHK __memset_chk_avx512_no_vzeroupper |
29 | #endif |
30 | |
31 | .section .text.avx512,"ax" ,@progbits |
32 | #if defined PIC |
33 | ENTRY (MEMSET_CHK) |
34 | cmp %RDX_LP, %RCX_LP |
35 | jb HIDDEN_JUMPTARGET (__chk_fail) |
36 | END (MEMSET_CHK) |
37 | #endif |
38 | |
39 | ENTRY (MEMSET) |
40 | # ifdef __ILP32__ |
41 | /* Clear the upper 32 bits. */ |
42 | mov %edx, %edx |
43 | # endif |
44 | vpxor %xmm0, %xmm0, %xmm0 |
45 | vmovd %esi, %xmm1 |
46 | lea (%rdi, %rdx), %rsi |
47 | mov %rdi, %rax |
48 | vpshufb %xmm0, %xmm1, %xmm0 |
49 | cmp $16, %rdx |
50 | jb L(less_16bytes) |
51 | cmp $512, %rdx |
52 | vbroadcastss %xmm0, %zmm2 |
53 | ja L(512bytesormore) |
54 | cmp $256, %rdx |
55 | jb L(less_256bytes) |
56 | vmovups %zmm2, (%rdi) |
57 | vmovups %zmm2, 0x40(%rdi) |
58 | vmovups %zmm2, 0x80(%rdi) |
59 | vmovups %zmm2, 0xC0(%rdi) |
60 | vmovups %zmm2, -0x100(%rsi) |
61 | vmovups %zmm2, -0xC0(%rsi) |
62 | vmovups %zmm2, -0x80(%rsi) |
63 | vmovups %zmm2, -0x40(%rsi) |
64 | ret |
65 | |
66 | L(less_256bytes): |
67 | cmp $128, %dl |
68 | jb L(less_128bytes) |
69 | vmovups %zmm2, (%rdi) |
70 | vmovups %zmm2, 0x40(%rdi) |
71 | vmovups %zmm2, -0x80(%rsi) |
72 | vmovups %zmm2, -0x40(%rsi) |
73 | ret |
74 | |
75 | L(less_128bytes): |
76 | cmp $64, %dl |
77 | jb L(less_64bytes) |
78 | vmovups %zmm2, (%rdi) |
79 | vmovups %zmm2, -0x40(%rsi) |
80 | ret |
81 | |
82 | L(less_64bytes): |
83 | cmp $32, %dl |
84 | jb L(less_32bytes) |
85 | vmovdqu %ymm2, (%rdi) |
86 | vmovdqu %ymm2, -0x20(%rsi) |
87 | ret |
88 | |
89 | L(less_32bytes): |
90 | vmovdqu %xmm0, (%rdi) |
91 | vmovdqu %xmm0, -0x10(%rsi) |
92 | ret |
93 | |
94 | L(less_16bytes): |
95 | cmp $8, %dl |
96 | jb L(less_8bytes) |
97 | vmovq %xmm0, (%rdi) |
98 | vmovq %xmm0, -0x08(%rsi) |
99 | ret |
100 | |
101 | L(less_8bytes): |
102 | vmovd %xmm0, %ecx |
103 | cmp $4, %dl |
104 | jb L(less_4bytes) |
105 | mov %ecx, (%rdi) |
106 | mov %ecx, -0x04(%rsi) |
107 | ret |
108 | |
109 | L(less_4bytes): |
110 | cmp $2, %dl |
111 | jb L(less_2bytes) |
112 | mov %cx, (%rdi) |
113 | mov %cx, -0x02(%rsi) |
114 | ret |
115 | |
116 | L(less_2bytes): |
117 | cmp $1, %dl |
118 | jb L(less_1bytes) |
119 | mov %cl, (%rdi) |
120 | L(less_1bytes): |
121 | ret |
122 | |
123 | L(512bytesormore): |
124 | mov __x86_shared_cache_size_half(%rip), %rcx |
125 | cmp %rcx, %rdx |
126 | ja L(preloop_large) |
127 | cmp $1024, %rdx |
128 | ja L(1024bytesormore) |
129 | |
130 | vmovups %zmm2, (%rdi) |
131 | vmovups %zmm2, 0x40(%rdi) |
132 | vmovups %zmm2, 0x80(%rdi) |
133 | vmovups %zmm2, 0xC0(%rdi) |
134 | vmovups %zmm2, 0x100(%rdi) |
135 | vmovups %zmm2, 0x140(%rdi) |
136 | vmovups %zmm2, 0x180(%rdi) |
137 | vmovups %zmm2, 0x1C0(%rdi) |
138 | vmovups %zmm2, -0x200(%rsi) |
139 | vmovups %zmm2, -0x1C0(%rsi) |
140 | vmovups %zmm2, -0x180(%rsi) |
141 | vmovups %zmm2, -0x140(%rsi) |
142 | vmovups %zmm2, -0x100(%rsi) |
143 | vmovups %zmm2, -0xC0(%rsi) |
144 | vmovups %zmm2, -0x80(%rsi) |
145 | vmovups %zmm2, -0x40(%rsi) |
146 | ret |
147 | |
148 | /* Align on 64 and loop with aligned stores. */ |
149 | L(1024bytesormore): |
150 | sub $0x100, %rsi |
151 | vmovups %zmm2, (%rax) |
152 | and $-0x40, %rdi |
153 | add $0x40, %rdi |
154 | |
155 | L(gobble_256bytes_loop): |
156 | vmovaps %zmm2, (%rdi) |
157 | vmovaps %zmm2, 0x40(%rdi) |
158 | vmovaps %zmm2, 0x80(%rdi) |
159 | vmovaps %zmm2, 0xC0(%rdi) |
160 | add $0x100, %rdi |
161 | cmp %rsi, %rdi |
162 | jb L(gobble_256bytes_loop) |
163 | vmovups %zmm2, (%rsi) |
164 | vmovups %zmm2, 0x40(%rsi) |
165 | vmovups %zmm2, 0x80(%rsi) |
166 | vmovups %zmm2, 0xC0(%rsi) |
167 | ret |
168 | |
169 | /* Align on 128 and loop with non-temporal stores. */ |
170 | L(preloop_large): |
171 | and $-0x80, %rdi |
172 | add $0x80, %rdi |
173 | vmovups %zmm2, (%rax) |
174 | vmovups %zmm2, 0x40(%rax) |
175 | sub $0x200, %rsi |
176 | |
177 | L(gobble_512bytes_nt_loop): |
178 | vmovntdq %zmm2, (%rdi) |
179 | vmovntdq %zmm2, 0x40(%rdi) |
180 | vmovntdq %zmm2, 0x80(%rdi) |
181 | vmovntdq %zmm2, 0xC0(%rdi) |
182 | vmovntdq %zmm2, 0x100(%rdi) |
183 | vmovntdq %zmm2, 0x140(%rdi) |
184 | vmovntdq %zmm2, 0x180(%rdi) |
185 | vmovntdq %zmm2, 0x1C0(%rdi) |
186 | add $0x200, %rdi |
187 | cmp %rsi, %rdi |
188 | jb L(gobble_512bytes_nt_loop) |
189 | sfence |
190 | vmovups %zmm2, (%rsi) |
191 | vmovups %zmm2, 0x40(%rsi) |
192 | vmovups %zmm2, 0x80(%rsi) |
193 | vmovups %zmm2, 0xC0(%rsi) |
194 | vmovups %zmm2, 0x100(%rsi) |
195 | vmovups %zmm2, 0x140(%rsi) |
196 | vmovups %zmm2, 0x180(%rsi) |
197 | vmovups %zmm2, 0x1C0(%rsi) |
198 | ret |
199 | END (MEMSET) |
200 | #endif |
201 | |