1 | /* Wrapper implementations of vector math functions. |
2 | Copyright (C) 2014-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* SSE2 ISA version as wrapper to scalar. */ |
20 | .macro WRAPPER_IMPL_SSE2 callee |
21 | subq $40, %rsp |
22 | cfi_adjust_cfa_offset(40) |
23 | movaps %xmm0, (%rsp) |
24 | call JUMPTARGET(\callee) |
25 | movss %xmm0, 16(%rsp) |
26 | movss 4(%rsp), %xmm0 |
27 | call JUMPTARGET(\callee) |
28 | movss %xmm0, 20(%rsp) |
29 | movss 8(%rsp), %xmm0 |
30 | call JUMPTARGET(\callee) |
31 | movss %xmm0, 24(%rsp) |
32 | movss 12(%rsp), %xmm0 |
33 | call JUMPTARGET(\callee) |
34 | movss 16(%rsp), %xmm3 |
35 | movss 20(%rsp), %xmm2 |
36 | movss 24(%rsp), %xmm1 |
37 | movss %xmm0, 28(%rsp) |
38 | unpcklps %xmm1, %xmm3 |
39 | unpcklps %xmm0, %xmm2 |
40 | unpcklps %xmm2, %xmm3 |
41 | movaps %xmm3, %xmm0 |
42 | addq $40, %rsp |
43 | cfi_adjust_cfa_offset(-40) |
44 | ret |
45 | .endm |
46 | |
47 | /* 2 argument SSE2 ISA version as wrapper to scalar. */ |
48 | .macro WRAPPER_IMPL_SSE2_ff callee |
49 | subq $56, %rsp |
50 | cfi_adjust_cfa_offset(56) |
51 | movaps %xmm0, (%rsp) |
52 | movaps %xmm1, 16(%rsp) |
53 | call JUMPTARGET(\callee) |
54 | movss %xmm0, 32(%rsp) |
55 | movss 4(%rsp), %xmm0 |
56 | movss 20(%rsp), %xmm1 |
57 | call JUMPTARGET(\callee) |
58 | movss %xmm0, 36(%rsp) |
59 | movss 8(%rsp), %xmm0 |
60 | movss 24(%rsp), %xmm1 |
61 | call JUMPTARGET(\callee) |
62 | movss %xmm0, 40(%rsp) |
63 | movss 12(%rsp), %xmm0 |
64 | movss 28(%rsp), %xmm1 |
65 | call JUMPTARGET(\callee) |
66 | movss 32(%rsp), %xmm3 |
67 | movss 36(%rsp), %xmm2 |
68 | movss 40(%rsp), %xmm1 |
69 | movss %xmm0, 44(%rsp) |
70 | unpcklps %xmm1, %xmm3 |
71 | unpcklps %xmm0, %xmm2 |
72 | unpcklps %xmm2, %xmm3 |
73 | movaps %xmm3, %xmm0 |
74 | addq $56, %rsp |
75 | cfi_adjust_cfa_offset(-56) |
76 | ret |
77 | .endm |
78 | |
79 | /* 3 argument SSE2 ISA version as wrapper to scalar. */ |
80 | .macro WRAPPER_IMPL_SSE2_fFF callee |
81 | pushq %rbp |
82 | cfi_adjust_cfa_offset (8) |
83 | cfi_rel_offset (%rbp, 0) |
84 | pushq %rbx |
85 | cfi_adjust_cfa_offset (8) |
86 | cfi_rel_offset (%rbx, 0) |
87 | movq %rdi, %rbp |
88 | movq %rsi, %rbx |
89 | subq $40, %rsp |
90 | cfi_adjust_cfa_offset(40) |
91 | leaq 24(%rsp), %rsi |
92 | leaq 28(%rsp), %rdi |
93 | movaps %xmm0, (%rsp) |
94 | call JUMPTARGET(\callee) |
95 | leaq 24(%rsp), %rsi |
96 | leaq 28(%rsp), %rdi |
97 | movss 28(%rsp), %xmm0 |
98 | movss %xmm0, 0(%rbp) |
99 | movaps (%rsp), %xmm1 |
100 | movss 24(%rsp), %xmm0 |
101 | movss %xmm0, (%rbx) |
102 | movaps %xmm1, %xmm0 |
103 | shufps $85, %xmm1, %xmm0 |
104 | call JUMPTARGET(\callee) |
105 | movss 28(%rsp), %xmm0 |
106 | leaq 24(%rsp), %rsi |
107 | movss %xmm0, 4(%rbp) |
108 | leaq 28(%rsp), %rdi |
109 | movaps (%rsp), %xmm1 |
110 | movss 24(%rsp), %xmm0 |
111 | movss %xmm0, 4(%rbx) |
112 | movaps %xmm1, %xmm0 |
113 | unpckhps %xmm1, %xmm0 |
114 | call JUMPTARGET(\callee) |
115 | movaps (%rsp), %xmm1 |
116 | leaq 24(%rsp), %rsi |
117 | leaq 28(%rsp), %rdi |
118 | movss 28(%rsp), %xmm0 |
119 | shufps $255, %xmm1, %xmm1 |
120 | movss %xmm0, 8(%rbp) |
121 | movss 24(%rsp), %xmm0 |
122 | movss %xmm0, 8(%rbx) |
123 | movaps %xmm1, %xmm0 |
124 | call JUMPTARGET(\callee) |
125 | movss 28(%rsp), %xmm0 |
126 | movss %xmm0, 12(%rbp) |
127 | movss 24(%rsp), %xmm0 |
128 | movss %xmm0, 12(%rbx) |
129 | addq $40, %rsp |
130 | cfi_adjust_cfa_offset(-40) |
131 | popq %rbx |
132 | cfi_adjust_cfa_offset (-8) |
133 | cfi_restore (%rbx) |
134 | popq %rbp |
135 | cfi_adjust_cfa_offset (-8) |
136 | cfi_restore (%rbp) |
137 | ret |
138 | .endm |
139 | |
140 | /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
141 | .macro WRAPPER_IMPL_AVX callee |
142 | pushq %rbp |
143 | cfi_adjust_cfa_offset (8) |
144 | cfi_rel_offset (%rbp, 0) |
145 | movq %rsp, %rbp |
146 | cfi_def_cfa_register (%rbp) |
147 | andq $-32, %rsp |
148 | subq $32, %rsp |
149 | vextractf128 $1, %ymm0, (%rsp) |
150 | vzeroupper |
151 | call HIDDEN_JUMPTARGET(\callee) |
152 | vmovaps %xmm0, 16(%rsp) |
153 | vmovaps (%rsp), %xmm0 |
154 | call HIDDEN_JUMPTARGET(\callee) |
155 | vmovaps %xmm0, %xmm1 |
156 | vmovaps 16(%rsp), %xmm0 |
157 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
158 | movq %rbp, %rsp |
159 | cfi_def_cfa_register (%rsp) |
160 | popq %rbp |
161 | cfi_adjust_cfa_offset (-8) |
162 | cfi_restore (%rbp) |
163 | ret |
164 | .endm |
165 | |
166 | /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
167 | .macro WRAPPER_IMPL_AVX_ff callee |
168 | pushq %rbp |
169 | cfi_adjust_cfa_offset (8) |
170 | cfi_rel_offset (%rbp, 0) |
171 | movq %rsp, %rbp |
172 | cfi_def_cfa_register (%rbp) |
173 | andq $-32, %rsp |
174 | subq $64, %rsp |
175 | vextractf128 $1, %ymm0, 16(%rsp) |
176 | vextractf128 $1, %ymm1, (%rsp) |
177 | vzeroupper |
178 | call HIDDEN_JUMPTARGET(\callee) |
179 | vmovaps %xmm0, 32(%rsp) |
180 | vmovaps 16(%rsp), %xmm0 |
181 | vmovaps (%rsp), %xmm1 |
182 | call HIDDEN_JUMPTARGET(\callee) |
183 | vmovaps %xmm0, %xmm1 |
184 | vmovaps 32(%rsp), %xmm0 |
185 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
186 | movq %rbp, %rsp |
187 | cfi_def_cfa_register (%rsp) |
188 | popq %rbp |
189 | cfi_adjust_cfa_offset (-8) |
190 | cfi_restore (%rbp) |
191 | ret |
192 | .endm |
193 | |
194 | /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
195 | .macro WRAPPER_IMPL_AVX_fFF callee |
196 | pushq %rbp |
197 | cfi_adjust_cfa_offset (8) |
198 | cfi_rel_offset (%rbp, 0) |
199 | movq %rsp, %rbp |
200 | cfi_def_cfa_register (%rbp) |
201 | andq $-32, %rsp |
202 | pushq %r13 |
203 | cfi_adjust_cfa_offset (8) |
204 | cfi_rel_offset (%r13, 0) |
205 | pushq %r14 |
206 | cfi_adjust_cfa_offset (8) |
207 | cfi_rel_offset (%r14, 0) |
208 | subq $48, %rsp |
209 | movq %rsi, %r14 |
210 | vmovaps %ymm0, (%rsp) |
211 | movq %rdi, %r13 |
212 | vmovaps 16(%rsp), %xmm1 |
213 | vmovaps %xmm1, 32(%rsp) |
214 | vzeroupper |
215 | vmovaps (%rsp), %xmm0 |
216 | call HIDDEN_JUMPTARGET(\callee) |
217 | vmovaps 32(%rsp), %xmm0 |
218 | lea (%rsp), %rdi |
219 | lea 16(%rsp), %rsi |
220 | call HIDDEN_JUMPTARGET(\callee) |
221 | vmovaps (%rsp), %xmm0 |
222 | vmovaps 16(%rsp), %xmm1 |
223 | vmovaps %xmm0, 16(%r13) |
224 | vmovaps %xmm1, 16(%r14) |
225 | addq $48, %rsp |
226 | popq %r14 |
227 | cfi_adjust_cfa_offset (-8) |
228 | cfi_restore (%r14) |
229 | popq %r13 |
230 | cfi_adjust_cfa_offset (-8) |
231 | cfi_restore (%r13) |
232 | movq %rbp, %rsp |
233 | cfi_def_cfa_register (%rsp) |
234 | popq %rbp |
235 | cfi_adjust_cfa_offset (-8) |
236 | cfi_restore (%rbp) |
237 | ret |
238 | .endm |
239 | |
240 | /* AVX512 ISA version as wrapper to AVX2 ISA version. */ |
241 | .macro WRAPPER_IMPL_AVX512 callee |
242 | pushq %rbp |
243 | cfi_adjust_cfa_offset (8) |
244 | cfi_rel_offset (%rbp, 0) |
245 | movq %rsp, %rbp |
246 | cfi_def_cfa_register (%rbp) |
247 | andq $-64, %rsp |
248 | subq $128, %rsp |
249 | vmovups %zmm0, (%rsp) |
250 | vmovupd (%rsp), %ymm0 |
251 | call HIDDEN_JUMPTARGET(\callee) |
252 | vmovupd %ymm0, 64(%rsp) |
253 | vmovupd 32(%rsp), %ymm0 |
254 | call HIDDEN_JUMPTARGET(\callee) |
255 | vmovupd %ymm0, 96(%rsp) |
256 | vmovups 64(%rsp), %zmm0 |
257 | movq %rbp, %rsp |
258 | cfi_def_cfa_register (%rsp) |
259 | popq %rbp |
260 | cfi_adjust_cfa_offset (-8) |
261 | cfi_restore (%rbp) |
262 | ret |
263 | .endm |
264 | |
265 | /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
266 | .macro WRAPPER_IMPL_AVX512_ff callee |
267 | pushq %rbp |
268 | cfi_adjust_cfa_offset (8) |
269 | cfi_rel_offset (%rbp, 0) |
270 | movq %rsp, %rbp |
271 | cfi_def_cfa_register (%rbp) |
272 | andq $-64, %rsp |
273 | subq $192, %rsp |
274 | vmovups %zmm0, (%rsp) |
275 | vmovups %zmm1, 64(%rsp) |
276 | vmovups (%rsp), %ymm0 |
277 | vmovups 64(%rsp), %ymm1 |
278 | call HIDDEN_JUMPTARGET(\callee) |
279 | vmovups %ymm0, 128(%rsp) |
280 | vmovups 32(%rsp), %ymm0 |
281 | vmovups 96(%rsp), %ymm1 |
282 | call HIDDEN_JUMPTARGET(\callee) |
283 | vmovups %ymm0, 160(%rsp) |
284 | vmovups 128(%rsp), %zmm0 |
285 | movq %rbp, %rsp |
286 | cfi_def_cfa_register (%rsp) |
287 | popq %rbp |
288 | cfi_adjust_cfa_offset (-8) |
289 | cfi_restore (%rbp) |
290 | ret |
291 | .endm |
292 | |
293 | /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
294 | .macro WRAPPER_IMPL_AVX512_fFF callee |
295 | pushq %rbp |
296 | cfi_adjust_cfa_offset (8) |
297 | cfi_rel_offset (%rbp, 0) |
298 | movq %rsp, %rbp |
299 | cfi_def_cfa_register (%rbp) |
300 | andq $-64, %rsp |
301 | pushq %r12 |
302 | pushq %r13 |
303 | subq $176, %rsp |
304 | movq %rsi, %r13 |
305 | vmovaps %zmm0, (%rsp) |
306 | movq %rdi, %r12 |
307 | vmovaps (%rsp), %ymm0 |
308 | call HIDDEN_JUMPTARGET(\callee) |
309 | vmovaps 32(%rsp), %ymm0 |
310 | lea 64(%rsp), %rdi |
311 | lea 96(%rsp), %rsi |
312 | call HIDDEN_JUMPTARGET(\callee) |
313 | vmovaps 64(%rsp), %ymm0 |
314 | vmovaps 96(%rsp), %ymm1 |
315 | vmovaps %ymm0, 32(%r12) |
316 | vmovaps %ymm1, 32(%r13) |
317 | addq $176, %rsp |
318 | popq %r13 |
319 | popq %r12 |
320 | movq %rbp, %rsp |
321 | cfi_def_cfa_register (%rsp) |
322 | popq %rbp |
323 | cfi_adjust_cfa_offset (-8) |
324 | cfi_restore (%rbp) |
325 | ret |
326 | .endm |
327 | |