1 | /* Wrapper implementations of vector math functions. |
2 | Copyright (C) 2014-2016 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | /* SSE2 ISA version as wrapper to scalar. */ |
20 | .macro WRAPPER_IMPL_SSE2 callee |
21 | subq $40, %rsp |
22 | cfi_adjust_cfa_offset(40) |
23 | movaps %xmm0, (%rsp) |
24 | call \callee@PLT |
25 | movss %xmm0, 16(%rsp) |
26 | movss 4(%rsp), %xmm0 |
27 | call \callee@PLT |
28 | movss %xmm0, 20(%rsp) |
29 | movss 8(%rsp), %xmm0 |
30 | call \callee@PLT |
31 | movss %xmm0, 24(%rsp) |
32 | movss 12(%rsp), %xmm0 |
33 | call \callee@PLT |
34 | movss 16(%rsp), %xmm3 |
35 | movss 20(%rsp), %xmm2 |
36 | movss 24(%rsp), %xmm1 |
37 | movss %xmm0, 28(%rsp) |
38 | unpcklps %xmm1, %xmm3 |
39 | unpcklps %xmm0, %xmm2 |
40 | unpcklps %xmm2, %xmm3 |
41 | movaps %xmm3, %xmm0 |
42 | addq $40, %rsp |
43 | cfi_adjust_cfa_offset(-40) |
44 | ret |
45 | .endm |
46 | |
47 | /* 2 argument SSE2 ISA version as wrapper to scalar. */ |
48 | .macro WRAPPER_IMPL_SSE2_ff callee |
49 | subq $56, %rsp |
50 | cfi_adjust_cfa_offset(56) |
51 | movaps %xmm0, (%rsp) |
52 | movaps %xmm1, 16(%rsp) |
53 | call \callee@PLT |
54 | movss %xmm0, 32(%rsp) |
55 | movss 4(%rsp), %xmm0 |
56 | movss 20(%rsp), %xmm1 |
57 | call \callee@PLT |
58 | movss %xmm0, 36(%rsp) |
59 | movss 8(%rsp), %xmm0 |
60 | movss 24(%rsp), %xmm1 |
61 | call \callee@PLT |
62 | movss %xmm0, 40(%rsp) |
63 | movss 12(%rsp), %xmm0 |
64 | movss 28(%rsp), %xmm1 |
65 | call \callee@PLT |
66 | movss 32(%rsp), %xmm3 |
67 | movss 36(%rsp), %xmm2 |
68 | movss 40(%rsp), %xmm1 |
69 | movss %xmm0, 44(%rsp) |
70 | unpcklps %xmm1, %xmm3 |
71 | unpcklps %xmm0, %xmm2 |
72 | unpcklps %xmm2, %xmm3 |
73 | movaps %xmm3, %xmm0 |
74 | addq $56, %rsp |
75 | cfi_adjust_cfa_offset(-56) |
76 | ret |
77 | .endm |
78 | |
79 | /* 3 argument SSE2 ISA version as wrapper to scalar. */ |
80 | .macro WRAPPER_IMPL_SSE2_fFF callee |
81 | pushq %rbp |
82 | cfi_adjust_cfa_offset (8) |
83 | cfi_rel_offset (%rbp, 0) |
84 | pushq %rbx |
85 | cfi_adjust_cfa_offset (8) |
86 | cfi_rel_offset (%rbx, 0) |
87 | movq %rdi, %rbp |
88 | movq %rsi, %rbx |
89 | subq $40, %rsp |
90 | cfi_adjust_cfa_offset(40) |
91 | leaq 24(%rsp), %rsi |
92 | leaq 28(%rsp), %rdi |
93 | movaps %xmm0, (%rsp) |
94 | call \callee@PLT |
95 | leaq 24(%rsp), %rsi |
96 | leaq 28(%rsp), %rdi |
97 | movss 28(%rsp), %xmm0 |
98 | movss %xmm0, 0(%rbp) |
99 | movaps (%rsp), %xmm1 |
100 | movss 24(%rsp), %xmm0 |
101 | movss %xmm0, (%rbx) |
102 | movaps %xmm1, %xmm0 |
103 | shufps $85, %xmm1, %xmm0 |
104 | call \callee@PLT |
105 | movss 28(%rsp), %xmm0 |
106 | leaq 24(%rsp), %rsi |
107 | movss %xmm0, 4(%rbp) |
108 | leaq 28(%rsp), %rdi |
109 | movaps (%rsp), %xmm1 |
110 | movss 24(%rsp), %xmm0 |
111 | movss %xmm0, 4(%rbx) |
112 | movaps %xmm1, %xmm0 |
113 | unpckhps %xmm1, %xmm0 |
114 | call \callee@PLT |
115 | movaps (%rsp), %xmm1 |
116 | leaq 24(%rsp), %rsi |
117 | leaq 28(%rsp), %rdi |
118 | movss 28(%rsp), %xmm0 |
119 | shufps $255, %xmm1, %xmm1 |
120 | movss %xmm0, 8(%rbp) |
121 | movss 24(%rsp), %xmm0 |
122 | movss %xmm0, 8(%rbx) |
123 | movaps %xmm1, %xmm0 |
124 | call \callee@PLT |
125 | movss 28(%rsp), %xmm0 |
126 | movss %xmm0, 12(%rbp) |
127 | movss 24(%rsp), %xmm0 |
128 | movss %xmm0, 12(%rbx) |
129 | addq $40, %rsp |
130 | cfi_adjust_cfa_offset(-40) |
131 | popq %rbx |
132 | cfi_adjust_cfa_offset (-8) |
133 | cfi_restore (%rbx) |
134 | popq %rbp |
135 | cfi_adjust_cfa_offset (-8) |
136 | cfi_restore (%rbp) |
137 | ret |
138 | .endm |
139 | |
140 | /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
141 | .macro WRAPPER_IMPL_AVX callee |
142 | pushq %rbp |
143 | cfi_adjust_cfa_offset (8) |
144 | cfi_rel_offset (%rbp, 0) |
145 | movq %rsp, %rbp |
146 | cfi_def_cfa_register (%rbp) |
147 | andq $-32, %rsp |
148 | subq $32, %rsp |
149 | vextractf128 $1, %ymm0, (%rsp) |
150 | vzeroupper |
151 | call HIDDEN_JUMPTARGET(\callee) |
152 | vmovaps %xmm0, 16(%rsp) |
153 | vmovaps (%rsp), %xmm0 |
154 | call HIDDEN_JUMPTARGET(\callee) |
155 | vmovaps %xmm0, %xmm1 |
156 | vmovaps 16(%rsp), %xmm0 |
157 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
158 | movq %rbp, %rsp |
159 | cfi_def_cfa_register (%rsp) |
160 | popq %rbp |
161 | cfi_adjust_cfa_offset (-8) |
162 | cfi_restore (%rbp) |
163 | ret |
164 | .endm |
165 | |
166 | /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
167 | .macro WRAPPER_IMPL_AVX_ff callee |
168 | pushq %rbp |
169 | cfi_adjust_cfa_offset (8) |
170 | cfi_rel_offset (%rbp, 0) |
171 | movq %rsp, %rbp |
172 | cfi_def_cfa_register (%rbp) |
173 | andq $-32, %rsp |
174 | subq $64, %rsp |
175 | vextractf128 $1, %ymm0, 16(%rsp) |
176 | vextractf128 $1, %ymm1, (%rsp) |
177 | vzeroupper |
178 | call HIDDEN_JUMPTARGET(\callee) |
179 | vmovaps %xmm0, 32(%rsp) |
180 | vmovaps 16(%rsp), %xmm0 |
181 | vmovaps (%rsp), %xmm1 |
182 | call HIDDEN_JUMPTARGET(\callee) |
183 | vmovaps %xmm0, %xmm1 |
184 | vmovaps 32(%rsp), %xmm0 |
185 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
186 | movq %rbp, %rsp |
187 | cfi_def_cfa_register (%rsp) |
188 | popq %rbp |
189 | cfi_adjust_cfa_offset (-8) |
190 | cfi_restore (%rbp) |
191 | ret |
192 | .endm |
193 | |
194 | /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
195 | .macro WRAPPER_IMPL_AVX_fFF callee |
196 | pushq %rbp |
197 | cfi_adjust_cfa_offset (8) |
198 | cfi_rel_offset (%rbp, 0) |
199 | movq %rsp, %rbp |
200 | cfi_def_cfa_register (%rbp) |
201 | andq $-32, %rsp |
202 | pushq %r13 |
203 | cfi_adjust_cfa_offset (8) |
204 | cfi_rel_offset (%r13, 0) |
205 | pushq %r14 |
206 | cfi_adjust_cfa_offset (8) |
207 | cfi_rel_offset (%r14, 0) |
208 | subq $48, %rsp |
209 | movq %rsi, %r14 |
210 | vmovaps %ymm0, (%rsp) |
211 | movq %rdi, %r13 |
212 | vmovaps 16(%rsp), %xmm1 |
213 | vmovaps %xmm1, 32(%rsp) |
214 | vzeroupper |
215 | vmovaps (%rsp), %xmm0 |
216 | call HIDDEN_JUMPTARGET(\callee) |
217 | vmovaps 32(%rsp), %xmm0 |
218 | lea (%rsp), %rdi |
219 | lea 16(%rsp), %rsi |
220 | call HIDDEN_JUMPTARGET(\callee) |
221 | vmovaps (%rsp), %xmm0 |
222 | vmovaps 16(%rsp), %xmm1 |
223 | vmovaps %xmm0, 16(%r13) |
224 | vmovaps %xmm1, 16(%r14) |
225 | addq $48, %rsp |
226 | popq %r14 |
227 | cfi_adjust_cfa_offset (-8) |
228 | cfi_restore (%r14) |
229 | popq %r13 |
230 | cfi_adjust_cfa_offset (-8) |
231 | cfi_restore (%r13) |
232 | movq %rbp, %rsp |
233 | cfi_def_cfa_register (%rsp) |
234 | popq %rbp |
235 | cfi_adjust_cfa_offset (-8) |
236 | cfi_restore (%rbp) |
237 | ret |
238 | .endm |
239 | |
240 | /* AVX512 ISA version as wrapper to AVX2 ISA version. */ |
241 | .macro WRAPPER_IMPL_AVX512 callee |
242 | pushq %rbp |
243 | cfi_adjust_cfa_offset (8) |
244 | cfi_rel_offset (%rbp, 0) |
245 | movq %rsp, %rbp |
246 | cfi_def_cfa_register (%rbp) |
247 | andq $-64, %rsp |
248 | subq $128, %rsp |
249 | /* Below is encoding for vmovups %zmm0, (%rsp). */ |
250 | .byte 0x62 |
251 | .byte 0xf1 |
252 | .byte 0x7c |
253 | .byte 0x48 |
254 | .byte 0x11 |
255 | .byte 0x04 |
256 | .byte 0x24 |
257 | vmovupd (%rsp), %ymm0 |
258 | call HIDDEN_JUMPTARGET(\callee) |
259 | vmovupd %ymm0, 64(%rsp) |
260 | vmovupd 32(%rsp), %ymm0 |
261 | call HIDDEN_JUMPTARGET(\callee) |
262 | vmovupd %ymm0, 96(%rsp) |
263 | /* Below is encoding for vmovups 64(%rsp), %zmm0. */ |
264 | .byte 0x62 |
265 | .byte 0xf1 |
266 | .byte 0x7c |
267 | .byte 0x48 |
268 | .byte 0x10 |
269 | .byte 0x44 |
270 | .byte 0x24 |
271 | .byte 0x01 |
272 | movq %rbp, %rsp |
273 | cfi_def_cfa_register (%rsp) |
274 | popq %rbp |
275 | cfi_adjust_cfa_offset (-8) |
276 | cfi_restore (%rbp) |
277 | ret |
278 | .endm |
279 | |
280 | /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
281 | .macro WRAPPER_IMPL_AVX512_ff callee |
282 | pushq %rbp |
283 | cfi_adjust_cfa_offset (8) |
284 | cfi_rel_offset (%rbp, 0) |
285 | movq %rsp, %rbp |
286 | cfi_def_cfa_register (%rbp) |
287 | andq $-64, %rsp |
288 | subq $192, %rsp |
289 | /* Below is encoding for vmovups %zmm0, (%rsp). */ |
290 | .byte 0x62 |
291 | .byte 0xf1 |
292 | .byte 0x7c |
293 | .byte 0x48 |
294 | .byte 0x11 |
295 | .byte 0x04 |
296 | .byte 0x24 |
297 | /* Below is encoding for vmovups %zmm1, 64(%rsp). */ |
298 | .byte 0x62 |
299 | .byte 0xf1 |
300 | .byte 0x7c |
301 | .byte 0x48 |
302 | .byte 0x11 |
303 | .byte 0x4c |
304 | .byte 0x24 |
305 | .byte 0x01 |
306 | vmovups (%rsp), %ymm0 |
307 | vmovups 64(%rsp), %ymm1 |
308 | call HIDDEN_JUMPTARGET(\callee) |
309 | vmovups %ymm0, 128(%rsp) |
310 | vmovups 32(%rsp), %ymm0 |
311 | vmovups 96(%rsp), %ymm1 |
312 | call HIDDEN_JUMPTARGET(\callee) |
313 | vmovups %ymm0, 160(%rsp) |
314 | /* Below is encoding for vmovups 128(%rsp), %zmm0. */ |
315 | .byte 0x62 |
316 | .byte 0xf1 |
317 | .byte 0x7c |
318 | .byte 0x48 |
319 | .byte 0x10 |
320 | .byte 0x44 |
321 | .byte 0x24 |
322 | .byte 0x02 |
323 | movq %rbp, %rsp |
324 | cfi_def_cfa_register (%rsp) |
325 | popq %rbp |
326 | cfi_adjust_cfa_offset (-8) |
327 | cfi_restore (%rbp) |
328 | ret |
329 | .endm |
330 | |
331 | /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
332 | .macro WRAPPER_IMPL_AVX512_fFF callee |
333 | pushq %rbp |
334 | cfi_adjust_cfa_offset (8) |
335 | cfi_rel_offset (%rbp, 0) |
336 | movq %rsp, %rbp |
337 | cfi_def_cfa_register (%rbp) |
338 | andq $-64, %rsp |
339 | pushq %r12 |
340 | pushq %r13 |
341 | subq $176, %rsp |
342 | movq %rsi, %r13 |
343 | /* Below is encoding for vmovaps %zmm0, (%rsp). */ |
344 | .byte 0x62 |
345 | .byte 0xf1 |
346 | .byte 0x7c |
347 | .byte 0x48 |
348 | .byte 0x29 |
349 | .byte 0x04 |
350 | .byte 0x24 |
351 | movq %rdi, %r12 |
352 | vmovaps (%rsp), %ymm0 |
353 | call HIDDEN_JUMPTARGET(\callee) |
354 | vmovaps 32(%rsp), %ymm0 |
355 | lea 64(%rsp), %rdi |
356 | lea 96(%rsp), %rsi |
357 | call HIDDEN_JUMPTARGET(\callee) |
358 | vmovaps 64(%rsp), %ymm0 |
359 | vmovaps 96(%rsp), %ymm1 |
360 | vmovaps %ymm0, 32(%r12) |
361 | vmovaps %ymm1, 32(%r13) |
362 | addq $176, %rsp |
363 | popq %r13 |
364 | popq %r12 |
365 | movq %rbp, %rsp |
366 | cfi_def_cfa_register (%rsp) |
367 | popq %rbp |
368 | cfi_adjust_cfa_offset (-8) |
369 | cfi_restore (%rbp) |
370 | ret |
371 | .endm |
372 | |