1 | /* Wrapper implementations of vector math functions. |
2 | Copyright (C) 2014-2016 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | /* SSE2 ISA version as wrapper to scalar. */ |
20 | .macro WRAPPER_IMPL_SSE2 callee |
21 | subq $40, %rsp |
22 | cfi_adjust_cfa_offset(40) |
23 | movaps %xmm0, (%rsp) |
24 | call \callee@PLT |
25 | movsd %xmm0, 16(%rsp) |
26 | movsd 8(%rsp), %xmm0 |
27 | call \callee@PLT |
28 | movsd 16(%rsp), %xmm1 |
29 | movsd %xmm0, 24(%rsp) |
30 | unpcklpd %xmm0, %xmm1 |
31 | movaps %xmm1, %xmm0 |
32 | addq $40, %rsp |
33 | cfi_adjust_cfa_offset(-40) |
34 | ret |
35 | .endm |
36 | |
37 | /* 2 argument SSE2 ISA version as wrapper to scalar. */ |
38 | .macro WRAPPER_IMPL_SSE2_ff callee |
39 | subq $56, %rsp |
40 | cfi_adjust_cfa_offset(56) |
41 | movaps %xmm0, (%rsp) |
42 | movaps %xmm1, 16(%rsp) |
43 | call \callee@PLT |
44 | movsd %xmm0, 32(%rsp) |
45 | movsd 8(%rsp), %xmm0 |
46 | movsd 24(%rsp), %xmm1 |
47 | call \callee@PLT |
48 | movsd 32(%rsp), %xmm1 |
49 | movsd %xmm0, 40(%rsp) |
50 | unpcklpd %xmm0, %xmm1 |
51 | movaps %xmm1, %xmm0 |
52 | addq $56, %rsp |
53 | cfi_adjust_cfa_offset(-56) |
54 | ret |
55 | .endm |
56 | |
57 | /* 3 argument SSE2 ISA version as wrapper to scalar. */ |
58 | .macro WRAPPER_IMPL_SSE2_fFF callee |
59 | pushq %rbp |
60 | cfi_adjust_cfa_offset (8) |
61 | cfi_rel_offset (%rbp, 0) |
62 | pushq %rbx |
63 | cfi_adjust_cfa_offset (8) |
64 | cfi_rel_offset (%rbx, 0) |
65 | movq %rdi, %rbp |
66 | movq %rsi, %rbx |
67 | subq $40, %rsp |
68 | cfi_adjust_cfa_offset(40) |
69 | leaq 16(%rsp), %rsi |
70 | leaq 24(%rsp), %rdi |
71 | movaps %xmm0, (%rsp) |
72 | call \callee@PLT |
73 | leaq 16(%rsp), %rsi |
74 | leaq 24(%rsp), %rdi |
75 | movsd 24(%rsp), %xmm0 |
76 | movapd (%rsp), %xmm1 |
77 | movsd %xmm0, 0(%rbp) |
78 | unpckhpd %xmm1, %xmm1 |
79 | movsd 16(%rsp), %xmm0 |
80 | movsd %xmm0, (%rbx) |
81 | movapd %xmm1, %xmm0 |
82 | call \callee@PLT |
83 | movsd 24(%rsp), %xmm0 |
84 | movsd %xmm0, 8(%rbp) |
85 | movsd 16(%rsp), %xmm0 |
86 | movsd %xmm0, 8(%rbx) |
87 | addq $40, %rsp |
88 | cfi_adjust_cfa_offset(-40) |
89 | popq %rbx |
90 | cfi_adjust_cfa_offset (-8) |
91 | cfi_restore (%rbx) |
92 | popq %rbp |
93 | cfi_adjust_cfa_offset (-8) |
94 | cfi_restore (%rbp) |
95 | ret |
96 | .endm |
97 | |
98 | /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
99 | .macro WRAPPER_IMPL_AVX callee |
100 | pushq %rbp |
101 | cfi_adjust_cfa_offset (8) |
102 | cfi_rel_offset (%rbp, 0) |
103 | movq %rsp, %rbp |
104 | cfi_def_cfa_register (%rbp) |
105 | andq $-32, %rsp |
106 | subq $32, %rsp |
107 | vextractf128 $1, %ymm0, (%rsp) |
108 | vzeroupper |
109 | call HIDDEN_JUMPTARGET(\callee) |
110 | vmovapd %xmm0, 16(%rsp) |
111 | vmovaps (%rsp), %xmm0 |
112 | call HIDDEN_JUMPTARGET(\callee) |
113 | vmovapd %xmm0, %xmm1 |
114 | vmovapd 16(%rsp), %xmm0 |
115 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
116 | movq %rbp, %rsp |
117 | cfi_def_cfa_register (%rsp) |
118 | popq %rbp |
119 | cfi_adjust_cfa_offset (-8) |
120 | cfi_restore (%rbp) |
121 | ret |
122 | .endm |
123 | |
124 | /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
125 | .macro WRAPPER_IMPL_AVX_ff callee |
126 | pushq %rbp |
127 | cfi_adjust_cfa_offset (8) |
128 | cfi_rel_offset (%rbp, 0) |
129 | movq %rsp, %rbp |
130 | cfi_def_cfa_register (%rbp) |
131 | andq $-32, %rsp |
132 | subq $64, %rsp |
133 | vextractf128 $1, %ymm0, 16(%rsp) |
134 | vextractf128 $1, %ymm1, (%rsp) |
135 | vzeroupper |
136 | call HIDDEN_JUMPTARGET(\callee) |
137 | vmovaps %xmm0, 32(%rsp) |
138 | vmovaps 16(%rsp), %xmm0 |
139 | vmovaps (%rsp), %xmm1 |
140 | call HIDDEN_JUMPTARGET(\callee) |
141 | vmovaps %xmm0, %xmm1 |
142 | vmovaps 32(%rsp), %xmm0 |
143 | vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
144 | movq %rbp, %rsp |
145 | cfi_def_cfa_register (%rsp) |
146 | popq %rbp |
147 | cfi_adjust_cfa_offset (-8) |
148 | cfi_restore (%rbp) |
149 | ret |
150 | .endm |
151 | |
152 | /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
153 | .macro WRAPPER_IMPL_AVX_fFF callee |
154 | pushq %rbp |
155 | cfi_adjust_cfa_offset (8) |
156 | cfi_rel_offset (%rbp, 0) |
157 | movq %rsp, %rbp |
158 | cfi_def_cfa_register (%rbp) |
159 | andq $-32, %rsp |
160 | pushq %r13 |
161 | cfi_adjust_cfa_offset (8) |
162 | cfi_rel_offset (%r13, 0) |
163 | pushq %r14 |
164 | cfi_adjust_cfa_offset (8) |
165 | cfi_rel_offset (%r14, 0) |
166 | subq $48, %rsp |
167 | movq %rsi, %r14 |
168 | movq %rdi, %r13 |
169 | vextractf128 $1, %ymm0, 32(%rsp) |
170 | vzeroupper |
171 | call HIDDEN_JUMPTARGET(\callee) |
172 | vmovaps 32(%rsp), %xmm0 |
173 | lea (%rsp), %rdi |
174 | lea 16(%rsp), %rsi |
175 | call HIDDEN_JUMPTARGET(\callee) |
176 | vmovapd (%rsp), %xmm0 |
177 | vmovapd 16(%rsp), %xmm1 |
178 | vmovapd %xmm0, 16(%r13) |
179 | vmovapd %xmm1, 16(%r14) |
180 | addq $48, %rsp |
181 | popq %r14 |
182 | cfi_adjust_cfa_offset (-8) |
183 | cfi_restore (%r14) |
184 | popq %r13 |
185 | cfi_adjust_cfa_offset (-8) |
186 | cfi_restore (%r13) |
187 | movq %rbp, %rsp |
188 | cfi_def_cfa_register (%rsp) |
189 | popq %rbp |
190 | cfi_adjust_cfa_offset (-8) |
191 | cfi_restore (%rbp) |
192 | ret |
193 | .endm |
194 | |
195 | /* AVX512 ISA version as wrapper to AVX2 ISA version. */ |
196 | .macro WRAPPER_IMPL_AVX512 callee |
197 | pushq %rbp |
198 | cfi_adjust_cfa_offset (8) |
199 | cfi_rel_offset (%rbp, 0) |
200 | movq %rsp, %rbp |
201 | cfi_def_cfa_register (%rbp) |
202 | andq $-64, %rsp |
203 | subq $128, %rsp |
204 | /* Below is encoding for vmovups %zmm0, (%rsp). */ |
205 | .byte 0x62 |
206 | .byte 0xf1 |
207 | .byte 0x7c |
208 | .byte 0x48 |
209 | .byte 0x11 |
210 | .byte 0x04 |
211 | .byte 0x24 |
212 | vmovupd (%rsp), %ymm0 |
213 | call HIDDEN_JUMPTARGET(\callee) |
214 | vmovupd %ymm0, 64(%rsp) |
215 | vmovupd 32(%rsp), %ymm0 |
216 | call HIDDEN_JUMPTARGET(\callee) |
217 | vmovupd %ymm0, 96(%rsp) |
218 | /* Below is encoding for vmovups 64(%rsp), %zmm0. */ |
219 | .byte 0x62 |
220 | .byte 0xf1 |
221 | .byte 0x7c |
222 | .byte 0x48 |
223 | .byte 0x10 |
224 | .byte 0x44 |
225 | .byte 0x24 |
226 | .byte 0x01 |
227 | movq %rbp, %rsp |
228 | cfi_def_cfa_register (%rsp) |
229 | popq %rbp |
230 | cfi_adjust_cfa_offset (-8) |
231 | cfi_restore (%rbp) |
232 | ret |
233 | .endm |
234 | |
235 | /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
236 | .macro WRAPPER_IMPL_AVX512_ff callee |
237 | pushq %rbp |
238 | cfi_adjust_cfa_offset (8) |
239 | cfi_rel_offset (%rbp, 0) |
240 | movq %rsp, %rbp |
241 | cfi_def_cfa_register (%rbp) |
242 | andq $-64, %rsp |
243 | subq $192, %rsp |
244 | /* Below is encoding for vmovups %zmm0, (%rsp). */ |
245 | .byte 0x62 |
246 | .byte 0xf1 |
247 | .byte 0x7c |
248 | .byte 0x48 |
249 | .byte 0x11 |
250 | .byte 0x04 |
251 | .byte 0x24 |
252 | /* Below is encoding for vmovups %zmm1, 64(%rsp). */ |
253 | .byte 0x62 |
254 | .byte 0xf1 |
255 | .byte 0x7c |
256 | .byte 0x48 |
257 | .byte 0x11 |
258 | .byte 0x4c |
259 | .byte 0x24 |
260 | .byte 0x01 |
261 | vmovupd (%rsp), %ymm0 |
262 | vmovupd 64(%rsp), %ymm1 |
263 | call HIDDEN_JUMPTARGET(\callee) |
264 | vmovupd %ymm0, 128(%rsp) |
265 | vmovupd 32(%rsp), %ymm0 |
266 | vmovupd 96(%rsp), %ymm1 |
267 | call HIDDEN_JUMPTARGET(\callee) |
268 | vmovupd %ymm0, 160(%rsp) |
269 | /* Below is encoding for vmovups 128(%rsp), %zmm0. */ |
270 | .byte 0x62 |
271 | .byte 0xf1 |
272 | .byte 0x7c |
273 | .byte 0x48 |
274 | .byte 0x10 |
275 | .byte 0x44 |
276 | .byte 0x24 |
277 | .byte 0x02 |
278 | movq %rbp, %rsp |
279 | cfi_def_cfa_register (%rsp) |
280 | popq %rbp |
281 | cfi_adjust_cfa_offset (-8) |
282 | cfi_restore (%rbp) |
283 | ret |
284 | .endm |
285 | |
286 | /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
287 | .macro WRAPPER_IMPL_AVX512_fFF callee |
288 | pushq %rbp |
289 | cfi_adjust_cfa_offset (8) |
290 | cfi_rel_offset (%rbp, 0) |
291 | movq %rsp, %rbp |
292 | cfi_def_cfa_register (%rbp) |
293 | andq $-64, %rsp |
294 | pushq %r12 |
295 | cfi_adjust_cfa_offset (8) |
296 | cfi_rel_offset (%r12, 0) |
297 | pushq %r13 |
298 | cfi_adjust_cfa_offset (8) |
299 | cfi_rel_offset (%r13, 0) |
300 | subq $176, %rsp |
301 | movq %rsi, %r13 |
302 | /* Below is encoding for vmovups %zmm0, (%rsp). */ |
303 | .byte 0x62 |
304 | .byte 0xf1 |
305 | .byte 0x7c |
306 | .byte 0x48 |
307 | .byte 0x11 |
308 | .byte 0x04 |
309 | .byte 0x24 |
310 | movq %rdi, %r12 |
311 | vmovupd (%rsp), %ymm0 |
312 | call HIDDEN_JUMPTARGET(\callee) |
313 | vmovupd 32(%rsp), %ymm0 |
314 | lea 64(%rsp), %rdi |
315 | lea 96(%rsp), %rsi |
316 | call HIDDEN_JUMPTARGET(\callee) |
317 | vmovupd 64(%rsp), %ymm0 |
318 | vmovupd 96(%rsp), %ymm1 |
319 | vmovupd %ymm0, 32(%r12) |
320 | vmovupd %ymm1, 32(%r13) |
321 | vzeroupper |
322 | addq $176, %rsp |
323 | popq %r13 |
324 | cfi_adjust_cfa_offset (-8) |
325 | cfi_restore (%r13) |
326 | popq %r12 |
327 | cfi_adjust_cfa_offset (-8) |
328 | cfi_restore (%r12) |
329 | movq %rbp, %rsp |
330 | cfi_def_cfa_register (%rsp) |
331 | popq %rbp |
332 | cfi_adjust_cfa_offset (-8) |
333 | cfi_restore (%rbp) |
334 | ret |
335 | .endm |
336 | |