1 | /* Common float/double wrapper implementations of vector math |
2 | functions. |
3 | Copyright (C) 2022-2023 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | /* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
21 | .macro WRAPPER_IMPL_AVX callee |
22 | pushq %rbp |
23 | cfi_adjust_cfa_offset (8) |
24 | cfi_rel_offset (%rbp, 0) |
25 | movq %rsp, %rbp |
26 | cfi_def_cfa_register (%rbp) |
27 | andq $-32, %rsp |
28 | subq $32, %rsp |
29 | vmovaps %ymm0, (%rsp) |
30 | vzeroupper |
31 | call HIDDEN_JUMPTARGET(\callee) |
32 | vmovaps %xmm0, (%rsp) |
33 | vmovaps 16(%rsp), %xmm0 |
34 | call HIDDEN_JUMPTARGET(\callee) |
35 | /* combine xmm0 (return of second call) with result of first |
36 | call (saved on stack). Might be worth exploring logic that |
37 | uses `vpblend` and reads in ymm1 using -16(rsp). */ |
38 | vmovaps (%rsp), %xmm1 |
39 | vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
40 | movq %rbp, %rsp |
41 | cfi_def_cfa_register (%rsp) |
42 | popq %rbp |
43 | cfi_adjust_cfa_offset (-8) |
44 | cfi_restore (%rbp) |
45 | ret |
46 | .endm |
47 | |
48 | /* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
49 | .macro WRAPPER_IMPL_AVX_ff callee |
50 | pushq %rbp |
51 | cfi_adjust_cfa_offset (8) |
52 | cfi_rel_offset (%rbp, 0) |
53 | movq %rsp, %rbp |
54 | cfi_def_cfa_register (%rbp) |
55 | andq $-32, %rsp |
56 | subq $64, %rsp |
57 | vmovaps %ymm0, (%rsp) |
58 | vmovaps %ymm1, 32(%rsp) |
59 | vzeroupper |
60 | call HIDDEN_JUMPTARGET(\callee) |
61 | vmovaps 48(%rsp), %xmm1 |
62 | vmovaps %xmm0, (%rsp) |
63 | vmovaps 16(%rsp), %xmm0 |
64 | call HIDDEN_JUMPTARGET(\callee) |
65 | /* combine xmm0 (return of second call) with result of first |
66 | call (saved on stack). Might be worth exploring logic that |
67 | uses `vpblend` and reads in ymm1 using -16(rsp). */ |
68 | vmovaps (%rsp), %xmm1 |
69 | vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
70 | movq %rbp, %rsp |
71 | cfi_def_cfa_register (%rsp) |
72 | popq %rbp |
73 | cfi_adjust_cfa_offset (-8) |
74 | cfi_restore (%rbp) |
75 | ret |
76 | .endm |
77 | |
78 | /* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */ |
79 | .macro WRAPPER_IMPL_AVX_fFF callee |
80 | pushq %rbp |
81 | cfi_adjust_cfa_offset (8) |
82 | cfi_rel_offset (%rbp, 0) |
83 | movq %rsp, %rbp |
84 | andq $-32, %rsp |
85 | subq $32, %rsp |
86 | vmovaps %ymm0, (%rsp) |
87 | pushq %rbx |
88 | pushq %r14 |
89 | movq %rdi, %rbx |
90 | movq %rsi, %r14 |
91 | vzeroupper |
92 | call HIDDEN_JUMPTARGET(\callee) |
93 | vmovaps 32(%rsp), %xmm0 |
94 | leaq 16(%rbx), %rdi |
95 | leaq 16(%r14), %rsi |
96 | call HIDDEN_JUMPTARGET(\callee) |
97 | popq %r14 |
98 | popq %rbx |
99 | movq %rbp, %rsp |
100 | cfi_def_cfa_register (%rsp) |
101 | popq %rbp |
102 | cfi_adjust_cfa_offset (-8) |
103 | cfi_restore (%rbp) |
104 | ret |
105 | .endm |
106 | |
107 | /* AVX512 ISA version as wrapper to AVX2 ISA version. */ |
108 | .macro WRAPPER_IMPL_AVX512 callee |
109 | pushq %rbp |
110 | cfi_adjust_cfa_offset (8) |
111 | cfi_rel_offset (%rbp, 0) |
112 | movq %rsp, %rbp |
113 | cfi_def_cfa_register (%rbp) |
114 | andq $-64, %rsp |
115 | subq $64, %rsp |
116 | vmovups %zmm0, (%rsp) |
117 | call HIDDEN_JUMPTARGET(\callee) |
118 | vmovupd %ymm0, (%rsp) |
119 | vmovupd 32(%rsp), %ymm0 |
120 | call HIDDEN_JUMPTARGET(\callee) |
121 | /* combine ymm0 (return of second call) with result of first |
122 | call (saved on stack). */ |
123 | vmovaps (%rsp), %ymm1 |
124 | vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0 |
125 | movq %rbp, %rsp |
126 | cfi_def_cfa_register (%rsp) |
127 | popq %rbp |
128 | cfi_adjust_cfa_offset (-8) |
129 | cfi_restore (%rbp) |
130 | ret |
131 | .endm |
132 | |
133 | /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
134 | .macro WRAPPER_IMPL_AVX512_ff callee |
135 | pushq %rbp |
136 | cfi_adjust_cfa_offset (8) |
137 | cfi_rel_offset (%rbp, 0) |
138 | movq %rsp, %rbp |
139 | cfi_def_cfa_register (%rbp) |
140 | andq $-64, %rsp |
141 | addq $-128, %rsp |
142 | vmovups %zmm0, (%rsp) |
143 | vmovups %zmm1, 64(%rsp) |
144 | /* ymm0 and ymm1 are already set. */ |
145 | call HIDDEN_JUMPTARGET(\callee) |
146 | vmovups 96(%rsp), %ymm1 |
147 | vmovaps %ymm0, (%rsp) |
148 | vmovups 32(%rsp), %ymm0 |
149 | call HIDDEN_JUMPTARGET(\callee) |
150 | /* combine ymm0 (return of second call) with result of first |
151 | call (saved on stack). */ |
152 | vmovaps (%rsp), %ymm1 |
153 | vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0 |
154 | movq %rbp, %rsp |
155 | cfi_def_cfa_register (%rsp) |
156 | popq %rbp |
157 | cfi_adjust_cfa_offset (-8) |
158 | cfi_restore (%rbp) |
159 | ret |
160 | .endm |
161 | |
162 | /* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */ |
163 | .macro WRAPPER_IMPL_AVX512_fFF callee |
164 | pushq %rbp |
165 | cfi_adjust_cfa_offset (8) |
166 | cfi_rel_offset (%rbp, 0) |
167 | movq %rsp, %rbp |
168 | cfi_def_cfa_register (%rbp) |
169 | andq $-64, %rsp |
170 | subq $64, %rsp |
171 | vmovaps %zmm0, (%rsp) |
172 | pushq %rbx |
173 | pushq %r14 |
174 | movq %rdi, %rbx |
175 | movq %rsi, %r14 |
176 | /* ymm0 is already set. */ |
177 | call HIDDEN_JUMPTARGET(\callee) |
178 | vmovaps 48(%rsp), %ymm0 |
179 | leaq 32(%rbx), %rdi |
180 | leaq 32(%r14), %rsi |
181 | call HIDDEN_JUMPTARGET(\callee) |
182 | popq %r14 |
183 | popq %rbx |
184 | movq %rbp, %rsp |
185 | cfi_def_cfa_register (%rsp) |
186 | popq %rbp |
187 | cfi_adjust_cfa_offset (-8) |
188 | cfi_restore (%rbp) |
189 | ret |
190 | .endm |
191 | |