1/* Common float/double wrapper implementations of vector math
2 functions.
3 Copyright (C) 2022-2023 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
21.macro WRAPPER_IMPL_AVX callee
22 pushq %rbp
23 cfi_adjust_cfa_offset (8)
24 cfi_rel_offset (%rbp, 0)
25 movq %rsp, %rbp
26 cfi_def_cfa_register (%rbp)
27 andq $-32, %rsp
28 subq $32, %rsp
29 vmovaps %ymm0, (%rsp)
30 vzeroupper
31 call HIDDEN_JUMPTARGET(\callee)
32 vmovaps %xmm0, (%rsp)
33 vmovaps 16(%rsp), %xmm0
34 call HIDDEN_JUMPTARGET(\callee)
35 /* combine xmm0 (return of second call) with result of first
36 call (saved on stack). Might be worth exploring logic that
37 uses `vpblend` and reads in ymm1 using -16(rsp). */
38 vmovaps (%rsp), %xmm1
39 vinsertf128 $1, %xmm0, %ymm1, %ymm0
40 movq %rbp, %rsp
41 cfi_def_cfa_register (%rsp)
42 popq %rbp
43 cfi_adjust_cfa_offset (-8)
44 cfi_restore (%rbp)
45 ret
46.endm
47
48/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
49.macro WRAPPER_IMPL_AVX_ff callee
50 pushq %rbp
51 cfi_adjust_cfa_offset (8)
52 cfi_rel_offset (%rbp, 0)
53 movq %rsp, %rbp
54 cfi_def_cfa_register (%rbp)
55 andq $-32, %rsp
56 subq $64, %rsp
57 vmovaps %ymm0, (%rsp)
58 vmovaps %ymm1, 32(%rsp)
59 vzeroupper
60 call HIDDEN_JUMPTARGET(\callee)
61 vmovaps 48(%rsp), %xmm1
62 vmovaps %xmm0, (%rsp)
63 vmovaps 16(%rsp), %xmm0
64 call HIDDEN_JUMPTARGET(\callee)
65 /* combine xmm0 (return of second call) with result of first
66 call (saved on stack). Might be worth exploring logic that
67 uses `vpblend` and reads in ymm1 using -16(rsp). */
68 vmovaps (%rsp), %xmm1
69 vinsertf128 $1, %xmm0, %ymm1, %ymm0
70 movq %rbp, %rsp
71 cfi_def_cfa_register (%rsp)
72 popq %rbp
73 cfi_adjust_cfa_offset (-8)
74 cfi_restore (%rbp)
75 ret
76.endm
77
78/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
79.macro WRAPPER_IMPL_AVX_fFF callee
80 pushq %rbp
81 cfi_adjust_cfa_offset (8)
82 cfi_rel_offset (%rbp, 0)
83 movq %rsp, %rbp
84 andq $-32, %rsp
85 subq $32, %rsp
86 vmovaps %ymm0, (%rsp)
87 pushq %rbx
88 pushq %r14
89 movq %rdi, %rbx
90 movq %rsi, %r14
91 vzeroupper
92 call HIDDEN_JUMPTARGET(\callee)
93 vmovaps 32(%rsp), %xmm0
94 leaq 16(%rbx), %rdi
95 leaq 16(%r14), %rsi
96 call HIDDEN_JUMPTARGET(\callee)
97 popq %r14
98 popq %rbx
99 movq %rbp, %rsp
100 cfi_def_cfa_register (%rsp)
101 popq %rbp
102 cfi_adjust_cfa_offset (-8)
103 cfi_restore (%rbp)
104 ret
105.endm
106
107/* AVX512 ISA version as wrapper to AVX2 ISA version. */
108.macro WRAPPER_IMPL_AVX512 callee
109 pushq %rbp
110 cfi_adjust_cfa_offset (8)
111 cfi_rel_offset (%rbp, 0)
112 movq %rsp, %rbp
113 cfi_def_cfa_register (%rbp)
114 andq $-64, %rsp
115 subq $64, %rsp
116 vmovups %zmm0, (%rsp)
117 call HIDDEN_JUMPTARGET(\callee)
118 vmovupd %ymm0, (%rsp)
119 vmovupd 32(%rsp), %ymm0
120 call HIDDEN_JUMPTARGET(\callee)
121 /* combine ymm0 (return of second call) with result of first
122 call (saved on stack). */
123 vmovaps (%rsp), %ymm1
124 vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
125 movq %rbp, %rsp
126 cfi_def_cfa_register (%rsp)
127 popq %rbp
128 cfi_adjust_cfa_offset (-8)
129 cfi_restore (%rbp)
130 ret
131.endm
132
133/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
134.macro WRAPPER_IMPL_AVX512_ff callee
135 pushq %rbp
136 cfi_adjust_cfa_offset (8)
137 cfi_rel_offset (%rbp, 0)
138 movq %rsp, %rbp
139 cfi_def_cfa_register (%rbp)
140 andq $-64, %rsp
141 addq $-128, %rsp
142 vmovups %zmm0, (%rsp)
143 vmovups %zmm1, 64(%rsp)
144 /* ymm0 and ymm1 are already set. */
145 call HIDDEN_JUMPTARGET(\callee)
146 vmovups 96(%rsp), %ymm1
147 vmovaps %ymm0, (%rsp)
148 vmovups 32(%rsp), %ymm0
149 call HIDDEN_JUMPTARGET(\callee)
150 /* combine ymm0 (return of second call) with result of first
151 call (saved on stack). */
152 vmovaps (%rsp), %ymm1
153 vinserti64x4 $0x1, %ymm0, %zmm1, %zmm0
154 movq %rbp, %rsp
155 cfi_def_cfa_register (%rsp)
156 popq %rbp
157 cfi_adjust_cfa_offset (-8)
158 cfi_restore (%rbp)
159 ret
160.endm
161
162/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
163.macro WRAPPER_IMPL_AVX512_fFF callee
164 pushq %rbp
165 cfi_adjust_cfa_offset (8)
166 cfi_rel_offset (%rbp, 0)
167 movq %rsp, %rbp
168 cfi_def_cfa_register (%rbp)
169 andq $-64, %rsp
170 subq $64, %rsp
171 vmovaps %zmm0, (%rsp)
172 pushq %rbx
173 pushq %r14
174 movq %rdi, %rbx
175 movq %rsi, %r14
176 /* ymm0 is already set. */
177 call HIDDEN_JUMPTARGET(\callee)
178 vmovaps 48(%rsp), %ymm0
179 leaq 32(%rbx), %rdi
180 leaq 32(%r14), %rsi
181 call HIDDEN_JUMPTARGET(\callee)
182 popq %r14
183 popq %rbx
184 movq %rbp, %rsp
185 cfi_def_cfa_register (%rsp)
186 popq %rbp
187 cfi_adjust_cfa_offset (-8)
188 cfi_restore (%rbp)
189 ret
190.endm
191