1 | /* Function sincosf vectorized with SSE2. |
2 | Copyright (C) 2014-2021 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include "svml_s_wrapper_impl.h" |
21 | |
22 | .text |
23 | ENTRY (_ZGVbN4vl4l4_sincosf) |
24 | WRAPPER_IMPL_SSE2_fFF sincosf |
25 | END (_ZGVbN4vl4l4_sincosf) |
26 | libmvec_hidden_def (_ZGVbN4vl4l4_sincosf) |
27 | |
28 | /* SSE2 ISA version as wrapper to scalar (for vector |
29 | function declared with #pragma omp declare simd notinbranch). */ |
30 | .macro WRAPPER_IMPL_SSE2_fFF_vvv callee |
31 | #ifndef __ILP32__ |
32 | subq $120, %rsp |
33 | cfi_adjust_cfa_offset(120) |
34 | movaps %xmm0, 96(%rsp) |
35 | lea (%rsp), %rdi |
36 | movdqa %xmm1, 32(%rdi) |
37 | lea 16(%rsp), %rsi |
38 | movdqa %xmm2, 32(%rsi) |
39 | movdqa %xmm3, 48(%rsi) |
40 | movdqa %xmm4, 64(%rsi) |
41 | call JUMPTARGET(\callee) |
42 | movss 100(%rsp), %xmm0 |
43 | lea 4(%rsp), %rdi |
44 | lea 20(%rsp), %rsi |
45 | call JUMPTARGET(\callee) |
46 | movss 104(%rsp), %xmm0 |
47 | lea 8(%rsp), %rdi |
48 | lea 24(%rsp), %rsi |
49 | call JUMPTARGET(\callee) |
50 | movss 108(%rsp), %xmm0 |
51 | lea 12(%rsp), %rdi |
52 | lea 28(%rsp), %rsi |
53 | call JUMPTARGET(\callee) |
54 | movq 32(%rsp), %rdx |
55 | movq 40(%rsp), %rsi |
56 | movq 48(%rsp), %r8 |
57 | movq 56(%rsp), %r10 |
58 | movl (%rsp), %eax |
59 | movl 4(%rsp), %ecx |
60 | movl 8(%rsp), %edi |
61 | movl 12(%rsp), %r9d |
62 | movl %eax, (%rdx) |
63 | movl %ecx, (%rsi) |
64 | movq 64(%rsp), %rax |
65 | movq 72(%rsp), %rcx |
66 | movl %edi, (%r8) |
67 | movl %r9d, (%r10) |
68 | movq 80(%rsp), %rdi |
69 | movq 88(%rsp), %r9 |
70 | movl 16(%rsp), %r11d |
71 | movl 20(%rsp), %edx |
72 | movl 24(%rsp), %esi |
73 | movl 28(%rsp), %r8d |
74 | movl %r11d, (%rax) |
75 | movl %edx, (%rcx) |
76 | movl %esi, (%rdi) |
77 | movl %r8d, (%r9) |
78 | addq $120, %rsp |
79 | cfi_adjust_cfa_offset(-120) |
80 | ret |
81 | #else |
82 | pushq %rbp |
83 | .cfi_def_cfa_offset 16 |
84 | .cfi_offset 6, -16 |
85 | pushq %rbx |
86 | .cfi_def_cfa_offset 24 |
87 | .cfi_offset 3, -24 |
88 | subl $88, %esp |
89 | .cfi_def_cfa_offset 112 |
90 | leal 64(%rsp), %esi |
91 | movaps %xmm1, (%esp) |
92 | leal 48(%rsp), %edi |
93 | movaps %xmm2, 16(%esp) |
94 | movq %rsi, %rbp |
95 | movq %rdi, %rbx |
96 | movaps %xmm0, 32(%esp) |
97 | call JUMPTARGET(\callee) |
98 | movups 36(%esp), %xmm0 |
99 | leal 4(%rbp), %esi |
100 | leal 4(%rbx), %edi |
101 | call JUMPTARGET(\callee) |
102 | movups 40(%esp), %xmm0 |
103 | leal 8(%rbp), %esi |
104 | leal 8(%rbx), %edi |
105 | call JUMPTARGET(\callee) |
106 | movups 44(%esp), %xmm0 |
107 | leal 12(%rbp), %esi |
108 | leal 12(%rbx), %edi |
109 | call JUMPTARGET(\callee) |
110 | movq (%esp), %rax |
111 | movss 48(%esp), %xmm0 |
112 | movdqa (%esp), %xmm4 |
113 | movdqa 16(%esp), %xmm7 |
114 | movss %xmm0, (%eax) |
115 | movss 52(%esp), %xmm0 |
116 | pextrd $1, %xmm4, %eax |
117 | movss %xmm0, (%eax) |
118 | movq 8(%esp), %rax |
119 | movss 56(%esp), %xmm0 |
120 | movss %xmm0, (%eax) |
121 | movss 60(%esp), %xmm0 |
122 | pextrd $3, %xmm4, %eax |
123 | movss %xmm0, (%eax) |
124 | movq 16(%esp), %rax |
125 | movss 64(%esp), %xmm0 |
126 | movss %xmm0, (%eax) |
127 | movss 68(%esp), %xmm0 |
128 | pextrd $1, %xmm7, %eax |
129 | movss %xmm0, (%eax) |
130 | movq 24(%esp), %rax |
131 | movss 72(%esp), %xmm0 |
132 | movss %xmm0, (%eax) |
133 | movss 76(%esp), %xmm0 |
134 | pextrd $3, %xmm7, %eax |
135 | movss %xmm0, (%eax) |
136 | addl $88, %esp |
137 | .cfi_def_cfa_offset 24 |
138 | popq %rbx |
139 | .cfi_def_cfa_offset 16 |
140 | popq %rbp |
141 | .cfi_def_cfa_offset 8 |
142 | ret |
143 | #endif |
144 | .endm |
145 | |
146 | ENTRY (_ZGVbN4vvv_sincosf) |
147 | WRAPPER_IMPL_SSE2_fFF_vvv sincosf |
148 | END (_ZGVbN4vvv_sincosf) |
149 | |
150 | #ifndef USE_MULTIARCH |
151 | libmvec_hidden_def (_ZGVbN4vvv_sincosf) |
152 | #endif |
153 | |