1 | /* Implement powl for x86 using extra-precision log. |
2 | Copyright (C) 2012-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <math.h> |
20 | #include <math_private.h> |
21 | #include <math-underflow.h> |
22 | #include <stdbool.h> |
23 | |
24 | /* High parts and low parts of -log (k/16), for integer k from 12 to |
25 | 24. */ |
26 | |
27 | static const long double powl_log_table[] = |
28 | { |
29 | 0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L, |
30 | 0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L, |
31 | 0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L, |
32 | 0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L, |
33 | -0x0p+0L, 0x0p+0L, |
34 | -0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L, |
35 | -0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L, |
36 | -0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L, |
37 | -0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L, |
38 | -0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L, |
39 | -0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L, |
40 | -0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L, |
41 | -0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L, |
42 | }; |
43 | |
44 | /* High 32 bits of log2 (e), and remainder rounded to 64 bits. */ |
45 | static const long double log2e_hi = 0x1.71547652p+0L; |
46 | static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L; |
47 | |
48 | /* Given a number with high part HI and low part LO, add the number X |
49 | to it and store the result in *RHI and *RLO. It is given that |
50 | either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are |
51 | small enough that no overflow occurs. The result does not need to |
52 | be exact to 128 bits; 78-bit accuracy of the final accumulated |
53 | result suffices. */ |
54 | |
55 | static inline void |
56 | acc_split (long double *rhi, long double *rlo, long double hi, long double lo, |
57 | long double x) |
58 | { |
59 | long double thi = hi + x; |
60 | long double tlo = (hi - thi) + x + lo; |
61 | *rhi = thi + tlo; |
62 | *rlo = (thi - *rhi) + tlo; |
63 | } |
64 | |
65 | extern long double __powl_helper (long double x, long double y); |
66 | libm_hidden_proto (__powl_helper) |
67 | |
68 | /* Given X a value that is finite and nonzero, or a NaN, and Y a |
69 | finite nonzero value with 0x1p-79 <= |Y| <= 0x1p78, compute X to |
70 | the power Y. */ |
71 | |
72 | long double |
73 | __powl_helper (long double x, long double y) |
74 | { |
75 | if (isnan (x)) |
76 | return __ieee754_expl (y * __ieee754_logl (x)); |
77 | bool negate; |
78 | if (x < 0) |
79 | { |
80 | long double absy = fabsl (y); |
81 | if (absy >= 0x1p64L) |
82 | negate = false; |
83 | else |
84 | { |
85 | unsigned long long yll = absy; |
86 | if (yll != absy) |
87 | return __ieee754_expl (y * __ieee754_logl (x)); |
88 | negate = (yll & 1) != 0; |
89 | } |
90 | x = fabsl (x); |
91 | } |
92 | else |
93 | negate = false; |
94 | |
95 | /* We need to compute Y * log2 (X) to at least 64 bits after the |
96 | point for normal results (that is, to at least 78 bits |
97 | precision). */ |
98 | int x_int_exponent; |
99 | long double x_frac; |
100 | x_frac = __frexpl (x, &x_int_exponent); |
101 | if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */ |
102 | { |
103 | x_frac *= 2.0; |
104 | x_int_exponent--; |
105 | } |
106 | |
107 | long double log_x_frac_hi, log_x_frac_lo; |
108 | /* Determine an initial approximation to log (X_FRAC) using |
109 | POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an |
110 | interval (24/25, 26/25). */ |
111 | int k = (int) ((16.0L / x_frac) + 0.5L); |
112 | log_x_frac_hi = powl_log_table[2 * k - 24]; |
113 | log_x_frac_lo = powl_log_table[2 * k - 23]; |
114 | long double x_frac_low; |
115 | if (k == 16) |
116 | x_frac_low = 0.0L; |
117 | else |
118 | { |
119 | /* Mask off low 5 bits of X_FRAC so the multiplication by K/16 |
120 | is exact. These bits are small enough that they can be |
121 | corrected for by adding log2 (e) * X_FRAC_LOW to the final |
122 | result. */ |
123 | int32_t se; |
124 | uint32_t i0, i1; |
125 | GET_LDOUBLE_WORDS (se, i0, i1, x_frac); |
126 | x_frac_low = x_frac; |
127 | i1 &= 0xffffffe0; |
128 | SET_LDOUBLE_WORDS (x_frac, se, i0, i1); |
129 | x_frac_low -= x_frac; |
130 | x_frac_low /= x_frac; |
131 | x_frac *= k / 16.0L; |
132 | } |
133 | |
134 | /* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25). Separate |
135 | W = X_FRAC - 1 into high 16 bits and remaining bits, so that |
136 | multiplications for low-order power series terms are exact. The |
137 | remaining bits are small enough that adding a 64-bit value of |
138 | log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for |
139 | them. */ |
140 | long double w = x_frac - 1; |
141 | long double w_hi, w_lo; |
142 | int32_t se; |
143 | uint32_t i0, i1; |
144 | GET_LDOUBLE_WORDS (se, i0, i1, w); |
145 | i0 &= 0xffff0000; |
146 | i1 = 0; |
147 | SET_LDOUBLE_WORDS (w_hi, se, i0, i1); |
148 | w_lo = w - w_hi; |
149 | long double wp = w_hi; |
150 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp); |
151 | wp *= -w_hi; |
152 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
153 | wp / 2.0L); |
154 | wp *= -w_hi; |
155 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
156 | wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part. */ |
157 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
158 | wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part. */ |
159 | wp *= -w_hi; |
160 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
161 | wp / 4.0L); |
162 | /* Subsequent terms are small enough that they only need be computed |
163 | to 64 bits. */ |
164 | for (int i = 5; i <= 17; i++) |
165 | { |
166 | wp *= -w_hi; |
167 | acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, |
168 | wp / i); |
169 | } |
170 | |
171 | /* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm. */ |
172 | long double log2_x_frac_hi, log2_x_frac_lo; |
173 | long double log_x_frac_hi32, log_x_frac_lo64; |
174 | GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi); |
175 | i1 = 0; |
176 | SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1); |
177 | log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo; |
178 | long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi; |
179 | long double log2_x_frac_lo1 |
180 | = log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo; |
181 | log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1; |
182 | log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1; |
183 | |
184 | /* Correct for the masking off of W_LO. */ |
185 | long double log2_1p_w_lo; |
186 | asm ("fyl2xp1" |
187 | : "=t" (log2_1p_w_lo) |
188 | : "0" (w_lo / (1.0L + w_hi)), "u" (1.0L) |
189 | : "st(1)" ); |
190 | acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo, |
191 | log2_1p_w_lo); |
192 | |
193 | /* Correct for the masking off of X_FRAC_LOW. */ |
194 | acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo, |
195 | x_frac_low * M_LOG2El); |
196 | |
197 | /* Add the integer and fractional parts of the base-2 logarithm. */ |
198 | long double log2_x_hi, log2_x_lo; |
199 | log2_x_hi = x_int_exponent + log2_x_frac_hi; |
200 | log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo; |
201 | |
202 | /* Compute the base-2 logarithm of the result. */ |
203 | long double log2_res_hi, log2_res_lo; |
204 | long double log2_x_hi32, log2_x_lo64; |
205 | GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi); |
206 | i1 = 0; |
207 | SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1); |
208 | log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo; |
209 | long double y_hi32, y_lo32; |
210 | GET_LDOUBLE_WORDS (se, i0, i1, y); |
211 | i1 = 0; |
212 | SET_LDOUBLE_WORDS (y_hi32, se, i0, i1); |
213 | y_lo32 = y - y_hi32; |
214 | log2_res_hi = log2_x_hi32 * y_hi32; |
215 | log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y; |
216 | |
217 | /* Split the base-2 logarithm of the result into integer and |
218 | fractional parts. */ |
219 | long double log2_res_int = roundl (log2_res_hi); |
220 | long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo; |
221 | /* If the integer part is very large, the computed fractional part |
222 | may be outside the valid range for f2xm1. */ |
223 | if (fabsl (log2_res_int) > 16500) |
224 | log2_res_frac = 0; |
225 | |
226 | /* Compute the final result. */ |
227 | long double res; |
228 | asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac)); |
229 | res += 1.0L; |
230 | if (negate) |
231 | res = -res; |
232 | asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int)); |
233 | math_check_force_underflow (res); |
234 | return res; |
235 | } |
236 | |
237 | libm_hidden_def (__powl_helper) |
238 | |