1 | /* |
2 | * IBM Accurate Mathematical Library |
3 | * written by International Business Machines Corp. |
4 | * Copyright (C) 2001-2021 Free Software Foundation, Inc. |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU Lesser General Public License as published by |
8 | * the Free Software Foundation; either version 2.1 of the License, or |
9 | * (at your option) any later version. |
10 | * |
11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public License |
17 | * along with this program; if not, see <https://www.gnu.org/licenses/>. |
18 | */ |
19 | /************************************************************************/ |
20 | /* MODULE_NAME: mpa.c */ |
21 | /* */ |
22 | /* FUNCTIONS: */ |
23 | /* mcr */ |
24 | /* acr */ |
25 | /* cpy */ |
26 | /* norm */ |
27 | /* denorm */ |
28 | /* mp_dbl */ |
29 | /* dbl_mp */ |
30 | /* add_magnitudes */ |
31 | /* sub_magnitudes */ |
32 | /* add */ |
33 | /* sub */ |
34 | /* mul */ |
35 | /* inv */ |
36 | /* dvd */ |
37 | /* */ |
38 | /* Arithmetic functions for multiple precision numbers. */ |
39 | /* Relative errors are bounded */ |
40 | /************************************************************************/ |
41 | |
42 | |
43 | #include "endian.h" |
44 | #include "mpa.h" |
45 | #include <sys/param.h> |
46 | #include <alloca.h> |
47 | |
48 | #ifndef SECTION |
49 | # define SECTION |
50 | #endif |
51 | |
52 | #ifndef NO__CONST |
53 | const mp_no __mpone = { 1, { 1.0, 1.0 } }; |
54 | const mp_no __mptwo = { 1, { 1.0, 2.0 } }; |
55 | #endif |
56 | |
57 | #ifndef NO___ACR |
58 | /* Compare mantissa of two multiple precision numbers regardless of the sign |
59 | and exponent of the numbers. */ |
60 | static int |
61 | mcr (const mp_no *x, const mp_no *y, int p) |
62 | { |
63 | long i; |
64 | long p2 = p; |
65 | for (i = 1; i <= p2; i++) |
66 | { |
67 | if (X[i] == Y[i]) |
68 | continue; |
69 | else if (X[i] > Y[i]) |
70 | return 1; |
71 | else |
72 | return -1; |
73 | } |
74 | return 0; |
75 | } |
76 | |
77 | /* Compare the absolute values of two multiple precision numbers. */ |
78 | int |
79 | __acr (const mp_no *x, const mp_no *y, int p) |
80 | { |
81 | long i; |
82 | |
83 | if (X[0] == 0) |
84 | { |
85 | if (Y[0] == 0) |
86 | i = 0; |
87 | else |
88 | i = -1; |
89 | } |
90 | else if (Y[0] == 0) |
91 | i = 1; |
92 | else |
93 | { |
94 | if (EX > EY) |
95 | i = 1; |
96 | else if (EX < EY) |
97 | i = -1; |
98 | else |
99 | i = mcr (x, y, p); |
100 | } |
101 | |
102 | return i; |
103 | } |
104 | #endif |
105 | |
106 | #ifndef NO___CPY |
107 | /* Copy multiple precision number X into Y. They could be the same |
108 | number. */ |
109 | void |
110 | __cpy (const mp_no *x, mp_no *y, int p) |
111 | { |
112 | long i; |
113 | |
114 | EY = EX; |
115 | for (i = 0; i <= p; i++) |
116 | Y[i] = X[i]; |
117 | } |
118 | #endif |
119 | |
120 | #ifndef NO___MP_DBL |
121 | /* Convert a multiple precision number *X into a double precision |
122 | number *Y, normalized case (|x| >= 2**(-1022))). X has precision |
123 | P, which is positive. */ |
124 | static void |
125 | norm (const mp_no *x, double *y, int p) |
126 | { |
127 | # define R RADIXI |
128 | long i; |
129 | double c; |
130 | mantissa_t a, u, v, z[5]; |
131 | if (p < 5) |
132 | { |
133 | if (p == 1) |
134 | c = X[1]; |
135 | else if (p == 2) |
136 | c = X[1] + R * X[2]; |
137 | else if (p == 3) |
138 | c = X[1] + R * (X[2] + R * X[3]); |
139 | else /* p == 4. */ |
140 | c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]); |
141 | } |
142 | else |
143 | { |
144 | for (a = 1, z[1] = X[1]; z[1] < TWO23; ) |
145 | { |
146 | a *= 2; |
147 | z[1] *= 2; |
148 | } |
149 | |
150 | for (i = 2; i < 5; i++) |
151 | { |
152 | mantissa_store_t d, r; |
153 | d = X[i] * (mantissa_store_t) a; |
154 | DIV_RADIX (d, r); |
155 | z[i] = r; |
156 | z[i - 1] += d; |
157 | } |
158 | |
159 | u = ALIGN_DOWN_TO (z[3], TWO19); |
160 | v = z[3] - u; |
161 | |
162 | if (v == TWO18) |
163 | { |
164 | if (z[4] == 0) |
165 | { |
166 | for (i = 5; i <= p; i++) |
167 | { |
168 | if (X[i] == 0) |
169 | continue; |
170 | else |
171 | { |
172 | z[3] += 1; |
173 | break; |
174 | } |
175 | } |
176 | } |
177 | else |
178 | z[3] += 1; |
179 | } |
180 | |
181 | c = (z[1] + R * (z[2] + R * z[3])) / a; |
182 | } |
183 | |
184 | c *= X[0]; |
185 | |
186 | for (i = 1; i < EX; i++) |
187 | c *= RADIX; |
188 | for (i = 1; i > EX; i--) |
189 | c *= RADIXI; |
190 | |
191 | *y = c; |
192 | # undef R |
193 | } |
194 | |
195 | /* Convert a multiple precision number *X into a double precision |
196 | number *Y, Denormal case (|x| < 2**(-1022))). */ |
197 | static void |
198 | denorm (const mp_no *x, double *y, int p) |
199 | { |
200 | long i, k; |
201 | long p2 = p; |
202 | double c; |
203 | mantissa_t u, z[5]; |
204 | |
205 | # define R RADIXI |
206 | if (EX < -44 || (EX == -44 && X[1] < TWO5)) |
207 | { |
208 | *y = 0; |
209 | return; |
210 | } |
211 | |
212 | if (p2 == 1) |
213 | { |
214 | if (EX == -42) |
215 | { |
216 | z[1] = X[1] + TWO10; |
217 | z[2] = 0; |
218 | z[3] = 0; |
219 | k = 3; |
220 | } |
221 | else if (EX == -43) |
222 | { |
223 | z[1] = TWO10; |
224 | z[2] = X[1]; |
225 | z[3] = 0; |
226 | k = 2; |
227 | } |
228 | else |
229 | { |
230 | z[1] = TWO10; |
231 | z[2] = 0; |
232 | z[3] = X[1]; |
233 | k = 1; |
234 | } |
235 | } |
236 | else if (p2 == 2) |
237 | { |
238 | if (EX == -42) |
239 | { |
240 | z[1] = X[1] + TWO10; |
241 | z[2] = X[2]; |
242 | z[3] = 0; |
243 | k = 3; |
244 | } |
245 | else if (EX == -43) |
246 | { |
247 | z[1] = TWO10; |
248 | z[2] = X[1]; |
249 | z[3] = X[2]; |
250 | k = 2; |
251 | } |
252 | else |
253 | { |
254 | z[1] = TWO10; |
255 | z[2] = 0; |
256 | z[3] = X[1]; |
257 | k = 1; |
258 | } |
259 | } |
260 | else |
261 | { |
262 | if (EX == -42) |
263 | { |
264 | z[1] = X[1] + TWO10; |
265 | z[2] = X[2]; |
266 | k = 3; |
267 | } |
268 | else if (EX == -43) |
269 | { |
270 | z[1] = TWO10; |
271 | z[2] = X[1]; |
272 | k = 2; |
273 | } |
274 | else |
275 | { |
276 | z[1] = TWO10; |
277 | z[2] = 0; |
278 | k = 1; |
279 | } |
280 | z[3] = X[k]; |
281 | } |
282 | |
283 | u = ALIGN_DOWN_TO (z[3], TWO5); |
284 | |
285 | if (u == z[3]) |
286 | { |
287 | for (i = k + 1; i <= p2; i++) |
288 | { |
289 | if (X[i] == 0) |
290 | continue; |
291 | else |
292 | { |
293 | z[3] += 1; |
294 | break; |
295 | } |
296 | } |
297 | } |
298 | |
299 | c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10); |
300 | |
301 | *y = c * TWOM1032; |
302 | # undef R |
303 | } |
304 | |
305 | /* Convert multiple precision number *X into double precision number *Y. The |
306 | result is correctly rounded to the nearest/even. */ |
307 | void |
308 | __mp_dbl (const mp_no *x, double *y, int p) |
309 | { |
310 | if (X[0] == 0) |
311 | { |
312 | *y = 0; |
313 | return; |
314 | } |
315 | |
316 | if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10))) |
317 | norm (x, y, p); |
318 | else |
319 | denorm (x, y, p); |
320 | } |
321 | #endif |
322 | |
323 | /* Get the multiple precision equivalent of X into *Y. If the precision is too |
324 | small, the result is truncated. */ |
325 | void |
326 | SECTION |
327 | __dbl_mp (double x, mp_no *y, int p) |
328 | { |
329 | long i, n; |
330 | long p2 = p; |
331 | |
332 | /* Sign. */ |
333 | if (x == 0) |
334 | { |
335 | Y[0] = 0; |
336 | return; |
337 | } |
338 | else if (x > 0) |
339 | Y[0] = 1; |
340 | else |
341 | { |
342 | Y[0] = -1; |
343 | x = -x; |
344 | } |
345 | |
346 | /* Exponent. */ |
347 | for (EY = 1; x >= RADIX; EY += 1) |
348 | x *= RADIXI; |
349 | for (; x < 1; EY -= 1) |
350 | x *= RADIX; |
351 | |
352 | /* Digits. */ |
353 | n = MIN (p2, 4); |
354 | for (i = 1; i <= n; i++) |
355 | { |
356 | INTEGER_OF (x, Y[i]); |
357 | x *= RADIX; |
358 | } |
359 | for (; i <= p2; i++) |
360 | Y[i] = 0; |
361 | } |
362 | |
363 | /* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The |
364 | sign of the sum *Z is not changed. X and Y may overlap but not X and Z or |
365 | Y and Z. No guard digit is used. The result equals the exact sum, |
366 | truncated. */ |
367 | static void |
368 | SECTION |
369 | add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
370 | { |
371 | long i, j, k; |
372 | long p2 = p; |
373 | mantissa_t zk; |
374 | |
375 | EZ = EX; |
376 | |
377 | i = p2; |
378 | j = p2 + EY - EX; |
379 | k = p2 + 1; |
380 | |
381 | if (__glibc_unlikely (j < 1)) |
382 | { |
383 | __cpy (x, z, p); |
384 | return; |
385 | } |
386 | |
387 | zk = 0; |
388 | |
389 | for (; j > 0; i--, j--) |
390 | { |
391 | zk += X[i] + Y[j]; |
392 | if (zk >= RADIX) |
393 | { |
394 | Z[k--] = zk - RADIX; |
395 | zk = 1; |
396 | } |
397 | else |
398 | { |
399 | Z[k--] = zk; |
400 | zk = 0; |
401 | } |
402 | } |
403 | |
404 | for (; i > 0; i--) |
405 | { |
406 | zk += X[i]; |
407 | if (zk >= RADIX) |
408 | { |
409 | Z[k--] = zk - RADIX; |
410 | zk = 1; |
411 | } |
412 | else |
413 | { |
414 | Z[k--] = zk; |
415 | zk = 0; |
416 | } |
417 | } |
418 | |
419 | if (zk == 0) |
420 | { |
421 | for (i = 1; i <= p2; i++) |
422 | Z[i] = Z[i + 1]; |
423 | } |
424 | else |
425 | { |
426 | Z[1] = zk; |
427 | EZ += 1; |
428 | } |
429 | } |
430 | |
431 | /* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0. |
432 | The sign of the difference *Z is not changed. X and Y may overlap but not X |
433 | and Z or Y and Z. One guard digit is used. The error is less than one |
434 | ULP. */ |
435 | static void |
436 | SECTION |
437 | sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p) |
438 | { |
439 | long i, j, k; |
440 | long p2 = p; |
441 | mantissa_t zk; |
442 | |
443 | EZ = EX; |
444 | i = p2; |
445 | j = p2 + EY - EX; |
446 | k = p2; |
447 | |
448 | /* Y is too small compared to X, copy X over to the result. */ |
449 | if (__glibc_unlikely (j < 1)) |
450 | { |
451 | __cpy (x, z, p); |
452 | return; |
453 | } |
454 | |
455 | /* The relevant least significant digit in Y is non-zero, so we factor it in |
456 | to enhance accuracy. */ |
457 | if (j < p2 && Y[j + 1] > 0) |
458 | { |
459 | Z[k + 1] = RADIX - Y[j + 1]; |
460 | zk = -1; |
461 | } |
462 | else |
463 | zk = Z[k + 1] = 0; |
464 | |
465 | /* Subtract and borrow. */ |
466 | for (; j > 0; i--, j--) |
467 | { |
468 | zk += (X[i] - Y[j]); |
469 | if (zk < 0) |
470 | { |
471 | Z[k--] = zk + RADIX; |
472 | zk = -1; |
473 | } |
474 | else |
475 | { |
476 | Z[k--] = zk; |
477 | zk = 0; |
478 | } |
479 | } |
480 | |
481 | /* We're done with digits from Y, so it's just digits in X. */ |
482 | for (; i > 0; i--) |
483 | { |
484 | zk += X[i]; |
485 | if (zk < 0) |
486 | { |
487 | Z[k--] = zk + RADIX; |
488 | zk = -1; |
489 | } |
490 | else |
491 | { |
492 | Z[k--] = zk; |
493 | zk = 0; |
494 | } |
495 | } |
496 | |
497 | /* Normalize. */ |
498 | for (i = 1; Z[i] == 0; i++) |
499 | ; |
500 | EZ = EZ - i + 1; |
501 | for (k = 1; i <= p2 + 1; ) |
502 | Z[k++] = Z[i++]; |
503 | for (; k <= p2; ) |
504 | Z[k++] = 0; |
505 | } |
506 | |
507 | /* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X |
508 | and Z or Y and Z. One guard digit is used. The error is less than one |
509 | ULP. */ |
510 | void |
511 | SECTION |
512 | __add (const mp_no *x, const mp_no *y, mp_no *z, int p) |
513 | { |
514 | int n; |
515 | |
516 | if (X[0] == 0) |
517 | { |
518 | __cpy (y, z, p); |
519 | return; |
520 | } |
521 | else if (Y[0] == 0) |
522 | { |
523 | __cpy (x, z, p); |
524 | return; |
525 | } |
526 | |
527 | if (X[0] == Y[0]) |
528 | { |
529 | if (__acr (x, y, p) > 0) |
530 | { |
531 | add_magnitudes (x, y, z, p); |
532 | Z[0] = X[0]; |
533 | } |
534 | else |
535 | { |
536 | add_magnitudes (y, x, z, p); |
537 | Z[0] = Y[0]; |
538 | } |
539 | } |
540 | else |
541 | { |
542 | if ((n = __acr (x, y, p)) == 1) |
543 | { |
544 | sub_magnitudes (x, y, z, p); |
545 | Z[0] = X[0]; |
546 | } |
547 | else if (n == -1) |
548 | { |
549 | sub_magnitudes (y, x, z, p); |
550 | Z[0] = Y[0]; |
551 | } |
552 | else |
553 | Z[0] = 0; |
554 | } |
555 | } |
556 | |
557 | /* Subtract *Y from *X and return the result in *Z. X and Y may overlap but |
558 | not X and Z or Y and Z. One guard digit is used. The error is less than |
559 | one ULP. */ |
560 | void |
561 | SECTION |
562 | __sub (const mp_no *x, const mp_no *y, mp_no *z, int p) |
563 | { |
564 | int n; |
565 | |
566 | if (X[0] == 0) |
567 | { |
568 | __cpy (y, z, p); |
569 | Z[0] = -Z[0]; |
570 | return; |
571 | } |
572 | else if (Y[0] == 0) |
573 | { |
574 | __cpy (x, z, p); |
575 | return; |
576 | } |
577 | |
578 | if (X[0] != Y[0]) |
579 | { |
580 | if (__acr (x, y, p) > 0) |
581 | { |
582 | add_magnitudes (x, y, z, p); |
583 | Z[0] = X[0]; |
584 | } |
585 | else |
586 | { |
587 | add_magnitudes (y, x, z, p); |
588 | Z[0] = -Y[0]; |
589 | } |
590 | } |
591 | else |
592 | { |
593 | if ((n = __acr (x, y, p)) == 1) |
594 | { |
595 | sub_magnitudes (x, y, z, p); |
596 | Z[0] = X[0]; |
597 | } |
598 | else if (n == -1) |
599 | { |
600 | sub_magnitudes (y, x, z, p); |
601 | Z[0] = -Y[0]; |
602 | } |
603 | else |
604 | Z[0] = 0; |
605 | } |
606 | } |
607 | |
608 | #ifndef NO__MUL |
609 | /* Multiply *X and *Y and store result in *Z. X and Y may overlap but not X |
610 | and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P |
611 | digits. In case P > 3 the error is bounded by 1.001 ULP. */ |
612 | void |
613 | SECTION |
614 | __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) |
615 | { |
616 | long i, j, k, ip, ip2; |
617 | long p2 = p; |
618 | mantissa_store_t zk; |
619 | const mp_no *a; |
620 | mantissa_store_t *diag; |
621 | |
622 | /* Is z=0? */ |
623 | if (__glibc_unlikely (X[0] * Y[0] == 0)) |
624 | { |
625 | Z[0] = 0; |
626 | return; |
627 | } |
628 | |
629 | /* We need not iterate through all X's and Y's since it's pointless to |
630 | multiply zeroes. Here, both are zero... */ |
631 | for (ip2 = p2; ip2 > 0; ip2--) |
632 | if (X[ip2] != 0 || Y[ip2] != 0) |
633 | break; |
634 | |
635 | a = X[ip2] != 0 ? y : x; |
636 | |
637 | /* ... and here, at least one of them is still zero. */ |
638 | for (ip = ip2; ip > 0; ip--) |
639 | if (a->d[ip] != 0) |
640 | break; |
641 | |
642 | /* The product looks like this for p = 3 (as an example): |
643 | |
644 | |
645 | a1 a2 a3 |
646 | x b1 b2 b3 |
647 | ----------------------------- |
648 | a1*b3 a2*b3 a3*b3 |
649 | a1*b2 a2*b2 a3*b2 |
650 | a1*b1 a2*b1 a3*b1 |
651 | |
652 | So our K needs to ideally be P*2, but we're limiting ourselves to P + 3 |
653 | for P >= 3. We compute the above digits in two parts; the last P-1 |
654 | digits and then the first P digits. The last P-1 digits are a sum of |
655 | products of the input digits from P to P-k where K is 0 for the least |
656 | significant digit and increases as we go towards the left. The product |
657 | term is of the form X[k]*X[P-k] as can be seen in the above example. |
658 | |
659 | The first P digits are also a sum of products with the same product term, |
660 | except that the sum is from 1 to k. This is also evident from the above |
661 | example. |
662 | |
663 | Another thing that becomes evident is that only the most significant |
664 | ip+ip2 digits of the result are non-zero, where ip and ip2 are the |
665 | 'internal precision' of the input numbers, i.e. digits after ip and ip2 |
666 | are all 0. */ |
667 | |
668 | k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3; |
669 | |
670 | while (k > ip + ip2 + 1) |
671 | Z[k--] = 0; |
672 | |
673 | zk = 0; |
674 | |
675 | /* Precompute sums of diagonal elements so that we can directly use them |
676 | later. See the next comment to know we why need them. */ |
677 | diag = alloca (k * sizeof (mantissa_store_t)); |
678 | mantissa_store_t d = 0; |
679 | for (i = 1; i <= ip; i++) |
680 | { |
681 | d += X[i] * (mantissa_store_t) Y[i]; |
682 | diag[i] = d; |
683 | } |
684 | while (i < k) |
685 | diag[i++] = d; |
686 | |
687 | while (k > p2) |
688 | { |
689 | long lim = k / 2; |
690 | |
691 | if (k % 2 == 0) |
692 | /* We want to add this only once, but since we subtract it in the sum |
693 | of products above, we add twice. */ |
694 | zk += 2 * X[lim] * (mantissa_store_t) Y[lim]; |
695 | |
696 | for (i = k - p2, j = p2; i < j; i++, j--) |
697 | zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]); |
698 | |
699 | zk -= diag[k - 1]; |
700 | |
701 | DIV_RADIX (zk, Z[k]); |
702 | k--; |
703 | } |
704 | |
705 | /* The real deal. Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i |
706 | goes from 1 -> k - 1 and j goes the same range in reverse. To reduce the |
707 | number of multiplications, we halve the range and if k is an even number, |
708 | add the diagonal element X[k/2]Y[k/2]. Through the half range, we compute |
709 | X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j]. |
710 | |
711 | This reduction tells us that we're summing two things, the first term |
712 | through the half range and the negative of the sum of the product of all |
713 | terms of X and Y in the full range. i.e. |
714 | |
715 | SUM(X[i] * Y[i]) for k terms. This is precalculated above for each k in |
716 | a single loop so that it completes in O(n) time and can hence be directly |
717 | used in the loop below. */ |
718 | while (k > 1) |
719 | { |
720 | long lim = k / 2; |
721 | |
722 | if (k % 2 == 0) |
723 | /* We want to add this only once, but since we subtract it in the sum |
724 | of products above, we add twice. */ |
725 | zk += 2 * X[lim] * (mantissa_store_t) Y[lim]; |
726 | |
727 | for (i = 1, j = k - 1; i < j; i++, j--) |
728 | zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]); |
729 | |
730 | zk -= diag[k - 1]; |
731 | |
732 | DIV_RADIX (zk, Z[k]); |
733 | k--; |
734 | } |
735 | Z[k] = zk; |
736 | |
737 | /* Get the exponent sum into an intermediate variable. This is a subtle |
738 | optimization, where given enough registers, all operations on the exponent |
739 | happen in registers and the result is written out only once into EZ. */ |
740 | int e = EX + EY; |
741 | |
742 | /* Is there a carry beyond the most significant digit? */ |
743 | if (__glibc_unlikely (Z[1] == 0)) |
744 | { |
745 | for (i = 1; i <= p2; i++) |
746 | Z[i] = Z[i + 1]; |
747 | e--; |
748 | } |
749 | |
750 | EZ = e; |
751 | Z[0] = X[0] * Y[0]; |
752 | } |
753 | #endif |
754 | |
755 | #ifndef NO__SQR |
756 | /* Square *X and store result in *Y. X and Y may not overlap. For P in |
757 | [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the |
758 | error is bounded by 1.001 ULP. This is a faster special case of |
759 | multiplication. */ |
760 | void |
761 | SECTION |
762 | __sqr (const mp_no *x, mp_no *y, int p) |
763 | { |
764 | long i, j, k, ip; |
765 | mantissa_store_t yk; |
766 | |
767 | /* Is z=0? */ |
768 | if (__glibc_unlikely (X[0] == 0)) |
769 | { |
770 | Y[0] = 0; |
771 | return; |
772 | } |
773 | |
774 | /* We need not iterate through all X's since it's pointless to |
775 | multiply zeroes. */ |
776 | for (ip = p; ip > 0; ip--) |
777 | if (X[ip] != 0) |
778 | break; |
779 | |
780 | k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; |
781 | |
782 | while (k > 2 * ip + 1) |
783 | Y[k--] = 0; |
784 | |
785 | yk = 0; |
786 | |
787 | while (k > p) |
788 | { |
789 | mantissa_store_t yk2 = 0; |
790 | long lim = k / 2; |
791 | |
792 | if (k % 2 == 0) |
793 | yk += X[lim] * (mantissa_store_t) X[lim]; |
794 | |
795 | /* In __mul, this loop (and the one within the next while loop) run |
796 | between a range to calculate the mantissa as follows: |
797 | |
798 | Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] |
799 | + X[n] * Y[k] |
800 | |
801 | For X == Y, we can get away with summing halfway and doubling the |
802 | result. For cases where the range size is even, the mid-point needs |
803 | to be added separately (above). */ |
804 | for (i = k - p, j = p; i < j; i++, j--) |
805 | yk2 += X[i] * (mantissa_store_t) X[j]; |
806 | |
807 | yk += 2 * yk2; |
808 | |
809 | DIV_RADIX (yk, Y[k]); |
810 | k--; |
811 | } |
812 | |
813 | while (k > 1) |
814 | { |
815 | mantissa_store_t yk2 = 0; |
816 | long lim = k / 2; |
817 | |
818 | if (k % 2 == 0) |
819 | yk += X[lim] * (mantissa_store_t) X[lim]; |
820 | |
821 | /* Likewise for this loop. */ |
822 | for (i = 1, j = k - 1; i < j; i++, j--) |
823 | yk2 += X[i] * (mantissa_store_t) X[j]; |
824 | |
825 | yk += 2 * yk2; |
826 | |
827 | DIV_RADIX (yk, Y[k]); |
828 | k--; |
829 | } |
830 | Y[k] = yk; |
831 | |
832 | /* Squares are always positive. */ |
833 | Y[0] = 1; |
834 | |
835 | /* Get the exponent sum into an intermediate variable. This is a subtle |
836 | optimization, where given enough registers, all operations on the exponent |
837 | happen in registers and the result is written out only once into EZ. */ |
838 | int e = EX * 2; |
839 | |
840 | /* Is there a carry beyond the most significant digit? */ |
841 | if (__glibc_unlikely (Y[1] == 0)) |
842 | { |
843 | for (i = 1; i <= p; i++) |
844 | Y[i] = Y[i + 1]; |
845 | e--; |
846 | } |
847 | |
848 | EY = e; |
849 | } |
850 | #endif |
851 | |
852 | /* Invert *X and store in *Y. Relative error bound: |
853 | - For P = 2: 1.001 * R ^ (1 - P) |
854 | - For P = 3: 1.063 * R ^ (1 - P) |
855 | - For P > 3: 2.001 * R ^ (1 - P) |
856 | |
857 | *X = 0 is not permissible. */ |
858 | static void |
859 | SECTION |
860 | __inv (const mp_no *x, mp_no *y, int p) |
861 | { |
862 | long i; |
863 | double t; |
864 | mp_no z, w; |
865 | static const int np1[] = |
866 | { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
867 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 |
868 | }; |
869 | |
870 | __cpy (x, &z, p); |
871 | z.e = 0; |
872 | __mp_dbl (&z, &t, p); |
873 | t = 1 / t; |
874 | |
875 | /* t == 0 will never happen at this point, since 1/t can only be 0 if t is |
876 | infinity, but before the division t == mantissa of x (exponent is 0). We |
877 | can instruct the compiler to ignore this case. */ |
878 | if (t == 0) |
879 | __builtin_unreachable (); |
880 | |
881 | __dbl_mp (t, y, p); |
882 | EY -= EX; |
883 | |
884 | for (i = 0; i < np1[p]; i++) |
885 | { |
886 | __cpy (y, &w, p); |
887 | __mul (x, &w, y, p); |
888 | __sub (&__mptwo, y, &z, p); |
889 | __mul (&w, &z, y, p); |
890 | } |
891 | } |
892 | |
893 | /* Divide *X by *Y and store result in *Z. X and Y may overlap but not X and Z |
894 | or Y and Z. Relative error bound: |
895 | - For P = 2: 2.001 * R ^ (1 - P) |
896 | - For P = 3: 2.063 * R ^ (1 - P) |
897 | - For P > 3: 3.001 * R ^ (1 - P) |
898 | |
899 | *X = 0 is not permissible. */ |
900 | void |
901 | SECTION |
902 | __dvd (const mp_no *x, const mp_no *y, mp_no *z, int p) |
903 | { |
904 | mp_no w; |
905 | |
906 | if (X[0] == 0) |
907 | Z[0] = 0; |
908 | else |
909 | { |
910 | __inv (y, &w, p); |
911 | __mul (x, &w, z, p); |
912 | } |
913 | } |
914 | |