Annotation of OpenXM_contrib/gmp/mpn/generic/mul_n.c, Revision 1.1.1.2
1.1.1.2 ! maekawa 1: /* mpn_mul_n and helper function -- Multiply/square natural numbers.
1.1 maekawa 2:
1.1.1.2 ! maekawa 3: THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
! 4: ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
! 5: THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
! 6: THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
! 7:
! 8:
! 9: Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
! 10: Foundation, Inc.
1.1 maekawa 11:
12: This file is part of the GNU MP Library.
13:
14: The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2 ! maekawa 15: it under the terms of the GNU Lesser General Public License as published by
! 16: the Free Software Foundation; either version 2.1 of the License, or (at your
1.1 maekawa 17: option) any later version.
18:
19: The GNU MP Library is distributed in the hope that it will be useful, but
20: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2 ! maekawa 21: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
1.1 maekawa 22: License for more details.
23:
1.1.1.2 ! maekawa 24: You should have received a copy of the GNU Lesser General Public License
1.1 maekawa 25: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
26: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
27: MA 02111-1307, USA. */
28:
29: #include "gmp.h"
30: #include "gmp-impl.h"
1.1.1.2 ! maekawa 31: #include "longlong.h"
1.1 maekawa 32:
33:
1.1.1.2 ! maekawa 34: /* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
! 35: 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
! 36: #define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1)
1.1 maekawa 37:
1.1.1.2 ! maekawa 38: #if !defined (__alpha) && !defined (__mips)
! 39: /* For all other machines, we want to call mpn functions for the compund
! 40: operations instead of open-coding them. */
! 41: #define USE_MORE_MPN
1.1 maekawa 42: #endif
43:
1.1.1.2 ! maekawa 44: /*== Function declarations =================================================*/
1.1 maekawa 45:
1.1.1.2 ! maekawa 46: static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
! 47: mp_ptr, mp_ptr, mp_ptr,
! 48: mp_srcptr, mp_srcptr, mp_srcptr,
! 49: mp_size_t, mp_size_t));
! 50: static void interpolate3 _PROTO ((mp_srcptr,
! 51: mp_ptr, mp_ptr, mp_ptr,
! 52: mp_srcptr,
! 53: mp_ptr, mp_ptr, mp_ptr,
! 54: mp_size_t, mp_size_t));
! 55: static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
! 56:
! 57:
! 58: /*-- mpn_kara_mul_n ---------------------------------------------------------------*/
! 59:
! 60: /* Multiplies using 3 half-sized mults and so on recursively.
! 61: * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
! 62: * No overlap of p[...] with a[...] or b[...].
! 63: * ws is workspace.
! 64: */
1.1 maekawa 65:
66: void
67: #if __STDC__
1.1.1.2 ! maekawa 68: mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
1.1 maekawa 69: #else
1.1.1.2 ! maekawa 70: mpn_kara_mul_n(p, a, b, n, ws)
! 71: mp_ptr p;
! 72: mp_srcptr a;
! 73: mp_srcptr b;
! 74: mp_size_t n;
! 75: mp_ptr ws;
1.1 maekawa 76: #endif
77: {
1.1.1.2 ! maekawa 78: mp_limb_t i, sign, w, w0, w1;
! 79: mp_size_t n2;
! 80: mp_srcptr x, y;
1.1 maekawa 81:
1.1.1.2 ! maekawa 82: n2 = n >> 1;
! 83: ASSERT (n2 > 0);
! 84:
! 85: if (n & 1)
1.1 maekawa 86: {
1.1.1.2 ! maekawa 87: /* Odd length. */
! 88: mp_size_t n1, n3, nm1;
! 89:
! 90: n3 = n - n2;
! 91:
! 92: sign = 0;
! 93: w = a[n2];
! 94: if (w != 0)
! 95: w -= mpn_sub_n (p, a, a + n3, n2);
! 96: else
! 97: {
! 98: i = n2;
! 99: do
! 100: {
! 101: --i;
! 102: w0 = a[i];
! 103: w1 = a[n3+i];
! 104: }
! 105: while (w0 == w1 && i != 0);
! 106: if (w0 < w1)
! 107: {
! 108: x = a + n3;
! 109: y = a;
! 110: sign = 1;
! 111: }
! 112: else
! 113: {
! 114: x = a;
! 115: y = a + n3;
! 116: }
! 117: mpn_sub_n (p, x, y, n2);
! 118: }
! 119: p[n2] = w;
! 120:
! 121: w = b[n2];
! 122: if (w != 0)
! 123: w -= mpn_sub_n (p + n3, b, b + n3, n2);
! 124: else
! 125: {
! 126: i = n2;
! 127: do
! 128: {
! 129: --i;
! 130: w0 = b[i];
! 131: w1 = b[n3+i];
! 132: }
! 133: while (w0 == w1 && i != 0);
! 134: if (w0 < w1)
! 135: {
! 136: x = b + n3;
! 137: y = b;
! 138: sign ^= 1;
! 139: }
! 140: else
! 141: {
! 142: x = b;
! 143: y = b + n3;
! 144: }
! 145: mpn_sub_n (p + n3, x, y, n2);
! 146: }
! 147: p[n] = w;
! 148:
! 149: n1 = n + 1;
! 150: if (n2 < KARATSUBA_MUL_THRESHOLD)
! 151: {
! 152: if (n3 < KARATSUBA_MUL_THRESHOLD)
! 153: {
! 154: mpn_mul_basecase (ws, p, n3, p + n3, n3);
! 155: mpn_mul_basecase (p, a, n3, b, n3);
! 156: }
! 157: else
! 158: {
! 159: mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
! 160: mpn_kara_mul_n (p, a, b, n3, ws + n1);
! 161: }
! 162: mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
! 163: }
! 164: else
! 165: {
! 166: mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
! 167: mpn_kara_mul_n (p, a, b, n3, ws + n1);
! 168: mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
! 169: }
! 170:
! 171: if (sign)
! 172: mpn_add_n (ws, p, ws, n1);
1.1 maekawa 173: else
1.1.1.2 ! maekawa 174: mpn_sub_n (ws, p, ws, n1);
! 175:
! 176: nm1 = n - 1;
! 177: if (mpn_add_n (ws, p + n1, ws, nm1))
! 178: {
! 179: mp_limb_t x = ws[nm1] + 1;
! 180: ws[nm1] = x;
! 181: if (x == 0)
! 182: ++ws[n];
! 183: }
! 184: if (mpn_add_n (p + n3, p + n3, ws, n1))
! 185: {
! 186: mp_limb_t x;
! 187: i = n1 + n3;
! 188: do
! 189: {
! 190: x = p[i] + 1;
! 191: p[i] = x;
! 192: ++i;
! 193: } while (x == 0);
! 194: }
1.1 maekawa 195: }
196: else
1.1.1.2 ! maekawa 197: {
! 198: /* Even length. */
! 199: mp_limb_t t;
! 200:
! 201: i = n2;
! 202: do
! 203: {
! 204: --i;
! 205: w0 = a[i];
! 206: w1 = a[n2+i];
! 207: }
! 208: while (w0 == w1 && i != 0);
! 209: sign = 0;
! 210: if (w0 < w1)
! 211: {
! 212: x = a + n2;
! 213: y = a;
! 214: sign = 1;
! 215: }
! 216: else
! 217: {
! 218: x = a;
! 219: y = a + n2;
! 220: }
! 221: mpn_sub_n (p, x, y, n2);
1.1 maekawa 222:
1.1.1.2 ! maekawa 223: i = n2;
! 224: do
! 225: {
! 226: --i;
! 227: w0 = b[i];
! 228: w1 = b[n2+i];
! 229: }
! 230: while (w0 == w1 && i != 0);
! 231: if (w0 < w1)
! 232: {
! 233: x = b + n2;
! 234: y = b;
! 235: sign ^= 1;
! 236: }
! 237: else
! 238: {
! 239: x = b;
! 240: y = b + n2;
! 241: }
! 242: mpn_sub_n (p + n2, x, y, n2);
1.1 maekawa 243:
1.1.1.2 ! maekawa 244: /* Pointwise products. */
! 245: if (n2 < KARATSUBA_MUL_THRESHOLD)
1.1 maekawa 246: {
1.1.1.2 ! maekawa 247: mpn_mul_basecase (ws, p, n2, p + n2, n2);
! 248: mpn_mul_basecase (p, a, n2, b, n2);
! 249: mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
1.1 maekawa 250: }
251: else
1.1.1.2 ! maekawa 252: {
! 253: mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
! 254: mpn_kara_mul_n (p, a, b, n2, ws + n);
! 255: mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
! 256: }
1.1 maekawa 257:
1.1.1.2 ! maekawa 258: /* Interpolate. */
! 259: if (sign)
! 260: w = mpn_add_n (ws, p, ws, n);
! 261: else
! 262: w = -mpn_sub_n (ws, p, ws, n);
! 263: w += mpn_add_n (ws, p + n, ws, n);
! 264: w += mpn_add_n (p + n2, p + n2, ws, n);
! 265: /* TO DO: could put "if (w) { ... }" here.
! 266: * Less work but badly predicted branch.
! 267: * No measurable difference in speed on Alpha.
! 268: */
! 269: i = n + n2;
! 270: t = p[i] + w;
! 271: p[i] = t;
! 272: if (t < w)
! 273: {
! 274: do
! 275: {
! 276: ++i;
! 277: w = p[i] + 1;
! 278: p[i] = w;
! 279: }
! 280: while (w == 0);
! 281: }
1.1 maekawa 282: }
283: }
284:
285: void
286: #if __STDC__
1.1.1.2 ! maekawa 287: mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1.1 maekawa 288: #else
1.1.1.2 ! maekawa 289: mpn_kara_sqr_n (p, a, n, ws)
! 290: mp_ptr p;
! 291: mp_srcptr a;
! 292: mp_size_t n;
! 293: mp_ptr ws;
! 294: #endif
! 295: {
! 296: mp_limb_t i, sign, w, w0, w1;
! 297: mp_size_t n2;
! 298: mp_srcptr x, y;
1.1 maekawa 299:
1.1.1.2 ! maekawa 300: n2 = n >> 1;
! 301: ASSERT (n2 > 0);
! 302:
! 303: if (n & 1)
1.1 maekawa 304: {
1.1.1.2 ! maekawa 305: /* Odd length. */
! 306: mp_size_t n1, n3, nm1;
1.1 maekawa 307:
1.1.1.2 ! maekawa 308: n3 = n - n2;
1.1 maekawa 309:
1.1.1.2 ! maekawa 310: sign = 0;
! 311: w = a[n2];
! 312: if (w != 0)
! 313: w -= mpn_sub_n (p, a, a + n3, n2);
! 314: else
! 315: {
! 316: i = n2;
! 317: do
! 318: {
! 319: --i;
! 320: w0 = a[i];
! 321: w1 = a[n3+i];
! 322: }
! 323: while (w0 == w1 && i != 0);
! 324: if (w0 < w1)
! 325: {
! 326: x = a + n3;
! 327: y = a;
! 328: sign = 1;
! 329: }
! 330: else
! 331: {
! 332: x = a;
! 333: y = a + n3;
! 334: }
! 335: mpn_sub_n (p, x, y, n2);
! 336: }
! 337: p[n2] = w;
1.1 maekawa 338:
1.1.1.2 ! maekawa 339: w = a[n2];
! 340: if (w != 0)
! 341: w -= mpn_sub_n (p + n3, a, a + n3, n2);
! 342: else
! 343: {
! 344: i = n2;
! 345: do
! 346: {
! 347: --i;
! 348: w0 = a[i];
! 349: w1 = a[n3+i];
! 350: }
! 351: while (w0 == w1 && i != 0);
! 352: if (w0 < w1)
! 353: {
! 354: x = a + n3;
! 355: y = a;
! 356: sign ^= 1;
! 357: }
! 358: else
! 359: {
! 360: x = a;
! 361: y = a + n3;
! 362: }
! 363: mpn_sub_n (p + n3, x, y, n2);
! 364: }
! 365: p[n] = w;
1.1 maekawa 366:
1.1.1.2 ! maekawa 367: n1 = n + 1;
! 368: if (n2 < KARATSUBA_SQR_THRESHOLD)
! 369: {
! 370: if (n3 < KARATSUBA_SQR_THRESHOLD)
! 371: {
! 372: mpn_sqr_basecase (ws, p, n3);
! 373: mpn_sqr_basecase (p, a, n3);
! 374: }
! 375: else
! 376: {
! 377: mpn_kara_sqr_n (ws, p, n3, ws + n1);
! 378: mpn_kara_sqr_n (p, a, n3, ws + n1);
! 379: }
! 380: mpn_sqr_basecase (p + n1, a + n3, n2);
! 381: }
! 382: else
! 383: {
! 384: mpn_kara_sqr_n (ws, p, n3, ws + n1);
! 385: mpn_kara_sqr_n (p, a, n3, ws + n1);
! 386: mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
! 387: }
1.1 maekawa 388:
1.1.1.2 ! maekawa 389: if (sign)
! 390: mpn_add_n (ws, p, ws, n1);
! 391: else
! 392: mpn_sub_n (ws, p, ws, n1);
1.1 maekawa 393:
1.1.1.2 ! maekawa 394: nm1 = n - 1;
! 395: if (mpn_add_n (ws, p + n1, ws, nm1))
! 396: {
! 397: mp_limb_t x = ws[nm1] + 1;
! 398: ws[nm1] = x;
! 399: if (x == 0)
! 400: ++ws[n];
! 401: }
! 402: if (mpn_add_n (p + n3, p + n3, ws, n1))
! 403: {
! 404: mp_limb_t x;
! 405: i = n1 + n3;
! 406: do
! 407: {
! 408: x = p[i] + 1;
! 409: p[i] = x;
! 410: ++i;
! 411: } while (x == 0);
! 412: }
! 413: }
! 414: else
! 415: {
! 416: /* Even length. */
! 417: mp_limb_t t;
1.1 maekawa 418:
1.1.1.2 ! maekawa 419: i = n2;
! 420: do
! 421: {
! 422: --i;
! 423: w0 = a[i];
! 424: w1 = a[n2+i];
! 425: }
! 426: while (w0 == w1 && i != 0);
! 427: sign = 0;
! 428: if (w0 < w1)
1.1 maekawa 429: {
1.1.1.2 ! maekawa 430: x = a + n2;
! 431: y = a;
! 432: sign = 1;
1.1 maekawa 433: }
434: else
435: {
1.1.1.2 ! maekawa 436: x = a;
! 437: y = a + n2;
! 438: }
! 439: mpn_sub_n (p, x, y, n2);
! 440:
! 441: i = n2;
! 442: do
! 443: {
! 444: --i;
! 445: w0 = a[i];
! 446: w1 = a[n2+i];
1.1 maekawa 447: }
1.1.1.2 ! maekawa 448: while (w0 == w1 && i != 0);
! 449: if (w0 < w1)
1.1 maekawa 450: {
1.1.1.2 ! maekawa 451: x = a + n2;
! 452: y = a;
! 453: sign ^= 1;
1.1 maekawa 454: }
455: else
456: {
1.1.1.2 ! maekawa 457: x = a;
! 458: y = a + n2;
1.1 maekawa 459: }
1.1.1.2 ! maekawa 460: mpn_sub_n (p + n2, x, y, n2);
1.1 maekawa 461:
1.1.1.2 ! maekawa 462: /* Pointwise products. */
! 463: if (n2 < KARATSUBA_SQR_THRESHOLD)
! 464: {
! 465: mpn_sqr_basecase (ws, p, n2);
! 466: mpn_sqr_basecase (p, a, n2);
! 467: mpn_sqr_basecase (p + n, a + n2, n2);
! 468: }
! 469: else
! 470: {
! 471: mpn_kara_sqr_n (ws, p, n2, ws + n);
! 472: mpn_kara_sqr_n (p, a, n2, ws + n);
! 473: mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
! 474: }
1.1 maekawa 475:
1.1.1.2 ! maekawa 476: /* Interpolate. */
! 477: if (sign)
! 478: w = mpn_add_n (ws, p, ws, n);
1.1 maekawa 479: else
1.1.1.2 ! maekawa 480: w = -mpn_sub_n (ws, p, ws, n);
! 481: w += mpn_add_n (ws, p + n, ws, n);
! 482: w += mpn_add_n (p + n2, p + n2, ws, n);
! 483: /* TO DO: could put "if (w) { ... }" here.
! 484: * Less work but badly predicted branch.
! 485: * No measurable difference in speed on Alpha.
! 486: */
! 487: i = n + n2;
! 488: t = p[i] + w;
! 489: p[i] = t;
! 490: if (t < w)
! 491: {
! 492: do
! 493: {
! 494: ++i;
! 495: w = p[i] + 1;
! 496: p[i] = w;
! 497: }
! 498: while (w == 0);
! 499: }
! 500: }
! 501: }
1.1 maekawa 502:
1.1.1.2 ! maekawa 503: /*-- add2Times -------------------------------------------------------------*/
1.1 maekawa 504:
1.1.1.2 ! maekawa 505: /* z[] = x[] + 2 * y[]
! 506: Note that z and x might point to the same vectors. */
! 507: #ifdef USE_MORE_MPN
! 508: static inline mp_limb_t
! 509: #if __STDC__
! 510: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
! 511: #else
! 512: add2Times (z, x, y, n)
! 513: mp_ptr z;
! 514: mp_srcptr x;
! 515: mp_srcptr y;
! 516: mp_size_t n;
! 517: #endif
! 518: {
! 519: mp_ptr t;
! 520: mp_limb_t c;
! 521: TMP_DECL (marker);
! 522: TMP_MARK (marker);
! 523: t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
! 524: c = mpn_lshift (t, y, n, 1);
! 525: c += mpn_add_n (z, x, t, n);
! 526: TMP_FREE (marker);
! 527: return c;
! 528: }
! 529: #else
1.1 maekawa 530:
1.1.1.2 ! maekawa 531: static mp_limb_t
! 532: #if __STDC__
! 533: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
! 534: #else
! 535: add2Times (z, x, y, n)
! 536: mp_ptr z;
! 537: mp_srcptr x;
! 538: mp_srcptr y;
! 539: mp_size_t n;
! 540: #endif
! 541: {
! 542: mp_limb_t c, v, w;
1.1 maekawa 543:
1.1.1.2 ! maekawa 544: ASSERT (n > 0);
! 545: v = *x; w = *y;
! 546: c = w >> (BITS_PER_MP_LIMB - 1);
! 547: w <<= 1;
! 548: v += w;
! 549: c += v < w;
! 550: *z = v;
! 551: ++x; ++y; ++z;
! 552: while (--n)
! 553: {
! 554: v = *x;
! 555: w = *y;
! 556: v += c;
! 557: c = v < c;
! 558: c += w >> (BITS_PER_MP_LIMB - 1);
! 559: w <<= 1;
! 560: v += w;
! 561: c += v < w;
! 562: *z = v;
! 563: ++x; ++y; ++z;
1.1 maekawa 564: }
1.1.1.2 ! maekawa 565:
! 566: return c;
1.1 maekawa 567: }
1.1.1.2 ! maekawa 568: #endif
1.1 maekawa 569:
1.1.1.2 ! maekawa 570: /*-- evaluate3 -------------------------------------------------------------*/
! 571:
! 572: /* Evaluates:
! 573: * ph := 4*A+2*B+C
! 574: * p1 := A+B+C
! 575: * p2 := A+2*B+4*C
! 576: * where:
! 577: * ph[], p1[], p2[], A[] and B[] all have length len,
! 578: * C[] has length len2 with len-len2 = 0, 1 or 2.
! 579: * Returns top words (overflow) at pth, pt1 and pt2 respectively.
! 580: */
! 581: #ifdef USE_MORE_MPN
! 582: static void
1.1 maekawa 583: #if __STDC__
1.1.1.2 ! maekawa 584: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
! 585: mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
1.1 maekawa 586: #else
1.1.1.2 ! maekawa 587: evaluate3 (ph, p1, p2, pth, pt1, pt2,
! 588: A, B, C, len, len2)
! 589: mp_ptr ph;
! 590: mp_ptr p1;
! 591: mp_ptr p2;
! 592: mp_ptr pth;
! 593: mp_ptr pt1;
! 594: mp_ptr pt2;
! 595: mp_srcptr A;
! 596: mp_srcptr B;
! 597: mp_srcptr C;
! 598: mp_size_t len;
! 599: mp_size_t len2;
1.1 maekawa 600: #endif
601: {
1.1.1.2 ! maekawa 602: mp_limb_t c, d, e;
! 603:
! 604: ASSERT (len - len2 <= 2);
! 605:
! 606: e = mpn_lshift (p1, B, len, 1);
! 607:
! 608: c = mpn_lshift (ph, A, len, 2);
! 609: c += e + mpn_add_n (ph, ph, p1, len);
! 610: d = mpn_add_n (ph, ph, C, len2);
! 611: if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
! 612: ASSERT (c < 7);
! 613: *pth = c;
! 614:
! 615: c = mpn_lshift (p2, C, len2, 2);
! 616: #if 1
! 617: if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
! 618: c += e + mpn_add_n (p2, p2, p1, len);
! 619: #else
! 620: d = mpn_add_n (p2, p2, p1, len2);
! 621: c += d;
! 622: if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
! 623: c += e;
! 624: #endif
! 625: c += mpn_add_n (p2, p2, A, len);
! 626: ASSERT (c < 7);
! 627: *pt2 = c;
! 628:
! 629: c = mpn_add_n (p1, A, B, len);
! 630: d = mpn_add_n (p1, p1, C, len2);
! 631: if (len2 == len) c += d;
! 632: else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
! 633: ASSERT (c < 3);
! 634: *pt1 = c;
1.1 maekawa 635:
1.1.1.2 ! maekawa 636: }
! 637:
! 638: #else
1.1 maekawa 639:
1.1.1.2 ! maekawa 640: static void
! 641: #if __STDC__
! 642: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
! 643: mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
! 644: #else
! 645: evaluate3 (ph, p1, p2, pth, pt1, pt2,
! 646: A, B, C, l, ls)
! 647: mp_ptr ph;
! 648: mp_ptr p1;
! 649: mp_ptr p2;
! 650: mp_ptr pth;
! 651: mp_ptr pt1;
! 652: mp_ptr pt2;
! 653: mp_srcptr A;
! 654: mp_srcptr B;
! 655: mp_srcptr C;
! 656: mp_size_t l;
! 657: mp_size_t ls;
! 658: #endif
! 659: {
! 660: mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
! 661:
! 662: ASSERT (l - ls <= 2);
1.1 maekawa 663:
1.1.1.2 ! maekawa 664: th = t1 = t2 = 0;
! 665: for (i = 0; i < l; ++i)
1.1 maekawa 666: {
1.1.1.2 ! maekawa 667: a = *A;
! 668: b = *B;
! 669: c = i < ls ? *C : 0;
! 670:
! 671: /* TO DO: choose one of the following alternatives. */
! 672: #if 0
! 673: t = a << 2;
! 674: vh = th + t;
! 675: th = vh < t;
! 676: th += a >> (BITS_PER_MP_LIMB - 2);
! 677: t = b << 1;
! 678: vh += t;
! 679: th += vh < t;
! 680: th += b >> (BITS_PER_MP_LIMB - 1);
! 681: vh += c;
! 682: th += vh < c;
! 683: #else
! 684: vh = th + c;
! 685: th = vh < c;
! 686: t = b << 1;
! 687: vh += t;
! 688: th += vh < t;
! 689: th += b >> (BITS_PER_MP_LIMB - 1);
! 690: t = a << 2;
! 691: vh += t;
! 692: th += vh < t;
! 693: th += a >> (BITS_PER_MP_LIMB - 2);
! 694: #endif
1.1 maekawa 695:
1.1.1.2 ! maekawa 696: v1 = t1 + a;
! 697: t1 = v1 < a;
! 698: v1 += b;
! 699: t1 += v1 < b;
! 700: v1 += c;
! 701: t1 += v1 < c;
! 702:
! 703: v2 = t2 + a;
! 704: t2 = v2 < a;
! 705: t = b << 1;
! 706: v2 += t;
! 707: t2 += v2 < t;
! 708: t2 += b >> (BITS_PER_MP_LIMB - 1);
! 709: t = c << 2;
! 710: v2 += t;
! 711: t2 += v2 < t;
! 712: t2 += c >> (BITS_PER_MP_LIMB - 2);
! 713:
! 714: *ph = vh;
! 715: *p1 = v1;
! 716: *p2 = v2;
! 717:
! 718: ++A; ++B; ++C;
! 719: ++ph; ++p1; ++p2;
1.1 maekawa 720: }
1.1.1.2 ! maekawa 721:
! 722: ASSERT (th < 7);
! 723: ASSERT (t1 < 3);
! 724: ASSERT (t2 < 7);
! 725:
! 726: *pth = th;
! 727: *pt1 = t1;
! 728: *pt2 = t2;
1.1 maekawa 729: }
1.1.1.2 ! maekawa 730: #endif
1.1 maekawa 731:
1.1.1.2 ! maekawa 732:
! 733: /*-- interpolate3 ----------------------------------------------------------*/
! 734:
! 735: /* Interpolates B, C, D (in-place) from:
! 736: * 16*A+8*B+4*C+2*D+E
! 737: * A+B+C+D+E
! 738: * A+2*B+4*C+8*D+16*E
! 739: * where:
! 740: * A[], B[], C[] and D[] all have length l,
! 741: * E[] has length ls with l-ls = 0, 2 or 4.
! 742: *
! 743: * Reads top words (from earlier overflow) from ptb, ptc and ptd,
! 744: * and returns new top words there.
! 745: */
! 746:
! 747: #ifdef USE_MORE_MPN
! 748: static void
1.1 maekawa 749: #if __STDC__
1.1.1.2 ! maekawa 750: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
! 751: mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
! 752: #else
! 753: interpolate3 (A, B, C, D, E,
! 754: ptb, ptc, ptd, len, len2)
! 755: mp_srcptr A;
! 756: mp_ptr B;
! 757: mp_ptr C;
! 758: mp_ptr D;
! 759: mp_srcptr E;
! 760: mp_ptr ptb;
! 761: mp_ptr ptc;
! 762: mp_ptr ptd;
! 763: mp_size_t len;
! 764: mp_size_t len2;
! 765: #endif
! 766: {
! 767: mp_ptr ws;
! 768: mp_limb_t t, tb,tc,td;
! 769: TMP_DECL (marker);
! 770: TMP_MARK (marker);
! 771:
! 772: ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
! 773:
! 774: /* Let x1, x2, x3 be the values to interpolate. We have:
! 775: * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
! 776: * c = a + x1 + x2 + x3 + e
! 777: * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
! 778: */
! 779:
! 780: ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
! 781:
! 782: tb = *ptb; tc = *ptc; td = *ptd;
! 783:
! 784:
! 785: /* b := b - 16*a - e
! 786: * c := c - a - e
! 787: * d := d - a - 16*e
! 788: */
! 789:
! 790: t = mpn_lshift (ws, A, len, 4);
! 791: tb -= t + mpn_sub_n (B, B, ws, len);
! 792: t = mpn_sub_n (B, B, E, len2);
! 793: if (len2 == len) tb -= t;
! 794: else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
! 795:
! 796: tc -= mpn_sub_n (C, C, A, len);
! 797: t = mpn_sub_n (C, C, E, len2);
! 798: if (len2 == len) tc -= t;
! 799: else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
! 800:
! 801: t = mpn_lshift (ws, E, len2, 4);
! 802: t += mpn_add_n (ws, ws, A, len2);
! 803: #if 1
! 804: if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
! 805: td -= t + mpn_sub_n (D, D, ws, len);
! 806: #else
! 807: t += mpn_sub_n (D, D, ws, len2);
! 808: if (len2 != len) {
! 809: t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
! 810: t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
! 811: } /* end if/else */
! 812: td -= t;
! 813: #endif
! 814:
! 815:
! 816: /* b, d := b + d, b - d */
! 817:
! 818: #ifdef HAVE_MPN_ADD_SUB_N
! 819: /* #error TO DO ... */
! 820: #else
! 821: t = tb + td + mpn_add_n (ws, B, D, len);
! 822: td = tb - td - mpn_sub_n (D, B, D, len);
! 823: tb = t;
! 824: MPN_COPY (B, ws, len);
! 825: #endif
! 826:
! 827: /* b := b-8*c */
! 828: t = 8 * tc + mpn_lshift (ws, C, len, 3);
! 829: tb -= t + mpn_sub_n (B, B, ws, len);
! 830:
! 831: /* c := 2*c - b */
! 832: tc = 2 * tc + mpn_lshift (C, C, len, 1);
! 833: tc -= tb + mpn_sub_n (C, C, B, len);
! 834:
! 835: /* d := d/3 */
! 836: td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
! 837:
! 838: /* b, d := b + d, b - d */
! 839: #ifdef HAVE_MPN_ADD_SUB_N
! 840: /* #error TO DO ... */
1.1 maekawa 841: #else
1.1.1.2 ! maekawa 842: t = tb + td + mpn_add_n (ws, B, D, len);
! 843: td = tb - td - mpn_sub_n (D, B, D, len);
! 844: tb = t;
! 845: MPN_COPY (B, ws, len);
! 846: #endif
1.1 maekawa 847:
1.1.1.2 ! maekawa 848: /* Now:
! 849: * b = 4*x1
! 850: * c = 2*x2
! 851: * d = 4*x3
! 852: */
! 853:
! 854: ASSERT(!(*B & 3));
! 855: mpn_rshift (B, B, len, 2);
! 856: B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
! 857: ASSERT((long)tb >= 0);
! 858: tb >>= 2;
! 859:
! 860: ASSERT(!(*C & 1));
! 861: mpn_rshift (C, C, len, 1);
! 862: C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
! 863: ASSERT((long)tc >= 0);
! 864: tc >>= 1;
! 865:
! 866: ASSERT(!(*D & 3));
! 867: mpn_rshift (D, D, len, 2);
! 868: D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
! 869: ASSERT((long)td >= 0);
! 870: td >>= 2;
! 871:
! 872: #if WANT_ASSERT
! 873: ASSERT (tb < 2);
! 874: if (len == len2)
! 875: {
! 876: ASSERT (tc < 3);
! 877: ASSERT (td < 2);
1.1 maekawa 878: }
879: else
880: {
1.1.1.2 ! maekawa 881: ASSERT (tc < 2);
! 882: ASSERT (!td);
! 883: }
! 884: #endif
1.1 maekawa 885:
1.1.1.2 ! maekawa 886: *ptb = tb;
! 887: *ptc = tc;
! 888: *ptd = td;
1.1 maekawa 889:
1.1.1.2 ! maekawa 890: TMP_FREE (marker);
! 891: }
! 892:
! 893: #else
! 894:
! 895: static void
! 896: #if __STDC__
! 897: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
! 898: mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
! 899: #else
! 900: interpolate3 (A, B, C, D, E,
! 901: ptb, ptc, ptd, l, ls)
! 902: mp_srcptr A;
! 903: mp_ptr B;
! 904: mp_ptr C;
! 905: mp_ptr D;
! 906: mp_srcptr E;
! 907: mp_ptr ptb;
! 908: mp_ptr ptc;
! 909: mp_ptr ptd;
! 910: mp_size_t l;
! 911: mp_size_t ls;
! 912: #endif
! 913: {
! 914: mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
! 915: const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
! 916:
! 917: #if WANT_ASSERT
! 918: t = l - ls;
! 919: ASSERT (t == 0 || t == 2 || t == 4);
! 920: #endif
! 921:
! 922: sb = sc = sd = 0;
! 923: for (i = 0; i < l; ++i)
! 924: {
! 925: mp_limb_t tb, tc, td, tt;
! 926:
! 927: a = *A;
! 928: b = *B;
! 929: c = *C;
! 930: d = *D;
! 931: e = i < ls ? *E : 0;
! 932:
! 933: /* Let x1, x2, x3 be the values to interpolate. We have:
! 934: * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
! 935: * c = a + x1 + x2 + x3 + e
! 936: * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
! 937: */
! 938:
! 939: /* b := b - 16*a - e
! 940: * c := c - a - e
! 941: * d := d - a - 16*e
! 942: */
! 943: t = a << 4;
! 944: tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
! 945: b -= t;
! 946: tb -= b < e;
! 947: b -= e;
! 948: tc = -(c < a);
! 949: c -= a;
! 950: tc -= c < e;
! 951: c -= e;
! 952: td = -(d < a);
! 953: d -= a;
! 954: t = e << 4;
! 955: td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
! 956: d -= t;
! 957:
! 958: /* b, d := b + d, b - d */
! 959: t = b + d;
! 960: tt = tb + td + (t < b);
! 961: td = tb - td - (b < d);
! 962: d = b - d;
! 963: b = t;
! 964: tb = tt;
! 965:
! 966: /* b := b-8*c */
! 967: t = c << 3;
! 968: tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
! 969: b -= t;
! 970:
! 971: /* c := 2*c - b */
! 972: t = c << 1;
! 973: tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
! 974: c = t - b;
! 975:
! 976: /* d := d/3 */
! 977: d *= INVERSE_3;
! 978: td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
! 979: td *= INVERSE_3;
! 980:
! 981: /* b, d := b + d, b - d */
! 982: t = b + d;
! 983: tt = tb + td + (t < b);
! 984: td = tb - td - (b < d);
! 985: d = b - d;
! 986: b = t;
! 987: tb = tt;
! 988:
! 989: /* Now:
! 990: * b = 4*x1
! 991: * c = 2*x2
! 992: * d = 4*x3
! 993: */
! 994:
! 995: /* sb has period 2. */
! 996: b += sb;
! 997: tb += b < sb;
! 998: sb &= maskOffHalf;
! 999: sb |= sb >> (BITS_PER_MP_LIMB >> 1);
! 1000: sb += tb;
! 1001:
! 1002: /* sc has period 1. */
! 1003: c += sc;
! 1004: tc += c < sc;
! 1005: /* TO DO: choose one of the following alternatives. */
! 1006: #if 1
! 1007: sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
! 1008: sc += tc;
! 1009: #else
! 1010: sc = tc - ((long)sc < 0L);
! 1011: #endif
! 1012:
! 1013: /* sd has period 2. */
! 1014: d += sd;
! 1015: td += d < sd;
! 1016: sd &= maskOffHalf;
! 1017: sd |= sd >> (BITS_PER_MP_LIMB >> 1);
! 1018: sd += td;
! 1019:
! 1020: if (i != 0)
1.1 maekawa 1021: {
1.1.1.2 ! maekawa 1022: B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
! 1023: C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
! 1024: D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1.1 maekawa 1025: }
1.1.1.2 ! maekawa 1026: ob = b >> 2;
! 1027: oc = c >> 1;
! 1028: od = d >> 2;
1.1 maekawa 1029:
1.1.1.2 ! maekawa 1030: ++A; ++B; ++C; ++D; ++E;
! 1031: }
1.1 maekawa 1032:
1.1.1.2 ! maekawa 1033: /* Handle top words. */
! 1034: b = *ptb;
! 1035: c = *ptc;
! 1036: d = *ptd;
! 1037:
! 1038: t = b + d;
! 1039: d = b - d;
! 1040: b = t;
! 1041: b -= c << 3;
! 1042: c = (c << 1) - b;
! 1043: d *= INVERSE_3;
! 1044: t = b + d;
! 1045: d = b - d;
! 1046: b = t;
! 1047:
! 1048: b += sb;
! 1049: c += sc;
! 1050: d += sd;
! 1051:
! 1052: B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
! 1053: C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
! 1054: D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
! 1055:
! 1056: b >>= 2;
! 1057: c >>= 1;
! 1058: d >>= 2;
! 1059:
! 1060: #if WANT_ASSERT
! 1061: ASSERT (b < 2);
! 1062: if (l == ls)
! 1063: {
! 1064: ASSERT (c < 3);
! 1065: ASSERT (d < 2);
! 1066: }
! 1067: else
! 1068: {
! 1069: ASSERT (c < 2);
! 1070: ASSERT (!d);
! 1071: }
! 1072: #endif
1.1 maekawa 1073:
1.1.1.2 ! maekawa 1074: *ptb = b;
! 1075: *ptc = c;
! 1076: *ptd = d;
! 1077: }
! 1078: #endif
1.1 maekawa 1079:
1080:
1.1.1.2 ! maekawa 1081: /*-- mpn_toom3_mul_n --------------------------------------------------------------*/
1.1 maekawa 1082:
1.1.1.2 ! maekawa 1083: /* Multiplies using 5 mults of one third size and so on recursively.
! 1084: * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
! 1085: * No overlap of p[...] with a[...] or b[...].
! 1086: * ws is workspace.
! 1087: */
! 1088:
! 1089: /* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
! 1090: * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
! 1091: * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
! 1092: */
! 1093:
! 1094: #define TOOM3_MUL_REC(p, a, b, n, ws) \
! 1095: do { \
! 1096: if (n < KARATSUBA_MUL_THRESHOLD) \
! 1097: mpn_mul_basecase (p, a, n, b, n); \
! 1098: else if (n < TOOM3_MUL_THRESHOLD) \
! 1099: mpn_kara_mul_n (p, a, b, n, ws); \
! 1100: else \
! 1101: mpn_toom3_mul_n (p, a, b, n, ws); \
! 1102: } while (0)
1.1 maekawa 1103:
1.1.1.2 ! maekawa 1104: void
! 1105: #if __STDC__
! 1106: mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
! 1107: #else
! 1108: mpn_toom3_mul_n (p, a, b, n, ws)
! 1109: mp_ptr p;
! 1110: mp_srcptr a;
! 1111: mp_srcptr b;
! 1112: mp_size_t n;
! 1113: mp_ptr ws;
! 1114: #endif
! 1115: {
! 1116: mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
! 1117: mp_limb_t *A,*B,*C,*D,*E, *W;
! 1118: mp_size_t l,l2,l3,l4,l5,ls;
! 1119:
! 1120: /* Break n words into chunks of size l, l and ls.
! 1121: * n = 3*k => l = k, ls = k
! 1122: * n = 3*k+1 => l = k+1, ls = k-1
! 1123: * n = 3*k+2 => l = k+1, ls = k
! 1124: */
! 1125: {
! 1126: mp_limb_t m;
! 1127:
! 1128: ASSERT (n >= TOOM3_MUL_THRESHOLD);
! 1129: l = ls = n / 3;
! 1130: m = n - l * 3;
! 1131: if (m != 0)
! 1132: ++l;
! 1133: if (m == 1)
! 1134: --ls;
! 1135:
! 1136: l2 = l * 2;
! 1137: l3 = l * 3;
! 1138: l4 = l * 4;
! 1139: l5 = l * 5;
! 1140: A = p;
! 1141: B = ws;
! 1142: C = p + l2;
! 1143: D = ws + l2;
! 1144: E = p + l4;
! 1145: W = ws + l4;
! 1146: }
! 1147:
! 1148: /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
! 1149: evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
! 1150: evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
! 1151:
! 1152: /** Second stage: pointwise multiplies. **/
! 1153: TOOM3_MUL_REC(D, C, C + l, l, W);
! 1154: tD = cD*dD;
! 1155: if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
! 1156: if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
! 1157: ASSERT (tD < 49);
! 1158: TOOM3_MUL_REC(C, B, B + l, l, W);
! 1159: tC = cC*dC;
! 1160: /* TO DO: choose one of the following alternatives. */
! 1161: #if 0
! 1162: if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
! 1163: if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
! 1164: #else
! 1165: if (cC)
! 1166: {
! 1167: if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
! 1168: else tC += add2Times (C + l, C + l, B + l, l);
1.1 maekawa 1169: }
1.1.1.2 ! maekawa 1170: if (dC)
! 1171: {
! 1172: if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
! 1173: else tC += add2Times (C + l, C + l, B, l);
! 1174: }
! 1175: #endif
! 1176: ASSERT (tC < 9);
! 1177: TOOM3_MUL_REC(B, A, A + l, l, W);
! 1178: tB = cB*dB;
! 1179: if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
! 1180: if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
! 1181: ASSERT (tB < 49);
! 1182: TOOM3_MUL_REC(A, a, b, l, W);
! 1183: TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
! 1184:
! 1185: /** Third stage: interpolation. **/
! 1186: interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
! 1187:
! 1188: /** Final stage: add up the coefficients. **/
! 1189: {
! 1190: mp_limb_t i, x, y;
! 1191: tB += mpn_add_n (p + l, p + l, B, l2);
! 1192: tD += mpn_add_n (p + l3, p + l3, D, l2);
! 1193: mpn_incr_u (p + l3, tB);
! 1194: mpn_incr_u (p + l4, tC);
! 1195: mpn_incr_u (p + l5, tD);
! 1196: }
1.1 maekawa 1197: }
1198:
1.1.1.2 ! maekawa 1199: /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
! 1200:
! 1201: /* Like previous function but for squaring */
! 1202:
! 1203: #define TOOM3_SQR_REC(p, a, n, ws) \
! 1204: do { \
! 1205: if (n < KARATSUBA_SQR_THRESHOLD) \
! 1206: mpn_sqr_basecase (p, a, n); \
! 1207: else if (n < TOOM3_SQR_THRESHOLD) \
! 1208: mpn_kara_sqr_n (p, a, n, ws); \
! 1209: else \
! 1210: mpn_toom3_sqr_n (p, a, n, ws); \
! 1211: } while (0)
! 1212:
! 1213: void
1.1 maekawa 1214: #if __STDC__
1.1.1.2 ! maekawa 1215: mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1.1 maekawa 1216: #else
1.1.1.2 ! maekawa 1217: mpn_toom3_sqr_n (p, a, n, ws)
! 1218: mp_ptr p;
! 1219: mp_srcptr a;
! 1220: mp_size_t n;
! 1221: mp_ptr ws;
1.1 maekawa 1222: #endif
1223: {
1.1.1.2 ! maekawa 1224: mp_limb_t cB,cC,cD, tB,tC,tD;
! 1225: mp_limb_t *A,*B,*C,*D,*E, *W;
! 1226: mp_size_t l,l2,l3,l4,l5,ls;
! 1227:
! 1228: /* Break n words into chunks of size l, l and ls.
! 1229: * n = 3*k => l = k, ls = k
! 1230: * n = 3*k+1 => l = k+1, ls = k-1
! 1231: * n = 3*k+2 => l = k+1, ls = k
! 1232: */
! 1233: {
! 1234: mp_limb_t m;
! 1235:
! 1236: ASSERT (n >= TOOM3_MUL_THRESHOLD);
! 1237: l = ls = n / 3;
! 1238: m = n - l * 3;
! 1239: if (m != 0)
! 1240: ++l;
! 1241: if (m == 1)
! 1242: --ls;
! 1243:
! 1244: l2 = l * 2;
! 1245: l3 = l * 3;
! 1246: l4 = l * 4;
! 1247: l5 = l * 5;
! 1248: A = p;
! 1249: B = ws;
! 1250: C = p + l2;
! 1251: D = ws + l2;
! 1252: E = p + l4;
! 1253: W = ws + l4;
! 1254: }
! 1255:
! 1256: /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
! 1257: evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
! 1258:
! 1259: /** Second stage: pointwise multiplies. **/
! 1260: TOOM3_SQR_REC(D, C, l, W);
! 1261: tD = cD*cD;
! 1262: if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
! 1263: ASSERT (tD < 49);
! 1264: TOOM3_SQR_REC(C, B, l, W);
! 1265: tC = cC*cC;
! 1266: /* TO DO: choose one of the following alternatives. */
! 1267: #if 0
! 1268: if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
! 1269: #else
! 1270: if (cC >= 1)
1.1 maekawa 1271: {
1.1.1.2 ! maekawa 1272: tC += add2Times (C + l, C + l, B, l);
! 1273: if (cC == 2)
! 1274: tC += add2Times (C + l, C + l, B, l);
! 1275: }
! 1276: #endif
! 1277: ASSERT (tC < 9);
! 1278: TOOM3_SQR_REC(B, A, l, W);
! 1279: tB = cB*cB;
! 1280: if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
! 1281: ASSERT (tB < 49);
! 1282: TOOM3_SQR_REC(A, a, l, W);
! 1283: TOOM3_SQR_REC(E, a + l2, ls, W);
! 1284:
! 1285: /** Third stage: interpolation. **/
! 1286: interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
! 1287:
! 1288: /** Final stage: add up the coefficients. **/
! 1289: {
! 1290: mp_limb_t i, x, y;
! 1291: tB += mpn_add_n (p + l, p + l, B, l2);
! 1292: tD += mpn_add_n (p + l3, p + l3, D, l2);
! 1293: mpn_incr_u (p + l3, tB);
! 1294: mpn_incr_u (p + l4, tC);
! 1295: mpn_incr_u (p + l5, tD);
! 1296: }
! 1297: }
! 1298:
! 1299: void
! 1300: #if __STDC__
! 1301: mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
! 1302: #else
! 1303: mpn_mul_n (p, a, b, n)
! 1304: mp_ptr p;
! 1305: mp_srcptr a;
! 1306: mp_srcptr b;
! 1307: mp_size_t n;
! 1308: #endif
! 1309: {
! 1310: if (n < KARATSUBA_MUL_THRESHOLD)
! 1311: mpn_mul_basecase (p, a, n, b, n);
! 1312: else if (n < TOOM3_MUL_THRESHOLD)
! 1313: {
! 1314: /* Allocate workspace of fixed size on stack: fast! */
! 1315: #if TUNE_PROGRAM_BUILD
! 1316: mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
! 1317: #else
! 1318: mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
! 1319: #endif
! 1320: mpn_kara_mul_n (p, a, b, n, ws);
1.1 maekawa 1321: }
1.1.1.2 ! maekawa 1322: #if WANT_FFT || TUNE_PROGRAM_BUILD
! 1323: else if (n < FFT_MUL_THRESHOLD)
! 1324: #else
1.1 maekawa 1325: else
1.1.1.2 ! maekawa 1326: #endif
1.1 maekawa 1327: {
1.1.1.2 ! maekawa 1328: /* Use workspace of unknown size in heap, as stack space may
! 1329: * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the
! 1330: * multiplication will take much longer than malloc()/free(). */
! 1331: mp_limb_t wsLen, *ws;
! 1332: wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
! 1333: ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
! 1334: mpn_toom3_mul_n (p, a, b, n, ws);
! 1335: (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
1.1 maekawa 1336: }
1.1.1.2 ! maekawa 1337: #if WANT_FFT || TUNE_PROGRAM_BUILD
! 1338: else
! 1339: {
! 1340: mpn_mul_fft_full (p, a, n, b, n);
! 1341: }
! 1342: #endif
1.1 maekawa 1343: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>