Annotation of OpenXM_contrib/gmp/mpn/generic/mul_n.c, Revision 1.1.1.3
1.1.1.2 maekawa 1: /* mpn_mul_n and helper function -- Multiply/square natural numbers.
1.1 maekawa 2:
1.1.1.2 maekawa 3: THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
4: ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH
5: THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
6: THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
7:
8:
1.1.1.3 ! ohara 9: Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free
! 10: Software Foundation, Inc.
1.1 maekawa 11:
12: This file is part of the GNU MP Library.
13:
14: The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2 maekawa 15: it under the terms of the GNU Lesser General Public License as published by
16: the Free Software Foundation; either version 2.1 of the License, or (at your
1.1 maekawa 17: option) any later version.
18:
19: The GNU MP Library is distributed in the hope that it will be useful, but
20: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2 maekawa 21: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
1.1 maekawa 22: License for more details.
23:
1.1.1.2 maekawa 24: You should have received a copy of the GNU Lesser General Public License
1.1 maekawa 25: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
26: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
27: MA 02111-1307, USA. */
28:
29: #include "gmp.h"
30: #include "gmp-impl.h"
1.1.1.2 maekawa 31: #include "longlong.h"
1.1 maekawa 32:
33:
1.1.1.3 ! ohara 34: #if GMP_NAIL_BITS != 0
! 35: /* The open-coded interpolate3 stuff has not been generalized for nails. */
! 36: #define USE_MORE_MPN 1
! 37: #endif
1.1 maekawa 38:
1.1.1.3 ! ohara 39: #ifndef USE_MORE_MPN
1.1.1.2 maekawa 40: #if !defined (__alpha) && !defined (__mips)
41: /* For all other machines, we want to call mpn functions for the compund
42: operations instead of open-coding them. */
1.1.1.3 ! ohara 43: #define USE_MORE_MPN 1
! 44: #endif
1.1 maekawa 45: #endif
46:
1.1.1.2 maekawa 47: /*== Function declarations =================================================*/
1.1 maekawa 48:
1.1.1.2 maekawa 49: static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
1.1.1.3 ! ohara 50: mp_ptr, mp_ptr, mp_ptr,
! 51: mp_srcptr, mp_srcptr, mp_srcptr,
! 52: mp_size_t, mp_size_t));
1.1.1.2 maekawa 53: static void interpolate3 _PROTO ((mp_srcptr,
1.1.1.3 ! ohara 54: mp_ptr, mp_ptr, mp_ptr,
! 55: mp_srcptr,
! 56: mp_ptr, mp_ptr, mp_ptr,
! 57: mp_size_t, mp_size_t));
1.1.1.2 maekawa 58: static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
59:
60:
61: /*-- mpn_kara_mul_n ---------------------------------------------------------------*/
62:
63: /* Multiplies using 3 half-sized mults and so on recursively.
64: * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
65: * No overlap of p[...] with a[...] or b[...].
66: * ws is workspace.
67: */
1.1 maekawa 68:
69: void
1.1.1.2 maekawa 70: mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
1.1 maekawa 71: {
1.1.1.3 ! ohara 72: mp_limb_t w, w0, w1;
1.1.1.2 maekawa 73: mp_size_t n2;
74: mp_srcptr x, y;
1.1.1.3 ! ohara 75: mp_size_t i;
! 76: int sign;
1.1 maekawa 77:
1.1.1.2 maekawa 78: n2 = n >> 1;
79: ASSERT (n2 > 0);
80:
1.1.1.3 ! ohara 81: if ((n & 1) != 0)
1.1 maekawa 82: {
1.1.1.2 maekawa 83: /* Odd length. */
84: mp_size_t n1, n3, nm1;
85:
86: n3 = n - n2;
87:
88: sign = 0;
89: w = a[n2];
90: if (w != 0)
91: w -= mpn_sub_n (p, a, a + n3, n2);
92: else
93: {
94: i = n2;
95: do
96: {
97: --i;
98: w0 = a[i];
1.1.1.3 ! ohara 99: w1 = a[n3 + i];
1.1.1.2 maekawa 100: }
101: while (w0 == w1 && i != 0);
102: if (w0 < w1)
103: {
104: x = a + n3;
105: y = a;
1.1.1.3 ! ohara 106: sign = ~0;
1.1.1.2 maekawa 107: }
108: else
109: {
110: x = a;
111: y = a + n3;
112: }
113: mpn_sub_n (p, x, y, n2);
114: }
115: p[n2] = w;
116:
117: w = b[n2];
118: if (w != 0)
119: w -= mpn_sub_n (p + n3, b, b + n3, n2);
120: else
121: {
122: i = n2;
1.1.1.3 ! ohara 123: do
1.1.1.2 maekawa 124: {
125: --i;
1.1.1.3 ! ohara 126: w0 = b[i];
! 127: w1 = b[n3 + i];
1.1.1.2 maekawa 128: }
129: while (w0 == w1 && i != 0);
130: if (w0 < w1)
131: {
132: x = b + n3;
133: y = b;
1.1.1.3 ! ohara 134: sign = ~sign;
1.1.1.2 maekawa 135: }
136: else
137: {
138: x = b;
139: y = b + n3;
140: }
141: mpn_sub_n (p + n3, x, y, n2);
142: }
143: p[n] = w;
144:
145: n1 = n + 1;
1.1.1.3 ! ohara 146: if (n2 < MUL_KARATSUBA_THRESHOLD)
1.1.1.2 maekawa 147: {
1.1.1.3 ! ohara 148: if (n3 < MUL_KARATSUBA_THRESHOLD)
1.1.1.2 maekawa 149: {
150: mpn_mul_basecase (ws, p, n3, p + n3, n3);
151: mpn_mul_basecase (p, a, n3, b, n3);
152: }
153: else
154: {
155: mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
156: mpn_kara_mul_n (p, a, b, n3, ws + n1);
157: }
158: mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
159: }
160: else
161: {
162: mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
163: mpn_kara_mul_n (p, a, b, n3, ws + n1);
164: mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
165: }
166:
167: if (sign)
168: mpn_add_n (ws, p, ws, n1);
1.1 maekawa 169: else
1.1.1.2 maekawa 170: mpn_sub_n (ws, p, ws, n1);
171:
172: nm1 = n - 1;
173: if (mpn_add_n (ws, p + n1, ws, nm1))
174: {
1.1.1.3 ! ohara 175: mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
1.1.1.2 maekawa 176: ws[nm1] = x;
177: if (x == 0)
1.1.1.3 ! ohara 178: ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
1.1.1.2 maekawa 179: }
180: if (mpn_add_n (p + n3, p + n3, ws, n1))
181: {
1.1.1.3 ! ohara 182: mpn_incr_u (p + n1 + n3, 1);
1.1.1.2 maekawa 183: }
1.1 maekawa 184: }
185: else
1.1.1.2 maekawa 186: {
187: /* Even length. */
188: i = n2;
189: do
190: {
191: --i;
192: w0 = a[i];
1.1.1.3 ! ohara 193: w1 = a[n2 + i];
1.1.1.2 maekawa 194: }
195: while (w0 == w1 && i != 0);
196: sign = 0;
197: if (w0 < w1)
198: {
199: x = a + n2;
200: y = a;
1.1.1.3 ! ohara 201: sign = ~0;
1.1.1.2 maekawa 202: }
203: else
204: {
205: x = a;
206: y = a + n2;
207: }
208: mpn_sub_n (p, x, y, n2);
1.1 maekawa 209:
1.1.1.2 maekawa 210: i = n2;
1.1.1.3 ! ohara 211: do
1.1.1.2 maekawa 212: {
213: --i;
214: w0 = b[i];
1.1.1.3 ! ohara 215: w1 = b[n2 + i];
1.1.1.2 maekawa 216: }
217: while (w0 == w1 && i != 0);
218: if (w0 < w1)
219: {
220: x = b + n2;
221: y = b;
1.1.1.3 ! ohara 222: sign = ~sign;
1.1.1.2 maekawa 223: }
224: else
225: {
226: x = b;
227: y = b + n2;
228: }
229: mpn_sub_n (p + n2, x, y, n2);
1.1 maekawa 230:
1.1.1.2 maekawa 231: /* Pointwise products. */
1.1.1.3 ! ohara 232: if (n2 < MUL_KARATSUBA_THRESHOLD)
1.1 maekawa 233: {
1.1.1.2 maekawa 234: mpn_mul_basecase (ws, p, n2, p + n2, n2);
235: mpn_mul_basecase (p, a, n2, b, n2);
236: mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
1.1 maekawa 237: }
238: else
1.1.1.2 maekawa 239: {
240: mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
241: mpn_kara_mul_n (p, a, b, n2, ws + n);
242: mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
243: }
1.1 maekawa 244:
1.1.1.2 maekawa 245: /* Interpolate. */
246: if (sign)
247: w = mpn_add_n (ws, p, ws, n);
248: else
249: w = -mpn_sub_n (ws, p, ws, n);
250: w += mpn_add_n (ws, p + n, ws, n);
251: w += mpn_add_n (p + n2, p + n2, ws, n);
1.1.1.3 ! ohara 252: MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
1.1 maekawa 253: }
254: }
255:
256: void
1.1.1.2 maekawa 257: mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
258: {
1.1.1.3 ! ohara 259: mp_limb_t w, w0, w1;
1.1.1.2 maekawa 260: mp_size_t n2;
261: mp_srcptr x, y;
1.1.1.3 ! ohara 262: mp_size_t i;
1.1 maekawa 263:
1.1.1.2 maekawa 264: n2 = n >> 1;
265: ASSERT (n2 > 0);
266:
1.1.1.3 ! ohara 267: if ((n & 1) != 0)
1.1 maekawa 268: {
1.1.1.2 maekawa 269: /* Odd length. */
270: mp_size_t n1, n3, nm1;
1.1 maekawa 271:
1.1.1.2 maekawa 272: n3 = n - n2;
1.1 maekawa 273:
1.1.1.2 maekawa 274: w = a[n2];
275: if (w != 0)
276: w -= mpn_sub_n (p, a, a + n3, n2);
277: else
278: {
279: i = n2;
280: do
281: {
282: --i;
283: w0 = a[i];
1.1.1.3 ! ohara 284: w1 = a[n3 + i];
1.1.1.2 maekawa 285: }
286: while (w0 == w1 && i != 0);
287: if (w0 < w1)
288: {
289: x = a + n3;
290: y = a;
291: }
292: else
293: {
294: x = a;
295: y = a + n3;
296: }
297: mpn_sub_n (p, x, y, n2);
298: }
299: p[n2] = w;
1.1 maekawa 300:
1.1.1.3 ! ohara 301: n1 = n + 1;
! 302:
! 303: /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
! 304: could be combined. But that's not important, since the tests will
! 305: take a miniscule amount of time compared to the function calls. */
! 306: if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
1.1.1.2 maekawa 307: {
1.1.1.3 ! ohara 308: mpn_mul_basecase (ws, p, n3, p, n3);
! 309: mpn_mul_basecase (p, a, n3, a, n3);
1.1.1.2 maekawa 310: }
1.1.1.3 ! ohara 311: else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
1.1.1.2 maekawa 312: {
1.1.1.3 ! ohara 313: mpn_sqr_basecase (ws, p, n3);
! 314: mpn_sqr_basecase (p, a, n3);
1.1.1.2 maekawa 315: }
316: else
317: {
1.1.1.3 ! ohara 318: mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */
! 319: mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */
1.1.1.2 maekawa 320: }
1.1.1.3 ! ohara 321: if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
! 322: mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
! 323: else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
! 324: mpn_sqr_basecase (p + n1, a + n3, n2);
1.1.1.2 maekawa 325: else
1.1.1.3 ! ohara 326: mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */
! 327:
! 328: /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
! 329: borrow from mpn_sub_n. If it occurs then it'll be cancelled by a
! 330: carry from ws[n]. Further, since 2xy fits in n1 limbs there won't
! 331: be any carry out of ws[n] other than cancelling that borrow. */
! 332:
! 333: mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */
1.1 maekawa 334:
1.1.1.2 maekawa 335: nm1 = n - 1;
1.1.1.3 ! ohara 336: if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */
1.1.1.2 maekawa 337: {
1.1.1.3 ! ohara 338: mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
1.1.1.2 maekawa 339: ws[nm1] = x;
340: if (x == 0)
1.1.1.3 ! ohara 341: ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
1.1.1.2 maekawa 342: }
343: if (mpn_add_n (p + n3, p + n3, ws, n1))
344: {
1.1.1.3 ! ohara 345: mpn_incr_u (p + n1 + n3, 1);
1.1.1.2 maekawa 346: }
347: }
348: else
349: {
350: /* Even length. */
351: i = n2;
352: do
353: {
354: --i;
355: w0 = a[i];
1.1.1.3 ! ohara 356: w1 = a[n2 + i];
1.1.1.2 maekawa 357: }
358: while (w0 == w1 && i != 0);
359: if (w0 < w1)
1.1 maekawa 360: {
1.1.1.2 maekawa 361: x = a + n2;
362: y = a;
1.1 maekawa 363: }
364: else
365: {
1.1.1.2 maekawa 366: x = a;
367: y = a + n2;
368: }
369: mpn_sub_n (p, x, y, n2);
370:
1.1.1.3 ! ohara 371: /* Pointwise products. */
! 372: if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
1.1 maekawa 373: {
1.1.1.3 ! ohara 374: mpn_mul_basecase (ws, p, n2, p, n2);
! 375: mpn_mul_basecase (p, a, n2, a, n2);
! 376: mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
1.1 maekawa 377: }
1.1.1.3 ! ohara 378: else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
1.1 maekawa 379: {
1.1.1.3 ! ohara 380: mpn_sqr_basecase (ws, p, n2);
! 381: mpn_sqr_basecase (p, a, n2);
1.1.1.2 maekawa 382: mpn_sqr_basecase (p + n, a + n2, n2);
383: }
384: else
385: {
1.1.1.3 ! ohara 386: mpn_kara_sqr_n (ws, p, n2, ws + n);
! 387: mpn_kara_sqr_n (p, a, n2, ws + n);
1.1.1.2 maekawa 388: mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
389: }
1.1 maekawa 390:
1.1.1.2 maekawa 391: /* Interpolate. */
1.1.1.3 ! ohara 392: w = -mpn_sub_n (ws, p, ws, n);
1.1.1.2 maekawa 393: w += mpn_add_n (ws, p + n, ws, n);
394: w += mpn_add_n (p + n2, p + n2, ws, n);
1.1.1.3 ! ohara 395: MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
1.1.1.2 maekawa 396: }
397: }
1.1 maekawa 398:
1.1.1.2 maekawa 399: /*-- add2Times -------------------------------------------------------------*/
1.1 maekawa 400:
1.1.1.2 maekawa 401: /* z[] = x[] + 2 * y[]
1.1.1.3 ! ohara 402: Note that z and x might point to the same vectors.
! 403: FIXME: gcc won't inline this because it uses alloca. */
! 404: #if USE_MORE_MPN
! 405:
1.1.1.2 maekawa 406: static inline mp_limb_t
407: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
408: {
409: mp_ptr t;
410: mp_limb_t c;
411: TMP_DECL (marker);
412: TMP_MARK (marker);
413: t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
414: c = mpn_lshift (t, y, n, 1);
415: c += mpn_add_n (z, x, t, n);
416: TMP_FREE (marker);
417: return c;
418: }
1.1.1.3 ! ohara 419:
1.1.1.2 maekawa 420: #else
1.1 maekawa 421:
1.1.1.2 maekawa 422: static mp_limb_t
423: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
424: {
425: mp_limb_t c, v, w;
1.1 maekawa 426:
1.1.1.2 maekawa 427: ASSERT (n > 0);
1.1.1.3 ! ohara 428: v = *x;
! 429: w = *y;
1.1.1.2 maekawa 430: c = w >> (BITS_PER_MP_LIMB - 1);
431: w <<= 1;
432: v += w;
433: c += v < w;
434: *z = v;
435: ++x; ++y; ++z;
436: while (--n)
437: {
438: v = *x;
439: w = *y;
440: v += c;
441: c = v < c;
442: c += w >> (BITS_PER_MP_LIMB - 1);
443: w <<= 1;
444: v += w;
445: c += v < w;
446: *z = v;
447: ++x; ++y; ++z;
1.1 maekawa 448: }
1.1.1.2 maekawa 449:
450: return c;
1.1 maekawa 451: }
1.1.1.2 maekawa 452: #endif
1.1 maekawa 453:
1.1.1.2 maekawa 454: /*-- evaluate3 -------------------------------------------------------------*/
455:
456: /* Evaluates:
457: * ph := 4*A+2*B+C
458: * p1 := A+B+C
459: * p2 := A+2*B+4*C
460: * where:
461: * ph[], p1[], p2[], A[] and B[] all have length len,
462: * C[] has length len2 with len-len2 = 0, 1 or 2.
463: * Returns top words (overflow) at pth, pt1 and pt2 respectively.
464: */
1.1.1.3 ! ohara 465: #if USE_MORE_MPN
! 466:
1.1.1.2 maekawa 467: static void
468: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
1.1.1.3 ! ohara 469: mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len,mp_size_t len2)
1.1 maekawa 470: {
1.1.1.2 maekawa 471: mp_limb_t c, d, e;
1.1.1.3 ! ohara 472:
1.1.1.2 maekawa 473: ASSERT (len - len2 <= 2);
474:
475: e = mpn_lshift (p1, B, len, 1);
476:
477: c = mpn_lshift (ph, A, len, 2);
478: c += e + mpn_add_n (ph, ph, p1, len);
479: d = mpn_add_n (ph, ph, C, len2);
1.1.1.3 ! ohara 480: if (len2 == len)
! 481: c += d;
! 482: else
! 483: c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
1.1.1.2 maekawa 484: ASSERT (c < 7);
485: *pth = c;
486:
487: c = mpn_lshift (p2, C, len2, 2);
488: #if 1
1.1.1.3 ! ohara 489: if (len2 != len)
! 490: {
! 491: p2[len-1] = 0;
! 492: p2[len2] = c;
! 493: c = 0;
! 494: }
1.1.1.2 maekawa 495: c += e + mpn_add_n (p2, p2, p1, len);
496: #else
497: d = mpn_add_n (p2, p2, p1, len2);
498: c += d;
1.1.1.3 ! ohara 499: if (len2 != len)
! 500: c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
1.1.1.2 maekawa 501: c += e;
502: #endif
503: c += mpn_add_n (p2, p2, A, len);
504: ASSERT (c < 7);
505: *pt2 = c;
506:
507: c = mpn_add_n (p1, A, B, len);
508: d = mpn_add_n (p1, p1, C, len2);
1.1.1.3 ! ohara 509: if (len2 == len)
! 510: c += d;
! 511: else
! 512: c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
1.1.1.2 maekawa 513: ASSERT (c < 3);
514: *pt1 = c;
515: }
516:
517: #else
1.1 maekawa 518:
1.1.1.2 maekawa 519: static void
520: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
521: mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
522: {
523: mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
524:
525: ASSERT (l - ls <= 2);
1.1 maekawa 526:
1.1.1.2 maekawa 527: th = t1 = t2 = 0;
528: for (i = 0; i < l; ++i)
1.1 maekawa 529: {
1.1.1.2 maekawa 530: a = *A;
531: b = *B;
532: c = i < ls ? *C : 0;
533:
534: /* TO DO: choose one of the following alternatives. */
535: #if 0
536: t = a << 2;
537: vh = th + t;
538: th = vh < t;
539: th += a >> (BITS_PER_MP_LIMB - 2);
540: t = b << 1;
541: vh += t;
542: th += vh < t;
543: th += b >> (BITS_PER_MP_LIMB - 1);
544: vh += c;
545: th += vh < c;
546: #else
547: vh = th + c;
548: th = vh < c;
549: t = b << 1;
550: vh += t;
551: th += vh < t;
552: th += b >> (BITS_PER_MP_LIMB - 1);
553: t = a << 2;
554: vh += t;
555: th += vh < t;
556: th += a >> (BITS_PER_MP_LIMB - 2);
557: #endif
1.1 maekawa 558:
1.1.1.2 maekawa 559: v1 = t1 + a;
560: t1 = v1 < a;
561: v1 += b;
562: t1 += v1 < b;
563: v1 += c;
564: t1 += v1 < c;
565:
566: v2 = t2 + a;
567: t2 = v2 < a;
568: t = b << 1;
569: v2 += t;
570: t2 += v2 < t;
571: t2 += b >> (BITS_PER_MP_LIMB - 1);
572: t = c << 2;
573: v2 += t;
574: t2 += v2 < t;
575: t2 += c >> (BITS_PER_MP_LIMB - 2);
576:
577: *ph = vh;
578: *p1 = v1;
579: *p2 = v2;
580:
581: ++A; ++B; ++C;
582: ++ph; ++p1; ++p2;
1.1 maekawa 583: }
1.1.1.2 maekawa 584:
585: ASSERT (th < 7);
586: ASSERT (t1 < 3);
587: ASSERT (t2 < 7);
588:
589: *pth = th;
590: *pt1 = t1;
591: *pt2 = t2;
1.1 maekawa 592: }
1.1.1.2 maekawa 593: #endif
1.1 maekawa 594:
1.1.1.2 maekawa 595:
596: /*-- interpolate3 ----------------------------------------------------------*/
597:
598: /* Interpolates B, C, D (in-place) from:
599: * 16*A+8*B+4*C+2*D+E
600: * A+B+C+D+E
601: * A+2*B+4*C+8*D+16*E
602: * where:
603: * A[], B[], C[] and D[] all have length l,
604: * E[] has length ls with l-ls = 0, 2 or 4.
605: *
606: * Reads top words (from earlier overflow) from ptb, ptc and ptd,
607: * and returns new top words there.
608: */
609:
1.1.1.3 ! ohara 610: #if USE_MORE_MPN
! 611:
1.1.1.2 maekawa 612: static void
613: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
1.1.1.3 ! ohara 614: mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len,mp_size_t len2)
1.1.1.2 maekawa 615: {
616: mp_ptr ws;
617: mp_limb_t t, tb,tc,td;
618: TMP_DECL (marker);
619: TMP_MARK (marker);
620:
621: ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
622:
623: /* Let x1, x2, x3 be the values to interpolate. We have:
1.1.1.3 ! ohara 624: * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
! 625: * c = a + x1 + x2 + x3 + e
! 626: * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
1.1.1.2 maekawa 627: */
628:
629: ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
630:
631: tb = *ptb; tc = *ptc; td = *ptd;
632:
633:
1.1.1.3 ! ohara 634: /* b := b - 16*a - e
! 635: * c := c - a - e
! 636: * d := d - a - 16*e
1.1.1.2 maekawa 637: */
638:
639: t = mpn_lshift (ws, A, len, 4);
640: tb -= t + mpn_sub_n (B, B, ws, len);
641: t = mpn_sub_n (B, B, E, len2);
1.1.1.3 ! ohara 642: if (len2 == len)
! 643: tb -= t;
! 644: else
! 645: tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
1.1.1.2 maekawa 646:
647: tc -= mpn_sub_n (C, C, A, len);
648: t = mpn_sub_n (C, C, E, len2);
1.1.1.3 ! ohara 649: if (len2 == len)
! 650: tc -= t;
! 651: else
! 652: tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
1.1.1.2 maekawa 653:
654: t = mpn_lshift (ws, E, len2, 4);
655: t += mpn_add_n (ws, ws, A, len2);
656: #if 1
1.1.1.3 ! ohara 657: if (len2 != len)
! 658: t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
1.1.1.2 maekawa 659: td -= t + mpn_sub_n (D, D, ws, len);
660: #else
661: t += mpn_sub_n (D, D, ws, len2);
1.1.1.3 ! ohara 662: if (len2 != len)
! 663: {
! 664: t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
! 665: t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
! 666: }
1.1.1.2 maekawa 667: td -= t;
668: #endif
669:
670:
671: /* b, d := b + d, b - d */
672:
673: #ifdef HAVE_MPN_ADD_SUB_N
674: /* #error TO DO ... */
675: #else
1.1.1.3 ! ohara 676: t = tb + td + mpn_add_n (ws, B, D, len);
! 677: td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK;
1.1.1.2 maekawa 678: tb = t;
679: MPN_COPY (B, ws, len);
680: #endif
1.1.1.3 ! ohara 681:
1.1.1.2 maekawa 682: /* b := b-8*c */
683: t = 8 * tc + mpn_lshift (ws, C, len, 3);
684: tb -= t + mpn_sub_n (B, B, ws, len);
685:
686: /* c := 2*c - b */
687: tc = 2 * tc + mpn_lshift (C, C, len, 1);
688: tc -= tb + mpn_sub_n (C, C, B, len);
689:
690: /* d := d/3 */
1.1.1.3 ! ohara 691: td = ((td - mpn_divexact_by3 (D, D, len)) * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
1.1.1.2 maekawa 692:
693: /* b, d := b + d, b - d */
694: #ifdef HAVE_MPN_ADD_SUB_N
695: /* #error TO DO ... */
1.1 maekawa 696: #else
1.1.1.3 ! ohara 697: t = (tb + td + mpn_add_n (ws, B, D, len)) & GMP_NUMB_MASK;
! 698: td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK;
1.1.1.2 maekawa 699: tb = t;
700: MPN_COPY (B, ws, len);
701: #endif
1.1 maekawa 702:
1.1.1.2 maekawa 703: /* Now:
704: * b = 4*x1
705: * c = 2*x2
706: * d = 4*x3
707: */
708:
709: ASSERT(!(*B & 3));
710: mpn_rshift (B, B, len, 2);
1.1.1.3 ! ohara 711: B[len-1] |= (tb << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK;
! 712: ASSERT((mp_limb_signed_t)tb >= 0);
1.1.1.2 maekawa 713: tb >>= 2;
714:
715: ASSERT(!(*C & 1));
716: mpn_rshift (C, C, len, 1);
1.1.1.3 ! ohara 717: C[len-1] |= (tc << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
! 718: ASSERT((mp_limb_signed_t)tc >= 0);
1.1.1.2 maekawa 719: tc >>= 1;
720:
721: ASSERT(!(*D & 3));
722: mpn_rshift (D, D, len, 2);
1.1.1.3 ! ohara 723: D[len-1] |= (td << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK;
! 724: ASSERT((mp_limb_signed_t)td >= 0);
1.1.1.2 maekawa 725: td >>= 2;
726:
727: #if WANT_ASSERT
728: ASSERT (tb < 2);
729: if (len == len2)
730: {
731: ASSERT (tc < 3);
732: ASSERT (td < 2);
1.1 maekawa 733: }
734: else
735: {
1.1.1.2 maekawa 736: ASSERT (tc < 2);
737: ASSERT (!td);
738: }
739: #endif
1.1 maekawa 740:
1.1.1.2 maekawa 741: *ptb = tb;
742: *ptc = tc;
743: *ptd = td;
1.1 maekawa 744:
1.1.1.2 maekawa 745: TMP_FREE (marker);
746: }
747:
748: #else
749:
750: static void
751: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
752: mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
753: {
754: mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
755: const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
756:
757: #if WANT_ASSERT
758: t = l - ls;
759: ASSERT (t == 0 || t == 2 || t == 4);
760: #endif
761:
762: sb = sc = sd = 0;
763: for (i = 0; i < l; ++i)
764: {
765: mp_limb_t tb, tc, td, tt;
766:
767: a = *A;
768: b = *B;
769: c = *C;
770: d = *D;
771: e = i < ls ? *E : 0;
772:
773: /* Let x1, x2, x3 be the values to interpolate. We have:
774: * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e
775: * c = a + x1 + x2 + x3 + e
776: * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e
777: */
778:
779: /* b := b - 16*a - e
780: * c := c - a - e
781: * d := d - a - 16*e
782: */
783: t = a << 4;
784: tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
785: b -= t;
786: tb -= b < e;
787: b -= e;
788: tc = -(c < a);
789: c -= a;
790: tc -= c < e;
791: c -= e;
792: td = -(d < a);
793: d -= a;
794: t = e << 4;
795: td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
796: d -= t;
797:
798: /* b, d := b + d, b - d */
799: t = b + d;
800: tt = tb + td + (t < b);
801: td = tb - td - (b < d);
802: d = b - d;
803: b = t;
804: tb = tt;
805:
806: /* b := b-8*c */
807: t = c << 3;
808: tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
809: b -= t;
810:
811: /* c := 2*c - b */
812: t = c << 1;
813: tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
814: c = t - b;
815:
816: /* d := d/3 */
1.1.1.3 ! ohara 817: d *= MODLIMB_INVERSE_3;
1.1.1.2 maekawa 818: td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
1.1.1.3 ! ohara 819: td *= MODLIMB_INVERSE_3;
1.1.1.2 maekawa 820:
821: /* b, d := b + d, b - d */
822: t = b + d;
823: tt = tb + td + (t < b);
824: td = tb - td - (b < d);
825: d = b - d;
826: b = t;
827: tb = tt;
828:
829: /* Now:
830: * b = 4*x1
831: * c = 2*x2
832: * d = 4*x3
833: */
834:
835: /* sb has period 2. */
836: b += sb;
837: tb += b < sb;
838: sb &= maskOffHalf;
839: sb |= sb >> (BITS_PER_MP_LIMB >> 1);
840: sb += tb;
841:
842: /* sc has period 1. */
843: c += sc;
844: tc += c < sc;
845: /* TO DO: choose one of the following alternatives. */
846: #if 1
1.1.1.3 ! ohara 847: sc = (mp_limb_signed_t) sc >> (BITS_PER_MP_LIMB - 1);
1.1.1.2 maekawa 848: sc += tc;
849: #else
1.1.1.3 ! ohara 850: sc = tc - ((mp_limb_signed_t) sc < 0L);
1.1.1.2 maekawa 851: #endif
852:
853: /* sd has period 2. */
854: d += sd;
855: td += d < sd;
856: sd &= maskOffHalf;
857: sd |= sd >> (BITS_PER_MP_LIMB >> 1);
858: sd += td;
859:
860: if (i != 0)
1.1 maekawa 861: {
1.1.1.2 maekawa 862: B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
863: C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
864: D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1.1 maekawa 865: }
1.1.1.2 maekawa 866: ob = b >> 2;
867: oc = c >> 1;
868: od = d >> 2;
1.1 maekawa 869:
1.1.1.2 maekawa 870: ++A; ++B; ++C; ++D; ++E;
871: }
1.1 maekawa 872:
1.1.1.2 maekawa 873: /* Handle top words. */
874: b = *ptb;
875: c = *ptc;
876: d = *ptd;
877:
878: t = b + d;
879: d = b - d;
880: b = t;
881: b -= c << 3;
882: c = (c << 1) - b;
1.1.1.3 ! ohara 883: d *= MODLIMB_INVERSE_3;
1.1.1.2 maekawa 884: t = b + d;
885: d = b - d;
886: b = t;
887:
888: b += sb;
889: c += sc;
890: d += sd;
891:
892: B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
893: C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
894: D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
895:
896: b >>= 2;
897: c >>= 1;
898: d >>= 2;
899:
900: #if WANT_ASSERT
901: ASSERT (b < 2);
902: if (l == ls)
903: {
904: ASSERT (c < 3);
905: ASSERT (d < 2);
906: }
907: else
908: {
909: ASSERT (c < 2);
910: ASSERT (!d);
911: }
912: #endif
1.1 maekawa 913:
1.1.1.2 maekawa 914: *ptb = b;
915: *ptc = c;
916: *ptd = d;
917: }
918: #endif
1.1 maekawa 919:
920:
1.1.1.2 maekawa 921: /*-- mpn_toom3_mul_n --------------------------------------------------------------*/
1.1 maekawa 922:
1.1.1.2 maekawa 923: /* Multiplies using 5 mults of one third size and so on recursively.
924: * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
925: * No overlap of p[...] with a[...] or b[...].
926: * ws is workspace.
927: */
928:
1.1.1.3 ! ohara 929: /* TO DO: If MUL_TOOM3_THRESHOLD is much bigger than MUL_KARATSUBA_THRESHOLD then the
! 930: * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
! 931: * because the "n < MUL_KARATSUBA_THRESHOLD" test here will always be false.
1.1.1.2 maekawa 932: */
933:
934: #define TOOM3_MUL_REC(p, a, b, n, ws) \
935: do { \
1.1.1.3 ! ohara 936: if (n < MUL_KARATSUBA_THRESHOLD) \
1.1.1.2 maekawa 937: mpn_mul_basecase (p, a, n, b, n); \
1.1.1.3 ! ohara 938: else if (n < MUL_TOOM3_THRESHOLD) \
1.1.1.2 maekawa 939: mpn_kara_mul_n (p, a, b, n, ws); \
940: else \
941: mpn_toom3_mul_n (p, a, b, n, ws); \
942: } while (0)
1.1 maekawa 943:
1.1.1.2 maekawa 944: void
945: mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
946: {
947: mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
948: mp_limb_t *A,*B,*C,*D,*E, *W;
949: mp_size_t l,l2,l3,l4,l5,ls;
950:
951: /* Break n words into chunks of size l, l and ls.
952: * n = 3*k => l = k, ls = k
953: * n = 3*k+1 => l = k+1, ls = k-1
954: * n = 3*k+2 => l = k+1, ls = k
955: */
956: {
957: mp_limb_t m;
958:
1.1.1.3 ! ohara 959: /* this is probably unnecessarily strict */
! 960: ASSERT (n >= MUL_TOOM3_THRESHOLD);
! 961:
1.1.1.2 maekawa 962: l = ls = n / 3;
963: m = n - l * 3;
964: if (m != 0)
965: ++l;
966: if (m == 1)
967: --ls;
968:
969: l2 = l * 2;
970: l3 = l * 3;
971: l4 = l * 4;
972: l5 = l * 5;
973: A = p;
974: B = ws;
975: C = p + l2;
976: D = ws + l2;
977: E = p + l4;
978: W = ws + l4;
979: }
980:
1.1.1.3 ! ohara 981: ASSERT (l >= 1);
! 982: ASSERT (ls >= 1);
! 983:
1.1.1.2 maekawa 984: /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
985: evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
986: evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
987:
988: /** Second stage: pointwise multiplies. **/
989: TOOM3_MUL_REC(D, C, C + l, l, W);
990: tD = cD*dD;
991: if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
992: if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
993: ASSERT (tD < 49);
994: TOOM3_MUL_REC(C, B, B + l, l, W);
995: tC = cC*dC;
996: /* TO DO: choose one of the following alternatives. */
997: #if 0
998: if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
999: if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
1000: #else
1001: if (cC)
1002: {
1003: if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
1004: else tC += add2Times (C + l, C + l, B + l, l);
1.1 maekawa 1005: }
1.1.1.2 maekawa 1006: if (dC)
1007: {
1008: if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
1009: else tC += add2Times (C + l, C + l, B, l);
1010: }
1011: #endif
1012: ASSERT (tC < 9);
1013: TOOM3_MUL_REC(B, A, A + l, l, W);
1014: tB = cB*dB;
1015: if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
1016: if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
1017: ASSERT (tB < 49);
1018: TOOM3_MUL_REC(A, a, b, l, W);
1019: TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
1020:
1021: /** Third stage: interpolation. **/
1022: interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1023:
1024: /** Final stage: add up the coefficients. **/
1.1.1.3 ! ohara 1025: tB += mpn_add_n (p + l, p + l, B, l2);
! 1026: tD += mpn_add_n (p + l3, p + l3, D, l2);
! 1027: MPN_INCR_U (p + l3, 2 * n - l3, tB);
! 1028: MPN_INCR_U (p + l4, 2 * n - l4, tC);
! 1029: MPN_INCR_U (p + l5, 2 * n - l5, tD);
1.1 maekawa 1030: }
1031:
1.1.1.2 maekawa 1032: /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
1033:
1034: /* Like previous function but for squaring */
1035:
1.1.1.3 ! ohara 1036: /* FIXME: If SQR_TOOM3_THRESHOLD is big enough it might never get into the
! 1037: basecase range. Try to arrange those conditonals go dead. */
! 1038: #define TOOM3_SQR_REC(p, a, n, ws) \
! 1039: do { \
! 1040: if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) \
! 1041: mpn_mul_basecase (p, a, n, a, n); \
! 1042: else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \
! 1043: mpn_sqr_basecase (p, a, n); \
! 1044: else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
! 1045: mpn_kara_sqr_n (p, a, n, ws); \
! 1046: else \
! 1047: mpn_toom3_sqr_n (p, a, n, ws); \
1.1.1.2 maekawa 1048: } while (0)
1049:
1050: void
1051: mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1.1 maekawa 1052: {
1.1.1.2 maekawa 1053: mp_limb_t cB,cC,cD, tB,tC,tD;
1054: mp_limb_t *A,*B,*C,*D,*E, *W;
1055: mp_size_t l,l2,l3,l4,l5,ls;
1056:
1057: /* Break n words into chunks of size l, l and ls.
1058: * n = 3*k => l = k, ls = k
1059: * n = 3*k+1 => l = k+1, ls = k-1
1060: * n = 3*k+2 => l = k+1, ls = k
1061: */
1062: {
1063: mp_limb_t m;
1064:
1.1.1.3 ! ohara 1065: /* this is probably unnecessarily strict */
! 1066: ASSERT (n >= SQR_TOOM3_THRESHOLD);
! 1067:
1.1.1.2 maekawa 1068: l = ls = n / 3;
1069: m = n - l * 3;
1070: if (m != 0)
1071: ++l;
1072: if (m == 1)
1073: --ls;
1074:
1075: l2 = l * 2;
1076: l3 = l * 3;
1077: l4 = l * 4;
1078: l5 = l * 5;
1079: A = p;
1080: B = ws;
1081: C = p + l2;
1082: D = ws + l2;
1083: E = p + l4;
1084: W = ws + l4;
1085: }
1086:
1.1.1.3 ! ohara 1087: ASSERT (l >= 1);
! 1088: ASSERT (ls >= 1);
! 1089:
1.1.1.2 maekawa 1090: /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1091: evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1092:
1093: /** Second stage: pointwise multiplies. **/
1094: TOOM3_SQR_REC(D, C, l, W);
1095: tD = cD*cD;
1096: if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
1097: ASSERT (tD < 49);
1098: TOOM3_SQR_REC(C, B, l, W);
1099: tC = cC*cC;
1100: /* TO DO: choose one of the following alternatives. */
1101: #if 0
1102: if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
1103: #else
1104: if (cC >= 1)
1.1 maekawa 1105: {
1.1.1.2 maekawa 1106: tC += add2Times (C + l, C + l, B, l);
1107: if (cC == 2)
1.1.1.3 ! ohara 1108: tC += add2Times (C + l, C + l, B, l);
1.1.1.2 maekawa 1109: }
1110: #endif
1111: ASSERT (tC < 9);
1112: TOOM3_SQR_REC(B, A, l, W);
1113: tB = cB*cB;
1114: if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
1115: ASSERT (tB < 49);
1116: TOOM3_SQR_REC(A, a, l, W);
1117: TOOM3_SQR_REC(E, a + l2, ls, W);
1118:
1119: /** Third stage: interpolation. **/
1120: interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1121:
1122: /** Final stage: add up the coefficients. **/
1.1.1.3 ! ohara 1123: tB += mpn_add_n (p + l, p + l, B, l2);
! 1124: tD += mpn_add_n (p + l3, p + l3, D, l2);
! 1125: MPN_INCR_U (p + l3, 2 * n - l3, tB);
! 1126: MPN_INCR_U (p + l4, 2 * n - l4, tC);
! 1127: MPN_INCR_U (p + l5, 2 * n - l5, tD);
1.1.1.2 maekawa 1128: }
1129:
1130: void
1131: mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
1132: {
1.1.1.3 ! ohara 1133: ASSERT (n >= 1);
! 1134: ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
! 1135: ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));
! 1136:
! 1137: if (n < MUL_KARATSUBA_THRESHOLD)
1.1.1.2 maekawa 1138: mpn_mul_basecase (p, a, n, b, n);
1.1.1.3 ! ohara 1139: else if (n < MUL_TOOM3_THRESHOLD)
1.1.1.2 maekawa 1140: {
1141: /* Allocate workspace of fixed size on stack: fast! */
1142: #if TUNE_PROGRAM_BUILD
1.1.1.3 ! ohara 1143: mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
1.1.1.2 maekawa 1144: #else
1.1.1.3 ! ohara 1145: mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD-1)];
1.1.1.2 maekawa 1146: #endif
1147: mpn_kara_mul_n (p, a, b, n, ws);
1.1 maekawa 1148: }
1.1.1.2 maekawa 1149: #if WANT_FFT || TUNE_PROGRAM_BUILD
1.1.1.3 ! ohara 1150: else if (n < MUL_FFT_THRESHOLD)
1.1.1.2 maekawa 1151: #else
1.1 maekawa 1152: else
1.1.1.2 maekawa 1153: #endif
1.1 maekawa 1154: {
1.1.1.2 maekawa 1155: /* Use workspace of unknown size in heap, as stack space may
1.1.1.3 ! ohara 1156: * be limited. Since n is at least MUL_TOOM3_THRESHOLD, the
1.1.1.2 maekawa 1157: * multiplication will take much longer than malloc()/free(). */
1158: mp_limb_t wsLen, *ws;
1.1.1.3 ! ohara 1159: wsLen = MPN_TOOM3_MUL_N_TSIZE (n);
! 1160: ws = __GMP_ALLOCATE_FUNC_LIMBS ((size_t) wsLen);
1.1.1.2 maekawa 1161: mpn_toom3_mul_n (p, a, b, n, ws);
1.1.1.3 ! ohara 1162: __GMP_FREE_FUNC_LIMBS (ws, (size_t) wsLen);
1.1 maekawa 1163: }
1.1.1.2 maekawa 1164: #if WANT_FFT || TUNE_PROGRAM_BUILD
1165: else
1166: {
1.1.1.3 ! ohara 1167: mpn_mul_fft_full (p, a, n, b, n);
1.1.1.2 maekawa 1168: }
1169: #endif
1.1 maekawa 1170: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>