=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/generic/Attic/mul_n.c,v retrieving revision 1.1.1.2 retrieving revision 1.1.1.3 diff -u -p -r1.1.1.2 -r1.1.1.3 --- OpenXM_contrib/gmp/mpn/generic/Attic/mul_n.c 2000/09/09 14:12:26 1.1.1.2 +++ OpenXM_contrib/gmp/mpn/generic/Attic/mul_n.c 2003/08/25 16:06:20 1.1.1.3 @@ -6,8 +6,8 @@ THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. -Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software -Foundation, Inc. +Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free +Software Foundation, Inc. This file is part of the GNU MP Library. @@ -31,27 +31,30 @@ MA 02111-1307, USA. */ #include "longlong.h" -/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. - 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ -#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) +#if GMP_NAIL_BITS != 0 +/* The open-coded interpolate3 stuff has not been generalized for nails. */ +#define USE_MORE_MPN 1 +#endif +#ifndef USE_MORE_MPN #if !defined (__alpha) && !defined (__mips) /* For all other machines, we want to call mpn functions for the compund operations instead of open-coding them. */ -#define USE_MORE_MPN +#define USE_MORE_MPN 1 #endif +#endif /*== Function declarations =================================================*/ static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr, - mp_ptr, mp_ptr, mp_ptr, - mp_srcptr, mp_srcptr, mp_srcptr, - mp_size_t, mp_size_t)); + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, mp_srcptr, mp_srcptr, + mp_size_t, mp_size_t)); static void interpolate3 _PROTO ((mp_srcptr, - mp_ptr, mp_ptr, mp_ptr, - mp_srcptr, - mp_ptr, mp_ptr, mp_ptr, - mp_size_t, mp_size_t)); + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, + mp_ptr, mp_ptr, mp_ptr, + mp_size_t, mp_size_t)); static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); @@ -64,25 +67,18 @@ static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, */ void -#if __STDC__ mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) -#else -mpn_kara_mul_n(p, a, b, n, ws) - mp_ptr p; - mp_srcptr a; - mp_srcptr b; - mp_size_t n; - mp_ptr ws; -#endif { - mp_limb_t i, sign, w, w0, w1; + mp_limb_t w, w0, w1; mp_size_t n2; mp_srcptr x, y; + mp_size_t i; + int sign; n2 = n >> 1; ASSERT (n2 > 0); - if (n & 1) + if ((n & 1) != 0) { /* Odd length. */ mp_size_t n1, n3, nm1; @@ -100,14 +96,14 @@ mpn_kara_mul_n(p, a, b, n, ws) { --i; w0 = a[i]; - w1 = a[n3+i]; + w1 = a[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n3; y = a; - sign = 1; + sign = ~0; } else { @@ -124,18 +120,18 @@ mpn_kara_mul_n(p, a, b, n, ws) else { i = n2; - do + do { --i; - w0 = b[i]; - w1 = b[n3+i]; + w0 = b[i]; + w1 = b[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = b + n3; y = b; - sign ^= 1; + sign = ~sign; } else { @@ -147,9 +143,9 @@ mpn_kara_mul_n(p, a, b, n, ws) p[n] = w; n1 = n + 1; - if (n2 < KARATSUBA_MUL_THRESHOLD) + if (n2 < MUL_KARATSUBA_THRESHOLD) { - if (n3 < KARATSUBA_MUL_THRESHOLD) + if (n3 < MUL_KARATSUBA_THRESHOLD) { mpn_mul_basecase (ws, p, n3, p + n3, n3); mpn_mul_basecase (p, a, n3, b, n3); @@ -176,34 +172,25 @@ mpn_kara_mul_n(p, a, b, n, ws) nm1 = n - 1; if (mpn_add_n (ws, p + n1, ws, nm1)) { - mp_limb_t x = ws[nm1] + 1; + mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; ws[nm1] = x; if (x == 0) - ++ws[n]; + ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; } if (mpn_add_n (p + n3, p + n3, ws, n1)) { - mp_limb_t x; - i = n1 + n3; - do - { - x = p[i] + 1; - p[i] = x; - ++i; - } while (x == 0); + mpn_incr_u (p + n1 + n3, 1); } } else { /* Even length. */ - mp_limb_t t; - i = n2; do { --i; w0 = a[i]; - w1 = a[n2+i]; + w1 = a[n2 + i]; } while (w0 == w1 && i != 0); sign = 0; @@ -211,7 +198,7 @@ mpn_kara_mul_n(p, a, b, n, ws) { x = a + n2; y = a; - sign = 1; + sign = ~0; } else { @@ -221,18 +208,18 @@ mpn_kara_mul_n(p, a, b, n, ws) mpn_sub_n (p, x, y, n2); i = n2; - do + do { --i; w0 = b[i]; - w1 = b[n2+i]; + w1 = b[n2 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = b + n2; y = b; - sign ^= 1; + sign = ~sign; } else { @@ -242,7 +229,7 @@ mpn_kara_mul_n(p, a, b, n, ws) mpn_sub_n (p + n2, x, y, n2); /* Pointwise products. */ - if (n2 < KARATSUBA_MUL_THRESHOLD) + if (n2 < MUL_KARATSUBA_THRESHOLD) { mpn_mul_basecase (ws, p, n2, p + n2, n2); mpn_mul_basecase (p, a, n2, b, n2); @@ -262,52 +249,28 @@ mpn_kara_mul_n(p, a, b, n, ws) w = -mpn_sub_n (ws, p, ws, n); w += mpn_add_n (ws, p + n, ws, n); w += mpn_add_n (p + n2, p + n2, ws, n); - /* TO DO: could put "if (w) { ... }" here. - * Less work but badly predicted branch. - * No measurable difference in speed on Alpha. - */ - i = n + n2; - t = p[i] + w; - p[i] = t; - if (t < w) - { - do - { - ++i; - w = p[i] + 1; - p[i] = w; - } - while (w == 0); - } + MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); } } void -#if __STDC__ mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) -#else -mpn_kara_sqr_n (p, a, n, ws) - mp_ptr p; - mp_srcptr a; - mp_size_t n; - mp_ptr ws; -#endif { - mp_limb_t i, sign, w, w0, w1; + mp_limb_t w, w0, w1; mp_size_t n2; mp_srcptr x, y; + mp_size_t i; n2 = n >> 1; ASSERT (n2 > 0); - if (n & 1) + if ((n & 1) != 0) { /* Odd length. */ mp_size_t n1, n3, nm1; n3 = n - n2; - sign = 0; w = a[n2]; if (w != 0) w -= mpn_sub_n (p, a, a + n3, n2); @@ -318,14 +281,13 @@ mpn_kara_sqr_n (p, a, n, ws) { --i; w0 = a[i]; - w1 = a[n3+i]; + w1 = a[n3 + i]; } while (w0 == w1 && i != 0); if (w0 < w1) { x = a + n3; y = a; - sign = 1; } else { @@ -336,100 +298,68 @@ mpn_kara_sqr_n (p, a, n, ws) } p[n2] = w; - w = a[n2]; - if (w != 0) - w -= mpn_sub_n (p + n3, a, a + n3, n2); - else + n1 = n + 1; + + /* n2 is always either n3 or n3-1 so maybe the two sets of tests here + could be combined. But that's not important, since the tests will + take a miniscule amount of time compared to the function calls. */ + if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD)) { - i = n2; - do - { - --i; - w0 = a[i]; - w1 = a[n3+i]; - } - while (w0 == w1 && i != 0); - if (w0 < w1) - { - x = a + n3; - y = a; - sign ^= 1; - } - else - { - x = a; - y = a + n3; - } - mpn_sub_n (p + n3, x, y, n2); + mpn_mul_basecase (ws, p, n3, p, n3); + mpn_mul_basecase (p, a, n3, a, n3); } - p[n] = w; - - n1 = n + 1; - if (n2 < KARATSUBA_SQR_THRESHOLD) + else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD)) { - if (n3 < KARATSUBA_SQR_THRESHOLD) - { - mpn_sqr_basecase (ws, p, n3); - mpn_sqr_basecase (p, a, n3); - } - else - { - mpn_kara_sqr_n (ws, p, n3, ws + n1); - mpn_kara_sqr_n (p, a, n3, ws + n1); - } - mpn_sqr_basecase (p + n1, a + n3, n2); + mpn_sqr_basecase (ws, p, n3); + mpn_sqr_basecase (p, a, n3); } else { - mpn_kara_sqr_n (ws, p, n3, ws + n1); - mpn_kara_sqr_n (p, a, n3, ws + n1); - mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); + mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */ + mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */ } - - if (sign) - mpn_add_n (ws, p, ws, n1); + if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) + mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2); + else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) + mpn_sqr_basecase (p + n1, a + n3, n2); else - mpn_sub_n (ws, p, ws, n1); + mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */ + /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the + borrow from mpn_sub_n. If it occurs then it'll be cancelled by a + carry from ws[n]. Further, since 2xy fits in n1 limbs there won't + be any carry out of ws[n] other than cancelling that borrow. */ + + mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */ + nm1 = n - 1; - if (mpn_add_n (ws, p + n1, ws, nm1)) + if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */ { - mp_limb_t x = ws[nm1] + 1; + mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; ws[nm1] = x; if (x == 0) - ++ws[n]; + ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; } if (mpn_add_n (p + n3, p + n3, ws, n1)) { - mp_limb_t x; - i = n1 + n3; - do - { - x = p[i] + 1; - p[i] = x; - ++i; - } while (x == 0); + mpn_incr_u (p + n1 + n3, 1); } } else { /* Even length. */ - mp_limb_t t; - i = n2; do { --i; w0 = a[i]; - w1 = a[n2+i]; + w1 = a[n2 + i]; } while (w0 == w1 && i != 0); - sign = 0; if (w0 < w1) { x = a + n2; y = a; - sign = 1; } else { @@ -438,83 +368,43 @@ mpn_kara_sqr_n (p, a, n, ws) } mpn_sub_n (p, x, y, n2); - i = n2; - do + /* Pointwise products. */ + if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) { - --i; - w0 = a[i]; - w1 = a[n2+i]; + mpn_mul_basecase (ws, p, n2, p, n2); + mpn_mul_basecase (p, a, n2, a, n2); + mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2); } - while (w0 == w1 && i != 0); - if (w0 < w1) + else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) { - x = a + n2; - y = a; - sign ^= 1; - } - else - { - x = a; - y = a + n2; - } - mpn_sub_n (p + n2, x, y, n2); - - /* Pointwise products. */ - if (n2 < KARATSUBA_SQR_THRESHOLD) - { - mpn_sqr_basecase (ws, p, n2); - mpn_sqr_basecase (p, a, n2); + mpn_sqr_basecase (ws, p, n2); + mpn_sqr_basecase (p, a, n2); mpn_sqr_basecase (p + n, a + n2, n2); } else { - mpn_kara_sqr_n (ws, p, n2, ws + n); - mpn_kara_sqr_n (p, a, n2, ws + n); + mpn_kara_sqr_n (ws, p, n2, ws + n); + mpn_kara_sqr_n (p, a, n2, ws + n); mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); } /* Interpolate. */ - if (sign) - w = mpn_add_n (ws, p, ws, n); - else - w = -mpn_sub_n (ws, p, ws, n); + w = -mpn_sub_n (ws, p, ws, n); w += mpn_add_n (ws, p + n, ws, n); w += mpn_add_n (p + n2, p + n2, ws, n); - /* TO DO: could put "if (w) { ... }" here. - * Less work but badly predicted branch. - * No measurable difference in speed on Alpha. - */ - i = n + n2; - t = p[i] + w; - p[i] = t; - if (t < w) - { - do - { - ++i; - w = p[i] + 1; - p[i] = w; - } - while (w == 0); - } + MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); } } /*-- add2Times -------------------------------------------------------------*/ /* z[] = x[] + 2 * y[] - Note that z and x might point to the same vectors. */ -#ifdef USE_MORE_MPN + Note that z and x might point to the same vectors. + FIXME: gcc won't inline this because it uses alloca. */ +#if USE_MORE_MPN + static inline mp_limb_t -#if __STDC__ add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) -#else -add2Times (z, x, y, n) - mp_ptr z; - mp_srcptr x; - mp_srcptr y; - mp_size_t n; -#endif { mp_ptr t; mp_limb_t c; @@ -526,23 +416,17 @@ add2Times (z, x, y, n) TMP_FREE (marker); return c; } + #else static mp_limb_t -#if __STDC__ add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) -#else -add2Times (z, x, y, n) - mp_ptr z; - mp_srcptr x; - mp_srcptr y; - mp_size_t n; -#endif { mp_limb_t c, v, w; ASSERT (n > 0); - v = *x; w = *y; + v = *x; + w = *y; c = w >> (BITS_PER_MP_LIMB - 1); w <<= 1; v += w; @@ -578,29 +462,14 @@ add2Times (z, x, y, n) * C[] has length len2 with len-len2 = 0, 1 or 2. * Returns top words (overflow) at pth, pt1 and pt2 respectively. */ -#ifdef USE_MORE_MPN +#if USE_MORE_MPN + static void -#if __STDC__ evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, - mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2) -#else -evaluate3 (ph, p1, p2, pth, pt1, pt2, - A, B, C, len, len2) - mp_ptr ph; - mp_ptr p1; - mp_ptr p2; - mp_ptr pth; - mp_ptr pt1; - mp_ptr pt2; - mp_srcptr A; - mp_srcptr B; - mp_srcptr C; - mp_size_t len; - mp_size_t len2; -#endif + mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len,mp_size_t len2) { mp_limb_t c, d, e; - + ASSERT (len - len2 <= 2); e = mpn_lshift (p1, B, len, 1); @@ -608,18 +477,27 @@ evaluate3 (ph, p1, p2, pth, pt1, pt2, c = mpn_lshift (ph, A, len, 2); c += e + mpn_add_n (ph, ph, p1, len); d = mpn_add_n (ph, ph, C, len2); - if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); + if (len2 == len) + c += d; + else + c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); ASSERT (c < 7); *pth = c; c = mpn_lshift (p2, C, len2, 2); #if 1 - if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; } + if (len2 != len) + { + p2[len-1] = 0; + p2[len2] = c; + c = 0; + } c += e + mpn_add_n (p2, p2, p1, len); #else d = mpn_add_n (p2, p2, p1, len2); c += d; - if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); + if (len2 != len) + c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); c += e; #endif c += mpn_add_n (p2, p2, A, len); @@ -628,34 +506,19 @@ evaluate3 (ph, p1, p2, pth, pt1, pt2, c = mpn_add_n (p1, A, B, len); d = mpn_add_n (p1, p1, C, len2); - if (len2 == len) c += d; - else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); + if (len2 == len) + c += d; + else + c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); ASSERT (c < 3); *pt1 = c; - } #else static void -#if __STDC__ evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls) -#else -evaluate3 (ph, p1, p2, pth, pt1, pt2, - A, B, C, l, ls) - mp_ptr ph; - mp_ptr p1; - mp_ptr p2; - mp_ptr pth; - mp_ptr pt1; - mp_ptr pt2; - mp_srcptr A; - mp_srcptr B; - mp_srcptr C; - mp_size_t l; - mp_size_t ls; -#endif { mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2; @@ -744,25 +607,11 @@ evaluate3 (ph, p1, p2, pth, pt1, pt2, * and returns new top words there. */ -#ifdef USE_MORE_MPN +#if USE_MORE_MPN + static void -#if __STDC__ interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, - mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2) -#else -interpolate3 (A, B, C, D, E, - ptb, ptc, ptd, len, len2) - mp_srcptr A; - mp_ptr B; - mp_ptr C; - mp_ptr D; - mp_srcptr E; - mp_ptr ptb; - mp_ptr ptc; - mp_ptr ptd; - mp_size_t len; - mp_size_t len2; -#endif + mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len,mp_size_t len2) { mp_ptr ws; mp_limb_t t, tb,tc,td; @@ -772,9 +621,9 @@ interpolate3 (A, B, C, D, E, ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); /* Let x1, x2, x3 be the values to interpolate. We have: - * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e - * c = a + x1 + x2 + x3 + e - * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e + * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e + * c = a + x1 + x2 + x3 + e + * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e */ ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); @@ -782,33 +631,39 @@ interpolate3 (A, B, C, D, E, tb = *ptb; tc = *ptc; td = *ptd; - /* b := b - 16*a - e - * c := c - a - e - * d := d - a - 16*e + /* b := b - 16*a - e + * c := c - a - e + * d := d - a - 16*e */ t = mpn_lshift (ws, A, len, 4); tb -= t + mpn_sub_n (B, B, ws, len); t = mpn_sub_n (B, B, E, len2); - if (len2 == len) tb -= t; - else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); + if (len2 == len) + tb -= t; + else + tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); tc -= mpn_sub_n (C, C, A, len); t = mpn_sub_n (C, C, E, len2); - if (len2 == len) tc -= t; - else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); + if (len2 == len) + tc -= t; + else + tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); t = mpn_lshift (ws, E, len2, 4); t += mpn_add_n (ws, ws, A, len2); #if 1 - if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t); + if (len2 != len) + t = mpn_add_1 (ws+len2, A+len2, len-len2, t); td -= t + mpn_sub_n (D, D, ws, len); #else t += mpn_sub_n (D, D, ws, len2); - if (len2 != len) { - t = mpn_sub_1 (D+len2, D+len2, len-len2, t); - t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); - } /* end if/else */ + if (len2 != len) + { + t = mpn_sub_1 (D+len2, D+len2, len-len2, t); + t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); + } td -= t; #endif @@ -818,12 +673,12 @@ interpolate3 (A, B, C, D, E, #ifdef HAVE_MPN_ADD_SUB_N /* #error TO DO ... */ #else - t = tb + td + mpn_add_n (ws, B, D, len); - td = tb - td - mpn_sub_n (D, B, D, len); + t = tb + td + mpn_add_n (ws, B, D, len); + td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK; tb = t; MPN_COPY (B, ws, len); #endif - + /* b := b-8*c */ t = 8 * tc + mpn_lshift (ws, C, len, 3); tb -= t + mpn_sub_n (B, B, ws, len); @@ -833,14 +688,14 @@ interpolate3 (A, B, C, D, E, tc -= tb + mpn_sub_n (C, C, B, len); /* d := d/3 */ - td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3; + td = ((td - mpn_divexact_by3 (D, D, len)) * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; /* b, d := b + d, b - d */ #ifdef HAVE_MPN_ADD_SUB_N /* #error TO DO ... */ #else - t = tb + td + mpn_add_n (ws, B, D, len); - td = tb - td - mpn_sub_n (D, B, D, len); + t = (tb + td + mpn_add_n (ws, B, D, len)) & GMP_NUMB_MASK; + td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK; tb = t; MPN_COPY (B, ws, len); #endif @@ -853,20 +708,20 @@ interpolate3 (A, B, C, D, E, ASSERT(!(*B & 3)); mpn_rshift (B, B, len, 2); - B[len-1] |= tb<<(BITS_PER_MP_LIMB-2); - ASSERT((long)tb >= 0); + B[len-1] |= (tb << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK; + ASSERT((mp_limb_signed_t)tb >= 0); tb >>= 2; ASSERT(!(*C & 1)); mpn_rshift (C, C, len, 1); - C[len-1] |= tc<<(BITS_PER_MP_LIMB-1); - ASSERT((long)tc >= 0); + C[len-1] |= (tc << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; + ASSERT((mp_limb_signed_t)tc >= 0); tc >>= 1; ASSERT(!(*D & 3)); mpn_rshift (D, D, len, 2); - D[len-1] |= td<<(BITS_PER_MP_LIMB-2); - ASSERT((long)td >= 0); + D[len-1] |= (td << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK; + ASSERT((mp_limb_signed_t)td >= 0); td >>= 2; #if WANT_ASSERT @@ -893,23 +748,8 @@ interpolate3 (A, B, C, D, E, #else static void -#if __STDC__ interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls) -#else -interpolate3 (A, B, C, D, E, - ptb, ptc, ptd, l, ls) - mp_srcptr A; - mp_ptr B; - mp_ptr C; - mp_ptr D; - mp_srcptr E; - mp_ptr ptb; - mp_ptr ptc; - mp_ptr ptd; - mp_size_t l; - mp_size_t ls; -#endif { mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1); @@ -974,9 +814,9 @@ interpolate3 (A, B, C, D, E, c = t - b; /* d := d/3 */ - d *= INVERSE_3; + d *= MODLIMB_INVERSE_3; td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); - td *= INVERSE_3; + td *= MODLIMB_INVERSE_3; /* b, d := b + d, b - d */ t = b + d; @@ -1004,10 +844,10 @@ interpolate3 (A, B, C, D, E, tc += c < sc; /* TO DO: choose one of the following alternatives. */ #if 1 - sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1)); + sc = (mp_limb_signed_t) sc >> (BITS_PER_MP_LIMB - 1); sc += tc; #else - sc = tc - ((long)sc < 0L); + sc = tc - ((mp_limb_signed_t) sc < 0L); #endif /* sd has period 2. */ @@ -1040,7 +880,7 @@ interpolate3 (A, B, C, D, E, b = t; b -= c << 3; c = (c << 1) - b; - d *= INVERSE_3; + d *= MODLIMB_INVERSE_3; t = b + d; d = b - d; b = t; @@ -1086,32 +926,23 @@ interpolate3 (A, B, C, D, E, * ws is workspace. */ -/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the - * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() - * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false. +/* TO DO: If MUL_TOOM3_THRESHOLD is much bigger than MUL_KARATSUBA_THRESHOLD then the + * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() + * because the "n < MUL_KARATSUBA_THRESHOLD" test here will always be false. */ #define TOOM3_MUL_REC(p, a, b, n, ws) \ do { \ - if (n < KARATSUBA_MUL_THRESHOLD) \ + if (n < MUL_KARATSUBA_THRESHOLD) \ mpn_mul_basecase (p, a, n, b, n); \ - else if (n < TOOM3_MUL_THRESHOLD) \ + else if (n < MUL_TOOM3_THRESHOLD) \ mpn_kara_mul_n (p, a, b, n, ws); \ else \ mpn_toom3_mul_n (p, a, b, n, ws); \ } while (0) void -#if __STDC__ mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) -#else -mpn_toom3_mul_n (p, a, b, n, ws) - mp_ptr p; - mp_srcptr a; - mp_srcptr b; - mp_size_t n; - mp_ptr ws; -#endif { mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; mp_limb_t *A,*B,*C,*D,*E, *W; @@ -1125,7 +956,9 @@ mpn_toom3_mul_n (p, a, b, n, ws) { mp_limb_t m; - ASSERT (n >= TOOM3_MUL_THRESHOLD); + /* this is probably unnecessarily strict */ + ASSERT (n >= MUL_TOOM3_THRESHOLD); + l = ls = n / 3; m = n - l * 3; if (m != 0) @@ -1145,6 +978,9 @@ mpn_toom3_mul_n (p, a, b, n, ws) W = ws + l4; } + ASSERT (l >= 1); + ASSERT (ls >= 1); + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); @@ -1186,40 +1022,33 @@ mpn_toom3_mul_n (p, a, b, n, ws) interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); /** Final stage: add up the coefficients. **/ - { - mp_limb_t i, x, y; - tB += mpn_add_n (p + l, p + l, B, l2); - tD += mpn_add_n (p + l3, p + l3, D, l2); - mpn_incr_u (p + l3, tB); - mpn_incr_u (p + l4, tC); - mpn_incr_u (p + l5, tD); - } + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + MPN_INCR_U (p + l3, 2 * n - l3, tB); + MPN_INCR_U (p + l4, 2 * n - l4, tC); + MPN_INCR_U (p + l5, 2 * n - l5, tD); } /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/ /* Like previous function but for squaring */ -#define TOOM3_SQR_REC(p, a, n, ws) \ - do { \ - if (n < KARATSUBA_SQR_THRESHOLD) \ - mpn_sqr_basecase (p, a, n); \ - else if (n < TOOM3_SQR_THRESHOLD) \ - mpn_kara_sqr_n (p, a, n, ws); \ - else \ - mpn_toom3_sqr_n (p, a, n, ws); \ +/* FIXME: If SQR_TOOM3_THRESHOLD is big enough it might never get into the + basecase range. Try to arrange those conditonals go dead. */ +#define TOOM3_SQR_REC(p, a, n, ws) \ + do { \ + if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) \ + mpn_mul_basecase (p, a, n, a, n); \ + else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \ + mpn_sqr_basecase (p, a, n); \ + else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ + mpn_kara_sqr_n (p, a, n, ws); \ + else \ + mpn_toom3_sqr_n (p, a, n, ws); \ } while (0) void -#if __STDC__ mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) -#else -mpn_toom3_sqr_n (p, a, n, ws) - mp_ptr p; - mp_srcptr a; - mp_size_t n; - mp_ptr ws; -#endif { mp_limb_t cB,cC,cD, tB,tC,tD; mp_limb_t *A,*B,*C,*D,*E, *W; @@ -1233,7 +1062,9 @@ mpn_toom3_sqr_n (p, a, n, ws) { mp_limb_t m; - ASSERT (n >= TOOM3_MUL_THRESHOLD); + /* this is probably unnecessarily strict */ + ASSERT (n >= SQR_TOOM3_THRESHOLD); + l = ls = n / 3; m = n - l * 3; if (m != 0) @@ -1253,6 +1084,9 @@ mpn_toom3_sqr_n (p, a, n, ws) W = ws + l4; } + ASSERT (l >= 1); + ASSERT (ls >= 1); + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); @@ -1271,7 +1105,7 @@ mpn_toom3_sqr_n (p, a, n, ws) { tC += add2Times (C + l, C + l, B, l); if (cC == 2) - tC += add2Times (C + l, C + l, B, l); + tC += add2Times (C + l, C + l, B, l); } #endif ASSERT (tC < 9); @@ -1286,58 +1120,51 @@ mpn_toom3_sqr_n (p, a, n, ws) interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); /** Final stage: add up the coefficients. **/ - { - mp_limb_t i, x, y; - tB += mpn_add_n (p + l, p + l, B, l2); - tD += mpn_add_n (p + l3, p + l3, D, l2); - mpn_incr_u (p + l3, tB); - mpn_incr_u (p + l4, tC); - mpn_incr_u (p + l5, tD); - } + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + MPN_INCR_U (p + l3, 2 * n - l3, tB); + MPN_INCR_U (p + l4, 2 * n - l4, tC); + MPN_INCR_U (p + l5, 2 * n - l5, tD); } void -#if __STDC__ mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) -#else -mpn_mul_n (p, a, b, n) - mp_ptr p; - mp_srcptr a; - mp_srcptr b; - mp_size_t n; -#endif { - if (n < KARATSUBA_MUL_THRESHOLD) + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); + + if (n < MUL_KARATSUBA_THRESHOLD) mpn_mul_basecase (p, a, n, b, n); - else if (n < TOOM3_MUL_THRESHOLD) + else if (n < MUL_TOOM3_THRESHOLD) { /* Allocate workspace of fixed size on stack: fast! */ #if TUNE_PROGRAM_BUILD - mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB]; + mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)]; #else - mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB]; + mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD-1)]; #endif mpn_kara_mul_n (p, a, b, n, ws); } #if WANT_FFT || TUNE_PROGRAM_BUILD - else if (n < FFT_MUL_THRESHOLD) + else if (n < MUL_FFT_THRESHOLD) #else else #endif { /* Use workspace of unknown size in heap, as stack space may - * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the + * be limited. Since n is at least MUL_TOOM3_THRESHOLD, the * multiplication will take much longer than malloc()/free(). */ mp_limb_t wsLen, *ws; - wsLen = 2 * n + 3 * BITS_PER_MP_LIMB; - ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t)); + wsLen = MPN_TOOM3_MUL_N_TSIZE (n); + ws = __GMP_ALLOCATE_FUNC_LIMBS ((size_t) wsLen); mpn_toom3_mul_n (p, a, b, n, ws); - (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t)); + __GMP_FREE_FUNC_LIMBS (ws, (size_t) wsLen); } #if WANT_FFT || TUNE_PROGRAM_BUILD else { - mpn_mul_fft_full (p, a, n, b, n); + mpn_mul_fft_full (p, a, n, b, n); } #endif }