=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpz/Attic/powm.c,v retrieving revision 1.1.1.2 retrieving revision 1.1.1.3 diff -u -p -r1.1.1.2 -r1.1.1.3 --- OpenXM_contrib/gmp/mpz/Attic/powm.c 2000/09/09 14:12:56 1.1.1.2 +++ OpenXM_contrib/gmp/mpz/Attic/powm.c 2003/08/25 16:06:33 1.1.1.3 @@ -1,7 +1,7 @@ /* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD. -Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, Inc. -Contributed by Paul Zimmermann. +Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002 Free Software +Foundation, Inc. Contributed by Paul Zimmermann. This file is part of the GNU MP Library. @@ -20,6 +20,7 @@ along with the GNU MP Library; see the file COPYING.LI the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" @@ -28,49 +29,91 @@ MA 02111-1307, USA. */ #endif -/* set c <- (a*b)/R^n mod m c has to have at least (2n) allocated limbs */ -static void -#if __STDC__ -mpz_redc (mpz_ptr c, mpz_srcptr a, mpz_srcptr b, mpz_srcptr m, mp_limb_t Nprim) -#else -mpz_redc (c, a, b, m, Nprim) - mpz_ptr c; - mpz_srcptr a; - mpz_srcptr b; - mpz_srcptr m; - mp_limb_t Nprim; +/* Set c <- tp/R^n mod m. + tp should have space for 2*n+1 limbs; clobber its most significant limb. */ +#if ! WANT_REDC_GLOBAL +static #endif +void +redc (mp_ptr cp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim, mp_ptr tp) { - mp_ptr cp, mp = PTR (m); - mp_limb_t cy, cout = 0; + mp_limb_t cy; mp_limb_t q; - size_t j, n = ABSIZ (m); + mp_size_t j; - ASSERT (ALLOC (c) >= 2 * n); + tp[2 * n] = 0; /* carry guard */ - mpz_mul (c, a, b); - cp = PTR (c); - j = ABSIZ (c); - MPN_ZERO (cp + j, 2 * n - j); for (j = 0; j < n; j++) { - q = cp[0] * Nprim; - cy = mpn_addmul_1 (cp, mp, n, q); - cout += mpn_add_1 (cp + n, cp + n, n - j, cy); - cp++; + q = tp[0] * Nprim; + cy = mpn_addmul_1 (tp, mp, n, q); + mpn_incr_u (tp + n, cy); + tp++; } - cp -= n; - if (cout) + + if (tp[n] != 0) + mpn_sub_n (cp, tp, mp, n); + else + MPN_COPY (cp, tp, n); +} + +/* Compute t = a mod m, a is defined by (ap,an), m is defined by (mp,mn), and + t is defined by (tp,mn). */ +static void +reduce (mp_ptr tp, mp_srcptr ap, mp_size_t an, mp_srcptr mp, mp_size_t mn) +{ + mp_ptr qp; + TMP_DECL (marker); + + TMP_MARK (marker); + qp = TMP_ALLOC_LIMBS (an - mn + 1); + + mpn_tdiv_qr (qp, tp, 0L, ap, an, mp, mn); + + TMP_FREE (marker); +} + +#if REDUCE_EXPONENT +/* Return the group order of the ring mod m. */ +static mp_limb_t +phi (mp_limb_t t) +{ + mp_limb_t d, m, go; + + go = 1; + + if (t % 2 == 0) { - cy = cout - mpn_sub_n (cp, cp + n, mp, n); - while (cy) - cy -= mpn_sub_n (cp, cp, mp, n); + t = t / 2; + while (t % 2 == 0) + { + go *= 2; + t = t / 2; + } } - else - MPN_COPY (cp, cp + n, n); - MPN_NORMALIZE (cp, n); - SIZ (c) = SIZ (c) < 0 ? -n : n; + for (d = 3;; d += 2) + { + m = d - 1; + for (;;) + { + unsigned long int q = t / d; + if (q < d) + { + if (t <= 1) + return go; + if (t == d) + return go * m; + return go * (t - 1); + } + if (t != q * d) + break; + go *= m; + m = d; + t = q; + } + } } +#endif /* average number of calls to redc for an exponent of n bits with the sliding window algorithm of base 2^k: the optimal is @@ -85,280 +128,335 @@ mpz_redc (c, a, b, m, Nprim) 4096 4918 4787 4707 4665* 4670 */ -#ifndef BERKELEY_MP -void -#if __STDC__ -mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod) -#else -mpz_powm (res, base, e, mod) - mpz_ptr res; - mpz_srcptr base; - mpz_srcptr e; - mpz_srcptr mod; + +/* Use REDC instead of usual reduction for sizes < POWM_THRESHOLD. In REDC + each modular multiplication costs about 2*n^2 limbs operations, whereas + using usual reduction it costs 3*K(n), where K(n) is the cost of a + multiplication using Karatsuba, and a division is assumed to cost 2*K(n), + for example using Burnikel-Ziegler's algorithm. This gives a theoretical + threshold of a*SQR_KARATSUBA_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~ + 2.66. */ +/* For now, also disable REDC when MOD is even, as the inverse can't handle + that. At some point, we might want to make the code faster for that case, + perhaps using CRR. */ + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD ((8 * SQR_KARATSUBA_THRESHOLD) / 3) #endif -#else /* BERKELEY_MP */ + +#define HANDLE_NEGATIVE_EXPONENT 1 +#undef REDUCE_EXPONENT + void -#if __STDC__ -pow (mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod, mpz_ptr res) -#else -pow (base, e, mod, res) - mpz_srcptr base; - mpz_srcptr e; - mpz_srcptr mod; - mpz_ptr res; -#endif +#ifndef BERKELEY_MP +mpz_powm (mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m) +#else /* BERKELEY_MP */ +pow (mpz_srcptr b, mpz_srcptr e, mpz_srcptr m, mpz_ptr r) #endif /* BERKELEY_MP */ { - mp_limb_t invm, *ep, c, mask; - mpz_t xx, *g; - mp_size_t n, i, K, j, l, k; + mp_ptr xp, tp, qp, gp, this_gp; + mp_srcptr bp, ep, mp; + mp_size_t bn, es, en, mn, xn; + mp_limb_t invm, c; + unsigned long int enb; + mp_size_t i, K, j, l, k; + int m_zero_cnt, e_zero_cnt; int sh; int use_redc; - -#ifdef POWM_DEBUG - mpz_t exp; - mpz_init (exp); +#if HANDLE_NEGATIVE_EXPONENT + mpz_t new_b; #endif +#if REDUCE_EXPONENT + mpz_t new_e; +#endif + TMP_DECL (marker); - n = ABSIZ (mod); - - if (n == 0) + mp = PTR(m); + mn = ABSIZ (m); + if (mn == 0) DIVIDE_BY_ZERO; - if (SIZ (e) == 0) + TMP_MARK (marker); + + es = SIZ (e); + if (es <= 0) { - /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 - depending on if MOD equals 1. */ - SIZ(res) = (ABSIZ (mod) == 1 && (PTR(mod))[0] == 1) ? 0 : 1; - PTR(res)[0] = 1; - return; + if (es == 0) + { + /* Exponent is zero, result is 1 mod m, i.e., 1 or 0 depending on if + m equals 1. */ + SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1; + PTR(r)[0] = 1; + TMP_FREE (marker); /* we haven't really allocated anything here */ + return; + } +#if HANDLE_NEGATIVE_EXPONENT + MPZ_TMP_INIT (new_b, mn + 1); + + if (! mpz_invert (new_b, b, m)) + DIVIDE_BY_ZERO; + b = new_b; + es = -es; +#else + DIVIDE_BY_ZERO; +#endif } + en = es; - /* Use REDC instead of usual reduction for sizes < POWM_THRESHOLD. - In REDC each modular multiplication costs about 2*n^2 limbs operations, - whereas using usual reduction it costs 3*K(n), where K(n) is the cost of a - multiplication using Karatsuba, and a division is assumed to cost 2*K(n), - for example using Burnikel-Ziegler's algorithm. This gives a theoretical - threshold of a*KARATSUBA_SQR_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~ - 2.66. */ - /* For now, also disable REDC when MOD is even, as the inverse can't - handle that. */ - -#ifndef POWM_THRESHOLD -#define POWM_THRESHOLD ((8 * KARATSUBA_SQR_THRESHOLD) / 3) +#if REDUCE_EXPONENT + /* Reduce exponent by dividing it by phi(m) when m small. */ + if (mn == 1 && mp[0] < 0x7fffffffL && en * GMP_NUMB_BITS > 150) + { + MPZ_TMP_INIT (new_e, 2); + mpz_mod_ui (new_e, e, phi (mp[0])); + e = new_e; + } #endif - use_redc = (n < POWM_THRESHOLD && PTR(mod)[0] % 2 != 0); + use_redc = mn < POWM_THRESHOLD && mp[0] % 2 != 0; if (use_redc) { /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */ - modlimb_invert (invm, PTR(mod)[0]); + modlimb_invert (invm, mp[0]); invm = -invm; } + else + { + /* Normalize m (i.e. make its most significant bit set) as required by + division functions below. */ + count_leading_zeros (m_zero_cnt, mp[mn - 1]); + m_zero_cnt -= GMP_NAIL_BITS; + if (m_zero_cnt != 0) + { + mp_ptr new_mp; + new_mp = TMP_ALLOC_LIMBS (mn); + mpn_lshift (new_mp, mp, mn, m_zero_cnt); + mp = new_mp; + } + } - /* determines optimal value of k */ - l = ABSIZ (e) * BITS_PER_MP_LIMB; /* number of bits of exponent */ + /* Determine optimal value of k, the number of exponent bits we look at + at a time. */ + count_leading_zeros (e_zero_cnt, PTR(e)[en - 1]); + e_zero_cnt -= GMP_NAIL_BITS; + enb = en * GMP_NUMB_BITS - e_zero_cnt; /* number of bits of exponent */ k = 1; K = 2; - while (2 * l > K * (2 + k * (3 + k))) + while (2 * enb > K * (2 + k * (3 + k))) { k++; K *= 2; } - g = (mpz_t *) (*_mp_allocate_func) (K / 2 * sizeof (mpz_t)); - /* compute x*R^n where R=2^BITS_PER_MP_LIMB */ - mpz_init (g[0]); - if (use_redc) - { - mpz_mul_2exp (g[0], base, n * BITS_PER_MP_LIMB); - mpz_mod (g[0], g[0], mod); - } - else - mpz_mod (g[0], base, mod); + tp = TMP_ALLOC_LIMBS (2 * mn + 1); + qp = TMP_ALLOC_LIMBS (mn + 1); - /* compute xx^g for odd g < 2^k */ - mpz_init (xx); - if (use_redc) + gp = __GMP_ALLOCATE_FUNC_LIMBS (K / 2 * mn); + + /* Compute x*R^n where R=2^BITS_PER_MP_LIMB. */ + bn = ABSIZ (b); + bp = PTR(b); + /* Handle |b| >= m by computing b mod m. FIXME: It is not strictly necessary + for speed or correctness to do this when b and m have the same number of + limbs, perhaps remove mpn_cmp call. */ + if (bn > mn || (bn == mn && mpn_cmp (bp, mp, mn) >= 0)) { - _mpz_realloc (xx, 2 * n); - mpz_redc (xx, g[0], g[0], mod, invm); /* xx = x^2*R^n */ + /* Reduce possibly huge base while moving it to gp[0]. Use a function + call to reduce, since we don't want the quotient allocation to + live until function return. */ + if (use_redc) + { + reduce (tp + mn, bp, bn, mp, mn); /* b mod m */ + MPN_ZERO (tp, mn); + mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); /* unnormnalized! */ + } + else + { + reduce (gp, bp, bn, mp, mn); + } } else { - mpz_mul (xx, g[0], g[0]); - mpz_mod (xx, xx, mod); - } - for (i = 1; i < K / 2; i++) - { - mpz_init (g[i]); + /* |b| < m. We pad out operands to become mn limbs, which simplifies + the rest of the function, but slows things down when the |b| << m. */ if (use_redc) { - _mpz_realloc (g[i], 2 * n); - mpz_redc (g[i], g[i - 1], xx, mod, invm); /* g[i] = x^(2i+1)*R^n */ + MPN_ZERO (tp, mn); + MPN_COPY (tp + mn, bp, bn); + MPN_ZERO (tp + mn + bn, mn - bn); + mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); } else { - mpz_mul (g[i], g[i - 1], xx); - mpz_mod (g[i], g[i], mod); + MPN_COPY (gp, bp, bn); + MPN_ZERO (gp + bn, mn - bn); } } - /* now starts the real stuff */ - mask = (mp_limb_t) ((1< 0) { i--; - c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh)); - sh += BITS_PER_MP_LIMB; + c <<= (-sh); + sh += GMP_NUMB_BITS; + c |= ep[i] >> sh; } } else - c = c >> sh; -#ifdef POWM_DEBUG - printf ("-1/m mod 2^%u = %lu\n", BITS_PER_MP_LIMB, invm); - mpz_set_ui (exp, c); -#endif - j=0; - while (c % 2 == 0) + c >>= sh; + + for (j = 0; c % 2 == 0; j++) + c >>= 1; + + MPN_COPY (xp, gp + mn * (c >> 1), mn); + while (--j >= 0) { - j++; - c = (c >> 1); - } - mpz_set (xx, g[c >> 1]); - while (j--) - { + mpn_sqr_n (tp, xp, mn); if (use_redc) - mpz_redc (xx, xx, xx, mod, invm); + redc (xp, mp, mn, invm, tp); else - { - mpz_mul (xx, xx, xx); - mpz_mod (xx, xx, mod); - } + mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn); } -#ifdef POWM_DEBUG - printf ("x^"); mpz_out_str (0, 10, exp); - printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx); - putchar ('\n'); -#endif - while (i > 0 || sh > 0) { c = ep[i]; - sh -= k; l = k; /* number of bits treated */ + sh -= l; if (sh < 0) { if (i > 0) { i--; - c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh)); - sh += BITS_PER_MP_LIMB; + c <<= (-sh); + sh += GMP_NUMB_BITS; + c |= ep[i] >> sh; } else { - l += sh; /* may be less bits than k here */ - c = c & ((1<> sh; - c = c & mask; + c >>= sh; + c &= ((mp_limb_t) 1 << l) - 1; - /* this while loop implements the sliding window improvement */ - while ((c & (1 << (k - 1))) == 0 && (i > 0 || sh > 0)) + /* This while loop implements the sliding window improvement--loop while + the most significant bit of c is zero, squaring xx as we go. */ + while ((c >> (l - 1)) == 0 && (i > 0 || sh > 0)) { - if (use_redc) mpz_redc (xx, xx, xx, mod, invm); + mpn_sqr_n (tp, xp, mn); + if (use_redc) + redc (xp, mp, mn, invm, tp); else + mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn); + if (sh != 0) { - mpz_mul (xx, xx, xx); - mpz_mod (xx, xx, mod); - } - if (sh) - { sh--; - c = (c<<1) + ((ep[i]>>sh) & 1); + c = (c << 1) + ((ep[i] >> sh) & 1); } else { i--; - sh = BITS_PER_MP_LIMB - 1; - c = (c<<1) + (ep[i]>>sh); + sh = GMP_NUMB_BITS - 1; + c = (c << 1) + (ep[i] >> sh); } } -#ifdef POWM_DEBUG - printf ("l=%u c=%lu\n", l, c); - mpz_mul_2exp (exp, exp, k); - mpz_add_ui (exp, exp, c); -#endif - - /* now replace xx by xx^(2^k)*x^c */ + /* Replace xx by xx^(2^l)*x^c. */ if (c != 0) { - j = 0; - while (c % 2 == 0) + for (j = 0; c % 2 == 0; j++) + c >>= 1; + + /* c0 = c * 2^j, i.e. xx^(2^l)*x^c = (A^(2^(l - j))*c)^(2^j) */ + l -= j; + while (--l >= 0) { - j++; - c = c >> 1; + mpn_sqr_n (tp, xp, mn); + if (use_redc) + redc (xp, mp, mn, invm, tp); + else + mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn); } - /* c0 = c * 2^j, i.e. xx^(2^k)*x^c = (A^(2^(k - j))*c)^(2^j) */ - l -= j; - while (l--) - if (use_redc) mpz_redc (xx, xx, xx, mod, invm); - else - { - mpz_mul (xx, xx, xx); - mpz_mod (xx, xx, mod); - } + mpn_mul_n (tp, xp, gp + mn * (c >> 1), mn); if (use_redc) - mpz_redc (xx, xx, g[c >> 1], mod, invm); + redc (xp, mp, mn, invm, tp); else - { - mpz_mul (xx, xx, g[c >> 1]); - mpz_mod (xx, xx, mod); - } + mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn); } else j = l; /* case c=0 */ - while (j--) + while (--j >= 0) { + mpn_sqr_n (tp, xp, mn); if (use_redc) - mpz_redc (xx, xx, xx, mod, invm); + redc (xp, mp, mn, invm, tp); else - { - mpz_mul (xx, xx, xx); - mpz_mod (xx, xx, mod); - } + mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn); } -#ifdef POWM_DEBUG - printf ("x^"); mpz_out_str (0, 10, exp); - printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx); - putchar ('\n'); -#endif } - /* now convert back xx to xx/R^n */ if (use_redc) { - mpz_set_ui (g[0], 1); - mpz_redc (xx, xx, g[0], mod, invm); - if (mpz_cmp (xx, mod) >= 0) - mpz_sub (xx, xx, mod); + /* Convert back xx to xx/R^n. */ + MPN_COPY (tp, xp, mn); + MPN_ZERO (tp + mn, mn); + redc (xp, mp, mn, invm, tp); + if (mpn_cmp (xp, mp, mn) >= 0) + mpn_sub_n (xp, xp, mp, mn); } - mpz_set (res, xx); + else + { + if (m_zero_cnt != 0) + { + mp_limb_t cy; + cy = mpn_lshift (tp, xp, mn, m_zero_cnt); + tp[mn] = cy; + mpn_tdiv_qr (qp, xp, 0L, tp, mn + (cy != 0), mp, mn); + mpn_rshift (xp, xp, mn, m_zero_cnt); + } + } + xn = mn; + MPN_NORMALIZE (xp, xn); - mpz_clear (xx); - for (i = 0; i < K / 2; i++) - mpz_clear (g[i]); - (*_mp_free_func) (g, K / 2 * sizeof (mpz_t)); + if ((ep[0] & 1) && SIZ(b) < 0 && xn != 0) + { + mp = PTR(m); /* want original, unnormalized m */ + mpn_sub (xp, mp, mn, xp, xn); + xn = mn; + MPN_NORMALIZE (xp, xn); + } + MPZ_REALLOC (r, xn); + SIZ (r) = xn; + MPN_COPY (PTR(r), xp, xn); + + __GMP_FREE_FUNC_LIMBS (gp, K / 2 * mn); + TMP_FREE (marker); }