Annotation of OpenXM_contrib/gmp/mpz/n_pow_ui.c, Revision 1.1
1.1 ! ohara 1: /* mpz_n_pow_ui -- mpn raised to ulong.
! 2:
! 3: Copyright 2001, 2002 Free Software Foundation, Inc.
! 4:
! 5: This file is part of the GNU MP Library.
! 6:
! 7: The GNU MP Library is free software; you can redistribute it and/or modify
! 8: it under the terms of the GNU Lesser General Public License as published by
! 9: the Free Software Foundation; either version 2.1 of the License, or (at your
! 10: option) any later version.
! 11:
! 12: The GNU MP Library is distributed in the hope that it will be useful, but
! 13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 14: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 15: License for more details.
! 16:
! 17: You should have received a copy of the GNU Lesser General Public License
! 18: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 20: MA 02111-1307, USA. */
! 21:
! 22: #include "gmp.h"
! 23: #include "gmp-impl.h"
! 24: #include "longlong.h"
! 25:
! 26:
! 27: /* Change this to "#define TRACE(x) x" for some traces. */
! 28: #define TRACE(x)
! 29:
! 30:
! 31: /* Use this to test the mul_2 code on a CPU without a native version of that
! 32: routine. */
! 33: #if 0
! 34: #define mpn_mul_2 refmpn_mul_2
! 35: #define HAVE_NATIVE_mpn_mul_2 1
! 36: #endif
! 37:
! 38:
! 39: /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
! 40: ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
! 41: bsize==2 or >2, but separating that isn't easy because there's shared
! 42: code both before and after (the size calculations and the powers of 2
! 43: handling).
! 44:
! 45: Alternatives:
! 46:
! 47: It would work to just use the mpn_mul powering loop for 1 and 2 limb
! 48: bases, but the current separate loop allows mul_1 and mul_2 to be done
! 49: in-place, which might help cache locality a bit. If mpn_mul was relaxed
! 50: to allow source==dest when vn==1 or 2 then some pointer twiddling might
! 51: let us get the same effect in one loop.
! 52:
! 53: The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
! 54: form the biggest possible power of b that fits, only the biggest power of
! 55: 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
! 56: using __mp_bases[b].big_base for small b, and thereby get better value
! 57: from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
! 58: so would be more complicated than it's worth, and could well end up being
! 59: a slowdown for small e. For big e on the other hand the algorithm is
! 60: dominated by mpn_sqr_n so there wouldn't much of a saving. The current
! 61: code can be viewed as simply doing the first few steps of the powering in
! 62: a single or double limb where possible.
! 63:
! 64: If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
! 65: copy made of b is unnecessary. We could just use the old alloc'ed block
! 66: and free it at the end. But arranging this seems like a lot more trouble
! 67: than it's worth. */
! 68:
! 69:
! 70: /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
! 71: a limb without overflowing.
! 72: FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
! 73:
! 74: #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
! 75:
! 76:
! 77: /* The following are for convenience, they update the size and check the
! 78: alloc. */
! 79:
! 80: #define MPN_SQR_N(dst, alloc, src, size) \
! 81: do { \
! 82: ASSERT (2*(size) <= (alloc)); \
! 83: mpn_sqr_n (dst, src, size); \
! 84: (size) *= 2; \
! 85: (size) -= ((dst)[(size)-1] == 0); \
! 86: } while (0)
! 87:
! 88: #define MPN_MUL(dst, alloc, src, size, src2, size2) \
! 89: do { \
! 90: mp_limb_t cy; \
! 91: ASSERT ((size) + (size2) <= (alloc)); \
! 92: cy = mpn_mul (dst, src, size, src2, size2); \
! 93: (size) += (size2) - (cy == 0); \
! 94: } while (0)
! 95:
! 96: #define MPN_MUL_2(ptr, size, alloc, mult) \
! 97: do { \
! 98: mp_limb_t cy; \
! 99: ASSERT ((size)+2 <= (alloc)); \
! 100: cy = mpn_mul_2 (ptr, ptr, size, mult); \
! 101: (size)++; \
! 102: (ptr)[(size)] = cy; \
! 103: (size) += (cy != 0); \
! 104: } while (0)
! 105:
! 106: #define MPN_MUL_1(ptr, size, alloc, limb) \
! 107: do { \
! 108: mp_limb_t cy; \
! 109: ASSERT ((size)+1 <= (alloc)); \
! 110: cy = mpn_mul_1 (ptr, ptr, size, limb); \
! 111: (ptr)[size] = cy; \
! 112: (size) += (cy != 0); \
! 113: } while (0)
! 114:
! 115: #define MPN_LSHIFT(ptr, size, alloc, shift) \
! 116: do { \
! 117: mp_limb_t cy; \
! 118: ASSERT ((size)+1 <= (alloc)); \
! 119: cy = mpn_lshift (ptr, ptr, size, shift); \
! 120: (ptr)[size] = cy; \
! 121: (size) += (cy != 0); \
! 122: } while (0)
! 123:
! 124: #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
! 125: do { \
! 126: if ((shift) == 0) \
! 127: MPN_COPY (dst, src, size); \
! 128: else \
! 129: { \
! 130: mpn_rshift (dst, src, size, shift); \
! 131: (size) -= ((dst)[(size)-1] == 0); \
! 132: } \
! 133: } while (0)
! 134:
! 135:
! 136: /* ralloc and talloc are only wanted for ASSERTs, after the initial space
! 137: allocations. Avoid writing values to them in a normal build, to ensure
! 138: the compiler lets them go dead. gcc already figures this out itself
! 139: actually. */
! 140:
! 141: #define SWAP_RP_TP \
! 142: do { \
! 143: MP_PTR_SWAP (rp, tp); \
! 144: ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
! 145: } while (0)
! 146:
! 147:
! 148: void
! 149: mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
! 150: {
! 151: mp_ptr rp;
! 152: mp_size_t rtwos_limbs, ralloc, rsize;
! 153: int rneg, i, cnt, btwos, r_bp_overlap;
! 154: mp_limb_t blimb, rl;
! 155: unsigned long rtwos_bits;
! 156: #if HAVE_NATIVE_mpn_mul_2
! 157: mp_limb_t blimb_low, rl_high;
! 158: #else
! 159: mp_limb_t b_twolimbs[2];
! 160: #endif
! 161: TMP_DECL (marker);
! 162:
! 163: TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
! 164: PTR(r), bp, bsize, e, e);
! 165: mpn_trace ("b", bp, bsize));
! 166:
! 167: ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
! 168: ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
! 169:
! 170: /* b^0 == 1, including 0^0 == 1 */
! 171: if (e == 0)
! 172: {
! 173: PTR(r)[0] = 1;
! 174: SIZ(r) = 1;
! 175: return;
! 176: }
! 177:
! 178: /* 0^e == 0 apart from 0^0 above */
! 179: if (bsize == 0)
! 180: {
! 181: SIZ(r) = 0;
! 182: return;
! 183: }
! 184:
! 185: /* Sign of the final result. */
! 186: rneg = (bsize < 0 && (e & 1) != 0);
! 187: bsize = ABS (bsize);
! 188: TRACE (printf ("rneg %d\n", rneg));
! 189:
! 190: r_bp_overlap = (PTR(r) == bp);
! 191:
! 192: /* Strip low zero limbs from b. */
! 193: rtwos_limbs = 0;
! 194: for (blimb = *bp; blimb == 0; blimb = *++bp)
! 195: {
! 196: rtwos_limbs += e;
! 197: bsize--; ASSERT (bsize >= 1);
! 198: }
! 199: TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
! 200:
! 201: /* Strip low zero bits from b. */
! 202: count_trailing_zeros (btwos, blimb);
! 203: blimb >>= btwos;
! 204: rtwos_bits = e * btwos;
! 205: rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
! 206: rtwos_bits %= GMP_NUMB_BITS;
! 207: TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
! 208: btwos, rtwos_limbs, rtwos_bits));
! 209:
! 210: TMP_MARK (marker);
! 211:
! 212: rl = 1;
! 213: #if HAVE_NATIVE_mpn_mul_2
! 214: rl_high = 0;
! 215: #endif
! 216:
! 217: if (bsize == 1)
! 218: {
! 219: bsize_1:
! 220: /* Power up as far as possible within blimb. We start here with e!=0,
! 221: but if e is small then we might reach e==0 and the whole b^e in rl.
! 222: Notice this code works when blimb==1 too, reaching e==0. */
! 223:
! 224: while (blimb <= GMP_NUMB_HALFMAX)
! 225: {
! 226: TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
! 227: e, blimb, rl));
! 228: ASSERT (e != 0);
! 229: if ((e & 1) != 0)
! 230: rl *= blimb;
! 231: e >>= 1;
! 232: if (e == 0)
! 233: goto got_rl;
! 234: blimb *= blimb;
! 235: }
! 236:
! 237: #if HAVE_NATIVE_mpn_mul_2
! 238: TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
! 239: e, blimb, rl));
! 240:
! 241: /* Can power b once more into blimb:blimb_low */
! 242: bsize = 2;
! 243: ASSERT (e != 0);
! 244: if ((e & 1) != 0)
! 245: {
! 246: umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
! 247: rl >>= GMP_NAIL_BITS;
! 248: }
! 249: e >>= 1;
! 250: umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
! 251: blimb_low >>= GMP_NAIL_BITS;
! 252:
! 253: got_rl:
! 254: TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
! 255: e, blimb, blimb_low, rl_high, rl));
! 256:
! 257: /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
! 258: final mul_1 or mul_2 rather than a separate lshift.
! 259: - rl_high:rl mustn't be 1 (since then there's no final mul)
! 260: - rl_high mustn't overflow
! 261: - rl_high mustn't change to non-zero, since mul_1+lshift is
! 262: probably faster than mul_2 (FIXME: is this true?) */
! 263:
! 264: if (rtwos_bits != 0
! 265: && ! (rl_high == 0 && rl == 1)
! 266: && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
! 267: {
! 268: mp_limb_t new_rl_high = (rl_high << rtwos_bits)
! 269: | (rl >> (GMP_NUMB_BITS-rtwos_bits));
! 270: if (! (rl_high == 0 && new_rl_high != 0))
! 271: {
! 272: rl_high = new_rl_high;
! 273: rl <<= rtwos_bits;
! 274: rtwos_bits = 0;
! 275: TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
! 276: rl_high, rl));
! 277: }
! 278: }
! 279: #else
! 280: got_rl:
! 281: TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
! 282: e, blimb, rl));
! 283:
! 284: /* Combine left-over rtwos_bits into rl to be handled by the final
! 285: mul_1 rather than a separate lshift.
! 286: - rl mustn't be 1 (since then there's no final mul)
! 287: - rl mustn't overflow */
! 288:
! 289: if (rtwos_bits != 0
! 290: && rl != 1
! 291: && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
! 292: {
! 293: rl <<= rtwos_bits;
! 294: rtwos_bits = 0;
! 295: TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
! 296: }
! 297: #endif
! 298: }
! 299: else if (bsize == 2)
! 300: {
! 301: mp_limb_t bsecond = bp[1];
! 302: if (btwos != 0)
! 303: blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
! 304: bsecond >>= btwos;
! 305: if (bsecond == 0)
! 306: {
! 307: /* Two limbs became one after rshift. */
! 308: bsize = 1;
! 309: goto bsize_1;
! 310: }
! 311:
! 312: TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
! 313: #if HAVE_NATIVE_mpn_mul_2
! 314: blimb_low = blimb;
! 315: #else
! 316: bp = b_twolimbs;
! 317: b_twolimbs[0] = blimb;
! 318: b_twolimbs[1] = bsecond;
! 319: #endif
! 320: blimb = bsecond;
! 321: }
! 322: else
! 323: {
! 324: if (r_bp_overlap || btwos != 0)
! 325: {
! 326: mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
! 327: MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
! 328: bp = tp;
! 329: TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
! 330: }
! 331: #if HAVE_NATIVE_mpn_mul_2
! 332: /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
! 333: blimb_low = bp[0];
! 334: #endif
! 335: blimb = bp[bsize-1];
! 336:
! 337: TRACE (printf ("big bsize=%ld ", bsize);
! 338: mpn_trace ("b", bp, bsize));
! 339: }
! 340:
! 341: /* At this point blimb is the most significant limb of the base to use.
! 342:
! 343: Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
! 344: limb to round up the division; +1 for multiplies all using an extra
! 345: limb over the true size; +2 for rl at the end; +1 for lshift at the
! 346: end.
! 347:
! 348: The size calculation here is reasonably accurate. The base is at least
! 349: half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
! 350: when it will power up as just over 16, an overestimate of 17/16 =
! 351: 6.25%. For a 64-bit limb it's half that.
! 352:
! 353: If e==0 then blimb won't be anything useful (though it will be
! 354: non-zero), but that doesn't matter since we just end up with ralloc==5,
! 355: and that's fine for 2 limbs of rl and 1 of lshift. */
! 356:
! 357: ASSERT (blimb != 0);
! 358: count_leading_zeros (cnt, blimb);
! 359: ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
! 360: TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
! 361: ralloc, bsize, blimb, cnt));
! 362: MPZ_REALLOC (r, ralloc + rtwos_limbs);
! 363: rp = PTR(r);
! 364:
! 365: /* Low zero limbs resulting from powers of 2. */
! 366: MPN_ZERO (rp, rtwos_limbs);
! 367: rp += rtwos_limbs;
! 368:
! 369: if (e == 0)
! 370: {
! 371: /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
! 372: start. */
! 373: rp[0] = rl;
! 374: rsize = 1;
! 375: #if HAVE_NATIVE_mpn_mul_2
! 376: rp[1] = rl_high;
! 377: rsize += (rl_high != 0);
! 378: #endif
! 379: ASSERT (rp[rsize-1] != 0);
! 380: }
! 381: else
! 382: {
! 383: mp_ptr tp;
! 384: mp_size_t talloc;
! 385:
! 386: /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
! 387: low bit of e is zero, tp only has to hold the second last power
! 388: step, which is half the size of the final result. There's no need
! 389: to round up the divide by 2, since ralloc includes a +2 for rl
! 390: which not needed by tp. In the mpn_mul loop when the low bit of e
! 391: is 1, tp must hold nearly the full result, so just size it the same
! 392: as rp. */
! 393:
! 394: talloc = ralloc;
! 395: #if HAVE_NATIVE_mpn_mul_2
! 396: if (bsize <= 2 || (e & 1) == 0)
! 397: talloc /= 2;
! 398: #else
! 399: if (bsize <= 1 || (e & 1) == 0)
! 400: talloc /= 2;
! 401: #endif
! 402: TRACE (printf ("talloc %ld\n", talloc));
! 403: tp = TMP_ALLOC_LIMBS (talloc);
! 404:
! 405: /* Go from high to low over the bits of e, starting with i pointing at
! 406: the bit below the highest 1 (which will mean i==-1 if e==1). */
! 407: count_leading_zeros (cnt, e);
! 408: i = GMP_LIMB_BITS - cnt - 2;
! 409:
! 410: #if HAVE_NATIVE_mpn_mul_2
! 411: if (bsize <= 2)
! 412: {
! 413: mp_limb_t mult[2];
! 414:
! 415: /* Any bsize==1 will have been powered above to be two limbs. */
! 416: ASSERT (bsize == 2);
! 417: ASSERT (blimb != 0);
! 418:
! 419: /* Arrange the final result ends up in r, not in the temp space */
! 420: if ((i & 1) == 0)
! 421: SWAP_RP_TP;
! 422:
! 423: rp[0] = blimb_low;
! 424: rp[1] = blimb;
! 425: rsize = 2;
! 426:
! 427: mult[0] = blimb_low;
! 428: mult[1] = blimb;
! 429:
! 430: for ( ; i >= 0; i--)
! 431: {
! 432: TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
! 433: i, e, rsize, ralloc, talloc);
! 434: mpn_trace ("r", rp, rsize));
! 435:
! 436: MPN_SQR_N (tp, talloc, rp, rsize);
! 437: SWAP_RP_TP;
! 438: if ((e & (1L << i)) != 0)
! 439: MPN_MUL_2 (rp, rsize, ralloc, mult);
! 440: }
! 441:
! 442: TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
! 443: if (rl_high != 0)
! 444: {
! 445: mult[0] = rl;
! 446: mult[1] = rl_high;
! 447: MPN_MUL_2 (rp, rsize, ralloc, mult);
! 448: }
! 449: else if (rl != 1)
! 450: MPN_MUL_1 (rp, rsize, ralloc, rl);
! 451: }
! 452: #else
! 453: if (bsize == 1)
! 454: {
! 455: /* Arrange the final result ends up in r, not in the temp space */
! 456: if ((i & 1) == 0)
! 457: SWAP_RP_TP;
! 458:
! 459: rp[0] = blimb;
! 460: rsize = 1;
! 461:
! 462: for ( ; i >= 0; i--)
! 463: {
! 464: TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
! 465: i, e, rsize, ralloc, talloc);
! 466: mpn_trace ("r", rp, rsize));
! 467:
! 468: MPN_SQR_N (tp, talloc, rp, rsize);
! 469: SWAP_RP_TP;
! 470: if ((e & (1L << i)) != 0)
! 471: MPN_MUL_1 (rp, rsize, ralloc, blimb);
! 472: }
! 473:
! 474: TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
! 475: if (rl != 1)
! 476: MPN_MUL_1 (rp, rsize, ralloc, rl);
! 477: }
! 478: #endif
! 479: else
! 480: {
! 481: int parity;
! 482:
! 483: /* Arrange the final result ends up in r, not in the temp space */
! 484: ULONG_PARITY (parity, e);
! 485: if (((parity ^ i) & 1) != 0)
! 486: SWAP_RP_TP;
! 487:
! 488: MPN_COPY (rp, bp, bsize);
! 489: rsize = bsize;
! 490:
! 491: for ( ; i >= 0; i--)
! 492: {
! 493: TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
! 494: i, e, rsize, ralloc, talloc);
! 495: mpn_trace ("r", rp, rsize));
! 496:
! 497: MPN_SQR_N (tp, talloc, rp, rsize);
! 498: SWAP_RP_TP;
! 499: if ((e & (1L << i)) != 0)
! 500: {
! 501: MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
! 502: SWAP_RP_TP;
! 503: }
! 504: }
! 505: }
! 506: }
! 507:
! 508: ASSERT (rp == PTR(r) + rtwos_limbs);
! 509: TRACE (mpn_trace ("end loop r", rp, rsize));
! 510: TMP_FREE (marker);
! 511:
! 512: /* Apply any partial limb factors of 2. */
! 513: if (rtwos_bits != 0)
! 514: {
! 515: MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
! 516: TRACE (mpn_trace ("lshift r", rp, rsize));
! 517: }
! 518:
! 519: rsize += rtwos_limbs;
! 520: SIZ(r) = (rneg ? -rsize : rsize);
! 521: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>