OpenXM_contrib/gmp/mpz/n_pow_ui.c - annotate

Return to n_pow_ui.c CVS log
Up to [local] / OpenXM_contrib / gmp / mpz
Annotation of OpenXM_contrib/gmp/mpz/n_pow_ui.c, Revision 1.1.1.1

1.1       ohara       1: /* mpz_n_pow_ui -- mpn raised to ulong.
                      2:
                      3: Copyright 2001, 2002 Free Software Foundation, Inc.
                      4:
                      5: This file is part of the GNU MP Library.
                      6:
                      7: The GNU MP Library is free software; you can redistribute it and/or modify
                      8: it under the terms of the GNU Lesser General Public License as published by
                      9: the Free Software Foundation; either version 2.1 of the License, or (at your
                     10: option) any later version.
                     11:
                     12: The GNU MP Library is distributed in the hope that it will be useful, but
                     13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     14: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     15: License for more details.
                     16:
                     17: You should have received a copy of the GNU Lesser General Public License
                     18: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     20: MA 02111-1307, USA. */
                     21:
                     22: #include "gmp.h"
                     23: #include "gmp-impl.h"
                     24: #include "longlong.h"
                     25:
                     26:
                     27: /* Change this to "#define TRACE(x) x" for some traces. */
                     28: #define TRACE(x)
                     29:
                     30:
                     31: /* Use this to test the mul_2 code on a CPU without a native version of that
                     32:    routine.  */
                     33: #if 0
                     34: #define mpn_mul_2  refmpn_mul_2
                     35: #define HAVE_NATIVE_mpn_mul_2  1
                     36: #endif
                     37:
                     38:
                     39: /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
                     40:    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
                     41:    bsize==2 or >2, but separating that isn't easy because there's shared
                     42:    code both before and after (the size calculations and the powers of 2
                     43:    handling).
                     44:
                     45:    Alternatives:
                     46:
                     47:    It would work to just use the mpn_mul powering loop for 1 and 2 limb
                     48:    bases, but the current separate loop allows mul_1 and mul_2 to be done
                     49:    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
                     50:    to allow source==dest when vn==1 or 2 then some pointer twiddling might
                     51:    let us get the same effect in one loop.
                     52:
                     53:    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
                     54:    form the biggest possible power of b that fits, only the biggest power of
                     55:    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
                     56:    using __mp_bases[b].big_base for small b, and thereby get better value
                     57:    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
                     58:    so would be more complicated than it's worth, and could well end up being
                     59:    a slowdown for small e.  For big e on the other hand the algorithm is
                     60:    dominated by mpn_sqr_n so there wouldn't much of a saving.  The current
                     61:    code can be viewed as simply doing the first few steps of the powering in
                     62:    a single or double limb where possible.
                     63:
                     64:    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
                     65:    copy made of b is unnecessary.  We could just use the old alloc'ed block
                     66:    and free it at the end.  But arranging this seems like a lot more trouble
                     67:    than it's worth.  */
                     68:
                     69:
                     70: /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
                     71:    a limb without overflowing.
                     72:    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
                     73:
                     74: #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
                     75:
                     76:
                     77: /* The following are for convenience, they update the size and check the
                     78:    alloc.  */
                     79:
                     80: #define MPN_SQR_N(dst, alloc, src, size)        \
                     81:   do {                                          \
                     82:     ASSERT (2*(size) <= (alloc));               \
                     83:     mpn_sqr_n (dst, src, size);                 \
                     84:     (size) *= 2;                                \
                     85:     (size) -= ((dst)[(size)-1] == 0);           \
                     86:   } while (0)
                     87:
                     88: #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
                     89:   do {                                                  \
                     90:     mp_limb_t  cy;                                      \
                     91:     ASSERT ((size) + (size2) <= (alloc));               \
                     92:     cy = mpn_mul (dst, src, size, src2, size2);         \
                     93:     (size) += (size2) - (cy == 0);                      \
                     94:   } while (0)
                     95:
                     96: #define MPN_MUL_2(ptr, size, alloc, mult)       \
                     97:   do {                                          \
                     98:     mp_limb_t  cy;                              \
                     99:     ASSERT ((size)+2 <= (alloc));               \
                    100:     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
                    101:     (size)++;                                   \
                    102:     (ptr)[(size)] = cy;                         \
                    103:     (size) += (cy != 0);                        \
                    104:   } while (0)
                    105:
                    106: #define MPN_MUL_1(ptr, size, alloc, limb)       \
                    107:   do {                                          \
                    108:     mp_limb_t  cy;                              \
                    109:     ASSERT ((size)+1 <= (alloc));               \
                    110:     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
                    111:     (ptr)[size] = cy;                           \
                    112:     (size) += (cy != 0);                        \
                    113:   } while (0)
                    114:
                    115: #define MPN_LSHIFT(ptr, size, alloc, shift)     \
                    116:   do {                                          \
                    117:     mp_limb_t  cy;                              \
                    118:     ASSERT ((size)+1 <= (alloc));               \
                    119:     cy = mpn_lshift (ptr, ptr, size, shift);    \
                    120:     (ptr)[size] = cy;                           \
                    121:     (size) += (cy != 0);                        \
                    122:   } while (0)
                    123:
                    124: #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
                    125:   do {                                                  \
                    126:     if ((shift) == 0)                                   \
                    127:       MPN_COPY (dst, src, size);                        \
                    128:     else                                                \
                    129:       {                                                 \
                    130:         mpn_rshift (dst, src, size, shift);             \
                    131:         (size) -= ((dst)[(size)-1] == 0);               \
                    132:       }                                                 \
                    133:   } while (0)
                    134:
                    135:
                    136: /* ralloc and talloc are only wanted for ASSERTs, after the initial space
                    137:    allocations.  Avoid writing values to them in a normal build, to ensure
                    138:    the compiler lets them go dead.  gcc already figures this out itself
                    139:    actually.  */
                    140:
                    141: #define SWAP_RP_TP                                      \
                    142:   do {                                                  \
                    143:     MP_PTR_SWAP (rp, tp);                               \
                    144:     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
                    145:   } while (0)
                    146:
                    147:
                    148: void
                    149: mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
                    150: {
                    151:   mp_ptr         rp;
                    152:   mp_size_t      rtwos_limbs, ralloc, rsize;
                    153:   int            rneg, i, cnt, btwos, r_bp_overlap;
                    154:   mp_limb_t      blimb, rl;
                    155:   unsigned long  rtwos_bits;
                    156: #if HAVE_NATIVE_mpn_mul_2
                    157:   mp_limb_t      blimb_low, rl_high;
                    158: #else
                    159:   mp_limb_t      b_twolimbs[2];
                    160: #endif
                    161:   TMP_DECL (marker);
                    162:
                    163:   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
                    164:                  PTR(r), bp, bsize, e, e);
                    165:          mpn_trace ("b", bp, bsize));
                    166:
                    167:   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
                    168:   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
                    169:
                    170:   /* b^0 == 1, including 0^0 == 1 */
                    171:   if (e == 0)
                    172:     {
                    173:       PTR(r)[0] = 1;
                    174:       SIZ(r) = 1;
                    175:       return;
                    176:     }
                    177:
                    178:   /* 0^e == 0 apart from 0^0 above */
                    179:   if (bsize == 0)
                    180:     {
                    181:       SIZ(r) = 0;
                    182:       return;
                    183:     }
                    184:
                    185:   /* Sign of the final result. */
                    186:   rneg = (bsize < 0 && (e & 1) != 0);
                    187:   bsize = ABS (bsize);
                    188:   TRACE (printf ("rneg %d\n", rneg));
                    189:
                    190:   r_bp_overlap = (PTR(r) == bp);
                    191:
                    192:   /* Strip low zero limbs from b. */
                    193:   rtwos_limbs = 0;
                    194:   for (blimb = *bp; blimb == 0; blimb = *++bp)
                    195:     {
                    196:       rtwos_limbs += e;
                    197:       bsize--; ASSERT (bsize >= 1);
                    198:     }
                    199:   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
                    200:
                    201:   /* Strip low zero bits from b. */
                    202:   count_trailing_zeros (btwos, blimb);
                    203:   blimb >>= btwos;
                    204:   rtwos_bits = e * btwos;
                    205:   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
                    206:   rtwos_bits %= GMP_NUMB_BITS;
                    207:   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
                    208:                  btwos, rtwos_limbs, rtwos_bits));
                    209:
                    210:   TMP_MARK (marker);
                    211:
                    212:   rl = 1;
                    213: #if HAVE_NATIVE_mpn_mul_2
                    214:   rl_high = 0;
                    215: #endif
                    216:
                    217:   if (bsize == 1)
                    218:     {
                    219:     bsize_1:
                    220:       /* Power up as far as possible within blimb.  We start here with e!=0,
                    221:          but if e is small then we might reach e==0 and the whole b^e in rl.
                    222:          Notice this code works when blimb==1 too, reaching e==0.  */
                    223:
                    224:       while (blimb <= GMP_NUMB_HALFMAX)
                    225:         {
                    226:           TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
                    227:                          e, blimb, rl));
                    228:           ASSERT (e != 0);
                    229:           if ((e & 1) != 0)
                    230:             rl *= blimb;
                    231:           e >>= 1;
                    232:           if (e == 0)
                    233:             goto got_rl;
                    234:           blimb *= blimb;
                    235:         }
                    236:
                    237: #if HAVE_NATIVE_mpn_mul_2
                    238:       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
                    239:                      e, blimb, rl));
                    240:
                    241:       /* Can power b once more into blimb:blimb_low */
                    242:       bsize = 2;
                    243:       ASSERT (e != 0);
                    244:       if ((e & 1) != 0)
                    245:        {
                    246:          umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
                    247:          rl >>= GMP_NAIL_BITS;
                    248:        }
                    249:       e >>= 1;
                    250:       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
                    251:       blimb_low >>= GMP_NAIL_BITS;
                    252:
                    253:     got_rl:
                    254:       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
                    255:                      e, blimb, blimb_low, rl_high, rl));
                    256:
                    257:       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
                    258:          final mul_1 or mul_2 rather than a separate lshift.
                    259:          - rl_high:rl mustn't be 1 (since then there's no final mul)
                    260:          - rl_high mustn't overflow
                    261:          - rl_high mustn't change to non-zero, since mul_1+lshift is
                    262:          probably faster than mul_2 (FIXME: is this true?)  */
                    263:
                    264:       if (rtwos_bits != 0
                    265:           && ! (rl_high == 0 && rl == 1)
                    266:           && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
                    267:         {
                    268:           mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
                    269:             | (rl >> (GMP_NUMB_BITS-rtwos_bits));
                    270:           if (! (rl_high == 0 && new_rl_high != 0))
                    271:             {
                    272:               rl_high = new_rl_high;
                    273:               rl <<= rtwos_bits;
                    274:               rtwos_bits = 0;
                    275:               TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
                    276:                              rl_high, rl));
                    277:             }
                    278:         }
                    279: #else
                    280:     got_rl:
                    281:       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
                    282:                      e, blimb, rl));
                    283:
                    284:       /* Combine left-over rtwos_bits into rl to be handled by the final
                    285:          mul_1 rather than a separate lshift.
                    286:          - rl mustn't be 1 (since then there's no final mul)
                    287:          - rl mustn't overflow  */
                    288:
                    289:       if (rtwos_bits != 0
                    290:           && rl != 1
                    291:           && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
                    292:         {
                    293:           rl <<= rtwos_bits;
                    294:           rtwos_bits = 0;
                    295:           TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
                    296:         }
                    297: #endif
                    298:     }
                    299:   else if (bsize == 2)
                    300:     {
                    301:       mp_limb_t  bsecond = bp[1];
                    302:       if (btwos != 0)
                    303:         blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
                    304:       bsecond >>= btwos;
                    305:       if (bsecond == 0)
                    306:         {
                    307:           /* Two limbs became one after rshift. */
                    308:           bsize = 1;
                    309:           goto bsize_1;
                    310:         }
                    311:
                    312:       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
                    313: #if HAVE_NATIVE_mpn_mul_2
                    314:       blimb_low = blimb;
                    315: #else
                    316:       bp = b_twolimbs;
                    317:       b_twolimbs[0] = blimb;
                    318:       b_twolimbs[1] = bsecond;
                    319: #endif
                    320:       blimb = bsecond;
                    321:     }
                    322:   else
                    323:     {
                    324:       if (r_bp_overlap || btwos != 0)
                    325:         {
                    326:           mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
                    327:           MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
                    328:           bp = tp;
                    329:           TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
                    330:         }
                    331: #if HAVE_NATIVE_mpn_mul_2
                    332:       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
                    333:       blimb_low = bp[0];
                    334: #endif
                    335:       blimb = bp[bsize-1];
                    336:
                    337:       TRACE (printf ("big bsize=%ld  ", bsize);
                    338:              mpn_trace ("b", bp, bsize));
                    339:     }
                    340:
                    341:   /* At this point blimb is the most significant limb of the base to use.
                    342:
                    343:      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
                    344:      limb to round up the division; +1 for multiplies all using an extra
                    345:      limb over the true size; +2 for rl at the end; +1 for lshift at the
                    346:      end.
                    347:
                    348:      The size calculation here is reasonably accurate.  The base is at least
                    349:      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
                    350:      when it will power up as just over 16, an overestimate of 17/16 =
                    351:      6.25%.  For a 64-bit limb it's half that.
                    352:
                    353:      If e==0 then blimb won't be anything useful (though it will be
                    354:      non-zero), but that doesn't matter since we just end up with ralloc==5,
                    355:      and that's fine for 2 limbs of rl and 1 of lshift.  */
                    356:
                    357:   ASSERT (blimb != 0);
                    358:   count_leading_zeros (cnt, blimb);
                    359:   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
                    360:   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
                    361:                  ralloc, bsize, blimb, cnt));
                    362:   MPZ_REALLOC (r, ralloc + rtwos_limbs);
                    363:   rp = PTR(r);
                    364:
                    365:   /* Low zero limbs resulting from powers of 2. */
                    366:   MPN_ZERO (rp, rtwos_limbs);
                    367:   rp += rtwos_limbs;
                    368:
                    369:   if (e == 0)
                    370:     {
                    371:       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
                    372:          start. */
                    373:       rp[0] = rl;
                    374:       rsize = 1;
                    375: #if HAVE_NATIVE_mpn_mul_2
                    376:       rp[1] = rl_high;
                    377:       rsize += (rl_high != 0);
                    378: #endif
                    379:       ASSERT (rp[rsize-1] != 0);
                    380:     }
                    381:   else
                    382:     {
                    383:       mp_ptr     tp;
                    384:       mp_size_t  talloc;
                    385:
                    386:       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
                    387:          low bit of e is zero, tp only has to hold the second last power
                    388:          step, which is half the size of the final result.  There's no need
                    389:          to round up the divide by 2, since ralloc includes a +2 for rl
                    390:          which not needed by tp.  In the mpn_mul loop when the low bit of e
                    391:          is 1, tp must hold nearly the full result, so just size it the same
                    392:          as rp.  */
                    393:
                    394:       talloc = ralloc;
                    395: #if HAVE_NATIVE_mpn_mul_2
                    396:       if (bsize <= 2 || (e & 1) == 0)
                    397:         talloc /= 2;
                    398: #else
                    399:       if (bsize <= 1 || (e & 1) == 0)
                    400:         talloc /= 2;
                    401: #endif
                    402:       TRACE (printf ("talloc %ld\n", talloc));
                    403:       tp = TMP_ALLOC_LIMBS (talloc);
                    404:
                    405:       /* Go from high to low over the bits of e, starting with i pointing at
                    406:          the bit below the highest 1 (which will mean i==-1 if e==1).  */
                    407:       count_leading_zeros (cnt, e);
                    408:       i = GMP_LIMB_BITS - cnt - 2;
                    409:
                    410: #if HAVE_NATIVE_mpn_mul_2
                    411:       if (bsize <= 2)
                    412:         {
                    413:           mp_limb_t  mult[2];
                    414:
                    415:           /* Any bsize==1 will have been powered above to be two limbs. */
                    416:           ASSERT (bsize == 2);
                    417:           ASSERT (blimb != 0);
                    418:
                    419:           /* Arrange the final result ends up in r, not in the temp space */
                    420:           if ((i & 1) == 0)
                    421:             SWAP_RP_TP;
                    422:
                    423:           rp[0] = blimb_low;
                    424:           rp[1] = blimb;
                    425:           rsize = 2;
                    426:
                    427:           mult[0] = blimb_low;
                    428:           mult[1] = blimb;
                    429:
                    430:           for ( ; i >= 0; i--)
                    431:             {
                    432:               TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
                    433:                              i, e, rsize, ralloc, talloc);
                    434:                      mpn_trace ("r", rp, rsize));
                    435:
                    436:               MPN_SQR_N (tp, talloc, rp, rsize);
                    437:               SWAP_RP_TP;
                    438:               if ((e & (1L << i)) != 0)
                    439:                 MPN_MUL_2 (rp, rsize, ralloc, mult);
                    440:             }
                    441:
                    442:           TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
                    443:           if (rl_high != 0)
                    444:             {
                    445:               mult[0] = rl;
                    446:               mult[1] = rl_high;
                    447:               MPN_MUL_2 (rp, rsize, ralloc, mult);
                    448:             }
                    449:           else if (rl != 1)
                    450:             MPN_MUL_1 (rp, rsize, ralloc, rl);
                    451:         }
                    452: #else
                    453:       if (bsize == 1)
                    454:         {
                    455:           /* Arrange the final result ends up in r, not in the temp space */
                    456:           if ((i & 1) == 0)
                    457:             SWAP_RP_TP;
                    458:
                    459:           rp[0] = blimb;
                    460:           rsize = 1;
                    461:
                    462:           for ( ; i >= 0; i--)
                    463:             {
                    464:               TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
                    465:                              i, e, rsize, ralloc, talloc);
                    466:                      mpn_trace ("r", rp, rsize));
                    467:
                    468:               MPN_SQR_N (tp, talloc, rp, rsize);
                    469:               SWAP_RP_TP;
                    470:               if ((e & (1L << i)) != 0)
                    471:                 MPN_MUL_1 (rp, rsize, ralloc, blimb);
                    472:             }
                    473:
                    474:           TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
                    475:           if (rl != 1)
                    476:             MPN_MUL_1 (rp, rsize, ralloc, rl);
                    477:         }
                    478: #endif
                    479:       else
                    480:         {
                    481:           int  parity;
                    482:
                    483:           /* Arrange the final result ends up in r, not in the temp space */
                    484:           ULONG_PARITY (parity, e);
                    485:           if (((parity ^ i) & 1) != 0)
                    486:             SWAP_RP_TP;
                    487:
                    488:           MPN_COPY (rp, bp, bsize);
                    489:           rsize = bsize;
                    490:
                    491:           for ( ; i >= 0; i--)
                    492:             {
                    493:               TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
                    494:                              i, e, rsize, ralloc, talloc);
                    495:                      mpn_trace ("r", rp, rsize));
                    496:
                    497:               MPN_SQR_N (tp, talloc, rp, rsize);
                    498:               SWAP_RP_TP;
                    499:               if ((e & (1L << i)) != 0)
                    500:                 {
                    501:                   MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
                    502:                   SWAP_RP_TP;
                    503:                 }
                    504:             }
                    505:         }
                    506:     }
                    507:
                    508:   ASSERT (rp == PTR(r) + rtwos_limbs);
                    509:   TRACE (mpn_trace ("end loop r", rp, rsize));
                    510:   TMP_FREE (marker);
                    511:
                    512:   /* Apply any partial limb factors of 2. */
                    513:   if (rtwos_bits != 0)
                    514:     {
                    515:       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
                    516:       TRACE (mpn_trace ("lshift r", rp, rsize));
                    517:     }
                    518:
                    519:   rsize += rtwos_limbs;
                    520:   SIZ(r) = (rneg ? -rsize : rsize);
                    521: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>