Annotation of OpenXM_contrib/gmp/mpz/n_pow_ui.c, Revision 1.1.1.1
1.1 ohara 1: /* mpz_n_pow_ui -- mpn raised to ulong.
2:
3: Copyright 2001, 2002 Free Software Foundation, Inc.
4:
5: This file is part of the GNU MP Library.
6:
7: The GNU MP Library is free software; you can redistribute it and/or modify
8: it under the terms of the GNU Lesser General Public License as published by
9: the Free Software Foundation; either version 2.1 of the License, or (at your
10: option) any later version.
11:
12: The GNU MP Library is distributed in the hope that it will be useful, but
13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: License for more details.
16:
17: You should have received a copy of the GNU Lesser General Public License
18: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: MA 02111-1307, USA. */
21:
22: #include "gmp.h"
23: #include "gmp-impl.h"
24: #include "longlong.h"
25:
26:
27: /* Change this to "#define TRACE(x) x" for some traces. */
28: #define TRACE(x)
29:
30:
31: /* Use this to test the mul_2 code on a CPU without a native version of that
32: routine. */
33: #if 0
34: #define mpn_mul_2 refmpn_mul_2
35: #define HAVE_NATIVE_mpn_mul_2 1
36: #endif
37:
38:
39: /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
40: ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
41: bsize==2 or >2, but separating that isn't easy because there's shared
42: code both before and after (the size calculations and the powers of 2
43: handling).
44:
45: Alternatives:
46:
47: It would work to just use the mpn_mul powering loop for 1 and 2 limb
48: bases, but the current separate loop allows mul_1 and mul_2 to be done
49: in-place, which might help cache locality a bit. If mpn_mul was relaxed
50: to allow source==dest when vn==1 or 2 then some pointer twiddling might
51: let us get the same effect in one loop.
52:
53: The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
54: form the biggest possible power of b that fits, only the biggest power of
55: 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
56: using __mp_bases[b].big_base for small b, and thereby get better value
57: from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
58: so would be more complicated than it's worth, and could well end up being
59: a slowdown for small e. For big e on the other hand the algorithm is
60: dominated by mpn_sqr_n so there wouldn't much of a saving. The current
61: code can be viewed as simply doing the first few steps of the powering in
62: a single or double limb where possible.
63:
64: If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
65: copy made of b is unnecessary. We could just use the old alloc'ed block
66: and free it at the end. But arranging this seems like a lot more trouble
67: than it's worth. */
68:
69:
70: /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
71: a limb without overflowing.
72: FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
73:
74: #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
75:
76:
77: /* The following are for convenience, they update the size and check the
78: alloc. */
79:
80: #define MPN_SQR_N(dst, alloc, src, size) \
81: do { \
82: ASSERT (2*(size) <= (alloc)); \
83: mpn_sqr_n (dst, src, size); \
84: (size) *= 2; \
85: (size) -= ((dst)[(size)-1] == 0); \
86: } while (0)
87:
88: #define MPN_MUL(dst, alloc, src, size, src2, size2) \
89: do { \
90: mp_limb_t cy; \
91: ASSERT ((size) + (size2) <= (alloc)); \
92: cy = mpn_mul (dst, src, size, src2, size2); \
93: (size) += (size2) - (cy == 0); \
94: } while (0)
95:
96: #define MPN_MUL_2(ptr, size, alloc, mult) \
97: do { \
98: mp_limb_t cy; \
99: ASSERT ((size)+2 <= (alloc)); \
100: cy = mpn_mul_2 (ptr, ptr, size, mult); \
101: (size)++; \
102: (ptr)[(size)] = cy; \
103: (size) += (cy != 0); \
104: } while (0)
105:
106: #define MPN_MUL_1(ptr, size, alloc, limb) \
107: do { \
108: mp_limb_t cy; \
109: ASSERT ((size)+1 <= (alloc)); \
110: cy = mpn_mul_1 (ptr, ptr, size, limb); \
111: (ptr)[size] = cy; \
112: (size) += (cy != 0); \
113: } while (0)
114:
115: #define MPN_LSHIFT(ptr, size, alloc, shift) \
116: do { \
117: mp_limb_t cy; \
118: ASSERT ((size)+1 <= (alloc)); \
119: cy = mpn_lshift (ptr, ptr, size, shift); \
120: (ptr)[size] = cy; \
121: (size) += (cy != 0); \
122: } while (0)
123:
124: #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
125: do { \
126: if ((shift) == 0) \
127: MPN_COPY (dst, src, size); \
128: else \
129: { \
130: mpn_rshift (dst, src, size, shift); \
131: (size) -= ((dst)[(size)-1] == 0); \
132: } \
133: } while (0)
134:
135:
136: /* ralloc and talloc are only wanted for ASSERTs, after the initial space
137: allocations. Avoid writing values to them in a normal build, to ensure
138: the compiler lets them go dead. gcc already figures this out itself
139: actually. */
140:
141: #define SWAP_RP_TP \
142: do { \
143: MP_PTR_SWAP (rp, tp); \
144: ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
145: } while (0)
146:
147:
148: void
149: mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
150: {
151: mp_ptr rp;
152: mp_size_t rtwos_limbs, ralloc, rsize;
153: int rneg, i, cnt, btwos, r_bp_overlap;
154: mp_limb_t blimb, rl;
155: unsigned long rtwos_bits;
156: #if HAVE_NATIVE_mpn_mul_2
157: mp_limb_t blimb_low, rl_high;
158: #else
159: mp_limb_t b_twolimbs[2];
160: #endif
161: TMP_DECL (marker);
162:
163: TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
164: PTR(r), bp, bsize, e, e);
165: mpn_trace ("b", bp, bsize));
166:
167: ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
168: ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
169:
170: /* b^0 == 1, including 0^0 == 1 */
171: if (e == 0)
172: {
173: PTR(r)[0] = 1;
174: SIZ(r) = 1;
175: return;
176: }
177:
178: /* 0^e == 0 apart from 0^0 above */
179: if (bsize == 0)
180: {
181: SIZ(r) = 0;
182: return;
183: }
184:
185: /* Sign of the final result. */
186: rneg = (bsize < 0 && (e & 1) != 0);
187: bsize = ABS (bsize);
188: TRACE (printf ("rneg %d\n", rneg));
189:
190: r_bp_overlap = (PTR(r) == bp);
191:
192: /* Strip low zero limbs from b. */
193: rtwos_limbs = 0;
194: for (blimb = *bp; blimb == 0; blimb = *++bp)
195: {
196: rtwos_limbs += e;
197: bsize--; ASSERT (bsize >= 1);
198: }
199: TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
200:
201: /* Strip low zero bits from b. */
202: count_trailing_zeros (btwos, blimb);
203: blimb >>= btwos;
204: rtwos_bits = e * btwos;
205: rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
206: rtwos_bits %= GMP_NUMB_BITS;
207: TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
208: btwos, rtwos_limbs, rtwos_bits));
209:
210: TMP_MARK (marker);
211:
212: rl = 1;
213: #if HAVE_NATIVE_mpn_mul_2
214: rl_high = 0;
215: #endif
216:
217: if (bsize == 1)
218: {
219: bsize_1:
220: /* Power up as far as possible within blimb. We start here with e!=0,
221: but if e is small then we might reach e==0 and the whole b^e in rl.
222: Notice this code works when blimb==1 too, reaching e==0. */
223:
224: while (blimb <= GMP_NUMB_HALFMAX)
225: {
226: TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
227: e, blimb, rl));
228: ASSERT (e != 0);
229: if ((e & 1) != 0)
230: rl *= blimb;
231: e >>= 1;
232: if (e == 0)
233: goto got_rl;
234: blimb *= blimb;
235: }
236:
237: #if HAVE_NATIVE_mpn_mul_2
238: TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
239: e, blimb, rl));
240:
241: /* Can power b once more into blimb:blimb_low */
242: bsize = 2;
243: ASSERT (e != 0);
244: if ((e & 1) != 0)
245: {
246: umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
247: rl >>= GMP_NAIL_BITS;
248: }
249: e >>= 1;
250: umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
251: blimb_low >>= GMP_NAIL_BITS;
252:
253: got_rl:
254: TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
255: e, blimb, blimb_low, rl_high, rl));
256:
257: /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
258: final mul_1 or mul_2 rather than a separate lshift.
259: - rl_high:rl mustn't be 1 (since then there's no final mul)
260: - rl_high mustn't overflow
261: - rl_high mustn't change to non-zero, since mul_1+lshift is
262: probably faster than mul_2 (FIXME: is this true?) */
263:
264: if (rtwos_bits != 0
265: && ! (rl_high == 0 && rl == 1)
266: && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
267: {
268: mp_limb_t new_rl_high = (rl_high << rtwos_bits)
269: | (rl >> (GMP_NUMB_BITS-rtwos_bits));
270: if (! (rl_high == 0 && new_rl_high != 0))
271: {
272: rl_high = new_rl_high;
273: rl <<= rtwos_bits;
274: rtwos_bits = 0;
275: TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
276: rl_high, rl));
277: }
278: }
279: #else
280: got_rl:
281: TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
282: e, blimb, rl));
283:
284: /* Combine left-over rtwos_bits into rl to be handled by the final
285: mul_1 rather than a separate lshift.
286: - rl mustn't be 1 (since then there's no final mul)
287: - rl mustn't overflow */
288:
289: if (rtwos_bits != 0
290: && rl != 1
291: && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
292: {
293: rl <<= rtwos_bits;
294: rtwos_bits = 0;
295: TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
296: }
297: #endif
298: }
299: else if (bsize == 2)
300: {
301: mp_limb_t bsecond = bp[1];
302: if (btwos != 0)
303: blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
304: bsecond >>= btwos;
305: if (bsecond == 0)
306: {
307: /* Two limbs became one after rshift. */
308: bsize = 1;
309: goto bsize_1;
310: }
311:
312: TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
313: #if HAVE_NATIVE_mpn_mul_2
314: blimb_low = blimb;
315: #else
316: bp = b_twolimbs;
317: b_twolimbs[0] = blimb;
318: b_twolimbs[1] = bsecond;
319: #endif
320: blimb = bsecond;
321: }
322: else
323: {
324: if (r_bp_overlap || btwos != 0)
325: {
326: mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
327: MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
328: bp = tp;
329: TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
330: }
331: #if HAVE_NATIVE_mpn_mul_2
332: /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
333: blimb_low = bp[0];
334: #endif
335: blimb = bp[bsize-1];
336:
337: TRACE (printf ("big bsize=%ld ", bsize);
338: mpn_trace ("b", bp, bsize));
339: }
340:
341: /* At this point blimb is the most significant limb of the base to use.
342:
343: Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
344: limb to round up the division; +1 for multiplies all using an extra
345: limb over the true size; +2 for rl at the end; +1 for lshift at the
346: end.
347:
348: The size calculation here is reasonably accurate. The base is at least
349: half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
350: when it will power up as just over 16, an overestimate of 17/16 =
351: 6.25%. For a 64-bit limb it's half that.
352:
353: If e==0 then blimb won't be anything useful (though it will be
354: non-zero), but that doesn't matter since we just end up with ralloc==5,
355: and that's fine for 2 limbs of rl and 1 of lshift. */
356:
357: ASSERT (blimb != 0);
358: count_leading_zeros (cnt, blimb);
359: ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
360: TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
361: ralloc, bsize, blimb, cnt));
362: MPZ_REALLOC (r, ralloc + rtwos_limbs);
363: rp = PTR(r);
364:
365: /* Low zero limbs resulting from powers of 2. */
366: MPN_ZERO (rp, rtwos_limbs);
367: rp += rtwos_limbs;
368:
369: if (e == 0)
370: {
371: /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
372: start. */
373: rp[0] = rl;
374: rsize = 1;
375: #if HAVE_NATIVE_mpn_mul_2
376: rp[1] = rl_high;
377: rsize += (rl_high != 0);
378: #endif
379: ASSERT (rp[rsize-1] != 0);
380: }
381: else
382: {
383: mp_ptr tp;
384: mp_size_t talloc;
385:
386: /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
387: low bit of e is zero, tp only has to hold the second last power
388: step, which is half the size of the final result. There's no need
389: to round up the divide by 2, since ralloc includes a +2 for rl
390: which not needed by tp. In the mpn_mul loop when the low bit of e
391: is 1, tp must hold nearly the full result, so just size it the same
392: as rp. */
393:
394: talloc = ralloc;
395: #if HAVE_NATIVE_mpn_mul_2
396: if (bsize <= 2 || (e & 1) == 0)
397: talloc /= 2;
398: #else
399: if (bsize <= 1 || (e & 1) == 0)
400: talloc /= 2;
401: #endif
402: TRACE (printf ("talloc %ld\n", talloc));
403: tp = TMP_ALLOC_LIMBS (talloc);
404:
405: /* Go from high to low over the bits of e, starting with i pointing at
406: the bit below the highest 1 (which will mean i==-1 if e==1). */
407: count_leading_zeros (cnt, e);
408: i = GMP_LIMB_BITS - cnt - 2;
409:
410: #if HAVE_NATIVE_mpn_mul_2
411: if (bsize <= 2)
412: {
413: mp_limb_t mult[2];
414:
415: /* Any bsize==1 will have been powered above to be two limbs. */
416: ASSERT (bsize == 2);
417: ASSERT (blimb != 0);
418:
419: /* Arrange the final result ends up in r, not in the temp space */
420: if ((i & 1) == 0)
421: SWAP_RP_TP;
422:
423: rp[0] = blimb_low;
424: rp[1] = blimb;
425: rsize = 2;
426:
427: mult[0] = blimb_low;
428: mult[1] = blimb;
429:
430: for ( ; i >= 0; i--)
431: {
432: TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
433: i, e, rsize, ralloc, talloc);
434: mpn_trace ("r", rp, rsize));
435:
436: MPN_SQR_N (tp, talloc, rp, rsize);
437: SWAP_RP_TP;
438: if ((e & (1L << i)) != 0)
439: MPN_MUL_2 (rp, rsize, ralloc, mult);
440: }
441:
442: TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
443: if (rl_high != 0)
444: {
445: mult[0] = rl;
446: mult[1] = rl_high;
447: MPN_MUL_2 (rp, rsize, ralloc, mult);
448: }
449: else if (rl != 1)
450: MPN_MUL_1 (rp, rsize, ralloc, rl);
451: }
452: #else
453: if (bsize == 1)
454: {
455: /* Arrange the final result ends up in r, not in the temp space */
456: if ((i & 1) == 0)
457: SWAP_RP_TP;
458:
459: rp[0] = blimb;
460: rsize = 1;
461:
462: for ( ; i >= 0; i--)
463: {
464: TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
465: i, e, rsize, ralloc, talloc);
466: mpn_trace ("r", rp, rsize));
467:
468: MPN_SQR_N (tp, talloc, rp, rsize);
469: SWAP_RP_TP;
470: if ((e & (1L << i)) != 0)
471: MPN_MUL_1 (rp, rsize, ralloc, blimb);
472: }
473:
474: TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
475: if (rl != 1)
476: MPN_MUL_1 (rp, rsize, ralloc, rl);
477: }
478: #endif
479: else
480: {
481: int parity;
482:
483: /* Arrange the final result ends up in r, not in the temp space */
484: ULONG_PARITY (parity, e);
485: if (((parity ^ i) & 1) != 0)
486: SWAP_RP_TP;
487:
488: MPN_COPY (rp, bp, bsize);
489: rsize = bsize;
490:
491: for ( ; i >= 0; i--)
492: {
493: TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494: i, e, rsize, ralloc, talloc);
495: mpn_trace ("r", rp, rsize));
496:
497: MPN_SQR_N (tp, talloc, rp, rsize);
498: SWAP_RP_TP;
499: if ((e & (1L << i)) != 0)
500: {
501: MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
502: SWAP_RP_TP;
503: }
504: }
505: }
506: }
507:
508: ASSERT (rp == PTR(r) + rtwos_limbs);
509: TRACE (mpn_trace ("end loop r", rp, rsize));
510: TMP_FREE (marker);
511:
512: /* Apply any partial limb factors of 2. */
513: if (rtwos_bits != 0)
514: {
515: MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
516: TRACE (mpn_trace ("lshift r", rp, rsize));
517: }
518:
519: rsize += rtwos_limbs;
520: SIZ(r) = (rneg ? -rsize : rsize);
521: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>