Annotation of OpenXM_contrib/gmp/mpn/generic/gcdext.c, Revision 1.1.1.3
1.1 maekawa 1: /* mpn_gcdext -- Extended Greatest Common Divisor.
2:
1.1.1.3 ! ohara 3: Copyright 1996, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4:
5: This file is part of the GNU MP Library.
6:
7: The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2 maekawa 8: it under the terms of the GNU Lesser General Public License as published by
9: the Free Software Foundation; either version 2.1 of the License, or (at your
1.1 maekawa 10: option) any later version.
11:
12: The GNU MP Library is distributed in the hope that it will be useful, but
13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2 maekawa 14: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
1.1 maekawa 15: License for more details.
16:
1.1.1.2 maekawa 17: You should have received a copy of the GNU Lesser General Public License
1.1 maekawa 18: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: MA 02111-1307, USA. */
21:
22: #include "gmp.h"
23: #include "gmp-impl.h"
24: #include "longlong.h"
25:
1.1.1.2 maekawa 26: #ifndef GCDEXT_THRESHOLD
27: #define GCDEXT_THRESHOLD 17
28: #endif
29:
1.1 maekawa 30: #ifndef EXTEND
31: #define EXTEND 1
32: #endif
33:
34: #if STAT
1.1.1.3 ! ohara 35: int arr[GMP_LIMB_BITS + 1];
1.1 maekawa 36: #endif
37:
38:
1.1.1.2 maekawa 39: /* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
40:
41: Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
42: greatest common divisor at GP (unless it is 0), and the first cofactor at
43: SP. Write the size of the cofactor through the pointer SSIZE. Return the
44: size of the value at GP. Note that SP might be a negative number; this is
45: denoted by storing the negative of the size through SSIZE.
46:
47: {UP,USIZE} and {VP,VSIZE} are both clobbered.
48:
49: The space allocation for all four areas needs to be USIZE+1.
50:
51: Preconditions: 1) U >= V.
52: 2) V > 0. */
53:
54: /* We use Lehmer's algorithm. The idea is to extract the most significant
55: bits of the operands, and compute the continued fraction for them. We then
56: apply the gathered cofactors to the full operands.
57:
58: Idea 1: After we have performed a full division, don't shift operands back,
1.1 maekawa 59: but instead account for the extra factors-of-2 thus introduced.
60: Idea 2: Simple generalization to use divide-and-conquer would give us an
61: algorithm that runs faster than O(n^2).
62: Idea 3: The input numbers need less space as the computation progresses,
1.1.1.2 maekawa 63: while the s0 and s1 variables need more space. To save memory, we
1.1 maekawa 64: could make them share space, and have the latter variables grow
1.1.1.2 maekawa 65: into the former.
66: Idea 4: We should not do double-limb arithmetic from the start. Instead,
67: do things in single-limb arithmetic until the quotients differ,
68: and then switch to double-limb arithmetic. */
69:
1.1 maekawa 70:
1.1.1.3 ! ohara 71: /* One-limb division optimized for small quotients. */
1.1.1.2 maekawa 72: static mp_limb_t
1.1.1.3 ! ohara 73: div1 (mp_limb_t n0, mp_limb_t d0)
1.1.1.2 maekawa 74: {
1.1.1.3 ! ohara 75: if ((mp_limb_signed_t) n0 < 0)
1.1.1.2 maekawa 76: {
1.1.1.3 ! ohara 77: mp_limb_t q;
! 78: int cnt;
! 79: for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
! 80: {
! 81: d0 = d0 << 1;
! 82: }
! 83:
! 84: q = 0;
! 85: while (cnt)
! 86: {
! 87: q <<= 1;
! 88: if (n0 >= d0)
! 89: {
! 90: n0 = n0 - d0;
! 91: q |= 1;
! 92: }
! 93: d0 = d0 >> 1;
! 94: cnt--;
! 95: }
! 96:
! 97: return q;
! 98: }
! 99: else
! 100: {
! 101: mp_limb_t q;
! 102: int cnt;
! 103: for (cnt = 0; n0 >= d0; cnt++)
! 104: {
! 105: d0 = d0 << 1;
! 106: }
! 107:
! 108: q = 0;
! 109: while (cnt)
! 110: {
! 111: d0 = d0 >> 1;
! 112: q <<= 1;
! 113: if (n0 >= d0)
! 114: {
! 115: n0 = n0 - d0;
! 116: q |= 1;
! 117: }
! 118: cnt--;
! 119: }
! 120:
! 121: return q;
1.1.1.2 maekawa 122: }
1.1.1.3 ! ohara 123: }
1.1.1.2 maekawa 124:
1.1.1.3 ! ohara 125: /* Two-limb division optimized for small quotients. */
! 126: static mp_limb_t
! 127: div2 (mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
! 128: {
1.1.1.2 maekawa 129: if ((mp_limb_signed_t) n1 < 0)
130: {
131: mp_limb_t q;
132: int cnt;
133: for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
134: {
1.1.1.3 ! ohara 135: d1 = (d1 << 1) | (d0 >> (GMP_LIMB_BITS - 1));
1.1.1.2 maekawa 136: d0 = d0 << 1;
137: }
138:
139: q = 0;
140: while (cnt)
141: {
142: q <<= 1;
143: if (n1 > d1 || (n1 == d1 && n0 >= d0))
144: {
145: sub_ddmmss (n1, n0, n1, n0, d1, d0);
146: q |= 1;
147: }
1.1.1.3 ! ohara 148: d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
1.1.1.2 maekawa 149: d1 = d1 >> 1;
150: cnt--;
151: }
152:
153: return q;
154: }
155: else
156: {
157: mp_limb_t q;
158: int cnt;
159: for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
160: {
1.1.1.3 ! ohara 161: d1 = (d1 << 1) | (d0 >> (GMP_LIMB_BITS - 1));
1.1.1.2 maekawa 162: d0 = d0 << 1;
163: }
164:
165: q = 0;
166: while (cnt)
167: {
1.1.1.3 ! ohara 168: d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
1.1.1.2 maekawa 169: d1 = d1 >> 1;
170: q <<= 1;
171: if (n1 > d1 || (n1 == d1 && n0 >= d0))
172: {
173: sub_ddmmss (n1, n0, n1, n0, d1, d0);
174: q |= 1;
175: }
176: cnt--;
177: }
178:
179: return q;
180: }
181: }
1.1 maekawa 182:
183: mp_size_t
184: #if EXTEND
1.1.1.2 maekawa 185: mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
1.1 maekawa 186: mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
187: #else
188: mpn_gcd (mp_ptr gp,
189: mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
190: #endif
191: {
1.1.1.2 maekawa 192: mp_limb_t A, B, C, D;
1.1 maekawa 193: int cnt;
194: mp_ptr tp, wp;
195: #if RECORD
1.1.1.2 maekawa 196: mp_limb_t max = 0;
1.1 maekawa 197: #endif
198: #if EXTEND
199: mp_ptr s1p;
200: mp_ptr orig_s0p = s0p;
1.1.1.2 maekawa 201: mp_size_t ssize;
202: int sign = 1;
203: #endif
204: int use_double_flag;
1.1 maekawa 205: TMP_DECL (mark);
206:
1.1.1.3 ! ohara 207: ASSERT (size >= vsize);
! 208: ASSERT (vsize >= 1);
! 209: ASSERT (up[size-1] != 0);
! 210: ASSERT (vp[vsize-1] != 0);
! 211: ASSERT (! MPN_OVERLAP_P (up, size+1, vp, vsize+1));
! 212: #if EXTEND
! 213: ASSERT (! MPN_OVERLAP_P (s0p, size, up, size+1));
! 214: ASSERT (! MPN_OVERLAP_P (s0p, size, vp, vsize+1));
! 215: #endif
! 216: ASSERT (MPN_SAME_OR_SEPARATE_P (gp, up, size));
! 217: ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, size, vp, vsize));
1.1 maekawa 218:
1.1.1.3 ! ohara 219: TMP_MARK (mark);
1.1.1.2 maekawa 220:
1.1 maekawa 221: tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
222: wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
1.1.1.2 maekawa 223: #if EXTEND
224: s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
1.1 maekawa 225:
1.1.1.3 ! ohara 226: #if ! WANT_GCDEXT_ONE_STEP
1.1 maekawa 227: MPN_ZERO (s0p, size);
228: MPN_ZERO (s1p, size);
1.1.1.3 ! ohara 229: #endif
1.1 maekawa 230:
231: s0p[0] = 1;
232: s1p[0] = 0;
233: ssize = 1;
234: #endif
235:
236: if (size > vsize)
237: {
1.1.1.3 ! ohara 238: mpn_tdiv_qr (tp, up, (mp_size_t) 0, up, size, vp, vsize);
1.1 maekawa 239:
240: #if EXTEND
241: /* This is really what it boils down to in this case... */
242: s0p[0] = 0;
243: s1p[0] = 1;
1.1.1.2 maekawa 244: sign = -sign;
1.1 maekawa 245: #endif
246: size = vsize;
1.1.1.2 maekawa 247: MP_PTR_SWAP (up, vp);
1.1 maekawa 248: }
249:
1.1.1.3 ! ohara 250: use_double_flag = ABOVE_THRESHOLD (size, GCDEXT_THRESHOLD);
! 251:
1.1 maekawa 252: for (;;)
253: {
1.1.1.2 maekawa 254: mp_limb_t asign;
1.1 maekawa 255: /* Figure out exact size of V. */
256: vsize = size;
257: MPN_NORMALIZE (vp, vsize);
258: if (vsize <= 1)
259: break;
260:
1.1.1.2 maekawa 261: if (use_double_flag)
1.1 maekawa 262: {
1.1.1.2 maekawa 263: mp_limb_t uh, vh, ul, vl;
264: /* Let UH,UL be the most significant limbs of U, and let VH,VL be
265: the corresponding bits from V. */
266: uh = up[size - 1];
267: vh = vp[size - 1];
268: ul = up[size - 2];
269: vl = vp[size - 2];
270: count_leading_zeros (cnt, uh);
1.1.1.3 ! ohara 271: #if GMP_NAIL_BITS == 0
1.1.1.2 maekawa 272: if (cnt != 0)
273: {
1.1.1.3 ! ohara 274: uh = (uh << cnt) | (ul >> (GMP_LIMB_BITS - cnt));
! 275: vh = (vh << cnt) | (vl >> (GMP_LIMB_BITS - cnt));
1.1.1.2 maekawa 276: vl <<= cnt;
277: ul <<= cnt;
278: if (size >= 3)
279: {
1.1.1.3 ! ohara 280: ul |= (up[size - 3] >> (GMP_LIMB_BITS - cnt));
! 281: vl |= (vp[size - 3] >> (GMP_LIMB_BITS - cnt));
1.1.1.2 maekawa 282: }
283: }
1.1.1.3 ! ohara 284: #else
! 285: uh = uh << cnt;
! 286: vh = vh << cnt;
! 287: if (cnt < GMP_NUMB_BITS)
! 288: { /* GMP_NAIL_BITS <= cnt < GMP_NUMB_BITS */
! 289: uh |= ul >> (GMP_NUMB_BITS - cnt);
! 290: vh |= vl >> (GMP_NUMB_BITS - cnt);
! 291: ul <<= cnt + GMP_NAIL_BITS;
! 292: vl <<= cnt + GMP_NAIL_BITS;
! 293: if (size >= 3)
! 294: {
! 295: if (cnt + GMP_NAIL_BITS > GMP_NUMB_BITS)
! 296: {
! 297: ul |= up[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS;
! 298: vl |= vp[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS;
! 299: if (size >= 4)
! 300: {
! 301: ul |= up[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt;
! 302: vl |= vp[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt;
! 303: }
! 304: }
! 305: else
! 306: {
! 307: ul |= up[size - 3] >> (GMP_LIMB_BITS - cnt - 2 * GMP_NAIL_BITS);
! 308: vl |= vp[size - 3] >> (GMP_LIMB_BITS - cnt - 2 * GMP_NAIL_BITS);
! 309: }
! 310: }
! 311: }
! 312: else
! 313: { /* GMP_NUMB_BITS <= cnt <= GMP_LIMB_BITS-1 */
! 314: uh |= ul << cnt - GMP_NUMB_BITS; /* 0 <= c <= GMP_NAIL_BITS-1 */
! 315: vh |= vl << cnt - GMP_NUMB_BITS;
! 316: if (size >= 3)
! 317: {
! 318: if (cnt - GMP_NUMB_BITS != 0)
! 319: { /* uh/vh need yet more bits! */
! 320: uh |= up[size - 3] >> 2 * GMP_NUMB_BITS - cnt;
! 321: vh |= vp[size - 3] >> 2 * GMP_NUMB_BITS - cnt;
! 322: ul = up[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS;
! 323: vl = vp[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS;
! 324: if (size >= 4)
! 325: {
! 326: ul |= up[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt;
! 327: vl |= vp[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt;
! 328: }
! 329: }
! 330: else
! 331: {
! 332: ul = up[size - 3] << GMP_LIMB_BITS - cnt;
! 333: vl = vp[size - 3] << GMP_LIMB_BITS - cnt;
! 334: if (size >= 4)
! 335: {
! 336: ul |= up[size - 4] >> GMP_NUMB_BITS - (GMP_LIMB_BITS - cnt);
! 337: vl |= vp[size - 4] >> GMP_NUMB_BITS - (GMP_LIMB_BITS - cnt);
! 338: }
! 339: }
! 340: }
! 341: else
! 342: {
! 343: ul = 0;
! 344: vl = 0;
! 345: }
! 346: }
! 347: #endif
1.1.1.2 maekawa 348:
349: A = 1;
350: B = 0;
351: C = 0;
352: D = 1;
353:
354: asign = 0;
355: for (;;)
356: {
1.1.1.3 ! ohara 357: mp_limb_t Tac, Tbd;
! 358: mp_limb_t q1, q2;
1.1.1.2 maekawa 359: mp_limb_t nh, nl, dh, dl;
360: mp_limb_t t1, t0;
361: mp_limb_t Th, Tl;
362:
363: sub_ddmmss (dh, dl, vh, vl, 0, C);
1.1.1.3 ! ohara 364: if (dh == 0)
1.1.1.2 maekawa 365: break;
366: add_ssaaaa (nh, nl, uh, ul, 0, A);
1.1.1.3 ! ohara 367: q1 = div2 (nh, nl, dh, dl);
1.1.1.2 maekawa 368:
369: add_ssaaaa (dh, dl, vh, vl, 0, D);
1.1.1.3 ! ohara 370: if (dh == 0)
1.1.1.2 maekawa 371: break;
372: sub_ddmmss (nh, nl, uh, ul, 0, B);
1.1.1.3 ! ohara 373: q2 = div2 (nh, nl, dh, dl);
1.1.1.2 maekawa 374:
375: if (q1 != q2)
376: break;
377:
1.1.1.3 ! ohara 378: Tac = A + q1 * C;
! 379: if (GMP_NAIL_BITS != 0 && Tac > GMP_NUMB_MAX)
! 380: break;
! 381: Tbd = B + q1 * D;
! 382: if (GMP_NAIL_BITS != 0 && Tbd > GMP_NUMB_MAX)
! 383: break;
1.1.1.2 maekawa 384: A = C;
1.1.1.3 ! ohara 385: C = Tac;
1.1.1.2 maekawa 386: B = D;
1.1.1.3 ! ohara 387: D = Tbd;
1.1.1.2 maekawa 388: umul_ppmm (t1, t0, q1, vl);
389: t1 += q1 * vh;
390: sub_ddmmss (Th, Tl, uh, ul, t1, t0);
391: uh = vh, ul = vl;
392: vh = Th, vl = Tl;
393:
1.1.1.3 ! ohara 394: asign = ~asign;
! 395:
1.1.1.2 maekawa 396: add_ssaaaa (dh, dl, vh, vl, 0, C);
1.1.1.3 ! ohara 397: /* if (dh == 0) should never happen
! 398: break; */
1.1.1.2 maekawa 399: sub_ddmmss (nh, nl, uh, ul, 0, A);
1.1.1.3 ! ohara 400: q1 = div2 (nh, nl, dh, dl);
1.1.1.2 maekawa 401:
402: sub_ddmmss (dh, dl, vh, vl, 0, D);
1.1.1.3 ! ohara 403: if (dh == 0)
1.1.1.2 maekawa 404: break;
405: add_ssaaaa (nh, nl, uh, ul, 0, B);
1.1.1.3 ! ohara 406: q2 = div2 (nh, nl, dh, dl);
1.1.1.2 maekawa 407:
408: if (q1 != q2)
409: break;
410:
1.1.1.3 ! ohara 411: Tac = A + q1 * C;
! 412: if (GMP_NAIL_BITS != 0 && Tac > GMP_NUMB_MAX)
! 413: break;
! 414: Tbd = B + q1 * D;
! 415: if (GMP_NAIL_BITS != 0 && Tbd > GMP_NUMB_MAX)
! 416: break;
1.1.1.2 maekawa 417: A = C;
1.1.1.3 ! ohara 418: C = Tac;
1.1.1.2 maekawa 419: B = D;
1.1.1.3 ! ohara 420: D = Tbd;
1.1.1.2 maekawa 421: umul_ppmm (t1, t0, q1, vl);
422: t1 += q1 * vh;
423: sub_ddmmss (Th, Tl, uh, ul, t1, t0);
424: uh = vh, ul = vl;
425: vh = Th, vl = Tl;
1.1.1.3 ! ohara 426:
! 427: asign = ~asign;
1.1.1.2 maekawa 428: }
429: #if EXTEND
430: if (asign)
431: sign = -sign;
432: #endif
1.1 maekawa 433: }
1.1.1.2 maekawa 434: else /* Same, but using single-limb calculations. */
435: {
436: mp_limb_t uh, vh;
437: /* Make UH be the most significant limb of U, and make VH be
438: corresponding bits from V. */
439: uh = up[size - 1];
440: vh = vp[size - 1];
441: count_leading_zeros (cnt, uh);
1.1.1.3 ! ohara 442: #if GMP_NAIL_BITS == 0
1.1.1.2 maekawa 443: if (cnt != 0)
444: {
1.1.1.3 ! ohara 445: uh = (uh << cnt) | (up[size - 2] >> (GMP_LIMB_BITS - cnt));
! 446: vh = (vh << cnt) | (vp[size - 2] >> (GMP_LIMB_BITS - cnt));
1.1.1.2 maekawa 447: }
1.1.1.3 ! ohara 448: #else
! 449: uh <<= cnt;
! 450: vh <<= cnt;
! 451: if (cnt < GMP_NUMB_BITS)
! 452: {
! 453: uh |= up[size - 2] >> (GMP_NUMB_BITS - cnt);
! 454: vh |= vp[size - 2] >> (GMP_NUMB_BITS - cnt);
! 455: }
! 456: else
! 457: {
! 458: uh |= up[size - 2] << cnt - GMP_NUMB_BITS;
! 459: vh |= vp[size - 2] << cnt - GMP_NUMB_BITS;
! 460: if (size >= 3)
! 461: {
! 462: uh |= up[size - 3] >> 2 * GMP_NUMB_BITS - cnt;
! 463: vh |= vp[size - 3] >> 2 * GMP_NUMB_BITS - cnt;
! 464: }
! 465: }
! 466: #endif
1.1 maekawa 467:
1.1.1.2 maekawa 468: A = 1;
469: B = 0;
470: C = 0;
471: D = 1;
472:
473: asign = 0;
474: for (;;)
475: {
476: mp_limb_t q, T;
477: if (vh - C == 0 || vh + D == 0)
478: break;
479:
480: q = (uh + A) / (vh - C);
481: if (q != (uh - B) / (vh + D))
482: break;
483:
484: T = A + q * C;
485: A = C;
486: C = T;
487: T = B + q * D;
488: B = D;
489: D = T;
490: T = uh - q * vh;
491: uh = vh;
492: vh = T;
493:
1.1.1.3 ! ohara 494: asign = ~asign;
! 495:
1.1.1.2 maekawa 496: if (vh - D == 0)
497: break;
498:
499: q = (uh - A) / (vh + C);
500: if (q != (uh + B) / (vh - D))
501: break;
502:
503: T = A + q * C;
504: A = C;
505: C = T;
506: T = B + q * D;
507: B = D;
508: D = T;
509: T = uh - q * vh;
510: uh = vh;
511: vh = T;
1.1.1.3 ! ohara 512:
! 513: asign = ~asign;
1.1.1.2 maekawa 514: }
515: #if EXTEND
516: if (asign)
517: sign = -sign;
518: #endif
1.1 maekawa 519: }
520:
521: #if RECORD
522: max = MAX (A, max); max = MAX (B, max);
523: max = MAX (C, max); max = MAX (D, max);
524: #endif
525:
526: if (B == 0)
527: {
528: /* This is quite rare. I.e., optimize something else! */
529:
1.1.1.3 ! ohara 530: mpn_tdiv_qr (wp, up, (mp_size_t) 0, up, size, vp, vsize);
1.1 maekawa 531:
532: #if EXTEND
533: MPN_COPY (tp, s0p, ssize);
534: {
1.1.1.2 maekawa 535: mp_size_t qsize;
1.1.1.3 ! ohara 536: mp_size_t i;
1.1.1.2 maekawa 537:
1.1.1.3 ! ohara 538: qsize = size - vsize + 1; /* size of stored quotient from division */
! 539: MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
! 540:
! 541: for (i = 0; i < qsize; i++)
1.1.1.2 maekawa 542: {
1.1.1.3 ! ohara 543: mp_limb_t cy;
! 544: cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
! 545: tp[ssize + i] = cy;
1.1.1.2 maekawa 546: }
1.1.1.3 ! ohara 547:
1.1.1.2 maekawa 548: ssize += qsize;
549: ssize -= tp[ssize - 1] == 0;
1.1 maekawa 550: }
1.1.1.2 maekawa 551:
552: sign = -sign;
553: MP_PTR_SWAP (s0p, s1p);
554: MP_PTR_SWAP (s1p, tp);
1.1 maekawa 555: #endif
556: size = vsize;
1.1.1.2 maekawa 557: MP_PTR_SWAP (up, vp);
1.1 maekawa 558: }
559: else
560: {
1.1.1.2 maekawa 561: #if EXTEND
562: mp_size_t tsize, wsize;
563: #endif
1.1 maekawa 564: /* T = U*A + V*B
565: W = U*C + V*D
566: U = T
567: V = W */
568:
569: #if STAT
1.1.1.2 maekawa 570: { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
1.1.1.3 ! ohara 571: arr[GMP_LIMB_BITS - cnt]++; }
1.1 maekawa 572: #endif
573: if (A == 0)
574: {
1.1.1.2 maekawa 575: /* B == 1 and C == 1 (D is arbitrary) */
576: mp_limb_t cy;
1.1 maekawa 577: MPN_COPY (tp, vp, size);
1.1.1.2 maekawa 578: MPN_COPY (wp, up, size);
579: mpn_submul_1 (wp, vp, size, D);
580: MP_PTR_SWAP (tp, up);
581: MP_PTR_SWAP (wp, vp);
1.1 maekawa 582: #if EXTEND
583: MPN_COPY (tp, s1p, ssize);
1.1.1.2 maekawa 584: tsize = ssize;
585: tp[ssize] = 0; /* must zero since wp might spill below */
586: MPN_COPY (wp, s0p, ssize);
587: cy = mpn_addmul_1 (wp, s1p, ssize, D);
588: wp[ssize] = cy;
589: wsize = ssize + (cy != 0);
590: MP_PTR_SWAP (tp, s0p);
591: MP_PTR_SWAP (wp, s1p);
592: ssize = MAX (wsize, tsize);
593: #endif
1.1 maekawa 594: }
595: else
596: {
1.1.1.3 ! ohara 597: mp_limb_t cy, cy1, cy2;
! 598:
1.1.1.2 maekawa 599: if (asign)
1.1 maekawa 600: {
1.1.1.2 maekawa 601: mpn_mul_1 (tp, vp, size, B);
602: mpn_submul_1 (tp, up, size, A);
603: mpn_mul_1 (wp, up, size, C);
604: mpn_submul_1 (wp, vp, size, D);
1.1 maekawa 605: }
606: else
607: {
1.1.1.2 maekawa 608: mpn_mul_1 (tp, up, size, A);
609: mpn_submul_1 (tp, vp, size, B);
610: mpn_mul_1 (wp, vp, size, D);
611: mpn_submul_1 (wp, up, size, C);
1.1.1.3 ! ohara 612: }
! 613: MP_PTR_SWAP (tp, up);
! 614: MP_PTR_SWAP (wp, vp);
1.1.1.2 maekawa 615: #if EXTEND
1.1.1.3 ! ohara 616: /* Compute new s0 */
! 617: cy1 = mpn_mul_1 (tp, s0p, ssize, A);
! 618: cy2 = mpn_addmul_1 (tp, s1p, ssize, B);
! 619: cy = cy1 + cy2;
! 620: tp[ssize] = cy & GMP_NUMB_MASK;
! 621: tsize = ssize + (cy != 0);
! 622: #if GMP_NAIL_BITS == 0
! 623: if (cy < cy1)
! 624: #else
! 625: if (cy > GMP_NUMB_MAX)
1.1.1.2 maekawa 626: #endif
1.1.1.3 ! ohara 627: {
! 628: tp[tsize] = 1;
! 629: wp[tsize] = 0;
! 630: tsize++;
! 631: /* This happens just for nails, since we get more work done
! 632: per numb there. */
1.1 maekawa 633: }
1.1.1.2 maekawa 634:
1.1.1.3 ! ohara 635: /* Compute new s1 */
! 636: cy1 = mpn_mul_1 (wp, s1p, ssize, D);
! 637: cy2 = mpn_addmul_1 (wp, s0p, ssize, C);
! 638: cy = cy1 + cy2;
! 639: wp[ssize] = cy & GMP_NUMB_MASK;
! 640: wsize = ssize + (cy != 0);
! 641: #if GMP_NAIL_BITS == 0
! 642: if (cy < cy1)
! 643: #else
! 644: if (cy > GMP_NUMB_MAX)
! 645: #endif
! 646: {
! 647: wp[wsize] = 1;
! 648: if (wsize >= tsize)
! 649: tp[wsize] = 0;
! 650: wsize++;
! 651: }
! 652:
! 653: MP_PTR_SWAP (tp, s0p);
! 654: MP_PTR_SWAP (wp, s1p);
! 655: ssize = MAX (wsize, tsize);
! 656: #endif
! 657: }
! 658: size -= up[size - 1] == 0;
! 659: #if GMP_NAIL_BITS != 0
1.1.1.2 maekawa 660: size -= up[size - 1] == 0;
1.1.1.3 ! ohara 661: #endif
1.1 maekawa 662: }
1.1.1.3 ! ohara 663:
! 664: #if WANT_GCDEXT_ONE_STEP
! 665: TMP_FREE (mark);
! 666: return 0;
! 667: #endif
1.1 maekawa 668: }
669:
670: #if RECORD
1.1.1.2 maekawa 671: printf ("max: %lx\n", max);
672: #endif
673:
674: #if STAT
1.1.1.3 ! ohara 675: {int i; for (i = 0; i <= GMP_LIMB_BITS; i++) printf ("%d:%d\n", i, arr[i]);}
1.1 maekawa 676: #endif
677:
678: if (vsize == 0)
679: {
1.1.1.2 maekawa 680: if (gp != up && gp != 0)
1.1 maekawa 681: MPN_COPY (gp, up, size);
682: #if EXTEND
1.1.1.2 maekawa 683: MPN_NORMALIZE (s0p, ssize);
1.1 maekawa 684: if (orig_s0p != s0p)
685: MPN_COPY (orig_s0p, s0p, ssize);
1.1.1.2 maekawa 686: *s0size = sign >= 0 ? ssize : -ssize;
1.1 maekawa 687: #endif
688: TMP_FREE (mark);
689: return size;
690: }
691: else
692: {
693: mp_limb_t vl, ul, t;
694: #if EXTEND
1.1.1.2 maekawa 695: mp_size_t qsize, i;
1.1 maekawa 696: #endif
697: vl = vp[0];
698: #if EXTEND
699: t = mpn_divmod_1 (wp, up, size, vl);
1.1.1.2 maekawa 700:
1.1 maekawa 701: MPN_COPY (tp, s0p, ssize);
1.1.1.2 maekawa 702:
703: qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
704: if (ssize < qsize)
1.1 maekawa 705: {
1.1.1.2 maekawa 706: MPN_ZERO (tp + ssize, qsize - ssize);
707: MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
708: for (i = 0; i < ssize; i++)
709: {
710: mp_limb_t cy;
711: cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
712: tp[qsize + i] = cy;
713: }
714: }
715: else
716: {
717: MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
718: for (i = 0; i < qsize; i++)
719: {
720: mp_limb_t cy;
721: cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
722: tp[ssize + i] = cy;
723: }
724: }
725: ssize += qsize;
726: ssize -= tp[ssize - 1] == 0;
727:
728: sign = -sign;
729: MP_PTR_SWAP (s0p, s1p);
730: MP_PTR_SWAP (s1p, tp);
1.1 maekawa 731: #else
732: t = mpn_mod_1 (up, size, vl);
733: #endif
734: ul = vl;
735: vl = t;
736: while (vl != 0)
737: {
738: mp_limb_t t;
739: #if EXTEND
1.1.1.2 maekawa 740: mp_limb_t q;
1.1 maekawa 741: q = ul / vl;
1.1.1.2 maekawa 742: t = ul - q * vl;
1.1 maekawa 743:
744: MPN_COPY (tp, s0p, ssize);
1.1.1.2 maekawa 745:
746: MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
747:
1.1 maekawa 748: {
1.1.1.2 maekawa 749: mp_limb_t cy;
750: cy = mpn_addmul_1 (tp, s1p, ssize, q);
751: tp[ssize] = cy;
1.1 maekawa 752: }
753:
1.1.1.2 maekawa 754: ssize += 1;
755: ssize -= tp[ssize - 1] == 0;
756:
757: sign = -sign;
758: MP_PTR_SWAP (s0p, s1p);
759: MP_PTR_SWAP (s1p, tp);
1.1 maekawa 760: #else
761: t = ul % vl;
762: #endif
763: ul = vl;
764: vl = t;
765: }
1.1.1.2 maekawa 766: if (gp != 0)
767: gp[0] = ul;
1.1 maekawa 768: #if EXTEND
1.1.1.2 maekawa 769: MPN_NORMALIZE (s0p, ssize);
1.1 maekawa 770: if (orig_s0p != s0p)
771: MPN_COPY (orig_s0p, s0p, ssize);
1.1.1.2 maekawa 772: *s0size = sign >= 0 ? ssize : -ssize;
1.1 maekawa 773: #endif
774: TMP_FREE (mark);
775: return 1;
776: }
777: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>