===================================================================
RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/generic/Attic/gcd.c,v
retrieving revision 1.1
retrieving revision 1.1.1.3
diff -u -p -r1.1 -r1.1.1.3
--- OpenXM_contrib/gmp/mpn/generic/Attic/gcd.c	2000/01/10 15:35:23	1.1
+++ OpenXM_contrib/gmp/mpn/generic/Attic/gcd.c	2003/08/25 16:06:20	1.1.1.3
@@ -1,20 +1,21 @@
 /* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
 
-Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001, 2002 Free
+Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
 The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Library General Public License as published by
-the Free Software Foundation; either version 2 of the License, or (at your
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
 option) any later version.
 
 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 License for more details.
 
-You should have received a copy of the GNU Library General Public License
+You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 MA 02111-1307, USA. */
@@ -22,7 +23,7 @@ MA 02111-1307, USA. */
 /* Integer greatest common divisor of two unsigned integers, using
    the accelerated algorithm (see reference below).
 
-   mp_size_t mpn_gcd (vp, vsize, up, usize).
+   mp_size_t mpn_gcd (up, usize, vp, vsize).
 
    Preconditions [U = (up, usize) and V = (vp, vsize)]:
 
@@ -47,60 +48,52 @@ MA 02111-1307, USA. */
 #include "gmp-impl.h"
 #include "longlong.h"
 
-/* If MIN (usize, vsize) > ACCEL_THRESHOLD, then the accelerated algorithm is
-   used, otherwise the binary algorithm is used.  This may be adjusted for
-   different architectures.  */
-#ifndef ACCEL_THRESHOLD
-#define ACCEL_THRESHOLD 4
+/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
+   algorithm is used, otherwise the binary algorithm is used.  This may be
+   adjusted for different architectures.  */
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
 #endif
 
 /* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
    algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
-   is used.  0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB.  */
+   is used.  0 <= BMOD_THRESHOLD < GMP_NUMB_BITS.  */
 enum
   {
-    BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
+    BMOD_THRESHOLD = GMP_NUMB_BITS/2
   };
 
-#define SIGN_BIT  (~(~(mp_limb_t)0 >> 1))
 
-
-#define SWAP_LIMB(UL, VL) do{mp_limb_t __l=(UL);(UL)=(VL);(VL)=__l;}while(0)
-#define SWAP_PTR(UP, VP) do{mp_ptr __p=(UP);(UP)=(VP);(VP)=__p;}while(0)
-#define SWAP_SZ(US, VS) do{mp_size_t __s=(US);(US)=(VS);(VS)=__s;}while(0)
-#define SWAP_MPN(UP, US, VP, VS) do{SWAP_PTR(UP,VP);SWAP_SZ(US,VS);}while(0)
-
 /* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
    Both U and V must be odd.  */
-static __gmp_inline mp_size_t
-#if __STDC__
+static inline mp_size_t
 gcd_2 (mp_ptr vp, mp_srcptr up)
-#else
-gcd_2 (vp, up)
-     mp_ptr vp;
-     mp_srcptr up;
-#endif
 {
   mp_limb_t u0, u1, v0, v1;
   mp_size_t vsize;
 
-  u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
+  u0 = up[0];
+  u1 = up[1];
+  v0 = vp[0];
+  v1 = vp[1];
 
   while (u1 != v1 && u0 != v0)
     {
       unsigned long int r;
       if (u1 > v1)
 	{
-	  u1 -= v1 + (u0 < v0), u0 -= v0;
+	  u1 -= v1 + (u0 < v0);
+	  u0 = (u0 - v0) & GMP_NUMB_MASK;
 	  count_trailing_zeros (r, u0);
-	  u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
+	  u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
 	  u1 >>= r;
 	}
       else  /* u1 < v1.  */
 	{
-	  v1 -= u1 + (v0 < u0), v0 -= u0;
+	  v1 -= u1 + (v0 < u0);
+	  v0 = (v0 - u0) & GMP_NUMB_MASK;
 	  count_trailing_zeros (r, v0);
-	  v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
+	  v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
 	  v1 >>= r;
 	}
     }
@@ -117,85 +110,110 @@ gcd_2 (vp, up)
   return 1;
 }
 
-/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
-   0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
+/* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists
+   0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS).
    In the reference article, D was computed along with N, but it is better to
-   compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
+   compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating
    the result as a twos' complement signed integer.
 
-   Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB).  According to the reference
-   article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
-   2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
+   Initialize N1 to C mod 2^(2*GMP_NUMB_BITS).  According to the reference
+   article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use
+   2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double
    precision.  If N2 > N1 initially, the first iteration of the while loop
    will swap them.  In all other situations, N1 >= N2 is maintained.  */
 
-static __gmp_inline mp_limb_t
-#if __STDC__
-find_a (mp_srcptr cp)
+#if HAVE_NATIVE_mpn_gcd_finda
+#define find_a(cp)  mpn_gcd_finda (cp)
+
 #else
-find_a (cp)
-     mp_srcptr cp;
+static
+#if ! defined (__i386__)
+inline				/* don't inline this for the x86 */
 #endif
+mp_limb_t
+find_a (mp_srcptr cp)
 {
   unsigned long int leading_zero_bits = 0;
 
-  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l.  */
+  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l.  */
   mp_limb_t n1_h = cp[1];
 
-  mp_limb_t n2_l = -n1_l;	/* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l.  */
-  mp_limb_t n2_h = ~n1_h;
+  mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK);	/* N2 == n2_h * 2^GMP_NUMB_BITS + n2_l.  */
+  mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK);
 
   /* Main loop.  */
-  while (n2_h)			/* While N2 >= 2^BITS_PER_MP_LIMB.  */
+  while (n2_h != 0)		/* While N2 >= 2^GMP_NUMB_BITS.  */
     {
       /* N1 <-- N1 % N2.  */
-      if ((SIGN_BIT >> leading_zero_bits & n2_h) == 0)
+      if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0)
 	{
 	  unsigned long int i;
 	  count_leading_zeros (i, n2_h);
-	  i -= leading_zero_bits, leading_zero_bits += i;
-	  n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
+	  i -= GMP_NAIL_BITS;
+	  i -= leading_zero_bits;
+	  leading_zero_bits += i;
+	  n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - i));
+	  n2_l = (n2_l << i) & GMP_NUMB_MASK;
 	  do
 	    {
 	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
-		n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
-	      n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
+		{
+		  n1_h -= n2_h + (n1_l < n2_l);
+		  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
+		}
+	      n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK);
+	      n2_h >>= 1;
 	      i -= 1;
 	    }
-	  while (i);
+	  while (i != 0);
 	}
       if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
-	n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+	{
+	  n1_h -= n2_h + (n1_l < n2_l);
+	  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
+	}
 
-      SWAP_LIMB (n1_h, n2_h);
-      SWAP_LIMB (n1_l, n2_l);
+      MP_LIMB_T_SWAP (n1_h, n2_h);
+      MP_LIMB_T_SWAP (n1_l, n2_l);
     }
 
   return n2_l;
 }
+#endif
 
+
 mp_size_t
-#if __STDC__
-mpn_gcd (mp_ptr gp, mp_ptr vp, mp_size_t vsize, mp_ptr up, mp_size_t usize)
-#else
-mpn_gcd (gp, vp, vsize, up, usize)
-     mp_ptr gp;
-     mp_ptr vp;
-     mp_size_t vsize;
-     mp_ptr up;
-     mp_size_t usize;
-#endif
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
 {
   mp_ptr orig_vp = vp;
   mp_size_t orig_vsize = vsize;
   int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
   TMP_DECL (marker);
 
+  ASSERT (usize >= 1);
+  ASSERT (vsize >= 1);
+  ASSERT (usize >= vsize);
+  ASSERT (vp[0] & 1);
+  ASSERT (up[usize - 1] != 0);
+  ASSERT (vp[vsize - 1] != 0);
+#if WANT_ASSERT
+  if (usize == vsize)
+    {
+      int  uzeros, vzeros;
+      count_leading_zeros (uzeros, up[usize - 1]);
+      count_leading_zeros (vzeros, vp[vsize - 1]);
+      ASSERT (uzeros <= vzeros);
+    }
+#endif
+  ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
+  ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize));
+  ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize));
+
   TMP_MARK (marker);
 
-  /* Use accelerated algorithm if vsize is over ACCEL_THRESHOLD.
+  /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
      Two EXTRA limbs for U and V are required for kary reduction.  */
-  if (vsize > ACCEL_THRESHOLD)
+  if (vsize >= GCD_ACCEL_THRESHOLD)
     {
       unsigned long int vbitsize, d;
       mp_ptr orig_up = up;
@@ -205,10 +223,13 @@ mpn_gcd (gp, vp, vsize, up, usize)
       MPN_COPY (anchor_up, orig_up, usize);
       up = anchor_up;
 
-      count_leading_zeros (d, up[usize-1]);
-      d = usize * BITS_PER_MP_LIMB - d;
-      count_leading_zeros (vbitsize, vp[vsize-1]);
-      vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+      count_leading_zeros (d, up[usize - 1]);
+      d -= GMP_NAIL_BITS;
+      d = usize * GMP_NUMB_BITS - d;
+      count_leading_zeros (vbitsize, vp[vsize - 1]);
+      vbitsize -= GMP_NAIL_BITS;
+      vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
+      ASSERT (d >= vbitsize);
       d = d - vbitsize + 1;
 
       /* Use bmod reduction to quickly discover whether V divides U.  */
@@ -216,7 +237,7 @@ mpn_gcd (gp, vp, vsize, up, usize)
       mpn_bdivmod (up, up, usize, vp, vsize, d);
 
       /* Now skip U/V mod 2^d and any low zero limbs.  */
-      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+      d /= GMP_NUMB_BITS, up += d, usize -= d;
       while (usize != 0 && up[0] == 0)
 	up++, usize--;
 
@@ -228,12 +249,14 @@ mpn_gcd (gp, vp, vsize, up, usize)
 
       do					/* Main loop.  */
 	{
-	  if (up[usize-1] & SIGN_BIT)		/* U < 0; take twos' compl. */
+	  /* mpn_com_n can't be used here because anchor_up and up may
+	     partially overlap */
+	  if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0)  /* U < 0; take twos' compl. */
 	    {
 	      mp_size_t i;
-	      anchor_up[0] = -up[0];
+	      anchor_up[0] = -up[0] & GMP_NUMB_MASK;
 	      for (i = 1; i < usize; i++)
-		anchor_up[i] = ~up[i];
+		anchor_up[i] = (~up[i] & GMP_NUMB_MASK);
 	      up = anchor_up;
 	    }
 
@@ -241,54 +264,69 @@ mpn_gcd (gp, vp, vsize, up, usize)
 
 	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
 	    {
-	      unsigned long int r;
+	      unsigned int r;
 	      count_trailing_zeros (r, up[0]);
 	      mpn_rshift (anchor_up, up, usize, r);
-	      usize -= (anchor_up[usize-1] == 0);
+	      usize -= (anchor_up[usize - 1] == 0);
 	    }
 	  else if (anchor_up != up)
-	    MPN_COPY (anchor_up, up, usize);
+	    MPN_COPY_INCR (anchor_up, up, usize);
 
-	  SWAP_MPN (anchor_up, usize, vp, vsize);
+	  MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
 	  up = anchor_up;
 
 	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
 	    break;			/* isn't efficient for == 2 limbs.  */
 
 	  d = vbitsize;
-	  count_leading_zeros (vbitsize, vp[vsize-1]);
-	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  count_leading_zeros (vbitsize, vp[vsize - 1]);
+	  vbitsize -= GMP_NAIL_BITS;
+	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
 	  d = d - vbitsize + 1;
 
 	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
 	    {
 	      up[usize++] = 0;
 	      mpn_bdivmod (up, up, usize, vp, vsize, d);
-	      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+	      d /= GMP_NUMB_BITS, up += d, usize -= d;
 	    }
 	  else				/* Kary reduction.  */
 	    {
 	      mp_limb_t bp[2], cp[2];
 
-	      /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB).  */
-	      cp[0] = vp[0], cp[1] = vp[1];
-	      mpn_bdivmod (cp, cp, 2, up, 2, 2*BITS_PER_MP_LIMB);
+	      /* C <-- V/U mod 2^(2*GMP_NUMB_BITS).  */
+	      {
+		mp_limb_t u_inv, hi, lo;
+		modlimb_invert (u_inv, up[0]);
+		cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK;
+		umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS);
+		lo >>= GMP_NAIL_BITS;
+		cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK;
+	      }
 
 	      /* U <-- find_a (C)  *  U.  */
 	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
 	      usize++;
 
-	      /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
-		  bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
-		  bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 */
-	      bp[0] = up[0], bp[1] = up[1];
-	      mpn_bdivmod (bp, bp, 2, vp, 2, BITS_PER_MP_LIMB);
-	      bp[1] &= 1;	/* Since V is odd, division is unnecessary.  */
+	      /* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1).
+		  bp[0] <-- U/V mod 2^GMP_NUMB_BITS and
+		  bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2
 
+		  Like V/U above, but simplified because only the low bit of
+		  bp[1] is wanted. */
+	      {
+		mp_limb_t  v_inv, hi, lo;
+		modlimb_invert (v_inv, vp[0]);
+		bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK;
+		umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS);
+		lo >>= GMP_NAIL_BITS;
+		bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1;
+	      }
+
 	      up[usize++] = 0;
-	      if (bp[1])	/* B < 0: U <-- U + (-B)  * V.  */
+	      if (bp[1] != 0)	/* B < 0: U <-- U + (-B)  * V.  */
 		{
-		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
+		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & GMP_NUMB_MASK);
 		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
 		}
 	      else		/* B >= 0:  U <-- U - B * V.  */
@@ -304,7 +342,7 @@ mpn_gcd (gp, vp, vsize, up, usize)
 	  while (usize != 0 && up[0] == 0)
 	    up++, usize--;
 	}
-      while (usize);
+      while (usize != 0);
 
       /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
       up = orig_up, usize = orig_usize;
@@ -319,15 +357,17 @@ mpn_gcd (gp, vp, vsize, up, usize)
       if (usize > 2)		/* First make U close to V in size.  */
 	{
 	  unsigned long int vbitsize, d;
-	  count_leading_zeros (d, up[usize-1]);
-	  d = usize * BITS_PER_MP_LIMB - d;
-	  count_leading_zeros (vbitsize, vp[vsize-1]);
-	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  count_leading_zeros (d, up[usize - 1]);
+	  d -= GMP_NAIL_BITS;
+	  d = usize * GMP_NUMB_BITS - d;
+	  count_leading_zeros (vbitsize, vp[vsize - 1]);
+	  vbitsize -= GMP_NAIL_BITS;
+	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
 	  d = d - vbitsize - 1;
 	  if (d != -(unsigned long int)1 && d > 2)
 	    {
 	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
-	      d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
+	      d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d;
 	    }
 	}
 
@@ -342,15 +382,15 @@ mpn_gcd (gp, vp, vsize, up, usize)
 	    up += 1, usize -= 1;
 	  if ((up[0] & 1) == 0)
 	    {
-	      unsigned long int r;
+	      unsigned int r;
 	      count_trailing_zeros (r, up[0]);
 	      mpn_rshift (up, up, usize, r);
-	      usize -= (up[usize-1] == 0);
+	      usize -= (up[usize - 1] == 0);
 	    }
 
 	  /* Keep usize >= vsize.  */
 	  if (usize < vsize)
-	    SWAP_MPN (up, usize, vp, vsize);
+	    MPN_PTR_SWAP (up, usize, vp, vsize);
 
 	  if (usize <= 2)				/* Double precision. */
 	    {
@@ -375,7 +415,7 @@ mpn_gcd (gp, vp, vsize, up, usize)
 		size--;
 	      while (up[size] == vp[size]);
 	      if (up[size] < vp[size])			/* usize == vsize.  */
-		SWAP_PTR (up, vp);
+		MP_PTR_SWAP (up, vp);
 	      up += zeros, usize = size + 1 - zeros;
 	      mpn_sub_n (up, up, vp + zeros, usize);
 	    }
@@ -396,7 +436,7 @@ mpn_gcd (gp, vp, vsize, up, usize)
 
 done:
   if (vp != gp)
-    MPN_COPY (gp, vp, vsize);
+    MPN_COPY_INCR (gp, vp, vsize);
   TMP_FREE (marker);
   return vsize;
 }