===================================================================
RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/generic/Attic/mul_n.c,v
retrieving revision 1.1.1.2
retrieving revision 1.1.1.3
diff -u -p -r1.1.1.2 -r1.1.1.3
--- OpenXM_contrib/gmp/mpn/generic/Attic/mul_n.c	2000/09/09 14:12:26	1.1.1.2
+++ OpenXM_contrib/gmp/mpn/generic/Attic/mul_n.c	2003/08/25 16:06:20	1.1.1.3
@@ -6,8 +6,8 @@
    THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
 
 
-Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
-Foundation, Inc.
+Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free
+Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -31,27 +31,30 @@ MA 02111-1307, USA. */
 #include "longlong.h"
 
 
-/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
-   0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
-#define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
+#if GMP_NAIL_BITS != 0
+/* The open-coded interpolate3 stuff has not been generalized for nails.  */
+#define USE_MORE_MPN 1
+#endif
 
+#ifndef USE_MORE_MPN
 #if !defined (__alpha) && !defined (__mips)
 /* For all other machines, we want to call mpn functions for the compund
    operations instead of open-coding them.  */
-#define USE_MORE_MPN
+#define USE_MORE_MPN 1
 #endif
+#endif
 
 /*== Function declarations =================================================*/
 
 static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
-                               mp_ptr, mp_ptr, mp_ptr,
-                               mp_srcptr, mp_srcptr, mp_srcptr,
-                               mp_size_t, mp_size_t));
+			       mp_ptr, mp_ptr, mp_ptr,
+			       mp_srcptr, mp_srcptr, mp_srcptr,
+			       mp_size_t, mp_size_t));
 static void interpolate3 _PROTO ((mp_srcptr,
-                                  mp_ptr, mp_ptr, mp_ptr,
-                                  mp_srcptr,
-                                  mp_ptr, mp_ptr, mp_ptr,
-                                  mp_size_t, mp_size_t));
+				  mp_ptr, mp_ptr, mp_ptr,
+				  mp_srcptr,
+				  mp_ptr, mp_ptr, mp_ptr,
+				  mp_size_t, mp_size_t));
 static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
 
 
@@ -64,25 +67,18 @@ static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr,
  */
 
 void
-#if __STDC__
 mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
-#else
-mpn_kara_mul_n(p, a, b, n, ws)
-     mp_ptr    p;
-     mp_srcptr a;
-     mp_srcptr b;
-     mp_size_t n;
-     mp_ptr    ws;
-#endif
 {
-  mp_limb_t i, sign, w, w0, w1;
+  mp_limb_t w, w0, w1;
   mp_size_t n2;
   mp_srcptr x, y;
+  mp_size_t i;
+  int sign;
 
   n2 = n >> 1;
   ASSERT (n2 > 0);
 
-  if (n & 1)
+  if ((n & 1) != 0)
     {
       /* Odd length. */
       mp_size_t n1, n3, nm1;
@@ -100,14 +96,14 @@ mpn_kara_mul_n(p, a, b, n, ws)
 	    {
 	      --i;
 	      w0 = a[i];
-	      w1 = a[n3+i];
+	      w1 = a[n3 + i];
 	    }
 	  while (w0 == w1 && i != 0);
 	  if (w0 < w1)
 	    {
 	      x = a + n3;
 	      y = a;
-	      sign = 1;
+	      sign = ~0;
 	    }
 	  else
 	    {
@@ -124,18 +120,18 @@ mpn_kara_mul_n(p, a, b, n, ws)
       else
 	{
 	  i = n2;
-	  do 
+	  do
 	    {
 	      --i;
-	      w0 = b[i]; 
-	      w1 = b[n3+i];
+	      w0 = b[i];
+	      w1 = b[n3 + i];
 	    }
 	  while (w0 == w1 && i != 0);
 	  if (w0 < w1)
 	    {
 	      x = b + n3;
 	      y = b;
-	      sign ^= 1;
+	      sign = ~sign;
 	    }
 	  else
 	    {
@@ -147,9 +143,9 @@ mpn_kara_mul_n(p, a, b, n, ws)
       p[n] = w;
 
       n1 = n + 1;
-      if (n2 < KARATSUBA_MUL_THRESHOLD)
+      if (n2 < MUL_KARATSUBA_THRESHOLD)
 	{
-	  if (n3 < KARATSUBA_MUL_THRESHOLD)
+	  if (n3 < MUL_KARATSUBA_THRESHOLD)
 	    {
 	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
 	      mpn_mul_basecase (p, a, n3, b, n3);
@@ -176,34 +172,25 @@ mpn_kara_mul_n(p, a, b, n, ws)
       nm1 = n - 1;
       if (mpn_add_n (ws, p + n1, ws, nm1))
 	{
-	  mp_limb_t x = ws[nm1] + 1;
+	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
 	  ws[nm1] = x;
 	  if (x == 0)
-	    ++ws[n];
+	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
 	}
       if (mpn_add_n (p + n3, p + n3, ws, n1))
 	{
-	  mp_limb_t x;
-	  i = n1 + n3;
-	  do
-	    {
-	      x = p[i] + 1;
-	      p[i] = x;
-	      ++i;
-	    } while (x == 0);
+	  mpn_incr_u (p + n1 + n3, 1);
 	}
     }
   else
     {
       /* Even length. */
-      mp_limb_t t;
-
       i = n2;
       do
 	{
 	  --i;
 	  w0 = a[i];
-	  w1 = a[n2+i];
+	  w1 = a[n2 + i];
 	}
       while (w0 == w1 && i != 0);
       sign = 0;
@@ -211,7 +198,7 @@ mpn_kara_mul_n(p, a, b, n, ws)
 	{
 	  x = a + n2;
 	  y = a;
-	  sign = 1;
+	  sign = ~0;
 	}
       else
 	{
@@ -221,18 +208,18 @@ mpn_kara_mul_n(p, a, b, n, ws)
       mpn_sub_n (p, x, y, n2);
 
       i = n2;
-      do 
+      do
 	{
 	  --i;
 	  w0 = b[i];
-	  w1 = b[n2+i];
+	  w1 = b[n2 + i];
 	}
       while (w0 == w1 && i != 0);
       if (w0 < w1)
 	{
 	  x = b + n2;
 	  y = b;
-	  sign ^= 1;
+	  sign = ~sign;
 	}
       else
 	{
@@ -242,7 +229,7 @@ mpn_kara_mul_n(p, a, b, n, ws)
       mpn_sub_n (p + n2, x, y, n2);
 
       /* Pointwise products. */
-      if (n2 < KARATSUBA_MUL_THRESHOLD)
+      if (n2 < MUL_KARATSUBA_THRESHOLD)
 	{
 	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
 	  mpn_mul_basecase (p, a, n2, b, n2);
@@ -262,52 +249,28 @@ mpn_kara_mul_n(p, a, b, n, ws)
 	w = -mpn_sub_n (ws, p, ws, n);
       w += mpn_add_n (ws, p + n, ws, n);
       w += mpn_add_n (p + n2, p + n2, ws, n);
-      /* TO DO: could put "if (w) { ... }" here.
-       * Less work but badly predicted branch.
-       * No measurable difference in speed on Alpha.
-       */
-      i = n + n2;
-      t = p[i] + w;
-      p[i] = t;
-      if (t < w)
-	{
-	  do
-	    {
-	      ++i;
-	      w = p[i] + 1;
-	      p[i] = w;
-	    }
-	  while (w == 0);
-	}
+      MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
     }
 }
 
 void
-#if __STDC__
 mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
-#else
-mpn_kara_sqr_n (p, a, n, ws)
-     mp_ptr    p;
-     mp_srcptr a;
-     mp_size_t n;
-     mp_ptr    ws;
-#endif
 {
-  mp_limb_t i, sign, w, w0, w1;
+  mp_limb_t w, w0, w1;
   mp_size_t n2;
   mp_srcptr x, y;
+  mp_size_t i;
 
   n2 = n >> 1;
   ASSERT (n2 > 0);
 
-  if (n & 1)
+  if ((n & 1) != 0)
     {
       /* Odd length. */
       mp_size_t n1, n3, nm1;
 
       n3 = n - n2;
 
-      sign = 0;
       w = a[n2];
       if (w != 0)
 	w -= mpn_sub_n (p, a, a + n3, n2);
@@ -318,14 +281,13 @@ mpn_kara_sqr_n (p, a, n, ws)
 	    {
 	      --i;
 	      w0 = a[i];
-	      w1 = a[n3+i];
+	      w1 = a[n3 + i];
 	    }
 	  while (w0 == w1 && i != 0);
 	  if (w0 < w1)
 	    {
 	      x = a + n3;
 	      y = a;
-	      sign = 1;
 	    }
 	  else
 	    {
@@ -336,100 +298,68 @@ mpn_kara_sqr_n (p, a, n, ws)
 	}
       p[n2] = w;
 
-      w = a[n2];
-      if (w != 0)
-	w -= mpn_sub_n (p + n3, a, a + n3, n2);
-      else
+      n1 = n + 1;
+
+      /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
+	 could be combined.  But that's not important, since the tests will
+	 take a miniscule amount of time compared to the function calls.  */
+      if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
 	{
-	  i = n2;
-	  do 
-	    {
-	      --i;
-	      w0 = a[i]; 
-	      w1 = a[n3+i];
-	    }
-	  while (w0 == w1 && i != 0);
-	  if (w0 < w1)
-	    {
-	      x = a + n3;
-	      y = a;
-	      sign ^= 1;
-	    }
-	  else
-	    {
-	      x = a;
-	      y = a + n3;
-	    }
-	  mpn_sub_n (p + n3, x, y, n2);
+	  mpn_mul_basecase (ws, p, n3, p, n3);
+	  mpn_mul_basecase (p,  a, n3, a, n3);
 	}
-      p[n] = w;
-
-      n1 = n + 1;
-      if (n2 < KARATSUBA_SQR_THRESHOLD)
+      else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
 	{
-	  if (n3 < KARATSUBA_SQR_THRESHOLD)
-	    {
-	      mpn_sqr_basecase (ws, p, n3);
-	      mpn_sqr_basecase (p, a, n3);
-	    }
-	  else
-	    {
-	      mpn_kara_sqr_n (ws, p, n3, ws + n1);
-	      mpn_kara_sqr_n (p, a, n3, ws + n1);
-	    }
-	  mpn_sqr_basecase (p + n1, a + n3, n2);
+	  mpn_sqr_basecase (ws, p, n3);
+	  mpn_sqr_basecase (p,  a, n3);
 	}
       else
 	{
-	  mpn_kara_sqr_n (ws, p, n3, ws + n1);
-	  mpn_kara_sqr_n (p, a, n3, ws + n1);
-	  mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
+	  mpn_kara_sqr_n   (ws, p, n3, ws + n1);	 /* (x-y)^2 */
+	  mpn_kara_sqr_n   (p,  a, n3, ws + n1);	 /* x^2	    */
 	}
-
-      if (sign)
-	mpn_add_n (ws, p, ws, n1);
+      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
+	mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
+      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
+	mpn_sqr_basecase (p + n1, a + n3, n2);
       else
-	mpn_sub_n (ws, p, ws, n1);
+	mpn_kara_sqr_n   (p + n1, a + n3, n2, ws + n1);	 /* y^2	    */
 
+      /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
+	 borrow from mpn_sub_n.	 If it occurs then it'll be cancelled by a
+	 carry from ws[n].  Further, since 2xy fits in n1 limbs there won't
+	 be any carry out of ws[n] other than cancelling that borrow. */
+
+      mpn_sub_n (ws, p, ws, n1);	     /* x^2-(x-y)^2 */
+
       nm1 = n - 1;
-      if (mpn_add_n (ws, p + n1, ws, nm1))
+      if (mpn_add_n (ws, p + n1, ws, nm1))   /* x^2+y^2-(x-y)^2 = 2xy */
 	{
-	  mp_limb_t x = ws[nm1] + 1;
+	  mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
 	  ws[nm1] = x;
 	  if (x == 0)
-	    ++ws[n];
+	    ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
 	}
       if (mpn_add_n (p + n3, p + n3, ws, n1))
 	{
-	  mp_limb_t x;
-	  i = n1 + n3;
-	  do
-	    {
-	      x = p[i] + 1;
-	      p[i] = x;
-	      ++i;
-	    } while (x == 0);
+	  mpn_incr_u (p + n1 + n3, 1);
 	}
     }
   else
     {
       /* Even length. */
-      mp_limb_t t;
-
       i = n2;
       do
 	{
 	  --i;
 	  w0 = a[i];
-	  w1 = a[n2+i];
+	  w1 = a[n2 + i];
 	}
       while (w0 == w1 && i != 0);
-      sign = 0;
       if (w0 < w1)
 	{
 	  x = a + n2;
 	  y = a;
-	  sign = 1;
 	}
       else
 	{
@@ -438,83 +368,43 @@ mpn_kara_sqr_n (p, a, n, ws)
 	}
       mpn_sub_n (p, x, y, n2);
 
-      i = n2;
-      do 
+      /* Pointwise products. */
+      if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
 	{
-	  --i;
-	  w0 = a[i];
-	  w1 = a[n2+i];
+	  mpn_mul_basecase (ws,    p,      n2, p,      n2);
+	  mpn_mul_basecase (p,     a,      n2, a,      n2);
+	  mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
 	}
-      while (w0 == w1 && i != 0);
-      if (w0 < w1)
+      else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
 	{
-	  x = a + n2;
-	  y = a;
-	  sign ^= 1;
-	}
-      else
-	{
-	  x = a;
-	  y = a + n2;
-	}
-      mpn_sub_n (p + n2, x, y, n2);
-
-      /* Pointwise products. */
-      if (n2 < KARATSUBA_SQR_THRESHOLD)
-	{
-	  mpn_sqr_basecase (ws, p, n2);
-	  mpn_sqr_basecase (p, a, n2);
+	  mpn_sqr_basecase (ws,    p,      n2);
+	  mpn_sqr_basecase (p,     a,      n2);
 	  mpn_sqr_basecase (p + n, a + n2, n2);
 	}
       else
 	{
-	  mpn_kara_sqr_n (ws, p, n2, ws + n);
-	  mpn_kara_sqr_n (p, a, n2, ws + n);
+	  mpn_kara_sqr_n (ws,    p,      n2, ws + n);
+	  mpn_kara_sqr_n (p,     a,      n2, ws + n);
 	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
 	}
 
       /* Interpolate. */
-      if (sign)
-	w = mpn_add_n (ws, p, ws, n);
-      else
-	w = -mpn_sub_n (ws, p, ws, n);
+      w = -mpn_sub_n (ws, p, ws, n);
       w += mpn_add_n (ws, p + n, ws, n);
       w += mpn_add_n (p + n2, p + n2, ws, n);
-      /* TO DO: could put "if (w) { ... }" here.
-       * Less work but badly predicted branch.
-       * No measurable difference in speed on Alpha.
-       */
-      i = n + n2;
-      t = p[i] + w;
-      p[i] = t;
-      if (t < w)
-	{
-	  do
-	    {
-	      ++i;
-	      w = p[i] + 1;
-	      p[i] = w;
-	    }
-	  while (w == 0);
-	}
+      MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
     }
 }
 
 /*-- add2Times -------------------------------------------------------------*/
 
 /* z[] = x[] + 2 * y[]
-   Note that z and x might point to the same vectors. */
-#ifdef USE_MORE_MPN
+   Note that z and x might point to the same vectors.
+   FIXME: gcc won't inline this because it uses alloca. */
+#if USE_MORE_MPN
+
 static inline mp_limb_t
-#if __STDC__
 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
-#else
-add2Times (z, x, y, n)
-     mp_ptr    z;
-     mp_srcptr x;
-     mp_srcptr y;
-     mp_size_t n;
-#endif
 {
   mp_ptr t;
   mp_limb_t c;
@@ -526,23 +416,17 @@ add2Times (z, x, y, n)
   TMP_FREE (marker);
   return c;
 }
+
 #else
 
 static mp_limb_t
-#if __STDC__
 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
-#else
-add2Times (z, x, y, n)
-     mp_ptr    z;
-     mp_srcptr x;
-     mp_srcptr y;
-     mp_size_t n;
-#endif
 {
   mp_limb_t c, v, w;
 
   ASSERT (n > 0);
-  v = *x; w = *y;
+  v = *x;
+  w = *y;
   c = w >> (BITS_PER_MP_LIMB - 1);
   w <<= 1;
   v += w;
@@ -578,29 +462,14 @@ add2Times (z, x, y, n)
  *   C[] has length len2 with len-len2 = 0, 1 or 2.
  * Returns top words (overflow) at pth, pt1 and pt2 respectively.
  */
-#ifdef USE_MORE_MPN
+#if USE_MORE_MPN
+
 static void
-#if __STDC__
 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
-	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
-#else
-evaluate3 (ph, p1, p2, pth, pt1, pt2,
-           A, B, C, len, len2)
-     mp_ptr    ph;
-     mp_ptr    p1;
-     mp_ptr    p2;
-     mp_ptr    pth;
-     mp_ptr    pt1;
-     mp_ptr    pt2;
-     mp_srcptr A;
-     mp_srcptr B;
-     mp_srcptr C;
-     mp_size_t len;
-     mp_size_t len2;
-#endif
+	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len,mp_size_t len2)
 {
   mp_limb_t c, d, e;
-  
+
   ASSERT (len - len2 <= 2);
 
   e = mpn_lshift (p1, B, len, 1);
@@ -608,18 +477,27 @@ evaluate3 (ph, p1, p2, pth, pt1, pt2,
   c = mpn_lshift (ph, A, len, 2);
   c += e + mpn_add_n (ph, ph, p1, len);
   d = mpn_add_n (ph, ph, C, len2);
-  if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
+  if (len2 == len)
+    c += d;
+  else
+    c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
   ASSERT (c < 7);
   *pth = c;
 
   c = mpn_lshift (p2, C, len2, 2);
 #if 1
-  if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
+  if (len2 != len)
+    {
+      p2[len-1] = 0;
+      p2[len2] = c;
+      c = 0;
+    }
   c += e + mpn_add_n (p2, p2, p1, len);
 #else
   d = mpn_add_n (p2, p2, p1, len2);
   c += d;
-  if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
+  if (len2 != len)
+    c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
   c += e;
 #endif
   c += mpn_add_n (p2, p2, A, len);
@@ -628,34 +506,19 @@ evaluate3 (ph, p1, p2, pth, pt1, pt2,
 
   c = mpn_add_n (p1, A, B, len);
   d = mpn_add_n (p1, p1, C, len2);
-  if (len2 == len) c += d;
-  else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
+  if (len2 == len)
+    c += d;
+  else
+    c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
   ASSERT (c < 3);
   *pt1 = c;
-
 }
 
 #else
 
 static void
-#if __STDC__
 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
 	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
-#else
-evaluate3 (ph, p1, p2, pth, pt1, pt2,
-           A, B, C, l, ls)
-     mp_ptr    ph;
-     mp_ptr    p1;
-     mp_ptr    p2;
-     mp_ptr    pth;
-     mp_ptr    pt1;
-     mp_ptr    pt2;
-     mp_srcptr A;
-     mp_srcptr B;
-     mp_srcptr C;
-     mp_size_t l;
-     mp_size_t ls;
-#endif
 {
   mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
 
@@ -744,25 +607,11 @@ evaluate3 (ph, p1, p2, pth, pt1, pt2,
  * and returns new top words there.
  */
 
-#ifdef USE_MORE_MPN
+#if USE_MORE_MPN
+
 static void
-#if __STDC__
 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
-              mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
-#else
-interpolate3 (A, B, C, D, E,
-              ptb, ptc, ptd, len, len2)
-     mp_srcptr A;
-     mp_ptr    B;
-     mp_ptr    C;
-     mp_ptr    D;
-     mp_srcptr E;
-     mp_ptr    ptb;
-     mp_ptr    ptc;
-     mp_ptr    ptd;
-     mp_size_t len;
-     mp_size_t len2;
-#endif
+	      mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len,mp_size_t len2)
 {
   mp_ptr ws;
   mp_limb_t t, tb,tc,td;
@@ -772,9 +621,9 @@ interpolate3 (A, B, C, D, E,
   ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
 
   /* Let x1, x2, x3 be the values to interpolate.  We have:
-   *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
-   *         c =    a +   x1 +   x2 +   x3 +    e
-   *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
+   *	     b = 16*a + 8*x1 + 4*x2 + 2*x3 +	e
+   *	     c =    a +	  x1 +	 x2 +	x3 +	e
+   *	     d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
    */
 
   ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
@@ -782,33 +631,39 @@ interpolate3 (A, B, C, D, E,
   tb = *ptb; tc = *ptc; td = *ptd;
 
 
-  /* b := b - 16*a -    e
-   * c := c -    a -    e
-   * d := d -    a - 16*e
+  /* b := b - 16*a -	e
+   * c := c -	 a -	e
+   * d := d -	 a - 16*e
    */
 
   t = mpn_lshift (ws, A, len, 4);
   tb -= t + mpn_sub_n (B, B, ws, len);
   t = mpn_sub_n (B, B, E, len2);
-  if (len2 == len) tb -= t;
-  else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
+  if (len2 == len)
+    tb -= t;
+  else
+    tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
 
   tc -= mpn_sub_n (C, C, A, len);
   t = mpn_sub_n (C, C, E, len2);
-  if (len2 == len) tc -= t;
-  else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
+  if (len2 == len)
+    tc -= t;
+  else
+    tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
 
   t = mpn_lshift (ws, E, len2, 4);
   t += mpn_add_n (ws, ws, A, len2);
 #if 1
-  if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
+  if (len2 != len)
+    t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
   td -= t + mpn_sub_n (D, D, ws, len);
 #else
   t += mpn_sub_n (D, D, ws, len2);
-  if (len2 != len) {
-    t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
-    t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
-  } /* end if/else */
+  if (len2 != len)
+    {
+      t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
+      t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
+    }
   td -= t;
 #endif
 
@@ -818,12 +673,12 @@ interpolate3 (A, B, C, D, E,
 #ifdef HAVE_MPN_ADD_SUB_N
   /* #error TO DO ... */
 #else
-  t = tb + td + mpn_add_n (ws, B, D, len);  
-  td = tb - td - mpn_sub_n (D, B, D, len);
+  t = tb + td + mpn_add_n (ws, B, D, len);
+  td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK;
   tb = t;
   MPN_COPY (B, ws, len);
 #endif
-  
+
   /* b := b-8*c */
   t = 8 * tc + mpn_lshift (ws, C, len, 3);
   tb -= t + mpn_sub_n (B, B, ws, len);
@@ -833,14 +688,14 @@ interpolate3 (A, B, C, D, E,
   tc -= tb + mpn_sub_n (C, C, B, len);
 
   /* d := d/3 */
-  td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
+  td = ((td - mpn_divexact_by3 (D, D, len)) * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
 
   /* b, d := b + d, b - d */
 #ifdef HAVE_MPN_ADD_SUB_N
   /* #error TO DO ... */
 #else
-  t = tb + td + mpn_add_n (ws, B, D, len);  
-  td = tb - td - mpn_sub_n (D, B, D, len);
+  t = (tb + td + mpn_add_n (ws, B, D, len)) & GMP_NUMB_MASK;
+  td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK;
   tb = t;
   MPN_COPY (B, ws, len);
 #endif
@@ -853,20 +708,20 @@ interpolate3 (A, B, C, D, E,
 
   ASSERT(!(*B & 3));
   mpn_rshift (B, B, len, 2);
-  B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
-  ASSERT((long)tb >= 0);
+  B[len-1] |= (tb << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK;
+  ASSERT((mp_limb_signed_t)tb >= 0);
   tb >>= 2;
 
   ASSERT(!(*C & 1));
   mpn_rshift (C, C, len, 1);
-  C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
-  ASSERT((long)tc >= 0);
+  C[len-1] |= (tc << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
+  ASSERT((mp_limb_signed_t)tc >= 0);
   tc >>= 1;
 
   ASSERT(!(*D & 3));
   mpn_rshift (D, D, len, 2);
-  D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
-  ASSERT((long)td >= 0);
+  D[len-1] |= (td << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK;
+  ASSERT((mp_limb_signed_t)td >= 0);
   td >>= 2;
 
 #if WANT_ASSERT
@@ -893,23 +748,8 @@ interpolate3 (A, B, C, D, E,
 #else
 
 static void
-#if __STDC__
 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
 	      mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
-#else
-interpolate3 (A, B, C, D, E,
-              ptb, ptc, ptd, l, ls)
-     mp_srcptr A;
-     mp_ptr    B;
-     mp_ptr    C;
-     mp_ptr    D;
-     mp_srcptr E;
-     mp_ptr    ptb;
-     mp_ptr    ptc;
-     mp_ptr    ptd;
-     mp_size_t l;
-     mp_size_t ls;
-#endif
 {
   mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
   const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
@@ -974,9 +814,9 @@ interpolate3 (A, B, C, D, E,
       c = t - b;
 
       /* d := d/3 */
-      d *= INVERSE_3;
+      d *= MODLIMB_INVERSE_3;
       td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
-      td *= INVERSE_3;
+      td *= MODLIMB_INVERSE_3;
 
       /* b, d := b + d, b - d */
       t = b + d;
@@ -1004,10 +844,10 @@ interpolate3 (A, B, C, D, E,
       tc += c < sc;
       /* TO DO: choose one of the following alternatives. */
 #if 1
-      sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
+      sc = (mp_limb_signed_t) sc >> (BITS_PER_MP_LIMB - 1);
       sc += tc;
 #else
-      sc = tc - ((long)sc < 0L);
+      sc = tc - ((mp_limb_signed_t) sc < 0L);
 #endif
 
       /* sd has period 2. */
@@ -1040,7 +880,7 @@ interpolate3 (A, B, C, D, E,
   b = t;
   b -= c << 3;
   c = (c << 1) - b;
-  d *= INVERSE_3;
+  d *= MODLIMB_INVERSE_3;
   t = b + d;
   d = b - d;
   b = t;
@@ -1086,32 +926,23 @@ interpolate3 (A, B, C, D, E,
  * ws is workspace.
  */
 
-/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
- *        recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
- *        because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
+/* TO DO: If MUL_TOOM3_THRESHOLD is much bigger than MUL_KARATSUBA_THRESHOLD then the
+ *	  recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
+ *	  because the "n < MUL_KARATSUBA_THRESHOLD" test here will always be false.
  */
 
 #define TOOM3_MUL_REC(p, a, b, n, ws) \
   do {								\
-    if (n < KARATSUBA_MUL_THRESHOLD)				\
+    if (n < MUL_KARATSUBA_THRESHOLD)				\
       mpn_mul_basecase (p, a, n, b, n);				\
-    else if (n < TOOM3_MUL_THRESHOLD)				\
+    else if (n < MUL_TOOM3_THRESHOLD)				\
       mpn_kara_mul_n (p, a, b, n, ws);				\
     else							\
       mpn_toom3_mul_n (p, a, b, n, ws);				\
   } while (0)
 
 void
-#if __STDC__
 mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
-#else
-mpn_toom3_mul_n (p, a, b, n, ws)
-     mp_ptr    p;
-     mp_srcptr a;
-     mp_srcptr b;
-     mp_size_t n;
-     mp_ptr    ws;
-#endif
 {
   mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
   mp_limb_t *A,*B,*C,*D,*E, *W;
@@ -1125,7 +956,9 @@ mpn_toom3_mul_n (p, a, b, n, ws)
   {
     mp_limb_t m;
 
-    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    /* this is probably unnecessarily strict */
+    ASSERT (n >= MUL_TOOM3_THRESHOLD);
+
     l = ls = n / 3;
     m = n - l * 3;
     if (m != 0)
@@ -1145,6 +978,9 @@ mpn_toom3_mul_n (p, a, b, n, ws)
     W = ws + l4;
   }
 
+  ASSERT (l >= 1);
+  ASSERT (ls >= 1);
+
   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
   evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
@@ -1186,40 +1022,33 @@ mpn_toom3_mul_n (p, a, b, n, ws)
   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
 
   /** Final stage: add up the coefficients. **/
-  {
-    mp_limb_t i, x, y;
-    tB += mpn_add_n (p + l, p + l, B, l2);
-    tD += mpn_add_n (p + l3, p + l3, D, l2);
-    mpn_incr_u (p + l3, tB);
-    mpn_incr_u (p + l4, tC);
-    mpn_incr_u (p + l5, tD);
-  }
+  tB += mpn_add_n (p + l, p + l, B, l2);
+  tD += mpn_add_n (p + l3, p + l3, D, l2);
+  MPN_INCR_U (p + l3, 2 * n - l3, tB);
+  MPN_INCR_U (p + l4, 2 * n - l4, tC);
+  MPN_INCR_U (p + l5, 2 * n - l5, tD);
 }
 
 /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
 
 /* Like previous function but for squaring */
 
-#define TOOM3_SQR_REC(p, a, n, ws) \
-  do {								\
-    if (n < KARATSUBA_SQR_THRESHOLD)				\
-      mpn_sqr_basecase (p, a, n);				\
-    else if (n < TOOM3_SQR_THRESHOLD)				\
-      mpn_kara_sqr_n (p, a, n, ws);				\
-    else							\
-      mpn_toom3_sqr_n (p, a, n, ws);				\
+/* FIXME: If SQR_TOOM3_THRESHOLD is big enough it might never get into the
+   basecase range.  Try to arrange those conditonals go dead.  */
+#define TOOM3_SQR_REC(p, a, n, ws)                              \
+  do {                                                          \
+    if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))            \
+      mpn_mul_basecase (p, a, n, a, n);                         \
+    else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))      \
+      mpn_sqr_basecase (p, a, n);                               \
+    else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))          \
+      mpn_kara_sqr_n (p, a, n, ws);                             \
+    else                                                        \
+      mpn_toom3_sqr_n (p, a, n, ws);                            \
   } while (0)
 
 void
-#if __STDC__
 mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
-#else
-mpn_toom3_sqr_n (p, a, n, ws)
-     mp_ptr    p;
-     mp_srcptr a;
-     mp_size_t n;
-     mp_ptr    ws;
-#endif
 {
   mp_limb_t cB,cC,cD, tB,tC,tD;
   mp_limb_t *A,*B,*C,*D,*E, *W;
@@ -1233,7 +1062,9 @@ mpn_toom3_sqr_n (p, a, n, ws)
   {
     mp_limb_t m;
 
-    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    /* this is probably unnecessarily strict */
+    ASSERT (n >= SQR_TOOM3_THRESHOLD);
+
     l = ls = n / 3;
     m = n - l * 3;
     if (m != 0)
@@ -1253,6 +1084,9 @@ mpn_toom3_sqr_n (p, a, n, ws)
     W = ws + l4;
   }
 
+  ASSERT (l >= 1);
+  ASSERT (ls >= 1);
+
   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
 
@@ -1271,7 +1105,7 @@ mpn_toom3_sqr_n (p, a, n, ws)
     {
       tC += add2Times (C + l, C + l, B, l);
       if (cC == 2)
-        tC += add2Times (C + l, C + l, B, l);
+	tC += add2Times (C + l, C + l, B, l);
     }
 #endif
   ASSERT (tC < 9);
@@ -1286,58 +1120,51 @@ mpn_toom3_sqr_n (p, a, n, ws)
   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
 
   /** Final stage: add up the coefficients. **/
-  {
-    mp_limb_t i, x, y;
-    tB += mpn_add_n (p + l, p + l, B, l2);
-    tD += mpn_add_n (p + l3, p + l3, D, l2);
-    mpn_incr_u (p + l3, tB);
-    mpn_incr_u (p + l4, tC);
-    mpn_incr_u (p + l5, tD);
-  }
+  tB += mpn_add_n (p + l, p + l, B, l2);
+  tD += mpn_add_n (p + l3, p + l3, D, l2);
+  MPN_INCR_U (p + l3, 2 * n - l3, tB);
+  MPN_INCR_U (p + l4, 2 * n - l4, tC);
+  MPN_INCR_U (p + l5, 2 * n - l5, tD);
 }
 
 void
-#if __STDC__
 mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
-#else
-mpn_mul_n (p, a, b, n)
-     mp_ptr    p;
-     mp_srcptr a;
-     mp_srcptr b;
-     mp_size_t n;
-#endif
 {
-  if (n < KARATSUBA_MUL_THRESHOLD)
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
+  ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));
+
+  if (n < MUL_KARATSUBA_THRESHOLD)
     mpn_mul_basecase (p, a, n, b, n);
-  else if (n < TOOM3_MUL_THRESHOLD)
+  else if (n < MUL_TOOM3_THRESHOLD)
     {
       /* Allocate workspace of fixed size on stack: fast! */
 #if TUNE_PROGRAM_BUILD
-      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
+      mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
 #else
-      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
+      mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD-1)];
 #endif
       mpn_kara_mul_n (p, a, b, n, ws);
     }
 #if WANT_FFT || TUNE_PROGRAM_BUILD
-  else if (n < FFT_MUL_THRESHOLD)
+  else if (n < MUL_FFT_THRESHOLD)
 #else
   else
 #endif
     {
       /* Use workspace of unknown size in heap, as stack space may
-       * be limited.  Since n is at least TOOM3_MUL_THRESHOLD, the
+       * be limited.  Since n is at least MUL_TOOM3_THRESHOLD, the
        * multiplication will take much longer than malloc()/free().  */
       mp_limb_t wsLen, *ws;
-      wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
-      ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
+      wsLen = MPN_TOOM3_MUL_N_TSIZE (n);
+      ws = __GMP_ALLOCATE_FUNC_LIMBS ((size_t) wsLen);
       mpn_toom3_mul_n (p, a, b, n, ws);
-      (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
+      __GMP_FREE_FUNC_LIMBS (ws, (size_t) wsLen);
     }
 #if WANT_FFT || TUNE_PROGRAM_BUILD
   else
     {
-      mpn_mul_fft_full (p, a, n, b, n);      
+      mpn_mul_fft_full (p, a, n, b, n);
     }
 #endif
 }