===================================================================
RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/generic/Attic/sqrtrem.c,v
retrieving revision 1.1.1.2
retrieving revision 1.1.1.3
diff -u -p -r1.1.1.2 -r1.1.1.3
--- OpenXM_contrib/gmp/mpn/generic/Attic/sqrtrem.c	2000/09/09 14:12:27	1.1.1.2
+++ OpenXM_contrib/gmp/mpn/generic/Attic/sqrtrem.c	2003/08/25 16:06:20	1.1.1.3
@@ -1,20 +1,8 @@
-/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+/* mpn_sqrtrem -- square root and remainder */
 
-   Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
-   Write the remainder at REM_PTR, if REM_PTR != NULL.
-   Return the size of the remainder.
-   (The size of the root is always half of the size of the operand.)
+/*
+Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 
-   OP_PTR and ROOT_PTR may not point to the same object.
-   OP_PTR and REM_PTR may point to the same object.
-
-   If REM_PTR is NULL, only the root is computed and the return value of
-   the function is 0 if OP is a perfect square, and *any* non-zero number
-   otherwise.
-
-Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
-Foundation, Inc.
-
 This file is part of the GNU MP Library.
 
 The GNU MP Library is free software; you can redistribute it and/or modify
@@ -30,480 +18,270 @@ License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA. */
+MA 02111-1307, USA.
+*/
 
-/* This code is just correct if "unsigned char" has at least 8 bits.  It
-   doesn't help to use CHAR_BIT from limits.h, as the real problem is
-   the static arrays.  */
 
-#include <stdio.h> /* for NULL */
+/* Contributed by Paul Zimmermann.
+   See "Karatsuba Square Root", reference in gmp.texi.  */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
 #include "gmp.h"
 #include "gmp-impl.h"
 #include "longlong.h"
 
-/* Square root algorithm:
 
-   1. Shift OP (the input) to the left an even number of bits s.t. there
-      are an even number of words and either (or both) of the most
-      significant bits are set.  This way, sqrt(OP) has exactly half as
-      many words as OP, and has its most significant bit set.
 
-   2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
-      This approximation is used for the first single-precision
-      iterations of Newton's method, yielding a full-word approximation
-      to sqrt(OP).
+/* Square roots table.  Generated by the following program:
+#include "gmp.h"
+main(){mpz_t x;int i;mpz_init(x);for(i=64;i<256;i++){mpz_set_ui(x,256*i);
+mpz_sqrt(x,x);mpz_out_str(0,10,x);printf(",");if(i%16==15)printf("\n");}}
+*/
+static const unsigned char approx_tab[192] =
+  {
+    128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,
+    143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,
+    156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,
+    169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,
+    181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,
+    192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,
+    202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,
+    212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221,
+    221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230,
+    230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238,
+    239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247,
+    247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255
+  };
 
-   3. Perform multiple-precision Newton iteration until we have the
-      exact result.  Only about half of the input operand is used in
-      this calculation, as the square root is perfectly determinable
-      from just the higher half of a number.  */
-
-/* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
-#if defined __GNUC__ && ! defined __SOFT_FLOAT
+#define HALF_NAIL (GMP_NAIL_BITS / 2)
 
-#if defined (__sparc__) && BITS_PER_MP_LIMB == 32
-#define SQRT(a) \
-  ({									\
-    double __sqrt_res;							\
-    asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
-    __sqrt_res;								\
-  })
-#endif
-
-#if defined (__HAVE_68881__)
-#define SQRT(a) \
-  ({									\
-    double __sqrt_res;							\
-    asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
-    __sqrt_res;								\
-  })
-#endif
-
-#if defined (__hppa) && BITS_PER_MP_LIMB == 32
-#define SQRT(a) \
-  ({									\
-    double __sqrt_res;							\
-    asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));		\
-    __sqrt_res;								\
-  })
-#endif
-
-#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
-#define SQRT(a) \
-  ({									\
-    double __sqrt_res;							\
-    asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));			\
-    __sqrt_res;								\
-  })
-#endif
-
-#if 0
-#if defined (__i386__) || defined (__i486__)
-#define SQRT(a) \
-  ({									\
-    double __sqrt_res;							\
-    asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a));			\
-    __sqrt_res;								\
-  })
-#endif
-#endif
-
-#endif
-
-#ifndef SQRT
-
-/* Tables for initial approximation of the square root.  These are
-   indexed with bits 1-8 of the operand for which the square root is
-   calculated, where bit 0 is the most significant non-zero bit.  I.e.
-   the most significant one-bit is not used, since that per definition
-   is one.  Likewise, the tables don't return the highest bit of the
-   result.  That bit must be inserted by or:ing the returned value with
-   0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
-
-/* Table to be used for operands with an even total number of bits.
-   (Exactly as in the decimal system there are similarities between the
-   square root of numbers with the same initial digits and an even
-   difference in the total number of digits.  Consider the square root
-   of 1, 10, 100, 1000, ...)  */
-static const unsigned char even_approx_tab[256] =
+/* same as mpn_sqrtrem, but for size=1 and {np, 1} normalized */
+static mp_size_t
+mpn_sqrtrem1 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
 {
-  0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
-  0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
-  0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
-  0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
-  0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
-  0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
-  0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
-  0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
-  0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
-  0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
-  0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
-  0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
-  0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
-  0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
-  0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
-  0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
-  0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
-  0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
-  0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
-  0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
-  0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
-  0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
-  0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
-  0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
-  0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
-  0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
-  0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
-  0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
-  0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
-  0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
-  0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
-  0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
-};
+  mp_limb_t np0, s, r, q, u;
+  int prec;
 
-/* Table to be used for operands with an odd total number of bits.
-   (Further comments before previous table.)  */
-static const unsigned char odd_approx_tab[256] =
-{
-  0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
-  0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
-  0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
-  0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
-  0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
-  0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
-  0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
-  0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
-  0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
-  0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
-  0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
-  0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
-  0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
-  0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
-  0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
-  0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
-  0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
-  0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
-  0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
-  0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
-  0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
-  0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
-  0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
-  0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
-  0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
-  0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
-  0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
-  0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
-  0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
-  0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
-  0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
-  0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
-};
-#endif
+  ASSERT (np[0] >= GMP_NUMB_HIGHBIT / 2);
+  ASSERT (GMP_LIMB_BITS >= 16);
 
-
-mp_size_t
-#if __STDC__
-mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
-#else
-mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
-     mp_ptr root_ptr;
-     mp_ptr rem_ptr;
-     mp_srcptr op_ptr;
-     mp_size_t op_size;
-#endif
-{
-  /* R (root result) */
-  mp_ptr rp;			/* Pointer to least significant word */
-  mp_size_t rsize;		/* The size in words */
+  /* first compute a 8-bit approximation from the high 8-bits of np[0] */
+  np0 = np[0] << GMP_NAIL_BITS;
+  q = np0 >> (GMP_LIMB_BITS - 8);
+  /* 2^6 = 64 <= q < 256 = 2^8 */
+  s = approx_tab[q - 64];				/* 128 <= s < 255 */
+  r = (np0 >> (GMP_LIMB_BITS - 16)) - s * s;		/* r <= 2*s */
+  if (r > 2 * s)
+    {
+      r -= 2 * s + 1;
+      s++;
+    }
 
-  /* T (OP shifted to the left a.k.a. normalized) */
-  mp_ptr tp;			/* Pointer to least significant word */
-  mp_size_t tsize;		/* The size in words */
-  mp_ptr t_end_ptr;		/* Pointer right beyond most sign. word */
-  mp_limb_t t_high0, t_high1;	/* The two most significant words */
-
-  /* TT (temporary for numerator/remainder) */
-  mp_ptr ttp;			/* Pointer to least significant word */
-
-  /* X (temporary for quotient in main loop) */
-  mp_ptr xp;			/* Pointer to least significant word */
-  mp_size_t xsize;		/* The size in words */
-
-  unsigned cnt;
-  mp_limb_t initial_approx;	/* Initially made approximation */
-  mp_size_t tsizes[BITS_PER_MP_LIMB];	/* Successive calculation precisions */
-  mp_size_t tmp;
-  mp_size_t i;
-
-  mp_limb_t cy_limb;
-  TMP_DECL (marker);
-
-  /* If OP is zero, both results are zero.  */
-  if (op_size == 0)
-    return 0;
-
-  count_leading_zeros (cnt, op_ptr[op_size - 1]);
-  tsize = op_size;
-  if ((tsize & 1) != 0)
+  prec = 8;
+  np0 <<= 2 * prec;
+  while (2 * prec < GMP_LIMB_BITS)
     {
-      cnt += BITS_PER_MP_LIMB;
-      tsize++;
+      /* invariant: s has prec bits, and r <= 2*s */
+      r = (r << prec) + (np0 >> (GMP_LIMB_BITS - prec));
+      np0 <<= prec;
+      u = 2 * s;
+      q = r / u;
+      u = r - q * u;
+      s = (s << prec) + q;
+      u = (u << prec) + (np0 >> (GMP_LIMB_BITS - prec));
+      q = q * q;
+      r = u - q;
+      if (u < q)
+	{
+	  r += 2 * s - 1;
+	  s --;
+	}
+      np0 <<= prec;
+      prec = 2 * prec;
     }
 
-  rsize = tsize / 2;
-  rp = root_ptr;
+  ASSERT (2 * prec == GMP_LIMB_BITS); /* GMP_LIMB_BITS must be a power of 2 */
 
-  TMP_MARK (marker);
+  /* normalize back, assuming GMP_NAIL_BITS is even */
+  ASSERT (GMP_NAIL_BITS % 2 == 0);
+  sp[0] = s >> HALF_NAIL;
+  u = s - (sp[0] << HALF_NAIL); /* s mod 2^HALF_NAIL */
+  r += u * ((sp[0] << (HALF_NAIL + 1)) + u);
+  r = r >> GMP_NAIL_BITS;
 
-  /* Shift OP an even number of bits into T, such that either the most or
-     the second most significant bit is set, and such that the number of
-     words in T becomes even.  This way, the number of words in R=sqrt(OP)
-     is exactly half as many as in OP, and the most significant bit of R
-     is set.
+  if (rp != NULL)
+    rp[0] = r;
+  return r != 0 ? 1 : 0;
+}
 
-     Also, the initial approximation is simplified by this up-shifted OP.
 
-     Finally, the Newtonian iteration which is the main part of this
-     program performs division by R.  The fast division routine expects
-     the divisor to be "normalized" in exactly the sense of having the
-     most significant bit set.  */
+#define Prec (GMP_NUMB_BITS >> 1)
 
-  tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
+   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
+static mp_limb_t
+mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
+{
+  mp_limb_t qhl, q, u, np0;
+  int cc;
 
-  if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
-    t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
-			  (cnt & ~1) % BITS_PER_MP_LIMB);
-  else
-    MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
+  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
 
-  if (cnt >= BITS_PER_MP_LIMB)
-    tp[0] = 0;
-
-  t_high0 = tp[tsize - 1];
-  t_high1 = tp[tsize - 2];	/* Never stray.  TSIZE is >= 2.  */
-
-/* Is there a fast sqrt instruction defined for this machine?  */
-#ifdef SQRT
-  {
-    initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
-    /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
-       become incorrect due to overflow in the conversion from double to
-       mp_limb_t above.  It will typically be zero in that case, but might be
-       a small number on some machines.  The most significant bit of
-       INITIAL_APPROX should be set, so that bit is a good overflow
-       indication.  */
-    if ((mp_limb_signed_t) initial_approx >= 0)
-      initial_approx = ~(mp_limb_t)0;
-  }
-#else
-  /* Get a 9 bit approximation from the tables.  The tables expect to
-     be indexed with the 8 high bits right below the highest bit.
-     Also, the highest result bit is not returned by the tables, and
-     must be or:ed into the result.  The scheme gives 9 bits of start
-     approximation with just 256-entry 8 bit tables.  */
-
-  if ((cnt & 1) == 0)
+  np0 = np[0];
+  mpn_sqrtrem1 (sp, rp, np + 1);
+  qhl = 0;
+  while (rp[0] >= sp[0])
     {
-      /* The most significant bit of t_high0 is set.  */
-      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
-      initial_approx &= 0xff;
-      initial_approx = even_approx_tab[initial_approx];
+      qhl++;
+      rp[0] -= sp[0];
     }
-  else
+  /* now rp[0] < sp[0] < 2^Prec */
+  rp[0] = (rp[0] << Prec) + (np0 >> Prec);
+  u = 2 * sp[0];
+  q = rp[0] / u;
+  u = rp[0] - q * u;
+  q += (qhl & 1) << (Prec - 1);
+  qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
+  /* now we have (initial rp[0])<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp[0]) + u */
+  sp[0] = ((sp[0] + qhl) << Prec) + q;
+  cc = u >> Prec;
+  rp[0] = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
+  /* subtract q * q or qhl*2^(2*Prec) from rp */
+  cc -= mpn_sub_1 (rp, rp, 1, q * q) + qhl;
+  /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
+  if (cc < 0)
     {
-      /* The most significant bit of t_high0 is unset,
-	 the second most significant is set.  */
-      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
-      initial_approx &= 0xff;
-      initial_approx = odd_approx_tab[initial_approx];
+      cc += sp[0] != 0 ? mpn_add_1 (rp, rp, 1, sp[0]) : 1;
+      cc += mpn_add_1 (rp, rp, 1, --sp[0]);
     }
-  initial_approx |= 0x100;
-  initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
 
-  /* Perform small precision Newtonian iterations to get a full word
-     approximation.  For small operands, these iterations will do the
-     entire job.  */
-  if (t_high0 == ~(mp_limb_t)0)
-    initial_approx = t_high0;
-  else
-    {
-      mp_limb_t quot;
+  return cc;
+}
 
-      if (t_high0 >= initial_approx)
-	initial_approx = t_high0 + 1;
+/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
+   and in {np, n} the low n limbs of the remainder, returns the high
+   limb of the remainder (which is 0 or 1).
+   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
+   where B=2^GMP_NUMB_BITS.  */
+static mp_limb_t
+mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
+{
+  mp_limb_t q;			/* carry out of {sp, n} */
+  int c, b;			/* carry out of remainder */
+  mp_size_t l, h;
 
-      /* First get about 18 bits with pure C arithmetics.  */
-      quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
-      initial_approx = (initial_approx + quot) / 2;
-      initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
 
-      /* Now get a full word by one (or for > 36 bit machines) several
-	 iterations.  */
-      for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
-	{
-	  mp_limb_t ignored_remainder;
+  if (n == 1)
+    c = mpn_sqrtrem2 (sp, np, np);
+  else
+    {
+      l = n / 2;
+      h = n - l;
+      q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
+      if (q != 0)
+        mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
+      q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
+      c = sp[0] & 1;
+      mpn_rshift (sp, sp, l, 1);
+      sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
+      q >>= 1;
+      if (c != 0)
+        c = mpn_add_n (np + l, np + l, sp + l, h);
+      mpn_sqr_n (np + n, sp, l);
+      b = q + mpn_sub_n (np, np, np + n, 2 * l);
+      c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, b);
+      q = mpn_add_1 (sp + l, sp + l, h, q);
 
-	  udiv_qrnnd (quot, ignored_remainder,
-		      t_high0, t_high1, initial_approx);
-	  initial_approx = (initial_approx + quot) / 2;
-	  initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
-	}
+      if (c < 0)
+        {
+          c += mpn_addmul_1 (np, sp, n, 2) + 2 * q;
+          c -= mpn_sub_1 (np, np, n, 1);
+          q -= mpn_sub_1 (sp, sp, n, 1);
+        }
     }
-#endif
 
-  rp[0] = initial_approx;
-  rsize = 1;
+  return c;
+}
 
-#ifdef SQRT_DEBUG
-	  printf ("\n\nT = ");
-	  mpn_dump (tp, tsize);
-#endif
 
-  if (tsize > 2)
-    {
-      /* Determine the successive precisions to use in the iteration.  We
-	 minimize the precisions, beginning with the highest (i.e. last
-	 iteration) to the lowest (i.e. first iteration).  */
+mp_size_t
+mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
+{
+  mp_limb_t *tp, s0[1], cc, high, rl;
+  int c;
+  mp_size_t rn, tn;
+  TMP_DECL (marker);
 
-      xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
-      ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+  ASSERT (nn >= 0);
 
-      t_end_ptr = tp + tsize;
+  /* If OP is zero, both results are zero.  */
+  if (nn == 0)
+    return 0;
 
-      tmp = tsize / 2;
-      for (i = 0;; i++)
-	{
-	  tsize = (tmp + 1) / 2;
-	  if (tmp == tsize)
-	    break;
-	  tsizes[i] = tsize + tmp;
-	  tmp = tsize;
-	}
+  ASSERT (np[nn - 1] != 0);
+  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
+  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
+  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
 
-      /* Main Newton iteration loop.  For big arguments, most of the
-	 time is spent here.  */
+  high = np[nn - 1];
+  if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
+    return mpn_sqrtrem1 (sp, rp, np);
+  count_leading_zeros (c, high);
+  c -= GMP_NAIL_BITS;
 
-      /* It is possible to do a great optimization here.  The successive
-	 divisors in the mpn_divmod call below have more and more leading
-	 words equal to its predecessor.  Therefore the beginning of
-	 each division will repeat the same work as did the last
-	 division.  If we could guarantee that the leading words of two
-	 consecutive divisors are the same (i.e. in this case, a later
-	 divisor has just more digits at the end) it would be a simple
-	 matter of just using the old remainder of the last division in
-	 a subsequent division, to take care of this optimization.  This
-	 idea would surely make a difference even for small arguments.  */
+  c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
+  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
 
-      /* Loop invariants:
-
-	 R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
-	 X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
-	 R <= shiftdown_to_same_size(X).  */
-
-      while (--i >= 0)
+  TMP_MARK (marker);
+  if (nn % 2 != 0 || c > 0)
+    {
+      tp = TMP_ALLOC_LIMBS (2 * tn);
+      tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
+      if (c != 0)
+	mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
+      else
+	MPN_COPY (tp + 2 * tn - nn, np, nn);
+      rl = mpn_dc_sqrtrem (sp, tp, tn);
+      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
+	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
+      c += (nn % 2) * GMP_NUMB_BITS / 2;		/* c now represents k */
+      s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);	/* S mod 2^k */
+      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
+      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
+      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
+      mpn_rshift (sp, sp, tn, c);
+      tp[tn] = rl;
+      if (rp == NULL)
+	rp = tp;
+      c = c << 1;
+      if (c < GMP_NUMB_BITS)
+	tn++;
+      else
 	{
-	  mp_limb_t cy;
-#ifdef SQRT_DEBUG
-	  mp_limb_t old_least_sign_r = rp[0];
-	  mp_size_t old_rsize = rsize;
-
-	  printf ("R = ");
-	  mpn_dump (rp, rsize);
-#endif
-	  tsize = tsizes[i];
-
-	  /* Need to copy the numerator into temporary space, as
-	     mpn_divmod overwrites its numerator argument with the
-	     remainder (which we currently ignore).  */
-	  MPN_COPY (ttp, t_end_ptr - tsize, tsize);
-	  cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
-	  xsize = tsize - rsize;
-
-#ifdef SQRT_DEBUG
-	  printf ("X =%d ", cy);
-	  mpn_dump (xp, xsize);
-#endif
-
-	  /* Add X and R with the most significant limbs aligned,
-	     temporarily ignoring at least one limb at the low end of X.  */
-	  tmp = xsize - rsize;
-	  cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
-
-	  /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
-	     intermediate roots that'd need an extra bit.  We don't want to
-	     handle that since it would make the subsequent divisor
-	     non-normalized, so round such roots down to be only ones in the
-	     current precision.  */
-	  if (cy == 2)
-	    {
-	      mp_size_t j;
-	      for (j = xsize; j >= 0; j--)
-		xp[j] = ~(mp_limb_t)0;
-	    }
-
-	  /* Divide X by 2 and put the result in R.  This is the new
-	     approximation.  Shift in the carry from the addition.  */
-	  mpn_rshift (rp, xp, xsize, 1);
-	  rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
-	  rsize = xsize;
-#ifdef SQRT_DEBUG
-	  if (old_least_sign_r != rp[rsize - old_rsize])
-	    printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
-		    i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
-		    2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
-#endif
+	  tp++;
+	  c -= GMP_NUMB_BITS;
 	}
+      if (c != 0)
+	mpn_rshift (rp, tp, tn, c);
+      else
+	MPN_COPY_INCR (rp, tp, tn);
+      rn = tn;
     }
-
-#ifdef SQRT_DEBUG
-  printf ("(final) R = ");
-  mpn_dump (rp, rsize);
-#endif
-
-  /* We computed the square root of OP * 2**(2*floor(cnt/2)).
-     This has resulted in R being 2**floor(cnt/2) to large.
-     Shift it down here to fix that.  */
-  if (cnt / 2 != 0)
+  else
     {
-      mpn_rshift (rp, rp, rsize, cnt/2);
-      rsize -= rp[rsize - 1] == 0;
+      if (rp == NULL)
+	rp = TMP_ALLOC_LIMBS (nn);
+      if (rp != np)
+	MPN_COPY (rp, np, nn);
+      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
     }
 
-  /* Calculate the remainder.  */
-  mpn_mul_n (tp, rp, rp, rsize);
-  tsize = rsize + rsize;
-  tsize -= tp[tsize - 1] == 0;
-  if (op_size < tsize
-      || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
-    {
-      /* R is too large.  Decrement it.  */
+  MPN_NORMALIZE (rp, rn);
 
-      /* These operations can't overflow.  */
-      cy_limb  = mpn_sub_n (tp, tp, rp, rsize);
-      cy_limb += mpn_sub_n (tp, tp, rp, rsize);
-      mpn_decr_u (tp + rsize, cy_limb);
-      mpn_incr_u (tp, (mp_limb_t) 1);
-
-      mpn_decr_u (rp, (mp_limb_t) 1);
-
-#ifdef SQRT_DEBUG
-      printf ("(adjusted) R = ");
-      mpn_dump (rp, rsize);
-#endif
-    }
-
-  if (rem_ptr != NULL)
-    {
-      cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
-      MPN_NORMALIZE (rem_ptr, op_size);
-      TMP_FREE (marker);
-      return op_size;
-    }
-  else
-    {
-      int res;
-      res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
-      TMP_FREE (marker);
-      return res;
-    }
+  TMP_FREE (marker);
+  return rn;
 }