[BACK]Return to sqrtrem.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / generic

Diff for /OpenXM_contrib/gmp/mpn/generic/Attic/sqrtrem.c between version 1.1.1.2 and 1.1.1.3

version 1.1.1.2, 2000/09/09 14:12:27 version 1.1.1.3, 2003/08/25 16:06:20
Line 1 
Line 1 
 /* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)  /* mpn_sqrtrem -- square root and remainder */
   
    Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.  /*
    Write the remainder at REM_PTR, if REM_PTR != NULL.  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
    Return the size of the remainder.  
    (The size of the root is always half of the size of the operand.)  
   
    OP_PTR and ROOT_PTR may not point to the same object.  
    OP_PTR and REM_PTR may point to the same object.  
   
    If REM_PTR is NULL, only the root is computed and the return value of  
    the function is 0 if OP is a perfect square, and *any* non-zero number  
    otherwise.  
   
 Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software  
 Foundation, Inc.  
   
 This file is part of the GNU MP Library.  This file is part of the GNU MP Library.
   
 The GNU MP Library is free software; you can redistribute it and/or modify  The GNU MP Library is free software; you can redistribute it and/or modify
Line 30  License for more details.
Line 18  License for more details.
 You should have received a copy of the GNU Lesser General Public License  You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 MA 02111-1307, USA. */  MA 02111-1307, USA.
   */
   
 /* This code is just correct if "unsigned char" has at least 8 bits.  It  
    doesn't help to use CHAR_BIT from limits.h, as the real problem is  
    the static arrays.  */  
   
 #include <stdio.h> /* for NULL */  /* Contributed by Paul Zimmermann.
      See "Karatsuba Square Root", reference in gmp.texi.  */
   
   
   #include <stdio.h>
   #include <stdlib.h>
   
 #include "gmp.h"  #include "gmp.h"
 #include "gmp-impl.h"  #include "gmp-impl.h"
 #include "longlong.h"  #include "longlong.h"
   
 /* Square root algorithm:  
   
    1. Shift OP (the input) to the left an even number of bits s.t. there  
       are an even number of words and either (or both) of the most  
       significant bits are set.  This way, sqrt(OP) has exactly half as  
       many words as OP, and has its most significant bit set.  
   
    2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.  /* Square roots table.  Generated by the following program:
       This approximation is used for the first single-precision  #include "gmp.h"
       iterations of Newton's method, yielding a full-word approximation  main(){mpz_t x;int i;mpz_init(x);for(i=64;i<256;i++){mpz_set_ui(x,256*i);
       to sqrt(OP).  mpz_sqrt(x,x);mpz_out_str(0,10,x);printf(",");if(i%16==15)printf("\n");}}
   */
   static const unsigned char approx_tab[192] =
     {
       128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,
       143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,
       156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,
       169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,
       181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,
       192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,
       202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,
       212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221,
       221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230,
       230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238,
       239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247,
       247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255
     };
   
    3. Perform multiple-precision Newton iteration until we have the  #define HALF_NAIL (GMP_NAIL_BITS / 2)
       exact result.  Only about half of the input operand is used in  
       this calculation, as the square root is perfectly determinable  
       from just the higher half of a number.  */  
   
 /* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */  
 #if defined __GNUC__ && ! defined __SOFT_FLOAT  
   
 #if defined (__sparc__) && BITS_PER_MP_LIMB == 32  /* same as mpn_sqrtrem, but for size=1 and {np, 1} normalized */
 #define SQRT(a) \  static mp_size_t
   ({                                                                    \  mpn_sqrtrem1 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
     double __sqrt_res;                                                  \  
     asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));                 \  
     __sqrt_res;                                                         \  
   })  
 #endif  
   
 #if defined (__HAVE_68881__)  
 #define SQRT(a) \  
   ({                                                                    \  
     double __sqrt_res;                                                  \  
     asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));                 \  
     __sqrt_res;                                                         \  
   })  
 #endif  
   
 #if defined (__hppa) && BITS_PER_MP_LIMB == 32  
 #define SQRT(a) \  
   ({                                                                    \  
     double __sqrt_res;                                                  \  
     asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));            \  
     __sqrt_res;                                                         \  
   })  
 #endif  
   
 #if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32  
 #define SQRT(a) \  
   ({                                                                    \  
     double __sqrt_res;                                                  \  
     asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));                  \  
     __sqrt_res;                                                         \  
   })  
 #endif  
   
 #if 0  
 #if defined (__i386__) || defined (__i486__)  
 #define SQRT(a) \  
   ({                                                                    \  
     double __sqrt_res;                                                  \  
     asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a));                        \  
     __sqrt_res;                                                         \  
   })  
 #endif  
 #endif  
   
 #endif  
   
 #ifndef SQRT  
   
 /* Tables for initial approximation of the square root.  These are  
    indexed with bits 1-8 of the operand for which the square root is  
    calculated, where bit 0 is the most significant non-zero bit.  I.e.  
    the most significant one-bit is not used, since that per definition  
    is one.  Likewise, the tables don't return the highest bit of the  
    result.  That bit must be inserted by or:ing the returned value with  
    0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */  
   
 /* Table to be used for operands with an even total number of bits.  
    (Exactly as in the decimal system there are similarities between the  
    square root of numbers with the same initial digits and an even  
    difference in the total number of digits.  Consider the square root  
    of 1, 10, 100, 1000, ...)  */  
 static const unsigned char even_approx_tab[256] =  
 {  {
   0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,    mp_limb_t np0, s, r, q, u;
   0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,    int prec;
   0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,  
   0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,  
   0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,  
   0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,  
   0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,  
   0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,  
   0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,  
   0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,  
   0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,  
   0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,  
   0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,  
   0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,  
   0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,  
   0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,  
   0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,  
   0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,  
   0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,  
   0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,  
   0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,  
   0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,  
   0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,  
   0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,  
   0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,  
   0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,  
   0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,  
   0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,  
   0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,  
   0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,  
   0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,  
   0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,  
 };  
   
 /* Table to be used for operands with an odd total number of bits.    ASSERT (np[0] >= GMP_NUMB_HIGHBIT / 2);
    (Further comments before previous table.)  */    ASSERT (GMP_LIMB_BITS >= 16);
 static const unsigned char odd_approx_tab[256] =  
 {  
   0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,  
   0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,  
   0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,  
   0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,  
   0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,  
   0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,  
   0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,  
   0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,  
   0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,  
   0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,  
   0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,  
   0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,  
   0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,  
   0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,  
   0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,  
   0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,  
   0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,  
   0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,  
   0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,  
   0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,  
   0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,  
   0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,  
   0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,  
   0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,  
   0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,  
   0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,  
   0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,  
   0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,  
   0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,  
   0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,  
   0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,  
   0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,  
 };  
 #endif  
   
     /* first compute a 8-bit approximation from the high 8-bits of np[0] */
 mp_size_t    np0 = np[0] << GMP_NAIL_BITS;
 #if __STDC__    q = np0 >> (GMP_LIMB_BITS - 8);
 mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)    /* 2^6 = 64 <= q < 256 = 2^8 */
 #else    s = approx_tab[q - 64];                               /* 128 <= s < 255 */
 mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)    r = (np0 >> (GMP_LIMB_BITS - 16)) - s * s;            /* r <= 2*s */
      mp_ptr root_ptr;    if (r > 2 * s)
      mp_ptr rem_ptr;      {
      mp_srcptr op_ptr;        r -= 2 * s + 1;
      mp_size_t op_size;        s++;
 #endif      }
 {  
   /* R (root result) */  
   mp_ptr rp;                    /* Pointer to least significant word */  
   mp_size_t rsize;              /* The size in words */  
   
   /* T (OP shifted to the left a.k.a. normalized) */    prec = 8;
   mp_ptr tp;                    /* Pointer to least significant word */    np0 <<= 2 * prec;
   mp_size_t tsize;              /* The size in words */    while (2 * prec < GMP_LIMB_BITS)
   mp_ptr t_end_ptr;             /* Pointer right beyond most sign. word */  
   mp_limb_t t_high0, t_high1;   /* The two most significant words */  
   
   /* TT (temporary for numerator/remainder) */  
   mp_ptr ttp;                   /* Pointer to least significant word */  
   
   /* X (temporary for quotient in main loop) */  
   mp_ptr xp;                    /* Pointer to least significant word */  
   mp_size_t xsize;              /* The size in words */  
   
   unsigned cnt;  
   mp_limb_t initial_approx;     /* Initially made approximation */  
   mp_size_t tsizes[BITS_PER_MP_LIMB];   /* Successive calculation precisions */  
   mp_size_t tmp;  
   mp_size_t i;  
   
   mp_limb_t cy_limb;  
   TMP_DECL (marker);  
   
   /* If OP is zero, both results are zero.  */  
   if (op_size == 0)  
     return 0;  
   
   count_leading_zeros (cnt, op_ptr[op_size - 1]);  
   tsize = op_size;  
   if ((tsize & 1) != 0)  
     {      {
       cnt += BITS_PER_MP_LIMB;        /* invariant: s has prec bits, and r <= 2*s */
       tsize++;        r = (r << prec) + (np0 >> (GMP_LIMB_BITS - prec));
         np0 <<= prec;
         u = 2 * s;
         q = r / u;
         u = r - q * u;
         s = (s << prec) + q;
         u = (u << prec) + (np0 >> (GMP_LIMB_BITS - prec));
         q = q * q;
         r = u - q;
         if (u < q)
           {
             r += 2 * s - 1;
             s --;
           }
         np0 <<= prec;
         prec = 2 * prec;
     }      }
   
   rsize = tsize / 2;    ASSERT (2 * prec == GMP_LIMB_BITS); /* GMP_LIMB_BITS must be a power of 2 */
   rp = root_ptr;  
   
   TMP_MARK (marker);    /* normalize back, assuming GMP_NAIL_BITS is even */
     ASSERT (GMP_NAIL_BITS % 2 == 0);
     sp[0] = s >> HALF_NAIL;
     u = s - (sp[0] << HALF_NAIL); /* s mod 2^HALF_NAIL */
     r += u * ((sp[0] << (HALF_NAIL + 1)) + u);
     r = r >> GMP_NAIL_BITS;
   
   /* Shift OP an even number of bits into T, such that either the most or    if (rp != NULL)
      the second most significant bit is set, and such that the number of      rp[0] = r;
      words in T becomes even.  This way, the number of words in R=sqrt(OP)    return r != 0 ? 1 : 0;
      is exactly half as many as in OP, and the most significant bit of R  }
      is set.  
   
      Also, the initial approximation is simplified by this up-shifted OP.  
   
      Finally, the Newtonian iteration which is the main part of this  #define Prec (GMP_NUMB_BITS >> 1)
      program performs division by R.  The fast division routine expects  
      the divisor to be "normalized" in exactly the sense of having the  
      most significant bit set.  */  
   
   tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);  /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
      return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
   static mp_limb_t
   mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
   {
     mp_limb_t qhl, q, u, np0;
     int cc;
   
   if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)    ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
     t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,  
                           (cnt & ~1) % BITS_PER_MP_LIMB);  
   else  
     MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);  
   
   if (cnt >= BITS_PER_MP_LIMB)    np0 = np[0];
     tp[0] = 0;    mpn_sqrtrem1 (sp, rp, np + 1);
     qhl = 0;
   t_high0 = tp[tsize - 1];    while (rp[0] >= sp[0])
   t_high1 = tp[tsize - 2];      /* Never stray.  TSIZE is >= 2.  */  
   
 /* Is there a fast sqrt instruction defined for this machine?  */  
 #ifdef SQRT  
   {  
     initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);  
     /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have  
        become incorrect due to overflow in the conversion from double to  
        mp_limb_t above.  It will typically be zero in that case, but might be  
        a small number on some machines.  The most significant bit of  
        INITIAL_APPROX should be set, so that bit is a good overflow  
        indication.  */  
     if ((mp_limb_signed_t) initial_approx >= 0)  
       initial_approx = ~(mp_limb_t)0;  
   }  
 #else  
   /* Get a 9 bit approximation from the tables.  The tables expect to  
      be indexed with the 8 high bits right below the highest bit.  
      Also, the highest result bit is not returned by the tables, and  
      must be or:ed into the result.  The scheme gives 9 bits of start  
      approximation with just 256-entry 8 bit tables.  */  
   
   if ((cnt & 1) == 0)  
     {      {
       /* The most significant bit of t_high0 is set.  */        qhl++;
       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);        rp[0] -= sp[0];
       initial_approx &= 0xff;  
       initial_approx = even_approx_tab[initial_approx];  
     }      }
   else    /* now rp[0] < sp[0] < 2^Prec */
     rp[0] = (rp[0] << Prec) + (np0 >> Prec);
     u = 2 * sp[0];
     q = rp[0] / u;
     u = rp[0] - q * u;
     q += (qhl & 1) << (Prec - 1);
     qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
     /* now we have (initial rp[0])<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp[0]) + u */
     sp[0] = ((sp[0] + qhl) << Prec) + q;
     cc = u >> Prec;
     rp[0] = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
     /* subtract q * q or qhl*2^(2*Prec) from rp */
     cc -= mpn_sub_1 (rp, rp, 1, q * q) + qhl;
     /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
     if (cc < 0)
     {      {
       /* The most significant bit of t_high0 is unset,        cc += sp[0] != 0 ? mpn_add_1 (rp, rp, 1, sp[0]) : 1;
          the second most significant is set.  */        cc += mpn_add_1 (rp, rp, 1, --sp[0]);
       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);  
       initial_approx &= 0xff;  
       initial_approx = odd_approx_tab[initial_approx];  
     }      }
   initial_approx |= 0x100;  
   initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;  
   
   /* Perform small precision Newtonian iterations to get a full word    return cc;
      approximation.  For small operands, these iterations will do the  }
      entire job.  */  
   if (t_high0 == ~(mp_limb_t)0)  
     initial_approx = t_high0;  
   else  
     {  
       mp_limb_t quot;  
   
       if (t_high0 >= initial_approx)  /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
         initial_approx = t_high0 + 1;     and in {np, n} the low n limbs of the remainder, returns the high
      limb of the remainder (which is 0 or 1).
      Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
      where B=2^GMP_NUMB_BITS.  */
   static mp_limb_t
   mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
   {
     mp_limb_t q;                  /* carry out of {sp, n} */
     int c, b;                     /* carry out of remainder */
     mp_size_t l, h;
   
       /* First get about 18 bits with pure C arithmetics.  */    ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
       quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;  
       initial_approx = (initial_approx + quot) / 2;  
       initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);  
   
       /* Now get a full word by one (or for > 36 bit machines) several    if (n == 1)
          iterations.  */      c = mpn_sqrtrem2 (sp, np, np);
       for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)    else
         {      {
           mp_limb_t ignored_remainder;        l = n / 2;
         h = n - l;
         q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
         if (q != 0)
           mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
         q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
         c = sp[0] & 1;
         mpn_rshift (sp, sp, l, 1);
         sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
         q >>= 1;
         if (c != 0)
           c = mpn_add_n (np + l, np + l, sp + l, h);
         mpn_sqr_n (np + n, sp, l);
         b = q + mpn_sub_n (np, np, np + n, 2 * l);
         c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, b);
         q = mpn_add_1 (sp + l, sp + l, h, q);
   
           udiv_qrnnd (quot, ignored_remainder,        if (c < 0)
                       t_high0, t_high1, initial_approx);          {
           initial_approx = (initial_approx + quot) / 2;            c += mpn_addmul_1 (np, sp, n, 2) + 2 * q;
           initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);            c -= mpn_sub_1 (np, np, n, 1);
         }            q -= mpn_sub_1 (sp, sp, n, 1);
           }
     }      }
 #endif  
   
   rp[0] = initial_approx;    return c;
   rsize = 1;  }
   
 #ifdef SQRT_DEBUG  
           printf ("\n\nT = ");  
           mpn_dump (tp, tsize);  
 #endif  
   
   if (tsize > 2)  mp_size_t
     {  mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
       /* Determine the successive precisions to use in the iteration.  We  {
          minimize the precisions, beginning with the highest (i.e. last    mp_limb_t *tp, s0[1], cc, high, rl;
          iteration) to the lowest (i.e. first iteration).  */    int c;
     mp_size_t rn, tn;
     TMP_DECL (marker);
   
       xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);    ASSERT (nn >= 0);
       ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);  
   
       t_end_ptr = tp + tsize;    /* If OP is zero, both results are zero.  */
     if (nn == 0)
       return 0;
   
       tmp = tsize / 2;    ASSERT (np[nn - 1] != 0);
       for (i = 0;; i++)    ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
         {    ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
           tsize = (tmp + 1) / 2;    ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
           if (tmp == tsize)  
             break;  
           tsizes[i] = tsize + tmp;  
           tmp = tsize;  
         }  
   
       /* Main Newton iteration loop.  For big arguments, most of the    high = np[nn - 1];
          time is spent here.  */    if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
       return mpn_sqrtrem1 (sp, rp, np);
     count_leading_zeros (c, high);
     c -= GMP_NAIL_BITS;
   
       /* It is possible to do a great optimization here.  The successive    c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
          divisors in the mpn_divmod call below have more and more leading    tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
          words equal to its predecessor.  Therefore the beginning of  
          each division will repeat the same work as did the last  
          division.  If we could guarantee that the leading words of two  
          consecutive divisors are the same (i.e. in this case, a later  
          divisor has just more digits at the end) it would be a simple  
          matter of just using the old remainder of the last division in  
          a subsequent division, to take care of this optimization.  This  
          idea would surely make a difference even for small arguments.  */  
   
       /* Loop invariants:    TMP_MARK (marker);
     if (nn % 2 != 0 || c > 0)
          R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.      {
          X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.        tp = TMP_ALLOC_LIMBS (2 * tn);
          R <= shiftdown_to_same_size(X).  */        tp[0] = 0;             /* needed only when 2*tn > nn, but saves a test */
         if (c != 0)
       while (--i >= 0)          mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
         else
           MPN_COPY (tp + 2 * tn - nn, np, nn);
         rl = mpn_dc_sqrtrem (sp, tp, tn);
         /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
            thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
         c += (nn % 2) * GMP_NUMB_BITS / 2;                /* c now represents k */
         s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);       /* S mod 2^k */
         rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);       /* R = R + 2*s0*S */
         cc = mpn_submul_1 (tp, s0, 1, s0[0]);
         rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
         mpn_rshift (sp, sp, tn, c);
         tp[tn] = rl;
         if (rp == NULL)
           rp = tp;
         c = c << 1;
         if (c < GMP_NUMB_BITS)
           tn++;
         else
         {          {
           mp_limb_t cy;            tp++;
 #ifdef SQRT_DEBUG            c -= GMP_NUMB_BITS;
           mp_limb_t old_least_sign_r = rp[0];  
           mp_size_t old_rsize = rsize;  
   
           printf ("R = ");  
           mpn_dump (rp, rsize);  
 #endif  
           tsize = tsizes[i];  
   
           /* Need to copy the numerator into temporary space, as  
              mpn_divmod overwrites its numerator argument with the  
              remainder (which we currently ignore).  */  
           MPN_COPY (ttp, t_end_ptr - tsize, tsize);  
           cy = mpn_divmod (xp, ttp, tsize, rp, rsize);  
           xsize = tsize - rsize;  
   
 #ifdef SQRT_DEBUG  
           printf ("X =%d ", cy);  
           mpn_dump (xp, xsize);  
 #endif  
   
           /* Add X and R with the most significant limbs aligned,  
              temporarily ignoring at least one limb at the low end of X.  */  
           tmp = xsize - rsize;  
           cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);  
   
           /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get  
              intermediate roots that'd need an extra bit.  We don't want to  
              handle that since it would make the subsequent divisor  
              non-normalized, so round such roots down to be only ones in the  
              current precision.  */  
           if (cy == 2)  
             {  
               mp_size_t j;  
               for (j = xsize; j >= 0; j--)  
                 xp[j] = ~(mp_limb_t)0;  
             }  
   
           /* Divide X by 2 and put the result in R.  This is the new  
              approximation.  Shift in the carry from the addition.  */  
           mpn_rshift (rp, xp, xsize, 1);  
           rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));  
           rsize = xsize;  
 #ifdef SQRT_DEBUG  
           if (old_least_sign_r != rp[rsize - old_rsize])  
             printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",  
                     i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,  
                     2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);  
 #endif  
         }          }
         if (c != 0)
           mpn_rshift (rp, tp, tn, c);
         else
           MPN_COPY_INCR (rp, tp, tn);
         rn = tn;
     }      }
     else
 #ifdef SQRT_DEBUG  
   printf ("(final) R = ");  
   mpn_dump (rp, rsize);  
 #endif  
   
   /* We computed the square root of OP * 2**(2*floor(cnt/2)).  
      This has resulted in R being 2**floor(cnt/2) to large.  
      Shift it down here to fix that.  */  
   if (cnt / 2 != 0)  
     {      {
       mpn_rshift (rp, rp, rsize, cnt/2);        if (rp == NULL)
       rsize -= rp[rsize - 1] == 0;          rp = TMP_ALLOC_LIMBS (nn);
         if (rp != np)
           MPN_COPY (rp, np, nn);
         rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
     }      }
   
   /* Calculate the remainder.  */    MPN_NORMALIZE (rp, rn);
   mpn_mul_n (tp, rp, rp, rsize);  
   tsize = rsize + rsize;  
   tsize -= tp[tsize - 1] == 0;  
   if (op_size < tsize  
       || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))  
     {  
       /* R is too large.  Decrement it.  */  
   
       /* These operations can't overflow.  */    TMP_FREE (marker);
       cy_limb  = mpn_sub_n (tp, tp, rp, rsize);    return rn;
       cy_limb += mpn_sub_n (tp, tp, rp, rsize);  
       mpn_decr_u (tp + rsize, cy_limb);  
       mpn_incr_u (tp, (mp_limb_t) 1);  
   
       mpn_decr_u (rp, (mp_limb_t) 1);  
   
 #ifdef SQRT_DEBUG  
       printf ("(adjusted) R = ");  
       mpn_dump (rp, rsize);  
 #endif  
     }  
   
   if (rem_ptr != NULL)  
     {  
       cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);  
       MPN_NORMALIZE (rem_ptr, op_size);  
       TMP_FREE (marker);  
       return op_size;  
     }  
   else  
     {  
       int res;  
       res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);  
       TMP_FREE (marker);  
       return res;  
     }  
 }  }

Legend:
Removed from v.1.1.1.2  
changed lines
  Added in v.1.1.1.3

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>