version 1.1.1.2, 2000/09/09 14:12:27 |
version 1.1.1.3, 2003/08/25 16:06:20 |
|
|
/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) |
/* mpn_sqrtrem -- square root and remainder */ |
|
|
Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR. |
/* |
Write the remainder at REM_PTR, if REM_PTR != NULL. |
Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
Return the size of the remainder. |
|
(The size of the root is always half of the size of the operand.) |
|
|
|
OP_PTR and ROOT_PTR may not point to the same object. |
|
OP_PTR and REM_PTR may point to the same object. |
|
|
|
If REM_PTR is NULL, only the root is computed and the return value of |
|
the function is 0 if OP is a perfect square, and *any* non-zero number |
|
otherwise. |
|
|
|
Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software |
|
Foundation, Inc. |
|
|
|
This file is part of the GNU MP Library. |
This file is part of the GNU MP Library. |
|
|
The GNU MP Library is free software; you can redistribute it and/or modify |
The GNU MP Library is free software; you can redistribute it and/or modify |
Line 30 License for more details. |
|
Line 18 License for more details. |
|
You should have received a copy of the GNU Lesser General Public License |
You should have received a copy of the GNU Lesser General Public License |
along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
MA 02111-1307, USA. */ |
MA 02111-1307, USA. |
|
*/ |
|
|
/* This code is just correct if "unsigned char" has at least 8 bits. It |
|
doesn't help to use CHAR_BIT from limits.h, as the real problem is |
|
the static arrays. */ |
|
|
|
#include <stdio.h> /* for NULL */ |
/* Contributed by Paul Zimmermann. |
|
See "Karatsuba Square Root", reference in gmp.texi. */ |
|
|
|
|
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
|
#include "gmp.h" |
#include "gmp.h" |
#include "gmp-impl.h" |
#include "gmp-impl.h" |
#include "longlong.h" |
#include "longlong.h" |
|
|
/* Square root algorithm: |
|
|
|
1. Shift OP (the input) to the left an even number of bits s.t. there |
|
are an even number of words and either (or both) of the most |
|
significant bits are set. This way, sqrt(OP) has exactly half as |
|
many words as OP, and has its most significant bit set. |
|
|
|
2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables. |
/* Square roots table. Generated by the following program: |
This approximation is used for the first single-precision |
#include "gmp.h" |
iterations of Newton's method, yielding a full-word approximation |
main(){mpz_t x;int i;mpz_init(x);for(i=64;i<256;i++){mpz_set_ui(x,256*i); |
to sqrt(OP). |
mpz_sqrt(x,x);mpz_out_str(0,10,x);printf(",");if(i%16==15)printf("\n");}} |
|
*/ |
|
static const unsigned char approx_tab[192] = |
|
{ |
|
128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142, |
|
143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155, |
|
156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168, |
|
169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180, |
|
181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191, |
|
192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201, |
|
202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211, |
|
212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221, |
|
221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230, |
|
230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238, |
|
239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247, |
|
247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255 |
|
}; |
|
|
3. Perform multiple-precision Newton iteration until we have the |
#define HALF_NAIL (GMP_NAIL_BITS / 2) |
exact result. Only about half of the input operand is used in |
|
this calculation, as the square root is perfectly determinable |
|
from just the higher half of a number. */ |
|
|
|
/* Define this macro for IEEE P854 machines with a fast sqrt instruction. */ |
|
#if defined __GNUC__ && ! defined __SOFT_FLOAT |
|
|
|
#if defined (__sparc__) && BITS_PER_MP_LIMB == 32 |
/* same as mpn_sqrtrem, but for size=1 and {np, 1} normalized */ |
#define SQRT(a) \ |
static mp_size_t |
({ \ |
mpn_sqrtrem1 (mp_ptr sp, mp_ptr rp, mp_srcptr np) |
double __sqrt_res; \ |
|
asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \ |
|
__sqrt_res; \ |
|
}) |
|
#endif |
|
|
|
#if defined (__HAVE_68881__) |
|
#define SQRT(a) \ |
|
({ \ |
|
double __sqrt_res; \ |
|
asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \ |
|
__sqrt_res; \ |
|
}) |
|
#endif |
|
|
|
#if defined (__hppa) && BITS_PER_MP_LIMB == 32 |
|
#define SQRT(a) \ |
|
({ \ |
|
double __sqrt_res; \ |
|
asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \ |
|
__sqrt_res; \ |
|
}) |
|
#endif |
|
|
|
#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32 |
|
#define SQRT(a) \ |
|
({ \ |
|
double __sqrt_res; \ |
|
asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a)); \ |
|
__sqrt_res; \ |
|
}) |
|
#endif |
|
|
|
#if 0 |
|
#if defined (__i386__) || defined (__i486__) |
|
#define SQRT(a) \ |
|
({ \ |
|
double __sqrt_res; \ |
|
asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a)); \ |
|
__sqrt_res; \ |
|
}) |
|
#endif |
|
#endif |
|
|
|
#endif |
|
|
|
#ifndef SQRT |
|
|
|
/* Tables for initial approximation of the square root. These are |
|
indexed with bits 1-8 of the operand for which the square root is |
|
calculated, where bit 0 is the most significant non-zero bit. I.e. |
|
the most significant one-bit is not used, since that per definition |
|
is one. Likewise, the tables don't return the highest bit of the |
|
result. That bit must be inserted by or:ing the returned value with |
|
0x100. This way, we get a 9-bit approximation from 8-bit tables! */ |
|
|
|
/* Table to be used for operands with an even total number of bits. |
|
(Exactly as in the decimal system there are similarities between the |
|
square root of numbers with the same initial digits and an even |
|
difference in the total number of digits. Consider the square root |
|
of 1, 10, 100, 1000, ...) */ |
|
static const unsigned char even_approx_tab[256] = |
|
{ |
{ |
0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e, |
mp_limb_t np0, s, r, q, u; |
0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74, |
int prec; |
0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79, |
|
0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f, |
|
0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84, |
|
0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89, |
|
0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f, |
|
0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94, |
|
0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99, |
|
0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e, |
|
0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3, |
|
0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7, |
|
0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac, |
|
0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1, |
|
0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6, |
|
0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba, |
|
0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf, |
|
0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3, |
|
0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8, |
|
0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc, |
|
0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1, |
|
0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5, |
|
0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda, |
|
0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde, |
|
0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2, |
|
0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6, |
|
0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb, |
|
0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef, |
|
0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3, |
|
0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7, |
|
0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb, |
|
0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff, |
|
}; |
|
|
|
/* Table to be used for operands with an odd total number of bits. |
ASSERT (np[0] >= GMP_NUMB_HIGHBIT / 2); |
(Further comments before previous table.) */ |
ASSERT (GMP_LIMB_BITS >= 16); |
static const unsigned char odd_approx_tab[256] = |
|
{ |
|
0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, |
|
0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, |
|
0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, |
|
0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, |
|
0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, |
|
0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16, |
|
0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a, |
|
0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d, |
|
0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21, |
|
0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24, |
|
0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28, |
|
0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b, |
|
0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f, |
|
0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32, |
|
0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35, |
|
0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39, |
|
0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c, |
|
0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f, |
|
0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42, |
|
0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45, |
|
0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49, |
|
0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c, |
|
0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f, |
|
0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52, |
|
0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55, |
|
0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58, |
|
0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b, |
|
0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e, |
|
0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61, |
|
0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63, |
|
0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66, |
|
0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69, |
|
}; |
|
#endif |
|
|
|
|
/* first compute a 8-bit approximation from the high 8-bits of np[0] */ |
mp_size_t |
np0 = np[0] << GMP_NAIL_BITS; |
#if __STDC__ |
q = np0 >> (GMP_LIMB_BITS - 8); |
mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size) |
/* 2^6 = 64 <= q < 256 = 2^8 */ |
#else |
s = approx_tab[q - 64]; /* 128 <= s < 255 */ |
mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) |
r = (np0 >> (GMP_LIMB_BITS - 16)) - s * s; /* r <= 2*s */ |
mp_ptr root_ptr; |
if (r > 2 * s) |
mp_ptr rem_ptr; |
{ |
mp_srcptr op_ptr; |
r -= 2 * s + 1; |
mp_size_t op_size; |
s++; |
#endif |
} |
{ |
|
/* R (root result) */ |
|
mp_ptr rp; /* Pointer to least significant word */ |
|
mp_size_t rsize; /* The size in words */ |
|
|
|
/* T (OP shifted to the left a.k.a. normalized) */ |
prec = 8; |
mp_ptr tp; /* Pointer to least significant word */ |
np0 <<= 2 * prec; |
mp_size_t tsize; /* The size in words */ |
while (2 * prec < GMP_LIMB_BITS) |
mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */ |
|
mp_limb_t t_high0, t_high1; /* The two most significant words */ |
|
|
|
/* TT (temporary for numerator/remainder) */ |
|
mp_ptr ttp; /* Pointer to least significant word */ |
|
|
|
/* X (temporary for quotient in main loop) */ |
|
mp_ptr xp; /* Pointer to least significant word */ |
|
mp_size_t xsize; /* The size in words */ |
|
|
|
unsigned cnt; |
|
mp_limb_t initial_approx; /* Initially made approximation */ |
|
mp_size_t tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */ |
|
mp_size_t tmp; |
|
mp_size_t i; |
|
|
|
mp_limb_t cy_limb; |
|
TMP_DECL (marker); |
|
|
|
/* If OP is zero, both results are zero. */ |
|
if (op_size == 0) |
|
return 0; |
|
|
|
count_leading_zeros (cnt, op_ptr[op_size - 1]); |
|
tsize = op_size; |
|
if ((tsize & 1) != 0) |
|
{ |
{ |
cnt += BITS_PER_MP_LIMB; |
/* invariant: s has prec bits, and r <= 2*s */ |
tsize++; |
r = (r << prec) + (np0 >> (GMP_LIMB_BITS - prec)); |
|
np0 <<= prec; |
|
u = 2 * s; |
|
q = r / u; |
|
u = r - q * u; |
|
s = (s << prec) + q; |
|
u = (u << prec) + (np0 >> (GMP_LIMB_BITS - prec)); |
|
q = q * q; |
|
r = u - q; |
|
if (u < q) |
|
{ |
|
r += 2 * s - 1; |
|
s --; |
|
} |
|
np0 <<= prec; |
|
prec = 2 * prec; |
} |
} |
|
|
rsize = tsize / 2; |
ASSERT (2 * prec == GMP_LIMB_BITS); /* GMP_LIMB_BITS must be a power of 2 */ |
rp = root_ptr; |
|
|
|
TMP_MARK (marker); |
/* normalize back, assuming GMP_NAIL_BITS is even */ |
|
ASSERT (GMP_NAIL_BITS % 2 == 0); |
|
sp[0] = s >> HALF_NAIL; |
|
u = s - (sp[0] << HALF_NAIL); /* s mod 2^HALF_NAIL */ |
|
r += u * ((sp[0] << (HALF_NAIL + 1)) + u); |
|
r = r >> GMP_NAIL_BITS; |
|
|
/* Shift OP an even number of bits into T, such that either the most or |
if (rp != NULL) |
the second most significant bit is set, and such that the number of |
rp[0] = r; |
words in T becomes even. This way, the number of words in R=sqrt(OP) |
return r != 0 ? 1 : 0; |
is exactly half as many as in OP, and the most significant bit of R |
} |
is set. |
|
|
|
Also, the initial approximation is simplified by this up-shifted OP. |
|
|
|
Finally, the Newtonian iteration which is the main part of this |
#define Prec (GMP_NUMB_BITS >> 1) |
program performs division by R. The fast division routine expects |
|
the divisor to be "normalized" in exactly the sense of having the |
|
most significant bit set. */ |
|
|
|
tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); |
/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized |
|
return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ |
|
static mp_limb_t |
|
mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np) |
|
{ |
|
mp_limb_t qhl, q, u, np0; |
|
int cc; |
|
|
if ((cnt & ~1) % BITS_PER_MP_LIMB != 0) |
ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2); |
t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size, |
|
(cnt & ~1) % BITS_PER_MP_LIMB); |
|
else |
|
MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size); |
|
|
|
if (cnt >= BITS_PER_MP_LIMB) |
np0 = np[0]; |
tp[0] = 0; |
mpn_sqrtrem1 (sp, rp, np + 1); |
|
qhl = 0; |
t_high0 = tp[tsize - 1]; |
while (rp[0] >= sp[0]) |
t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */ |
|
|
|
/* Is there a fast sqrt instruction defined for this machine? */ |
|
#ifdef SQRT |
|
{ |
|
initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1); |
|
/* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have |
|
become incorrect due to overflow in the conversion from double to |
|
mp_limb_t above. It will typically be zero in that case, but might be |
|
a small number on some machines. The most significant bit of |
|
INITIAL_APPROX should be set, so that bit is a good overflow |
|
indication. */ |
|
if ((mp_limb_signed_t) initial_approx >= 0) |
|
initial_approx = ~(mp_limb_t)0; |
|
} |
|
#else |
|
/* Get a 9 bit approximation from the tables. The tables expect to |
|
be indexed with the 8 high bits right below the highest bit. |
|
Also, the highest result bit is not returned by the tables, and |
|
must be or:ed into the result. The scheme gives 9 bits of start |
|
approximation with just 256-entry 8 bit tables. */ |
|
|
|
if ((cnt & 1) == 0) |
|
{ |
{ |
/* The most significant bit of t_high0 is set. */ |
qhl++; |
initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1); |
rp[0] -= sp[0]; |
initial_approx &= 0xff; |
|
initial_approx = even_approx_tab[initial_approx]; |
|
} |
} |
else |
/* now rp[0] < sp[0] < 2^Prec */ |
|
rp[0] = (rp[0] << Prec) + (np0 >> Prec); |
|
u = 2 * sp[0]; |
|
q = rp[0] / u; |
|
u = rp[0] - q * u; |
|
q += (qhl & 1) << (Prec - 1); |
|
qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */ |
|
/* now we have (initial rp[0])<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp[0]) + u */ |
|
sp[0] = ((sp[0] + qhl) << Prec) + q; |
|
cc = u >> Prec; |
|
rp[0] = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1)); |
|
/* subtract q * q or qhl*2^(2*Prec) from rp */ |
|
cc -= mpn_sub_1 (rp, rp, 1, q * q) + qhl; |
|
/* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */ |
|
if (cc < 0) |
{ |
{ |
/* The most significant bit of t_high0 is unset, |
cc += sp[0] != 0 ? mpn_add_1 (rp, rp, 1, sp[0]) : 1; |
the second most significant is set. */ |
cc += mpn_add_1 (rp, rp, 1, --sp[0]); |
initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2); |
|
initial_approx &= 0xff; |
|
initial_approx = odd_approx_tab[initial_approx]; |
|
} |
} |
initial_approx |= 0x100; |
|
initial_approx <<= BITS_PER_MP_LIMB - 8 - 1; |
|
|
|
/* Perform small precision Newtonian iterations to get a full word |
return cc; |
approximation. For small operands, these iterations will do the |
} |
entire job. */ |
|
if (t_high0 == ~(mp_limb_t)0) |
|
initial_approx = t_high0; |
|
else |
|
{ |
|
mp_limb_t quot; |
|
|
|
if (t_high0 >= initial_approx) |
/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, |
initial_approx = t_high0 + 1; |
and in {np, n} the low n limbs of the remainder, returns the high |
|
limb of the remainder (which is 0 or 1). |
|
Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 |
|
where B=2^GMP_NUMB_BITS. */ |
|
static mp_limb_t |
|
mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n) |
|
{ |
|
mp_limb_t q; /* carry out of {sp, n} */ |
|
int c, b; /* carry out of remainder */ |
|
mp_size_t l, h; |
|
|
/* First get about 18 bits with pure C arithmetics. */ |
ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2); |
quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2; |
|
initial_approx = (initial_approx + quot) / 2; |
|
initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); |
|
|
|
/* Now get a full word by one (or for > 36 bit machines) several |
if (n == 1) |
iterations. */ |
c = mpn_sqrtrem2 (sp, np, np); |
for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1) |
else |
{ |
{ |
mp_limb_t ignored_remainder; |
l = n / 2; |
|
h = n - l; |
|
q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h); |
|
if (q != 0) |
|
mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h); |
|
q += mpn_divrem (sp, 0, np + l, n, sp + l, h); |
|
c = sp[0] & 1; |
|
mpn_rshift (sp, sp, l, 1); |
|
sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; |
|
q >>= 1; |
|
if (c != 0) |
|
c = mpn_add_n (np + l, np + l, sp + l, h); |
|
mpn_sqr_n (np + n, sp, l); |
|
b = q + mpn_sub_n (np, np, np + n, 2 * l); |
|
c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, b); |
|
q = mpn_add_1 (sp + l, sp + l, h, q); |
|
|
udiv_qrnnd (quot, ignored_remainder, |
if (c < 0) |
t_high0, t_high1, initial_approx); |
{ |
initial_approx = (initial_approx + quot) / 2; |
c += mpn_addmul_1 (np, sp, n, 2) + 2 * q; |
initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); |
c -= mpn_sub_1 (np, np, n, 1); |
} |
q -= mpn_sub_1 (sp, sp, n, 1); |
|
} |
} |
} |
#endif |
|
|
|
rp[0] = initial_approx; |
return c; |
rsize = 1; |
} |
|
|
#ifdef SQRT_DEBUG |
|
printf ("\n\nT = "); |
|
mpn_dump (tp, tsize); |
|
#endif |
|
|
|
if (tsize > 2) |
mp_size_t |
{ |
mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn) |
/* Determine the successive precisions to use in the iteration. We |
{ |
minimize the precisions, beginning with the highest (i.e. last |
mp_limb_t *tp, s0[1], cc, high, rl; |
iteration) to the lowest (i.e. first iteration). */ |
int c; |
|
mp_size_t rn, tn; |
|
TMP_DECL (marker); |
|
|
xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); |
ASSERT (nn >= 0); |
ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); |
|
|
|
t_end_ptr = tp + tsize; |
/* If OP is zero, both results are zero. */ |
|
if (nn == 0) |
|
return 0; |
|
|
tmp = tsize / 2; |
ASSERT (np[nn - 1] != 0); |
for (i = 0;; i++) |
ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn)); |
{ |
ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn)); |
tsize = (tmp + 1) / 2; |
ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn)); |
if (tmp == tsize) |
|
break; |
|
tsizes[i] = tsize + tmp; |
|
tmp = tsize; |
|
} |
|
|
|
/* Main Newton iteration loop. For big arguments, most of the |
high = np[nn - 1]; |
time is spent here. */ |
if (nn == 1 && (high & GMP_NUMB_HIGHBIT)) |
|
return mpn_sqrtrem1 (sp, rp, np); |
|
count_leading_zeros (c, high); |
|
c -= GMP_NAIL_BITS; |
|
|
/* It is possible to do a great optimization here. The successive |
c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ |
divisors in the mpn_divmod call below have more and more leading |
tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */ |
words equal to its predecessor. Therefore the beginning of |
|
each division will repeat the same work as did the last |
|
division. If we could guarantee that the leading words of two |
|
consecutive divisors are the same (i.e. in this case, a later |
|
divisor has just more digits at the end) it would be a simple |
|
matter of just using the old remainder of the last division in |
|
a subsequent division, to take care of this optimization. This |
|
idea would surely make a difference even for small arguments. */ |
|
|
|
/* Loop invariants: |
TMP_MARK (marker); |
|
if (nn % 2 != 0 || c > 0) |
R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1. |
{ |
X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X. |
tp = TMP_ALLOC_LIMBS (2 * tn); |
R <= shiftdown_to_same_size(X). */ |
tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */ |
|
if (c != 0) |
while (--i >= 0) |
mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c); |
|
else |
|
MPN_COPY (tp + 2 * tn - nn, np, nn); |
|
rl = mpn_dc_sqrtrem (sp, tp, tn); |
|
/* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2, |
|
thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */ |
|
c += (nn % 2) * GMP_NUMB_BITS / 2; /* c now represents k */ |
|
s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1); /* S mod 2^k */ |
|
rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */ |
|
cc = mpn_submul_1 (tp, s0, 1, s0[0]); |
|
rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc; |
|
mpn_rshift (sp, sp, tn, c); |
|
tp[tn] = rl; |
|
if (rp == NULL) |
|
rp = tp; |
|
c = c << 1; |
|
if (c < GMP_NUMB_BITS) |
|
tn++; |
|
else |
{ |
{ |
mp_limb_t cy; |
tp++; |
#ifdef SQRT_DEBUG |
c -= GMP_NUMB_BITS; |
mp_limb_t old_least_sign_r = rp[0]; |
|
mp_size_t old_rsize = rsize; |
|
|
|
printf ("R = "); |
|
mpn_dump (rp, rsize); |
|
#endif |
|
tsize = tsizes[i]; |
|
|
|
/* Need to copy the numerator into temporary space, as |
|
mpn_divmod overwrites its numerator argument with the |
|
remainder (which we currently ignore). */ |
|
MPN_COPY (ttp, t_end_ptr - tsize, tsize); |
|
cy = mpn_divmod (xp, ttp, tsize, rp, rsize); |
|
xsize = tsize - rsize; |
|
|
|
#ifdef SQRT_DEBUG |
|
printf ("X =%d ", cy); |
|
mpn_dump (xp, xsize); |
|
#endif |
|
|
|
/* Add X and R with the most significant limbs aligned, |
|
temporarily ignoring at least one limb at the low end of X. */ |
|
tmp = xsize - rsize; |
|
cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize); |
|
|
|
/* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get |
|
intermediate roots that'd need an extra bit. We don't want to |
|
handle that since it would make the subsequent divisor |
|
non-normalized, so round such roots down to be only ones in the |
|
current precision. */ |
|
if (cy == 2) |
|
{ |
|
mp_size_t j; |
|
for (j = xsize; j >= 0; j--) |
|
xp[j] = ~(mp_limb_t)0; |
|
} |
|
|
|
/* Divide X by 2 and put the result in R. This is the new |
|
approximation. Shift in the carry from the addition. */ |
|
mpn_rshift (rp, xp, xsize, 1); |
|
rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)); |
|
rsize = xsize; |
|
#ifdef SQRT_DEBUG |
|
if (old_least_sign_r != rp[rsize - old_rsize]) |
|
printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n", |
|
i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r, |
|
2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]); |
|
#endif |
|
} |
} |
|
if (c != 0) |
|
mpn_rshift (rp, tp, tn, c); |
|
else |
|
MPN_COPY_INCR (rp, tp, tn); |
|
rn = tn; |
} |
} |
|
else |
#ifdef SQRT_DEBUG |
|
printf ("(final) R = "); |
|
mpn_dump (rp, rsize); |
|
#endif |
|
|
|
/* We computed the square root of OP * 2**(2*floor(cnt/2)). |
|
This has resulted in R being 2**floor(cnt/2) to large. |
|
Shift it down here to fix that. */ |
|
if (cnt / 2 != 0) |
|
{ |
{ |
mpn_rshift (rp, rp, rsize, cnt/2); |
if (rp == NULL) |
rsize -= rp[rsize - 1] == 0; |
rp = TMP_ALLOC_LIMBS (nn); |
|
if (rp != np) |
|
MPN_COPY (rp, np, nn); |
|
rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn)); |
} |
} |
|
|
/* Calculate the remainder. */ |
MPN_NORMALIZE (rp, rn); |
mpn_mul_n (tp, rp, rp, rsize); |
|
tsize = rsize + rsize; |
|
tsize -= tp[tsize - 1] == 0; |
|
if (op_size < tsize |
|
|| (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0)) |
|
{ |
|
/* R is too large. Decrement it. */ |
|
|
|
/* These operations can't overflow. */ |
TMP_FREE (marker); |
cy_limb = mpn_sub_n (tp, tp, rp, rsize); |
return rn; |
cy_limb += mpn_sub_n (tp, tp, rp, rsize); |
|
mpn_decr_u (tp + rsize, cy_limb); |
|
mpn_incr_u (tp, (mp_limb_t) 1); |
|
|
|
mpn_decr_u (rp, (mp_limb_t) 1); |
|
|
|
#ifdef SQRT_DEBUG |
|
printf ("(adjusted) R = "); |
|
mpn_dump (rp, rsize); |
|
#endif |
|
} |
|
|
|
if (rem_ptr != NULL) |
|
{ |
|
cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize); |
|
MPN_NORMALIZE (rem_ptr, op_size); |
|
TMP_FREE (marker); |
|
return op_size; |
|
} |
|
else |
|
{ |
|
int res; |
|
res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size); |
|
TMP_FREE (marker); |
|
return res; |
|
} |
|
} |
} |