OpenXM_contrib/gmp/mpn/generic/sqrtrem.c - annotate

Return to sqrtrem.c CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / generic
Annotation of OpenXM_contrib/gmp/mpn/generic/sqrtrem.c, Revision 1.1.1.2

1.1       maekawa     1: /* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
                      2:
                      3:    Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
                      4:    Write the remainder at REM_PTR, if REM_PTR != NULL.
                      5:    Return the size of the remainder.
                      6:    (The size of the root is always half of the size of the operand.)
                      7:
                      8:    OP_PTR and ROOT_PTR may not point to the same object.
                      9:    OP_PTR and REM_PTR may point to the same object.
                     10:
                     11:    If REM_PTR is NULL, only the root is computed and the return value of
                     12:    the function is 0 if OP is a perfect square, and *any* non-zero number
                     13:    otherwise.
                     14:
1.1.1.2 ! maekawa    15: Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
        !            16: Foundation, Inc.
1.1       maekawa    17:
                     18: This file is part of the GNU MP Library.
                     19:
                     20: The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2 ! maekawa    21: it under the terms of the GNU Lesser General Public License as published by
        !            22: the Free Software Foundation; either version 2.1 of the License, or (at your
1.1       maekawa    23: option) any later version.
                     24:
                     25: The GNU MP Library is distributed in the hope that it will be useful, but
                     26: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2 ! maekawa    27: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
1.1       maekawa    28: License for more details.
                     29:
1.1.1.2 ! maekawa    30: You should have received a copy of the GNU Lesser General Public License
1.1       maekawa    31: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     32: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     33: MA 02111-1307, USA. */
                     34:
                     35: /* This code is just correct if "unsigned char" has at least 8 bits.  It
                     36:    doesn't help to use CHAR_BIT from limits.h, as the real problem is
                     37:    the static arrays.  */
                     38:
1.1.1.2 ! maekawa    39: #include <stdio.h> /* for NULL */
1.1       maekawa    40: #include "gmp.h"
                     41: #include "gmp-impl.h"
                     42: #include "longlong.h"
                     43:
                     44: /* Square root algorithm:
                     45:
                     46:    1. Shift OP (the input) to the left an even number of bits s.t. there
                     47:       are an even number of words and either (or both) of the most
                     48:       significant bits are set.  This way, sqrt(OP) has exactly half as
                     49:       many words as OP, and has its most significant bit set.
                     50:
                     51:    2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
                     52:       This approximation is used for the first single-precision
                     53:       iterations of Newton's method, yielding a full-word approximation
                     54:       to sqrt(OP).
                     55:
                     56:    3. Perform multiple-precision Newton iteration until we have the
                     57:       exact result.  Only about half of the input operand is used in
                     58:       this calculation, as the square root is perfectly determinable
                     59:       from just the higher half of a number.  */
                     60: 
                     61: /* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
                     62: #if defined __GNUC__ && ! defined __SOFT_FLOAT
                     63:
1.1.1.2 ! maekawa    64: #if defined (__sparc__) && BITS_PER_MP_LIMB == 32
1.1       maekawa    65: #define SQRT(a) \
                     66:   ({                                                                   \
                     67:     double __sqrt_res;                                                 \
                     68:     asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));                        \
                     69:     __sqrt_res;                                                                \
                     70:   })
                     71: #endif
                     72:
1.1.1.2 ! maekawa    73: #if defined (__HAVE_68881__)
1.1       maekawa    74: #define SQRT(a) \
                     75:   ({                                                                   \
                     76:     double __sqrt_res;                                                 \
                     77:     asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));                        \
                     78:     __sqrt_res;                                                                \
                     79:   })
                     80: #endif
                     81:
1.1.1.2 ! maekawa    82: #if defined (__hppa) && BITS_PER_MP_LIMB == 32
1.1       maekawa    83: #define SQRT(a) \
                     84:   ({                                                                   \
                     85:     double __sqrt_res;                                                 \
                     86:     asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));           \
                     87:     __sqrt_res;                                                                \
                     88:   })
                     89: #endif
                     90:
1.1.1.2 ! maekawa    91: #if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
1.1       maekawa    92: #define SQRT(a) \
                     93:   ({                                                                   \
                     94:     double __sqrt_res;                                                 \
                     95:     asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));                 \
                     96:     __sqrt_res;                                                                \
                     97:   })
                     98: #endif
                     99:
1.1.1.2 ! maekawa   100: #if 0
        !           101: #if defined (__i386__) || defined (__i486__)
        !           102: #define SQRT(a) \
        !           103:   ({                                                                   \
        !           104:     double __sqrt_res;                                                 \
        !           105:     asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a));                       \
        !           106:     __sqrt_res;                                                                \
        !           107:   })
        !           108: #endif
        !           109: #endif
        !           110:
1.1       maekawa   111: #endif
                    112:
                    113: #ifndef SQRT
                    114:
                    115: /* Tables for initial approximation of the square root.  These are
                    116:    indexed with bits 1-8 of the operand for which the square root is
                    117:    calculated, where bit 0 is the most significant non-zero bit.  I.e.
                    118:    the most significant one-bit is not used, since that per definition
                    119:    is one.  Likewise, the tables don't return the highest bit of the
                    120:    result.  That bit must be inserted by or:ing the returned value with
                    121:    0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
                    122:
                    123: /* Table to be used for operands with an even total number of bits.
                    124:    (Exactly as in the decimal system there are similarities between the
                    125:    square root of numbers with the same initial digits and an even
                    126:    difference in the total number of digits.  Consider the square root
                    127:    of 1, 10, 100, 1000, ...)  */
1.1.1.2 ! maekawa   128: static const unsigned char even_approx_tab[256] =
1.1       maekawa   129: {
                    130:   0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
                    131:   0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
                    132:   0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
                    133:   0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
                    134:   0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
                    135:   0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
                    136:   0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
                    137:   0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
                    138:   0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
                    139:   0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
                    140:   0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
                    141:   0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
                    142:   0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
                    143:   0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
                    144:   0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
                    145:   0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
                    146:   0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
                    147:   0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
                    148:   0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
                    149:   0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
                    150:   0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
                    151:   0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
                    152:   0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
                    153:   0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
                    154:   0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
                    155:   0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
                    156:   0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
                    157:   0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
                    158:   0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
                    159:   0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
                    160:   0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
                    161:   0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
                    162: };
                    163:
                    164: /* Table to be used for operands with an odd total number of bits.
                    165:    (Further comments before previous table.)  */
1.1.1.2 ! maekawa   166: static const unsigned char odd_approx_tab[256] =
1.1       maekawa   167: {
                    168:   0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
                    169:   0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
                    170:   0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
                    171:   0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
                    172:   0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
                    173:   0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
                    174:   0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
                    175:   0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
                    176:   0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
                    177:   0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
                    178:   0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
                    179:   0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
                    180:   0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
                    181:   0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
                    182:   0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
                    183:   0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
                    184:   0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
                    185:   0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
                    186:   0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
                    187:   0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
                    188:   0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
                    189:   0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
                    190:   0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
                    191:   0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
                    192:   0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
                    193:   0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
                    194:   0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
                    195:   0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
                    196:   0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
                    197:   0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
                    198:   0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
                    199:   0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
                    200: };
                    201: #endif
                    202:
                    203: 
                    204: mp_size_t
                    205: #if __STDC__
                    206: mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
                    207: #else
                    208: mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
                    209:      mp_ptr root_ptr;
                    210:      mp_ptr rem_ptr;
                    211:      mp_srcptr op_ptr;
                    212:      mp_size_t op_size;
                    213: #endif
                    214: {
                    215:   /* R (root result) */
                    216:   mp_ptr rp;                   /* Pointer to least significant word */
                    217:   mp_size_t rsize;             /* The size in words */
                    218:
                    219:   /* T (OP shifted to the left a.k.a. normalized) */
                    220:   mp_ptr tp;                   /* Pointer to least significant word */
                    221:   mp_size_t tsize;             /* The size in words */
                    222:   mp_ptr t_end_ptr;            /* Pointer right beyond most sign. word */
                    223:   mp_limb_t t_high0, t_high1;  /* The two most significant words */
                    224:
                    225:   /* TT (temporary for numerator/remainder) */
                    226:   mp_ptr ttp;                  /* Pointer to least significant word */
                    227:
                    228:   /* X (temporary for quotient in main loop) */
                    229:   mp_ptr xp;                   /* Pointer to least significant word */
                    230:   mp_size_t xsize;             /* The size in words */
                    231:
                    232:   unsigned cnt;
                    233:   mp_limb_t initial_approx;    /* Initially made approximation */
                    234:   mp_size_t tsizes[BITS_PER_MP_LIMB];  /* Successive calculation precisions */
                    235:   mp_size_t tmp;
                    236:   mp_size_t i;
                    237:
                    238:   mp_limb_t cy_limb;
                    239:   TMP_DECL (marker);
                    240:
                    241:   /* If OP is zero, both results are zero.  */
                    242:   if (op_size == 0)
                    243:     return 0;
                    244:
                    245:   count_leading_zeros (cnt, op_ptr[op_size - 1]);
                    246:   tsize = op_size;
                    247:   if ((tsize & 1) != 0)
                    248:     {
                    249:       cnt += BITS_PER_MP_LIMB;
                    250:       tsize++;
                    251:     }
                    252:
                    253:   rsize = tsize / 2;
                    254:   rp = root_ptr;
                    255:
                    256:   TMP_MARK (marker);
                    257:
                    258:   /* Shift OP an even number of bits into T, such that either the most or
                    259:      the second most significant bit is set, and such that the number of
                    260:      words in T becomes even.  This way, the number of words in R=sqrt(OP)
                    261:      is exactly half as many as in OP, and the most significant bit of R
                    262:      is set.
                    263:
                    264:      Also, the initial approximation is simplified by this up-shifted OP.
                    265:
                    266:      Finally, the Newtonian iteration which is the main part of this
                    267:      program performs division by R.  The fast division routine expects
                    268:      the divisor to be "normalized" in exactly the sense of having the
                    269:      most significant bit set.  */
                    270:
                    271:   tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
                    272:
                    273:   if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
                    274:     t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
                    275:                          (cnt & ~1) % BITS_PER_MP_LIMB);
                    276:   else
                    277:     MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
                    278:
                    279:   if (cnt >= BITS_PER_MP_LIMB)
                    280:     tp[0] = 0;
                    281:
                    282:   t_high0 = tp[tsize - 1];
                    283:   t_high1 = tp[tsize - 2];     /* Never stray.  TSIZE is >= 2.  */
                    284:
                    285: /* Is there a fast sqrt instruction defined for this machine?  */
                    286: #ifdef SQRT
                    287:   {
1.1.1.2 ! maekawa   288:     initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
1.1       maekawa   289:     /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
                    290:        become incorrect due to overflow in the conversion from double to
                    291:        mp_limb_t above.  It will typically be zero in that case, but might be
                    292:        a small number on some machines.  The most significant bit of
                    293:        INITIAL_APPROX should be set, so that bit is a good overflow
                    294:        indication.  */
                    295:     if ((mp_limb_signed_t) initial_approx >= 0)
                    296:       initial_approx = ~(mp_limb_t)0;
                    297:   }
                    298: #else
                    299:   /* Get a 9 bit approximation from the tables.  The tables expect to
                    300:      be indexed with the 8 high bits right below the highest bit.
                    301:      Also, the highest result bit is not returned by the tables, and
                    302:      must be or:ed into the result.  The scheme gives 9 bits of start
                    303:      approximation with just 256-entry 8 bit tables.  */
                    304:
                    305:   if ((cnt & 1) == 0)
                    306:     {
1.1.1.2 ! maekawa   307:       /* The most significant bit of t_high0 is set.  */
1.1       maekawa   308:       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
                    309:       initial_approx &= 0xff;
                    310:       initial_approx = even_approx_tab[initial_approx];
                    311:     }
                    312:   else
                    313:     {
1.1.1.2 ! maekawa   314:       /* The most significant bit of t_high0 is unset,
1.1       maekawa   315:         the second most significant is set.  */
                    316:       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
                    317:       initial_approx &= 0xff;
                    318:       initial_approx = odd_approx_tab[initial_approx];
                    319:     }
                    320:   initial_approx |= 0x100;
                    321:   initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
                    322:
                    323:   /* Perform small precision Newtonian iterations to get a full word
1.1.1.2 ! maekawa   324:      approximation.  For small operands, these iterations will do the
1.1       maekawa   325:      entire job.  */
                    326:   if (t_high0 == ~(mp_limb_t)0)
                    327:     initial_approx = t_high0;
                    328:   else
                    329:     {
                    330:       mp_limb_t quot;
                    331:
                    332:       if (t_high0 >= initial_approx)
                    333:        initial_approx = t_high0 + 1;
                    334:
                    335:       /* First get about 18 bits with pure C arithmetics.  */
                    336:       quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
                    337:       initial_approx = (initial_approx + quot) / 2;
                    338:       initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
                    339:
                    340:       /* Now get a full word by one (or for > 36 bit machines) several
                    341:         iterations.  */
1.1.1.2 ! maekawa   342:       for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
1.1       maekawa   343:        {
                    344:          mp_limb_t ignored_remainder;
                    345:
                    346:          udiv_qrnnd (quot, ignored_remainder,
                    347:                      t_high0, t_high1, initial_approx);
                    348:          initial_approx = (initial_approx + quot) / 2;
                    349:          initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
                    350:        }
                    351:     }
                    352: #endif
                    353:
                    354:   rp[0] = initial_approx;
                    355:   rsize = 1;
                    356:
1.1.1.2 ! maekawa   357: #ifdef SQRT_DEBUG
1.1       maekawa   358:          printf ("\n\nT = ");
                    359:          mpn_dump (tp, tsize);
                    360: #endif
                    361:
                    362:   if (tsize > 2)
                    363:     {
                    364:       /* Determine the successive precisions to use in the iteration.  We
                    365:         minimize the precisions, beginning with the highest (i.e. last
                    366:         iteration) to the lowest (i.e. first iteration).  */
                    367:
                    368:       xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
                    369:       ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
                    370:
                    371:       t_end_ptr = tp + tsize;
                    372:
                    373:       tmp = tsize / 2;
                    374:       for (i = 0;; i++)
                    375:        {
                    376:          tsize = (tmp + 1) / 2;
                    377:          if (tmp == tsize)
                    378:            break;
                    379:          tsizes[i] = tsize + tmp;
                    380:          tmp = tsize;
                    381:        }
                    382:
                    383:       /* Main Newton iteration loop.  For big arguments, most of the
                    384:         time is spent here.  */
                    385:
                    386:       /* It is possible to do a great optimization here.  The successive
1.1.1.2 ! maekawa   387:         divisors in the mpn_divmod call below have more and more leading
1.1       maekawa   388:         words equal to its predecessor.  Therefore the beginning of
                    389:         each division will repeat the same work as did the last
                    390:         division.  If we could guarantee that the leading words of two
                    391:         consecutive divisors are the same (i.e. in this case, a later
                    392:         divisor has just more digits at the end) it would be a simple
                    393:         matter of just using the old remainder of the last division in
                    394:         a subsequent division, to take care of this optimization.  This
                    395:         idea would surely make a difference even for small arguments.  */
                    396:
                    397:       /* Loop invariants:
                    398:
                    399:         R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
                    400:         X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
                    401:         R <= shiftdown_to_same_size(X).  */
                    402:
                    403:       while (--i >= 0)
                    404:        {
                    405:          mp_limb_t cy;
1.1.1.2 ! maekawa   406: #ifdef SQRT_DEBUG
1.1       maekawa   407:          mp_limb_t old_least_sign_r = rp[0];
                    408:          mp_size_t old_rsize = rsize;
                    409:
                    410:          printf ("R = ");
                    411:          mpn_dump (rp, rsize);
                    412: #endif
                    413:          tsize = tsizes[i];
                    414:
                    415:          /* Need to copy the numerator into temporary space, as
                    416:             mpn_divmod overwrites its numerator argument with the
                    417:             remainder (which we currently ignore).  */
                    418:          MPN_COPY (ttp, t_end_ptr - tsize, tsize);
                    419:          cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
                    420:          xsize = tsize - rsize;
                    421:
1.1.1.2 ! maekawa   422: #ifdef SQRT_DEBUG
1.1       maekawa   423:          printf ("X =%d ", cy);
                    424:          mpn_dump (xp, xsize);
                    425: #endif
                    426:
                    427:          /* Add X and R with the most significant limbs aligned,
                    428:             temporarily ignoring at least one limb at the low end of X.  */
                    429:          tmp = xsize - rsize;
                    430:          cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
                    431:
                    432:          /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
                    433:             intermediate roots that'd need an extra bit.  We don't want to
                    434:             handle that since it would make the subsequent divisor
                    435:             non-normalized, so round such roots down to be only ones in the
                    436:             current precision.  */
                    437:          if (cy == 2)
                    438:            {
                    439:              mp_size_t j;
                    440:              for (j = xsize; j >= 0; j--)
                    441:                xp[j] = ~(mp_limb_t)0;
                    442:            }
                    443:
                    444:          /* Divide X by 2 and put the result in R.  This is the new
                    445:             approximation.  Shift in the carry from the addition.  */
                    446:          mpn_rshift (rp, xp, xsize, 1);
                    447:          rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
                    448:          rsize = xsize;
1.1.1.2 ! maekawa   449: #ifdef SQRT_DEBUG
1.1       maekawa   450:          if (old_least_sign_r != rp[rsize - old_rsize])
                    451:            printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
                    452:                    i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
                    453:                    2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
                    454: #endif
                    455:        }
                    456:     }
                    457:
1.1.1.2 ! maekawa   458: #ifdef SQRT_DEBUG
1.1       maekawa   459:   printf ("(final) R = ");
                    460:   mpn_dump (rp, rsize);
                    461: #endif
                    462:
                    463:   /* We computed the square root of OP * 2**(2*floor(cnt/2)).
                    464:      This has resulted in R being 2**floor(cnt/2) to large.
                    465:      Shift it down here to fix that.  */
                    466:   if (cnt / 2 != 0)
                    467:     {
                    468:       mpn_rshift (rp, rp, rsize, cnt/2);
                    469:       rsize -= rp[rsize - 1] == 0;
                    470:     }
                    471:
                    472:   /* Calculate the remainder.  */
                    473:   mpn_mul_n (tp, rp, rp, rsize);
                    474:   tsize = rsize + rsize;
                    475:   tsize -= tp[tsize - 1] == 0;
                    476:   if (op_size < tsize
                    477:       || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
                    478:     {
                    479:       /* R is too large.  Decrement it.  */
                    480:
                    481:       /* These operations can't overflow.  */
                    482:       cy_limb  = mpn_sub_n (tp, tp, rp, rsize);
                    483:       cy_limb += mpn_sub_n (tp, tp, rp, rsize);
1.1.1.2 ! maekawa   484:       mpn_decr_u (tp + rsize, cy_limb);
        !           485:       mpn_incr_u (tp, (mp_limb_t) 1);
1.1       maekawa   486:
1.1.1.2 ! maekawa   487:       mpn_decr_u (rp, (mp_limb_t) 1);
1.1       maekawa   488:
1.1.1.2 ! maekawa   489: #ifdef SQRT_DEBUG
1.1       maekawa   490:       printf ("(adjusted) R = ");
                    491:       mpn_dump (rp, rsize);
                    492: #endif
                    493:     }
                    494:
                    495:   if (rem_ptr != NULL)
                    496:     {
                    497:       cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
                    498:       MPN_NORMALIZE (rem_ptr, op_size);
                    499:       TMP_FREE (marker);
                    500:       return op_size;
                    501:     }
                    502:   else
                    503:     {
                    504:       int res;
                    505:       res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
                    506:       TMP_FREE (marker);
                    507:       return res;
                    508:     }
                    509: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>