[BACK]Return to sqrtrem.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / generic

Annotation of OpenXM_contrib/gmp/mpn/generic/sqrtrem.c, Revision 1.1

1.1     ! maekawa     1: /* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
        !             2:
        !             3:    Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
        !             4:    Write the remainder at REM_PTR, if REM_PTR != NULL.
        !             5:    Return the size of the remainder.
        !             6:    (The size of the root is always half of the size of the operand.)
        !             7:
        !             8:    OP_PTR and ROOT_PTR may not point to the same object.
        !             9:    OP_PTR and REM_PTR may point to the same object.
        !            10:
        !            11:    If REM_PTR is NULL, only the root is computed and the return value of
        !            12:    the function is 0 if OP is a perfect square, and *any* non-zero number
        !            13:    otherwise.
        !            14:
        !            15: Copyright (C) 1993, 1994, 1996 Free Software Foundation, Inc.
        !            16:
        !            17: This file is part of the GNU MP Library.
        !            18:
        !            19: The GNU MP Library is free software; you can redistribute it and/or modify
        !            20: it under the terms of the GNU Library General Public License as published by
        !            21: the Free Software Foundation; either version 2 of the License, or (at your
        !            22: option) any later version.
        !            23:
        !            24: The GNU MP Library is distributed in the hope that it will be useful, but
        !            25: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
        !            26: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
        !            27: License for more details.
        !            28:
        !            29: You should have received a copy of the GNU Library General Public License
        !            30: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
        !            31: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
        !            32: MA 02111-1307, USA. */
        !            33:
        !            34: /* This code is just correct if "unsigned char" has at least 8 bits.  It
        !            35:    doesn't help to use CHAR_BIT from limits.h, as the real problem is
        !            36:    the static arrays.  */
        !            37:
        !            38: #include "gmp.h"
        !            39: #include "gmp-impl.h"
        !            40: #include "longlong.h"
        !            41:
        !            42: /* Square root algorithm:
        !            43:
        !            44:    1. Shift OP (the input) to the left an even number of bits s.t. there
        !            45:       are an even number of words and either (or both) of the most
        !            46:       significant bits are set.  This way, sqrt(OP) has exactly half as
        !            47:       many words as OP, and has its most significant bit set.
        !            48:
        !            49:    2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
        !            50:       This approximation is used for the first single-precision
        !            51:       iterations of Newton's method, yielding a full-word approximation
        !            52:       to sqrt(OP).
        !            53:
        !            54:    3. Perform multiple-precision Newton iteration until we have the
        !            55:       exact result.  Only about half of the input operand is used in
        !            56:       this calculation, as the square root is perfectly determinable
        !            57:       from just the higher half of a number.  */
        !            58: 
        !            59: /* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
        !            60: #if defined __GNUC__ && ! defined __SOFT_FLOAT
        !            61:
        !            62: #if defined __sparc__
        !            63: #define SQRT(a) \
        !            64:   ({                                                                   \
        !            65:     double __sqrt_res;                                                 \
        !            66:     asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));                        \
        !            67:     __sqrt_res;                                                                \
        !            68:   })
        !            69: #endif
        !            70:
        !            71: #if defined __HAVE_68881__
        !            72: #define SQRT(a) \
        !            73:   ({                                                                   \
        !            74:     double __sqrt_res;                                                 \
        !            75:     asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));                        \
        !            76:     __sqrt_res;                                                                \
        !            77:   })
        !            78: #endif
        !            79:
        !            80: #if defined __hppa
        !            81: #define SQRT(a) \
        !            82:   ({                                                                   \
        !            83:     double __sqrt_res;                                                 \
        !            84:     asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));           \
        !            85:     __sqrt_res;                                                                \
        !            86:   })
        !            87: #endif
        !            88:
        !            89: #if defined _ARCH_PWR2
        !            90: #define SQRT(a) \
        !            91:   ({                                                                   \
        !            92:     double __sqrt_res;                                                 \
        !            93:     asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));                 \
        !            94:     __sqrt_res;                                                                \
        !            95:   })
        !            96: #endif
        !            97:
        !            98: #endif
        !            99:
        !           100: #ifndef SQRT
        !           101:
        !           102: /* Tables for initial approximation of the square root.  These are
        !           103:    indexed with bits 1-8 of the operand for which the square root is
        !           104:    calculated, where bit 0 is the most significant non-zero bit.  I.e.
        !           105:    the most significant one-bit is not used, since that per definition
        !           106:    is one.  Likewise, the tables don't return the highest bit of the
        !           107:    result.  That bit must be inserted by or:ing the returned value with
        !           108:    0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
        !           109:
        !           110: /* Table to be used for operands with an even total number of bits.
        !           111:    (Exactly as in the decimal system there are similarities between the
        !           112:    square root of numbers with the same initial digits and an even
        !           113:    difference in the total number of digits.  Consider the square root
        !           114:    of 1, 10, 100, 1000, ...)  */
        !           115: static unsigned char even_approx_tab[256] =
        !           116: {
        !           117:   0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
        !           118:   0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
        !           119:   0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
        !           120:   0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
        !           121:   0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
        !           122:   0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
        !           123:   0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
        !           124:   0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
        !           125:   0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
        !           126:   0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
        !           127:   0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
        !           128:   0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
        !           129:   0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
        !           130:   0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
        !           131:   0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
        !           132:   0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
        !           133:   0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
        !           134:   0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
        !           135:   0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
        !           136:   0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
        !           137:   0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
        !           138:   0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
        !           139:   0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
        !           140:   0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
        !           141:   0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
        !           142:   0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
        !           143:   0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
        !           144:   0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
        !           145:   0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
        !           146:   0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
        !           147:   0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
        !           148:   0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
        !           149: };
        !           150:
        !           151: /* Table to be used for operands with an odd total number of bits.
        !           152:    (Further comments before previous table.)  */
        !           153: static unsigned char odd_approx_tab[256] =
        !           154: {
        !           155:   0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
        !           156:   0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
        !           157:   0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
        !           158:   0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
        !           159:   0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
        !           160:   0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
        !           161:   0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
        !           162:   0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
        !           163:   0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
        !           164:   0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
        !           165:   0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
        !           166:   0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
        !           167:   0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
        !           168:   0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
        !           169:   0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
        !           170:   0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
        !           171:   0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
        !           172:   0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
        !           173:   0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
        !           174:   0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
        !           175:   0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
        !           176:   0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
        !           177:   0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
        !           178:   0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
        !           179:   0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
        !           180:   0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
        !           181:   0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
        !           182:   0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
        !           183:   0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
        !           184:   0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
        !           185:   0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
        !           186:   0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
        !           187: };
        !           188: #endif
        !           189:
        !           190: 
        !           191: mp_size_t
        !           192: #if __STDC__
        !           193: mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
        !           194: #else
        !           195: mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
        !           196:      mp_ptr root_ptr;
        !           197:      mp_ptr rem_ptr;
        !           198:      mp_srcptr op_ptr;
        !           199:      mp_size_t op_size;
        !           200: #endif
        !           201: {
        !           202:   /* R (root result) */
        !           203:   mp_ptr rp;                   /* Pointer to least significant word */
        !           204:   mp_size_t rsize;             /* The size in words */
        !           205:
        !           206:   /* T (OP shifted to the left a.k.a. normalized) */
        !           207:   mp_ptr tp;                   /* Pointer to least significant word */
        !           208:   mp_size_t tsize;             /* The size in words */
        !           209:   mp_ptr t_end_ptr;            /* Pointer right beyond most sign. word */
        !           210:   mp_limb_t t_high0, t_high1;  /* The two most significant words */
        !           211:
        !           212:   /* TT (temporary for numerator/remainder) */
        !           213:   mp_ptr ttp;                  /* Pointer to least significant word */
        !           214:
        !           215:   /* X (temporary for quotient in main loop) */
        !           216:   mp_ptr xp;                   /* Pointer to least significant word */
        !           217:   mp_size_t xsize;             /* The size in words */
        !           218:
        !           219:   unsigned cnt;
        !           220:   mp_limb_t initial_approx;    /* Initially made approximation */
        !           221:   mp_size_t tsizes[BITS_PER_MP_LIMB];  /* Successive calculation precisions */
        !           222:   mp_size_t tmp;
        !           223:   mp_size_t i;
        !           224:
        !           225:   mp_limb_t cy_limb;
        !           226:   TMP_DECL (marker);
        !           227:
        !           228:   /* If OP is zero, both results are zero.  */
        !           229:   if (op_size == 0)
        !           230:     return 0;
        !           231:
        !           232:   count_leading_zeros (cnt, op_ptr[op_size - 1]);
        !           233:   tsize = op_size;
        !           234:   if ((tsize & 1) != 0)
        !           235:     {
        !           236:       cnt += BITS_PER_MP_LIMB;
        !           237:       tsize++;
        !           238:     }
        !           239:
        !           240:   rsize = tsize / 2;
        !           241:   rp = root_ptr;
        !           242:
        !           243:   TMP_MARK (marker);
        !           244:
        !           245:   /* Shift OP an even number of bits into T, such that either the most or
        !           246:      the second most significant bit is set, and such that the number of
        !           247:      words in T becomes even.  This way, the number of words in R=sqrt(OP)
        !           248:      is exactly half as many as in OP, and the most significant bit of R
        !           249:      is set.
        !           250:
        !           251:      Also, the initial approximation is simplified by this up-shifted OP.
        !           252:
        !           253:      Finally, the Newtonian iteration which is the main part of this
        !           254:      program performs division by R.  The fast division routine expects
        !           255:      the divisor to be "normalized" in exactly the sense of having the
        !           256:      most significant bit set.  */
        !           257:
        !           258:   tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
        !           259:
        !           260:   if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
        !           261:     t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
        !           262:                          (cnt & ~1) % BITS_PER_MP_LIMB);
        !           263:   else
        !           264:     MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
        !           265:
        !           266:   if (cnt >= BITS_PER_MP_LIMB)
        !           267:     tp[0] = 0;
        !           268:
        !           269:   t_high0 = tp[tsize - 1];
        !           270:   t_high1 = tp[tsize - 2];     /* Never stray.  TSIZE is >= 2.  */
        !           271:
        !           272: /* Is there a fast sqrt instruction defined for this machine?  */
        !           273: #ifdef SQRT
        !           274:   {
        !           275:     initial_approx = SQRT (t_high0 * 2.0
        !           276:                           * ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1))
        !           277:                           + t_high1);
        !           278:     /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
        !           279:        become incorrect due to overflow in the conversion from double to
        !           280:        mp_limb_t above.  It will typically be zero in that case, but might be
        !           281:        a small number on some machines.  The most significant bit of
        !           282:        INITIAL_APPROX should be set, so that bit is a good overflow
        !           283:        indication.  */
        !           284:     if ((mp_limb_signed_t) initial_approx >= 0)
        !           285:       initial_approx = ~(mp_limb_t)0;
        !           286:   }
        !           287: #else
        !           288:   /* Get a 9 bit approximation from the tables.  The tables expect to
        !           289:      be indexed with the 8 high bits right below the highest bit.
        !           290:      Also, the highest result bit is not returned by the tables, and
        !           291:      must be or:ed into the result.  The scheme gives 9 bits of start
        !           292:      approximation with just 256-entry 8 bit tables.  */
        !           293:
        !           294:   if ((cnt & 1) == 0)
        !           295:     {
        !           296:       /* The most sign bit of t_high0 is set.  */
        !           297:       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
        !           298:       initial_approx &= 0xff;
        !           299:       initial_approx = even_approx_tab[initial_approx];
        !           300:     }
        !           301:   else
        !           302:     {
        !           303:       /* The most significant bit of T_HIGH0 is unset,
        !           304:         the second most significant is set.  */
        !           305:       initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
        !           306:       initial_approx &= 0xff;
        !           307:       initial_approx = odd_approx_tab[initial_approx];
        !           308:     }
        !           309:   initial_approx |= 0x100;
        !           310:   initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
        !           311:
        !           312:   /* Perform small precision Newtonian iterations to get a full word
        !           313:      approximation.  For small operands, these iteration will make the
        !           314:      entire job.  */
        !           315:   if (t_high0 == ~(mp_limb_t)0)
        !           316:     initial_approx = t_high0;
        !           317:   else
        !           318:     {
        !           319:       mp_limb_t quot;
        !           320:
        !           321:       if (t_high0 >= initial_approx)
        !           322:        initial_approx = t_high0 + 1;
        !           323:
        !           324:       /* First get about 18 bits with pure C arithmetics.  */
        !           325:       quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
        !           326:       initial_approx = (initial_approx + quot) / 2;
        !           327:       initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
        !           328:
        !           329:       /* Now get a full word by one (or for > 36 bit machines) several
        !           330:         iterations.  */
        !           331:       for (i = 16; i < BITS_PER_MP_LIMB; i <<= 1)
        !           332:        {
        !           333:          mp_limb_t ignored_remainder;
        !           334:
        !           335:          udiv_qrnnd (quot, ignored_remainder,
        !           336:                      t_high0, t_high1, initial_approx);
        !           337:          initial_approx = (initial_approx + quot) / 2;
        !           338:          initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
        !           339:        }
        !           340:     }
        !           341: #endif
        !           342:
        !           343:   rp[0] = initial_approx;
        !           344:   rsize = 1;
        !           345:
        !           346: #ifdef DEBUG
        !           347:          printf ("\n\nT = ");
        !           348:          mpn_dump (tp, tsize);
        !           349: #endif
        !           350:
        !           351:   if (tsize > 2)
        !           352:     {
        !           353:       /* Determine the successive precisions to use in the iteration.  We
        !           354:         minimize the precisions, beginning with the highest (i.e. last
        !           355:         iteration) to the lowest (i.e. first iteration).  */
        !           356:
        !           357:       xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
        !           358:       ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
        !           359:
        !           360:       t_end_ptr = tp + tsize;
        !           361:
        !           362:       tmp = tsize / 2;
        !           363:       for (i = 0;; i++)
        !           364:        {
        !           365:          tsize = (tmp + 1) / 2;
        !           366:          if (tmp == tsize)
        !           367:            break;
        !           368:          tsizes[i] = tsize + tmp;
        !           369:          tmp = tsize;
        !           370:        }
        !           371:
        !           372:       /* Main Newton iteration loop.  For big arguments, most of the
        !           373:         time is spent here.  */
        !           374:
        !           375:       /* It is possible to do a great optimization here.  The successive
        !           376:         divisors in the mpn_divmod call below has more and more leading
        !           377:         words equal to its predecessor.  Therefore the beginning of
        !           378:         each division will repeat the same work as did the last
        !           379:         division.  If we could guarantee that the leading words of two
        !           380:         consecutive divisors are the same (i.e. in this case, a later
        !           381:         divisor has just more digits at the end) it would be a simple
        !           382:         matter of just using the old remainder of the last division in
        !           383:         a subsequent division, to take care of this optimization.  This
        !           384:         idea would surely make a difference even for small arguments.  */
        !           385:
        !           386:       /* Loop invariants:
        !           387:
        !           388:         R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
        !           389:         X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
        !           390:         R <= shiftdown_to_same_size(X).  */
        !           391:
        !           392:       while (--i >= 0)
        !           393:        {
        !           394:          mp_limb_t cy;
        !           395: #ifdef DEBUG
        !           396:          mp_limb_t old_least_sign_r = rp[0];
        !           397:          mp_size_t old_rsize = rsize;
        !           398:
        !           399:          printf ("R = ");
        !           400:          mpn_dump (rp, rsize);
        !           401: #endif
        !           402:          tsize = tsizes[i];
        !           403:
        !           404:          /* Need to copy the numerator into temporary space, as
        !           405:             mpn_divmod overwrites its numerator argument with the
        !           406:             remainder (which we currently ignore).  */
        !           407:          MPN_COPY (ttp, t_end_ptr - tsize, tsize);
        !           408:          cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
        !           409:          xsize = tsize - rsize;
        !           410:
        !           411: #ifdef DEBUG
        !           412:          printf ("X =%d ", cy);
        !           413:          mpn_dump (xp, xsize);
        !           414: #endif
        !           415:
        !           416:          /* Add X and R with the most significant limbs aligned,
        !           417:             temporarily ignoring at least one limb at the low end of X.  */
        !           418:          tmp = xsize - rsize;
        !           419:          cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
        !           420:
        !           421:          /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
        !           422:             intermediate roots that'd need an extra bit.  We don't want to
        !           423:             handle that since it would make the subsequent divisor
        !           424:             non-normalized, so round such roots down to be only ones in the
        !           425:             current precision.  */
        !           426:          if (cy == 2)
        !           427:            {
        !           428:              mp_size_t j;
        !           429:              for (j = xsize; j >= 0; j--)
        !           430:                xp[j] = ~(mp_limb_t)0;
        !           431:            }
        !           432:
        !           433:          /* Divide X by 2 and put the result in R.  This is the new
        !           434:             approximation.  Shift in the carry from the addition.  */
        !           435:          mpn_rshift (rp, xp, xsize, 1);
        !           436:          rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
        !           437:          rsize = xsize;
        !           438: #ifdef DEBUG
        !           439:          if (old_least_sign_r != rp[rsize - old_rsize])
        !           440:            printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
        !           441:                    i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
        !           442:                    2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
        !           443: #endif
        !           444:        }
        !           445:     }
        !           446:
        !           447: #ifdef DEBUG
        !           448:   printf ("(final) R = ");
        !           449:   mpn_dump (rp, rsize);
        !           450: #endif
        !           451:
        !           452:   /* We computed the square root of OP * 2**(2*floor(cnt/2)).
        !           453:      This has resulted in R being 2**floor(cnt/2) to large.
        !           454:      Shift it down here to fix that.  */
        !           455:   if (cnt / 2 != 0)
        !           456:     {
        !           457:       mpn_rshift (rp, rp, rsize, cnt/2);
        !           458:       rsize -= rp[rsize - 1] == 0;
        !           459:     }
        !           460:
        !           461:   /* Calculate the remainder.  */
        !           462:   mpn_mul_n (tp, rp, rp, rsize);
        !           463:   tsize = rsize + rsize;
        !           464:   tsize -= tp[tsize - 1] == 0;
        !           465:   if (op_size < tsize
        !           466:       || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
        !           467:     {
        !           468:       /* R is too large.  Decrement it.  */
        !           469:
        !           470:       /* These operations can't overflow.  */
        !           471:       cy_limb  = mpn_sub_n (tp, tp, rp, rsize);
        !           472:       cy_limb += mpn_sub_n (tp, tp, rp, rsize);
        !           473:       mpn_sub_1 (tp + rsize, tp + rsize, tsize - rsize, cy_limb);
        !           474:       mpn_add_1 (tp, tp, tsize, (mp_limb_t) 1);
        !           475:
        !           476:       mpn_sub_1 (rp, rp, rsize, (mp_limb_t) 1);
        !           477:
        !           478: #ifdef DEBUG
        !           479:       printf ("(adjusted) R = ");
        !           480:       mpn_dump (rp, rsize);
        !           481: #endif
        !           482:     }
        !           483:
        !           484:   if (rem_ptr != NULL)
        !           485:     {
        !           486:       cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
        !           487:       MPN_NORMALIZE (rem_ptr, op_size);
        !           488:       TMP_FREE (marker);
        !           489:       return op_size;
        !           490:     }
        !           491:   else
        !           492:     {
        !           493:       int res;
        !           494:       res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
        !           495:       TMP_FREE (marker);
        !           496:       return res;
        !           497:     }
        !           498: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>