=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/generic/Attic/gcd.c,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1.1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/generic/Attic/gcd.c 2000/01/10 15:35:23 1.1.1.1 +++ OpenXM_contrib/gmp/mpn/generic/Attic/gcd.c 2000/09/09 14:12:24 1.1.1.2 @@ -1,20 +1,21 @@ /* mpn/gcd.c: mpn_gcd for gcd of two odd integers. -Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc. +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software +Foundation, Inc. This file is part of the GNU MP Library. The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Library General Public License as published by -the Free Software Foundation; either version 2 of the License, or (at your +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU MP Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. -You should have received a copy of the GNU Library General Public License +You should have received a copy of the GNU Lesser General Public License along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -22,7 +23,7 @@ MA 02111-1307, USA. */ /* Integer greatest common divisor of two unsigned integers, using the accelerated algorithm (see reference below). - mp_size_t mpn_gcd (vp, vsize, up, usize). + mp_size_t mpn_gcd (up, usize, vp, vsize). Preconditions [U = (up, usize) and V = (vp, vsize)]: @@ -47,11 +48,11 @@ MA 02111-1307, USA. */ #include "gmp-impl.h" #include "longlong.h" -/* If MIN (usize, vsize) > ACCEL_THRESHOLD, then the accelerated algorithm is - used, otherwise the binary algorithm is used. This may be adjusted for - different architectures. */ -#ifndef ACCEL_THRESHOLD -#define ACCEL_THRESHOLD 4 +/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated + algorithm is used, otherwise the binary algorithm is used. This may be + adjusted for different architectures. */ +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 #endif /* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated @@ -62,14 +63,7 @@ enum BMOD_THRESHOLD = BITS_PER_MP_LIMB/2 }; -#define SIGN_BIT (~(~(mp_limb_t)0 >> 1)) - -#define SWAP_LIMB(UL, VL) do{mp_limb_t __l=(UL);(UL)=(VL);(VL)=__l;}while(0) -#define SWAP_PTR(UP, VP) do{mp_ptr __p=(UP);(UP)=(VP);(VP)=__p;}while(0) -#define SWAP_SZ(US, VS) do{mp_size_t __s=(US);(US)=(VS);(VS)=__s;}while(0) -#define SWAP_MPN(UP, US, VP, VS) do{SWAP_PTR(UP,VP);SWAP_SZ(US,VS);}while(0) - /* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2. Both U and V must be odd. */ static __gmp_inline mp_size_t @@ -129,7 +123,11 @@ gcd_2 (vp, up) precision. If N2 > N1 initially, the first iteration of the while loop will swap them. In all other situations, N1 >= N2 is maintained. */ -static __gmp_inline mp_limb_t +static +#if ! defined (__i386__) +__gmp_inline /* don't inline this for the x86 */ +#endif +mp_limb_t #if __STDC__ find_a (mp_srcptr cp) #else @@ -149,7 +147,7 @@ find_a (cp) while (n2_h) /* While N2 >= 2^BITS_PER_MP_LIMB. */ { /* N1 <-- N1 % N2. */ - if ((SIGN_BIT >> leading_zero_bits & n2_h) == 0) + if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0) { unsigned long int i; count_leading_zeros (i, n2_h); @@ -167,8 +165,8 @@ find_a (cp) if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l; - SWAP_LIMB (n1_h, n2_h); - SWAP_LIMB (n1_l, n2_l); + MP_LIMB_T_SWAP (n1_h, n2_h); + MP_LIMB_T_SWAP (n1_l, n2_l); } return n2_l; @@ -176,14 +174,14 @@ find_a (cp) mp_size_t #if __STDC__ -mpn_gcd (mp_ptr gp, mp_ptr vp, mp_size_t vsize, mp_ptr up, mp_size_t usize) +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) #else -mpn_gcd (gp, vp, vsize, up, usize) +mpn_gcd (gp, up, usize, vp, vsize) mp_ptr gp; - mp_ptr vp; - mp_size_t vsize; mp_ptr up; mp_size_t usize; + mp_ptr vp; + mp_size_t vsize; #endif { mp_ptr orig_vp = vp; @@ -193,9 +191,9 @@ mpn_gcd (gp, vp, vsize, up, usize) TMP_MARK (marker); - /* Use accelerated algorithm if vsize is over ACCEL_THRESHOLD. + /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD. Two EXTRA limbs for U and V are required for kary reduction. */ - if (vsize > ACCEL_THRESHOLD) + if (vsize >= GCD_ACCEL_THRESHOLD) { unsigned long int vbitsize, d; mp_ptr orig_up = up; @@ -228,7 +226,9 @@ mpn_gcd (gp, vp, vsize, up, usize) do /* Main loop. */ { - if (up[usize-1] & SIGN_BIT) /* U < 0; take twos' compl. */ + /* mpn_com_n can't be used here because anchor_up and up may + partially overlap */ + if (up[usize-1] & MP_LIMB_T_HIGHBIT) /* U < 0; take twos' compl. */ { mp_size_t i; anchor_up[0] = -up[0]; @@ -241,15 +241,15 @@ mpn_gcd (gp, vp, vsize, up, usize) if ((up[0] & 1) == 0) /* Result even; remove twos. */ { - unsigned long int r; + unsigned int r; count_trailing_zeros (r, up[0]); mpn_rshift (anchor_up, up, usize, r); usize -= (anchor_up[usize-1] == 0); } else if (anchor_up != up) - MPN_COPY (anchor_up, up, usize); + MPN_COPY_INCR (anchor_up, up, usize); - SWAP_MPN (anchor_up, usize, vp, vsize); + MPN_PTR_SWAP (anchor_up,usize, vp,vsize); up = anchor_up; if (vsize <= 2) /* Kary can't handle < 2 limbs and */ @@ -271,8 +271,13 @@ mpn_gcd (gp, vp, vsize, up, usize) mp_limb_t bp[2], cp[2]; /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB). */ - cp[0] = vp[0], cp[1] = vp[1]; - mpn_bdivmod (cp, cp, 2, up, 2, 2*BITS_PER_MP_LIMB); + { + mp_limb_t u_inv, hi, lo; + modlimb_invert (u_inv, up[0]); + cp[0] = vp[0] * u_inv; + umul_ppmm (hi, lo, cp[0], up[0]); + cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv; + } /* U <-- find_a (C) * U. */ up[usize] = mpn_mul_1 (up, up, usize, find_a (cp)); @@ -280,11 +285,18 @@ mpn_gcd (gp, vp, vsize, up, usize) /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1). bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and - bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 */ - bp[0] = up[0], bp[1] = up[1]; - mpn_bdivmod (bp, bp, 2, vp, 2, BITS_PER_MP_LIMB); - bp[1] &= 1; /* Since V is odd, division is unnecessary. */ + bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 + Like V/U above, but simplified because only the low bit of + bp[1] is wanted. */ + { + mp_limb_t v_inv, hi, lo; + modlimb_invert (v_inv, vp[0]); + bp[0] = up[0] * v_inv; + umul_ppmm (hi, lo, bp[0], vp[0]); + bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1; + } + up[usize++] = 0; if (bp[1]) /* B < 0: U <-- U + (-B) * V. */ { @@ -342,7 +354,7 @@ mpn_gcd (gp, vp, vsize, up, usize) up += 1, usize -= 1; if ((up[0] & 1) == 0) { - unsigned long int r; + unsigned int r; count_trailing_zeros (r, up[0]); mpn_rshift (up, up, usize, r); usize -= (up[usize-1] == 0); @@ -350,7 +362,7 @@ mpn_gcd (gp, vp, vsize, up, usize) /* Keep usize >= vsize. */ if (usize < vsize) - SWAP_MPN (up, usize, vp, vsize); + MPN_PTR_SWAP (up, usize, vp, vsize); if (usize <= 2) /* Double precision. */ { @@ -375,7 +387,7 @@ mpn_gcd (gp, vp, vsize, up, usize) size--; while (up[size] == vp[size]); if (up[size] < vp[size]) /* usize == vsize. */ - SWAP_PTR (up, vp); + MP_PTR_SWAP (up, vp); up += zeros, usize = size + 1 - zeros; mpn_sub_n (up, up, vp + zeros, usize); }