version 1.1.1.1, 2000/01/10 15:35:23 |
version 1.1.1.2, 2000/09/09 14:12:26 |
|
|
/* mpn_mul_n -- Multiply two natural numbers of length n. |
/* mpn_mul_n and helper function -- Multiply/square natural numbers. |
|
|
Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc. |
THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n) |
|
ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH |
|
THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED |
|
THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
|
|
|
|
|
Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software |
|
Foundation, Inc. |
|
|
This file is part of the GNU MP Library. |
This file is part of the GNU MP Library. |
|
|
The GNU MP Library is free software; you can redistribute it and/or modify |
The GNU MP Library is free software; you can redistribute it and/or modify |
it under the terms of the GNU Library General Public License as published by |
it under the terms of the GNU Lesser General Public License as published by |
the Free Software Foundation; either version 2 of the License, or (at your |
the Free Software Foundation; either version 2.1 of the License, or (at your |
option) any later version. |
option) any later version. |
|
|
The GNU MP Library is distributed in the hope that it will be useful, but |
The GNU MP Library is distributed in the hope that it will be useful, but |
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public |
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
License for more details. |
License for more details. |
|
|
You should have received a copy of the GNU Library General Public License |
You should have received a copy of the GNU Lesser General Public License |
along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
MA 02111-1307, USA. */ |
MA 02111-1307, USA. */ |
|
|
#include "gmp.h" |
#include "gmp.h" |
#include "gmp-impl.h" |
#include "gmp-impl.h" |
|
#include "longlong.h" |
|
|
/* Multiply the natural numbers u (pointed to by UP) and v (pointed to by VP), |
|
both with SIZE limbs, and store the result at PRODP. 2 * SIZE limbs are |
|
always stored. Return the most significant limb. |
|
|
|
Argument constraints: |
/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. |
1. PRODP != UP and PRODP != VP, i.e. the destination |
0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ |
must be distinct from the multiplier and the multiplicand. */ |
#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) |
|
|
/* If KARATSUBA_THRESHOLD is not already defined, define it to a |
#if !defined (__alpha) && !defined (__mips) |
value which is good on most machines. */ |
/* For all other machines, we want to call mpn functions for the compund |
#ifndef KARATSUBA_THRESHOLD |
operations instead of open-coding them. */ |
#define KARATSUBA_THRESHOLD 32 |
#define USE_MORE_MPN |
#endif |
#endif |
|
|
/* The code can't handle KARATSUBA_THRESHOLD smaller than 2. */ |
/*== Function declarations =================================================*/ |
#if KARATSUBA_THRESHOLD < 2 |
|
#undef KARATSUBA_THRESHOLD |
|
#define KARATSUBA_THRESHOLD 2 |
|
#endif |
|
|
|
/* Handle simple cases with traditional multiplication. |
static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr, |
|
mp_ptr, mp_ptr, mp_ptr, |
|
mp_srcptr, mp_srcptr, mp_srcptr, |
|
mp_size_t, mp_size_t)); |
|
static void interpolate3 _PROTO ((mp_srcptr, |
|
mp_ptr, mp_ptr, mp_ptr, |
|
mp_srcptr, |
|
mp_ptr, mp_ptr, mp_ptr, |
|
mp_size_t, mp_size_t)); |
|
static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); |
|
|
This is the most critical code of multiplication. All multiplies rely |
|
on this, both small and huge. Small ones arrive here immediately. Huge |
|
ones arrive here as this is the base case for Karatsuba's recursive |
|
algorithm below. */ |
|
|
|
|
/*-- mpn_kara_mul_n ---------------------------------------------------------------*/ |
|
|
|
/* Multiplies using 3 half-sized mults and so on recursively. |
|
* p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. |
|
* No overlap of p[...] with a[...] or b[...]. |
|
* ws is workspace. |
|
*/ |
|
|
void |
void |
#if __STDC__ |
#if __STDC__ |
impn_mul_n_basecase (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size) |
mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) |
#else |
#else |
impn_mul_n_basecase (prodp, up, vp, size) |
mpn_kara_mul_n(p, a, b, n, ws) |
mp_ptr prodp; |
mp_ptr p; |
mp_srcptr up; |
mp_srcptr a; |
mp_srcptr vp; |
mp_srcptr b; |
mp_size_t size; |
mp_size_t n; |
|
mp_ptr ws; |
#endif |
#endif |
{ |
{ |
mp_size_t i; |
mp_limb_t i, sign, w, w0, w1; |
mp_limb_t cy_limb; |
mp_size_t n2; |
mp_limb_t v_limb; |
mp_srcptr x, y; |
|
|
/* Multiply by the first limb in V separately, as the result can be |
n2 = n >> 1; |
stored (not added) to PROD. We also avoid a loop for zeroing. */ |
ASSERT (n2 > 0); |
v_limb = vp[0]; |
|
if (v_limb <= 1) |
if (n & 1) |
{ |
{ |
if (v_limb == 1) |
/* Odd length. */ |
MPN_COPY (prodp, up, size); |
mp_size_t n1, n3, nm1; |
|
|
|
n3 = n - n2; |
|
|
|
sign = 0; |
|
w = a[n2]; |
|
if (w != 0) |
|
w -= mpn_sub_n (p, a, a + n3, n2); |
else |
else |
MPN_ZERO (prodp, size); |
{ |
cy_limb = 0; |
i = n2; |
|
do |
|
{ |
|
--i; |
|
w0 = a[i]; |
|
w1 = a[n3+i]; |
|
} |
|
while (w0 == w1 && i != 0); |
|
if (w0 < w1) |
|
{ |
|
x = a + n3; |
|
y = a; |
|
sign = 1; |
|
} |
|
else |
|
{ |
|
x = a; |
|
y = a + n3; |
|
} |
|
mpn_sub_n (p, x, y, n2); |
|
} |
|
p[n2] = w; |
|
|
|
w = b[n2]; |
|
if (w != 0) |
|
w -= mpn_sub_n (p + n3, b, b + n3, n2); |
|
else |
|
{ |
|
i = n2; |
|
do |
|
{ |
|
--i; |
|
w0 = b[i]; |
|
w1 = b[n3+i]; |
|
} |
|
while (w0 == w1 && i != 0); |
|
if (w0 < w1) |
|
{ |
|
x = b + n3; |
|
y = b; |
|
sign ^= 1; |
|
} |
|
else |
|
{ |
|
x = b; |
|
y = b + n3; |
|
} |
|
mpn_sub_n (p + n3, x, y, n2); |
|
} |
|
p[n] = w; |
|
|
|
n1 = n + 1; |
|
if (n2 < KARATSUBA_MUL_THRESHOLD) |
|
{ |
|
if (n3 < KARATSUBA_MUL_THRESHOLD) |
|
{ |
|
mpn_mul_basecase (ws, p, n3, p + n3, n3); |
|
mpn_mul_basecase (p, a, n3, b, n3); |
|
} |
|
else |
|
{ |
|
mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); |
|
mpn_kara_mul_n (p, a, b, n3, ws + n1); |
|
} |
|
mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2); |
|
} |
|
else |
|
{ |
|
mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); |
|
mpn_kara_mul_n (p, a, b, n3, ws + n1); |
|
mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1); |
|
} |
|
|
|
if (sign) |
|
mpn_add_n (ws, p, ws, n1); |
|
else |
|
mpn_sub_n (ws, p, ws, n1); |
|
|
|
nm1 = n - 1; |
|
if (mpn_add_n (ws, p + n1, ws, nm1)) |
|
{ |
|
mp_limb_t x = ws[nm1] + 1; |
|
ws[nm1] = x; |
|
if (x == 0) |
|
++ws[n]; |
|
} |
|
if (mpn_add_n (p + n3, p + n3, ws, n1)) |
|
{ |
|
mp_limb_t x; |
|
i = n1 + n3; |
|
do |
|
{ |
|
x = p[i] + 1; |
|
p[i] = x; |
|
++i; |
|
} while (x == 0); |
|
} |
} |
} |
else |
else |
cy_limb = mpn_mul_1 (prodp, up, size, v_limb); |
{ |
|
/* Even length. */ |
|
mp_limb_t t; |
|
|
prodp[size] = cy_limb; |
i = n2; |
prodp++; |
do |
|
{ |
|
--i; |
|
w0 = a[i]; |
|
w1 = a[n2+i]; |
|
} |
|
while (w0 == w1 && i != 0); |
|
sign = 0; |
|
if (w0 < w1) |
|
{ |
|
x = a + n2; |
|
y = a; |
|
sign = 1; |
|
} |
|
else |
|
{ |
|
x = a; |
|
y = a + n2; |
|
} |
|
mpn_sub_n (p, x, y, n2); |
|
|
/* For each iteration in the outer loop, multiply one limb from |
i = n2; |
U with one limb from V, and add it to PROD. */ |
do |
for (i = 1; i < size; i++) |
|
{ |
|
v_limb = vp[i]; |
|
if (v_limb <= 1) |
|
{ |
{ |
cy_limb = 0; |
--i; |
if (v_limb == 1) |
w0 = b[i]; |
cy_limb = mpn_add_n (prodp, prodp, up, size); |
w1 = b[n2+i]; |
} |
} |
|
while (w0 == w1 && i != 0); |
|
if (w0 < w1) |
|
{ |
|
x = b + n2; |
|
y = b; |
|
sign ^= 1; |
|
} |
else |
else |
cy_limb = mpn_addmul_1 (prodp, up, size, v_limb); |
{ |
|
x = b; |
|
y = b + n2; |
|
} |
|
mpn_sub_n (p + n2, x, y, n2); |
|
|
prodp[size] = cy_limb; |
/* Pointwise products. */ |
prodp++; |
if (n2 < KARATSUBA_MUL_THRESHOLD) |
|
{ |
|
mpn_mul_basecase (ws, p, n2, p + n2, n2); |
|
mpn_mul_basecase (p, a, n2, b, n2); |
|
mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2); |
|
} |
|
else |
|
{ |
|
mpn_kara_mul_n (ws, p, p + n2, n2, ws + n); |
|
mpn_kara_mul_n (p, a, b, n2, ws + n); |
|
mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n); |
|
} |
|
|
|
/* Interpolate. */ |
|
if (sign) |
|
w = mpn_add_n (ws, p, ws, n); |
|
else |
|
w = -mpn_sub_n (ws, p, ws, n); |
|
w += mpn_add_n (ws, p + n, ws, n); |
|
w += mpn_add_n (p + n2, p + n2, ws, n); |
|
/* TO DO: could put "if (w) { ... }" here. |
|
* Less work but badly predicted branch. |
|
* No measurable difference in speed on Alpha. |
|
*/ |
|
i = n + n2; |
|
t = p[i] + w; |
|
p[i] = t; |
|
if (t < w) |
|
{ |
|
do |
|
{ |
|
++i; |
|
w = p[i] + 1; |
|
p[i] = w; |
|
} |
|
while (w == 0); |
|
} |
} |
} |
} |
} |
|
|
void |
void |
#if __STDC__ |
#if __STDC__ |
impn_mul_n (mp_ptr prodp, |
mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) |
mp_srcptr up, mp_srcptr vp, mp_size_t size, mp_ptr tspace) |
|
#else |
#else |
impn_mul_n (prodp, up, vp, size, tspace) |
mpn_kara_sqr_n (p, a, n, ws) |
mp_ptr prodp; |
mp_ptr p; |
mp_srcptr up; |
mp_srcptr a; |
mp_srcptr vp; |
mp_size_t n; |
mp_size_t size; |
mp_ptr ws; |
mp_ptr tspace; |
|
#endif |
#endif |
{ |
{ |
if ((size & 1) != 0) |
mp_limb_t i, sign, w, w0, w1; |
{ |
mp_size_t n2; |
/* The size is odd, the code code below doesn't handle that. |
mp_srcptr x, y; |
Multiply the least significant (size - 1) limbs with a recursive |
|
call, and handle the most significant limb of S1 and S2 |
|
separately. */ |
|
/* A slightly faster way to do this would be to make the Karatsuba |
|
code below behave as if the size were even, and let it check for |
|
odd size in the end. I.e., in essence move this code to the end. |
|
Doing so would save us a recursive call, and potentially make the |
|
stack grow a lot less. */ |
|
|
|
mp_size_t esize = size - 1; /* even size */ |
n2 = n >> 1; |
mp_limb_t cy_limb; |
ASSERT (n2 > 0); |
|
|
MPN_MUL_N_RECURSE (prodp, up, vp, esize, tspace); |
if (n & 1) |
cy_limb = mpn_addmul_1 (prodp + esize, up, esize, vp[esize]); |
|
prodp[esize + esize] = cy_limb; |
|
cy_limb = mpn_addmul_1 (prodp + esize, vp, size, up[esize]); |
|
|
|
prodp[esize + size] = cy_limb; |
|
} |
|
else |
|
{ |
{ |
/* Anatolij Alekseevich Karatsuba's divide-and-conquer algorithm. |
/* Odd length. */ |
|
mp_size_t n1, n3, nm1; |
|
|
Split U in two pieces, U1 and U0, such that |
n3 = n - n2; |
U = U0 + U1*(B**n), |
|
and V in V1 and V0, such that |
|
V = V0 + V1*(B**n). |
|
|
|
UV is then computed recursively using the identity |
sign = 0; |
|
w = a[n2]; |
|
if (w != 0) |
|
w -= mpn_sub_n (p, a, a + n3, n2); |
|
else |
|
{ |
|
i = n2; |
|
do |
|
{ |
|
--i; |
|
w0 = a[i]; |
|
w1 = a[n3+i]; |
|
} |
|
while (w0 == w1 && i != 0); |
|
if (w0 < w1) |
|
{ |
|
x = a + n3; |
|
y = a; |
|
sign = 1; |
|
} |
|
else |
|
{ |
|
x = a; |
|
y = a + n3; |
|
} |
|
mpn_sub_n (p, x, y, n2); |
|
} |
|
p[n2] = w; |
|
|
2n n n n |
w = a[n2]; |
UV = (B + B )U V + B (U -U )(V -V ) + (B + 1)U V |
if (w != 0) |
1 1 1 0 0 1 0 0 |
w -= mpn_sub_n (p + n3, a, a + n3, n2); |
|
else |
|
{ |
|
i = n2; |
|
do |
|
{ |
|
--i; |
|
w0 = a[i]; |
|
w1 = a[n3+i]; |
|
} |
|
while (w0 == w1 && i != 0); |
|
if (w0 < w1) |
|
{ |
|
x = a + n3; |
|
y = a; |
|
sign ^= 1; |
|
} |
|
else |
|
{ |
|
x = a; |
|
y = a + n3; |
|
} |
|
mpn_sub_n (p + n3, x, y, n2); |
|
} |
|
p[n] = w; |
|
|
Where B = 2**BITS_PER_MP_LIMB. */ |
n1 = n + 1; |
|
if (n2 < KARATSUBA_SQR_THRESHOLD) |
|
{ |
|
if (n3 < KARATSUBA_SQR_THRESHOLD) |
|
{ |
|
mpn_sqr_basecase (ws, p, n3); |
|
mpn_sqr_basecase (p, a, n3); |
|
} |
|
else |
|
{ |
|
mpn_kara_sqr_n (ws, p, n3, ws + n1); |
|
mpn_kara_sqr_n (p, a, n3, ws + n1); |
|
} |
|
mpn_sqr_basecase (p + n1, a + n3, n2); |
|
} |
|
else |
|
{ |
|
mpn_kara_sqr_n (ws, p, n3, ws + n1); |
|
mpn_kara_sqr_n (p, a, n3, ws + n1); |
|
mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); |
|
} |
|
|
mp_size_t hsize = size >> 1; |
if (sign) |
mp_limb_t cy; |
mpn_add_n (ws, p, ws, n1); |
int negflg; |
else |
|
mpn_sub_n (ws, p, ws, n1); |
|
|
/*** Product H. ________________ ________________ |
nm1 = n - 1; |
|_____U1 x V1____||____U0 x V0_____| */ |
if (mpn_add_n (ws, p + n1, ws, nm1)) |
/* Put result in upper part of PROD and pass low part of TSPACE |
{ |
as new TSPACE. */ |
mp_limb_t x = ws[nm1] + 1; |
MPN_MUL_N_RECURSE (prodp + size, up + hsize, vp + hsize, hsize, tspace); |
ws[nm1] = x; |
|
if (x == 0) |
|
++ws[n]; |
|
} |
|
if (mpn_add_n (p + n3, p + n3, ws, n1)) |
|
{ |
|
mp_limb_t x; |
|
i = n1 + n3; |
|
do |
|
{ |
|
x = p[i] + 1; |
|
p[i] = x; |
|
++i; |
|
} while (x == 0); |
|
} |
|
} |
|
else |
|
{ |
|
/* Even length. */ |
|
mp_limb_t t; |
|
|
/*** Product M. ________________ |
i = n2; |
|_(U1-U0)(V0-V1)_| */ |
do |
if (mpn_cmp (up + hsize, up, hsize) >= 0) |
|
{ |
{ |
mpn_sub_n (prodp, up + hsize, up, hsize); |
--i; |
negflg = 0; |
w0 = a[i]; |
|
w1 = a[n2+i]; |
} |
} |
|
while (w0 == w1 && i != 0); |
|
sign = 0; |
|
if (w0 < w1) |
|
{ |
|
x = a + n2; |
|
y = a; |
|
sign = 1; |
|
} |
else |
else |
{ |
{ |
mpn_sub_n (prodp, up, up + hsize, hsize); |
x = a; |
negflg = 1; |
y = a + n2; |
} |
} |
if (mpn_cmp (vp + hsize, vp, hsize) >= 0) |
mpn_sub_n (p, x, y, n2); |
|
|
|
i = n2; |
|
do |
{ |
{ |
mpn_sub_n (prodp + hsize, vp + hsize, vp, hsize); |
--i; |
negflg ^= 1; |
w0 = a[i]; |
|
w1 = a[n2+i]; |
} |
} |
|
while (w0 == w1 && i != 0); |
|
if (w0 < w1) |
|
{ |
|
x = a + n2; |
|
y = a; |
|
sign ^= 1; |
|
} |
else |
else |
{ |
{ |
mpn_sub_n (prodp + hsize, vp, vp + hsize, hsize); |
x = a; |
/* No change of NEGFLG. */ |
y = a + n2; |
} |
} |
/* Read temporary operands from low part of PROD. |
mpn_sub_n (p + n2, x, y, n2); |
Put result in low part of TSPACE using upper part of TSPACE |
|
as new TSPACE. */ |
|
MPN_MUL_N_RECURSE (tspace, prodp, prodp + hsize, hsize, tspace + size); |
|
|
|
/*** Add/copy product H. */ |
/* Pointwise products. */ |
MPN_COPY (prodp + hsize, prodp + size, hsize); |
if (n2 < KARATSUBA_SQR_THRESHOLD) |
cy = mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize); |
{ |
|
mpn_sqr_basecase (ws, p, n2); |
|
mpn_sqr_basecase (p, a, n2); |
|
mpn_sqr_basecase (p + n, a + n2, n2); |
|
} |
|
else |
|
{ |
|
mpn_kara_sqr_n (ws, p, n2, ws + n); |
|
mpn_kara_sqr_n (p, a, n2, ws + n); |
|
mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); |
|
} |
|
|
/*** Add product M (if NEGFLG M is a negative number). */ |
/* Interpolate. */ |
if (negflg) |
if (sign) |
cy -= mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size); |
w = mpn_add_n (ws, p, ws, n); |
else |
else |
cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size); |
w = -mpn_sub_n (ws, p, ws, n); |
|
w += mpn_add_n (ws, p + n, ws, n); |
|
w += mpn_add_n (p + n2, p + n2, ws, n); |
|
/* TO DO: could put "if (w) { ... }" here. |
|
* Less work but badly predicted branch. |
|
* No measurable difference in speed on Alpha. |
|
*/ |
|
i = n + n2; |
|
t = p[i] + w; |
|
p[i] = t; |
|
if (t < w) |
|
{ |
|
do |
|
{ |
|
++i; |
|
w = p[i] + 1; |
|
p[i] = w; |
|
} |
|
while (w == 0); |
|
} |
|
} |
|
} |
|
|
/*** Product L. ________________ ________________ |
/*-- add2Times -------------------------------------------------------------*/ |
|________________||____U0 x V0_____| */ |
|
/* Read temporary operands from low part of PROD. |
|
Put result in low part of TSPACE using upper part of TSPACE |
|
as new TSPACE. */ |
|
MPN_MUL_N_RECURSE (tspace, up, vp, hsize, tspace + size); |
|
|
|
/*** Add/copy Product L (twice). */ |
/* z[] = x[] + 2 * y[] |
|
Note that z and x might point to the same vectors. */ |
|
#ifdef USE_MORE_MPN |
|
static inline mp_limb_t |
|
#if __STDC__ |
|
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) |
|
#else |
|
add2Times (z, x, y, n) |
|
mp_ptr z; |
|
mp_srcptr x; |
|
mp_srcptr y; |
|
mp_size_t n; |
|
#endif |
|
{ |
|
mp_ptr t; |
|
mp_limb_t c; |
|
TMP_DECL (marker); |
|
TMP_MARK (marker); |
|
t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); |
|
c = mpn_lshift (t, y, n, 1); |
|
c += mpn_add_n (z, x, t, n); |
|
TMP_FREE (marker); |
|
return c; |
|
} |
|
#else |
|
|
cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size); |
static mp_limb_t |
if (cy) |
#if __STDC__ |
mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy); |
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) |
|
#else |
|
add2Times (z, x, y, n) |
|
mp_ptr z; |
|
mp_srcptr x; |
|
mp_srcptr y; |
|
mp_size_t n; |
|
#endif |
|
{ |
|
mp_limb_t c, v, w; |
|
|
MPN_COPY (prodp, tspace, hsize); |
ASSERT (n > 0); |
cy = mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize); |
v = *x; w = *y; |
if (cy) |
c = w >> (BITS_PER_MP_LIMB - 1); |
mpn_add_1 (prodp + size, prodp + size, size, 1); |
w <<= 1; |
|
v += w; |
|
c += v < w; |
|
*z = v; |
|
++x; ++y; ++z; |
|
while (--n) |
|
{ |
|
v = *x; |
|
w = *y; |
|
v += c; |
|
c = v < c; |
|
c += w >> (BITS_PER_MP_LIMB - 1); |
|
w <<= 1; |
|
v += w; |
|
c += v < w; |
|
*z = v; |
|
++x; ++y; ++z; |
} |
} |
|
|
|
return c; |
} |
} |
|
#endif |
|
|
void |
/*-- evaluate3 -------------------------------------------------------------*/ |
|
|
|
/* Evaluates: |
|
* ph := 4*A+2*B+C |
|
* p1 := A+B+C |
|
* p2 := A+2*B+4*C |
|
* where: |
|
* ph[], p1[], p2[], A[] and B[] all have length len, |
|
* C[] has length len2 with len-len2 = 0, 1 or 2. |
|
* Returns top words (overflow) at pth, pt1 and pt2 respectively. |
|
*/ |
|
#ifdef USE_MORE_MPN |
|
static void |
#if __STDC__ |
#if __STDC__ |
impn_sqr_n_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t size) |
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, |
|
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2) |
#else |
#else |
impn_sqr_n_basecase (prodp, up, size) |
evaluate3 (ph, p1, p2, pth, pt1, pt2, |
mp_ptr prodp; |
A, B, C, len, len2) |
mp_srcptr up; |
mp_ptr ph; |
mp_size_t size; |
mp_ptr p1; |
|
mp_ptr p2; |
|
mp_ptr pth; |
|
mp_ptr pt1; |
|
mp_ptr pt2; |
|
mp_srcptr A; |
|
mp_srcptr B; |
|
mp_srcptr C; |
|
mp_size_t len; |
|
mp_size_t len2; |
#endif |
#endif |
{ |
{ |
mp_size_t i; |
mp_limb_t c, d, e; |
mp_limb_t cy_limb; |
|
mp_limb_t v_limb; |
ASSERT (len - len2 <= 2); |
|
|
/* Multiply by the first limb in V separately, as the result can be |
e = mpn_lshift (p1, B, len, 1); |
stored (not added) to PROD. We also avoid a loop for zeroing. */ |
|
v_limb = up[0]; |
c = mpn_lshift (ph, A, len, 2); |
if (v_limb <= 1) |
c += e + mpn_add_n (ph, ph, p1, len); |
|
d = mpn_add_n (ph, ph, C, len2); |
|
if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); |
|
ASSERT (c < 7); |
|
*pth = c; |
|
|
|
c = mpn_lshift (p2, C, len2, 2); |
|
#if 1 |
|
if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; } |
|
c += e + mpn_add_n (p2, p2, p1, len); |
|
#else |
|
d = mpn_add_n (p2, p2, p1, len2); |
|
c += d; |
|
if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); |
|
c += e; |
|
#endif |
|
c += mpn_add_n (p2, p2, A, len); |
|
ASSERT (c < 7); |
|
*pt2 = c; |
|
|
|
c = mpn_add_n (p1, A, B, len); |
|
d = mpn_add_n (p1, p1, C, len2); |
|
if (len2 == len) c += d; |
|
else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); |
|
ASSERT (c < 3); |
|
*pt1 = c; |
|
|
|
} |
|
|
|
#else |
|
|
|
static void |
|
#if __STDC__ |
|
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, |
|
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls) |
|
#else |
|
evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
A, B, C, l, ls) |
|
mp_ptr ph; |
|
mp_ptr p1; |
|
mp_ptr p2; |
|
mp_ptr pth; |
|
mp_ptr pt1; |
|
mp_ptr pt2; |
|
mp_srcptr A; |
|
mp_srcptr B; |
|
mp_srcptr C; |
|
mp_size_t l; |
|
mp_size_t ls; |
|
#endif |
|
{ |
|
mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2; |
|
|
|
ASSERT (l - ls <= 2); |
|
|
|
th = t1 = t2 = 0; |
|
for (i = 0; i < l; ++i) |
{ |
{ |
if (v_limb == 1) |
a = *A; |
MPN_COPY (prodp, up, size); |
b = *B; |
else |
c = i < ls ? *C : 0; |
MPN_ZERO (prodp, size); |
|
cy_limb = 0; |
/* TO DO: choose one of the following alternatives. */ |
|
#if 0 |
|
t = a << 2; |
|
vh = th + t; |
|
th = vh < t; |
|
th += a >> (BITS_PER_MP_LIMB - 2); |
|
t = b << 1; |
|
vh += t; |
|
th += vh < t; |
|
th += b >> (BITS_PER_MP_LIMB - 1); |
|
vh += c; |
|
th += vh < c; |
|
#else |
|
vh = th + c; |
|
th = vh < c; |
|
t = b << 1; |
|
vh += t; |
|
th += vh < t; |
|
th += b >> (BITS_PER_MP_LIMB - 1); |
|
t = a << 2; |
|
vh += t; |
|
th += vh < t; |
|
th += a >> (BITS_PER_MP_LIMB - 2); |
|
#endif |
|
|
|
v1 = t1 + a; |
|
t1 = v1 < a; |
|
v1 += b; |
|
t1 += v1 < b; |
|
v1 += c; |
|
t1 += v1 < c; |
|
|
|
v2 = t2 + a; |
|
t2 = v2 < a; |
|
t = b << 1; |
|
v2 += t; |
|
t2 += v2 < t; |
|
t2 += b >> (BITS_PER_MP_LIMB - 1); |
|
t = c << 2; |
|
v2 += t; |
|
t2 += v2 < t; |
|
t2 += c >> (BITS_PER_MP_LIMB - 2); |
|
|
|
*ph = vh; |
|
*p1 = v1; |
|
*p2 = v2; |
|
|
|
++A; ++B; ++C; |
|
++ph; ++p1; ++p2; |
} |
} |
|
|
|
ASSERT (th < 7); |
|
ASSERT (t1 < 3); |
|
ASSERT (t2 < 7); |
|
|
|
*pth = th; |
|
*pt1 = t1; |
|
*pt2 = t2; |
|
} |
|
#endif |
|
|
|
|
|
/*-- interpolate3 ----------------------------------------------------------*/ |
|
|
|
/* Interpolates B, C, D (in-place) from: |
|
* 16*A+8*B+4*C+2*D+E |
|
* A+B+C+D+E |
|
* A+2*B+4*C+8*D+16*E |
|
* where: |
|
* A[], B[], C[] and D[] all have length l, |
|
* E[] has length ls with l-ls = 0, 2 or 4. |
|
* |
|
* Reads top words (from earlier overflow) from ptb, ptc and ptd, |
|
* and returns new top words there. |
|
*/ |
|
|
|
#ifdef USE_MORE_MPN |
|
static void |
|
#if __STDC__ |
|
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, |
|
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2) |
|
#else |
|
interpolate3 (A, B, C, D, E, |
|
ptb, ptc, ptd, len, len2) |
|
mp_srcptr A; |
|
mp_ptr B; |
|
mp_ptr C; |
|
mp_ptr D; |
|
mp_srcptr E; |
|
mp_ptr ptb; |
|
mp_ptr ptc; |
|
mp_ptr ptd; |
|
mp_size_t len; |
|
mp_size_t len2; |
|
#endif |
|
{ |
|
mp_ptr ws; |
|
mp_limb_t t, tb,tc,td; |
|
TMP_DECL (marker); |
|
TMP_MARK (marker); |
|
|
|
ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); |
|
|
|
/* Let x1, x2, x3 be the values to interpolate. We have: |
|
* b = 16*a + 8*x1 + 4*x2 + 2*x3 + e |
|
* c = a + x1 + x2 + x3 + e |
|
* d = a + 2*x1 + 4*x2 + 8*x3 + 16*e |
|
*/ |
|
|
|
ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); |
|
|
|
tb = *ptb; tc = *ptc; td = *ptd; |
|
|
|
|
|
/* b := b - 16*a - e |
|
* c := c - a - e |
|
* d := d - a - 16*e |
|
*/ |
|
|
|
t = mpn_lshift (ws, A, len, 4); |
|
tb -= t + mpn_sub_n (B, B, ws, len); |
|
t = mpn_sub_n (B, B, E, len2); |
|
if (len2 == len) tb -= t; |
|
else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); |
|
|
|
tc -= mpn_sub_n (C, C, A, len); |
|
t = mpn_sub_n (C, C, E, len2); |
|
if (len2 == len) tc -= t; |
|
else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); |
|
|
|
t = mpn_lshift (ws, E, len2, 4); |
|
t += mpn_add_n (ws, ws, A, len2); |
|
#if 1 |
|
if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t); |
|
td -= t + mpn_sub_n (D, D, ws, len); |
|
#else |
|
t += mpn_sub_n (D, D, ws, len2); |
|
if (len2 != len) { |
|
t = mpn_sub_1 (D+len2, D+len2, len-len2, t); |
|
t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); |
|
} /* end if/else */ |
|
td -= t; |
|
#endif |
|
|
|
|
|
/* b, d := b + d, b - d */ |
|
|
|
#ifdef HAVE_MPN_ADD_SUB_N |
|
/* #error TO DO ... */ |
|
#else |
|
t = tb + td + mpn_add_n (ws, B, D, len); |
|
td = tb - td - mpn_sub_n (D, B, D, len); |
|
tb = t; |
|
MPN_COPY (B, ws, len); |
|
#endif |
|
|
|
/* b := b-8*c */ |
|
t = 8 * tc + mpn_lshift (ws, C, len, 3); |
|
tb -= t + mpn_sub_n (B, B, ws, len); |
|
|
|
/* c := 2*c - b */ |
|
tc = 2 * tc + mpn_lshift (C, C, len, 1); |
|
tc -= tb + mpn_sub_n (C, C, B, len); |
|
|
|
/* d := d/3 */ |
|
td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3; |
|
|
|
/* b, d := b + d, b - d */ |
|
#ifdef HAVE_MPN_ADD_SUB_N |
|
/* #error TO DO ... */ |
|
#else |
|
t = tb + td + mpn_add_n (ws, B, D, len); |
|
td = tb - td - mpn_sub_n (D, B, D, len); |
|
tb = t; |
|
MPN_COPY (B, ws, len); |
|
#endif |
|
|
|
/* Now: |
|
* b = 4*x1 |
|
* c = 2*x2 |
|
* d = 4*x3 |
|
*/ |
|
|
|
ASSERT(!(*B & 3)); |
|
mpn_rshift (B, B, len, 2); |
|
B[len-1] |= tb<<(BITS_PER_MP_LIMB-2); |
|
ASSERT((long)tb >= 0); |
|
tb >>= 2; |
|
|
|
ASSERT(!(*C & 1)); |
|
mpn_rshift (C, C, len, 1); |
|
C[len-1] |= tc<<(BITS_PER_MP_LIMB-1); |
|
ASSERT((long)tc >= 0); |
|
tc >>= 1; |
|
|
|
ASSERT(!(*D & 3)); |
|
mpn_rshift (D, D, len, 2); |
|
D[len-1] |= td<<(BITS_PER_MP_LIMB-2); |
|
ASSERT((long)td >= 0); |
|
td >>= 2; |
|
|
|
#if WANT_ASSERT |
|
ASSERT (tb < 2); |
|
if (len == len2) |
|
{ |
|
ASSERT (tc < 3); |
|
ASSERT (td < 2); |
|
} |
else |
else |
cy_limb = mpn_mul_1 (prodp, up, size, v_limb); |
{ |
|
ASSERT (tc < 2); |
|
ASSERT (!td); |
|
} |
|
#endif |
|
|
prodp[size] = cy_limb; |
*ptb = tb; |
prodp++; |
*ptc = tc; |
|
*ptd = td; |
|
|
/* For each iteration in the outer loop, multiply one limb from |
TMP_FREE (marker); |
U with one limb from V, and add it to PROD. */ |
} |
for (i = 1; i < size; i++) |
|
|
#else |
|
|
|
static void |
|
#if __STDC__ |
|
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, |
|
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls) |
|
#else |
|
interpolate3 (A, B, C, D, E, |
|
ptb, ptc, ptd, l, ls) |
|
mp_srcptr A; |
|
mp_ptr B; |
|
mp_ptr C; |
|
mp_ptr D; |
|
mp_srcptr E; |
|
mp_ptr ptb; |
|
mp_ptr ptc; |
|
mp_ptr ptd; |
|
mp_size_t l; |
|
mp_size_t ls; |
|
#endif |
|
{ |
|
mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; |
|
const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1); |
|
|
|
#if WANT_ASSERT |
|
t = l - ls; |
|
ASSERT (t == 0 || t == 2 || t == 4); |
|
#endif |
|
|
|
sb = sc = sd = 0; |
|
for (i = 0; i < l; ++i) |
{ |
{ |
v_limb = up[i]; |
mp_limb_t tb, tc, td, tt; |
if (v_limb <= 1) |
|
|
a = *A; |
|
b = *B; |
|
c = *C; |
|
d = *D; |
|
e = i < ls ? *E : 0; |
|
|
|
/* Let x1, x2, x3 be the values to interpolate. We have: |
|
* b = 16*a + 8*x1 + 4*x2 + 2*x3 + e |
|
* c = a + x1 + x2 + x3 + e |
|
* d = a + 2*x1 + 4*x2 + 8*x3 + 16*e |
|
*/ |
|
|
|
/* b := b - 16*a - e |
|
* c := c - a - e |
|
* d := d - a - 16*e |
|
*/ |
|
t = a << 4; |
|
tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t); |
|
b -= t; |
|
tb -= b < e; |
|
b -= e; |
|
tc = -(c < a); |
|
c -= a; |
|
tc -= c < e; |
|
c -= e; |
|
td = -(d < a); |
|
d -= a; |
|
t = e << 4; |
|
td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t); |
|
d -= t; |
|
|
|
/* b, d := b + d, b - d */ |
|
t = b + d; |
|
tt = tb + td + (t < b); |
|
td = tb - td - (b < d); |
|
d = b - d; |
|
b = t; |
|
tb = tt; |
|
|
|
/* b := b-8*c */ |
|
t = c << 3; |
|
tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t); |
|
b -= t; |
|
|
|
/* c := 2*c - b */ |
|
t = c << 1; |
|
tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b); |
|
c = t - b; |
|
|
|
/* d := d/3 */ |
|
d *= INVERSE_3; |
|
td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); |
|
td *= INVERSE_3; |
|
|
|
/* b, d := b + d, b - d */ |
|
t = b + d; |
|
tt = tb + td + (t < b); |
|
td = tb - td - (b < d); |
|
d = b - d; |
|
b = t; |
|
tb = tt; |
|
|
|
/* Now: |
|
* b = 4*x1 |
|
* c = 2*x2 |
|
* d = 4*x3 |
|
*/ |
|
|
|
/* sb has period 2. */ |
|
b += sb; |
|
tb += b < sb; |
|
sb &= maskOffHalf; |
|
sb |= sb >> (BITS_PER_MP_LIMB >> 1); |
|
sb += tb; |
|
|
|
/* sc has period 1. */ |
|
c += sc; |
|
tc += c < sc; |
|
/* TO DO: choose one of the following alternatives. */ |
|
#if 1 |
|
sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1)); |
|
sc += tc; |
|
#else |
|
sc = tc - ((long)sc < 0L); |
|
#endif |
|
|
|
/* sd has period 2. */ |
|
d += sd; |
|
td += d < sd; |
|
sd &= maskOffHalf; |
|
sd |= sd >> (BITS_PER_MP_LIMB >> 1); |
|
sd += td; |
|
|
|
if (i != 0) |
{ |
{ |
cy_limb = 0; |
B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); |
if (v_limb == 1) |
C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); |
cy_limb = mpn_add_n (prodp, prodp, up, size); |
D[-1] = od | d << (BITS_PER_MP_LIMB - 2); |
} |
} |
else |
ob = b >> 2; |
cy_limb = mpn_addmul_1 (prodp, up, size, v_limb); |
oc = c >> 1; |
|
od = d >> 2; |
|
|
prodp[size] = cy_limb; |
++A; ++B; ++C; ++D; ++E; |
prodp++; |
|
} |
} |
|
|
|
/* Handle top words. */ |
|
b = *ptb; |
|
c = *ptc; |
|
d = *ptd; |
|
|
|
t = b + d; |
|
d = b - d; |
|
b = t; |
|
b -= c << 3; |
|
c = (c << 1) - b; |
|
d *= INVERSE_3; |
|
t = b + d; |
|
d = b - d; |
|
b = t; |
|
|
|
b += sb; |
|
c += sc; |
|
d += sd; |
|
|
|
B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); |
|
C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); |
|
D[-1] = od | d << (BITS_PER_MP_LIMB - 2); |
|
|
|
b >>= 2; |
|
c >>= 1; |
|
d >>= 2; |
|
|
|
#if WANT_ASSERT |
|
ASSERT (b < 2); |
|
if (l == ls) |
|
{ |
|
ASSERT (c < 3); |
|
ASSERT (d < 2); |
|
} |
|
else |
|
{ |
|
ASSERT (c < 2); |
|
ASSERT (!d); |
|
} |
|
#endif |
|
|
|
*ptb = b; |
|
*ptc = c; |
|
*ptd = d; |
} |
} |
|
#endif |
|
|
|
|
|
/*-- mpn_toom3_mul_n --------------------------------------------------------------*/ |
|
|
|
/* Multiplies using 5 mults of one third size and so on recursively. |
|
* p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. |
|
* No overlap of p[...] with a[...] or b[...]. |
|
* ws is workspace. |
|
*/ |
|
|
|
/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the |
|
* recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() |
|
* because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false. |
|
*/ |
|
|
|
#define TOOM3_MUL_REC(p, a, b, n, ws) \ |
|
do { \ |
|
if (n < KARATSUBA_MUL_THRESHOLD) \ |
|
mpn_mul_basecase (p, a, n, b, n); \ |
|
else if (n < TOOM3_MUL_THRESHOLD) \ |
|
mpn_kara_mul_n (p, a, b, n, ws); \ |
|
else \ |
|
mpn_toom3_mul_n (p, a, b, n, ws); \ |
|
} while (0) |
|
|
void |
void |
#if __STDC__ |
#if __STDC__ |
impn_sqr_n (mp_ptr prodp, |
mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) |
mp_srcptr up, mp_size_t size, mp_ptr tspace) |
|
#else |
#else |
impn_sqr_n (prodp, up, size, tspace) |
mpn_toom3_mul_n (p, a, b, n, ws) |
mp_ptr prodp; |
mp_ptr p; |
mp_srcptr up; |
mp_srcptr a; |
mp_size_t size; |
mp_srcptr b; |
mp_ptr tspace; |
mp_size_t n; |
|
mp_ptr ws; |
#endif |
#endif |
{ |
{ |
if ((size & 1) != 0) |
mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; |
{ |
mp_limb_t *A,*B,*C,*D,*E, *W; |
/* The size is odd, the code code below doesn't handle that. |
mp_size_t l,l2,l3,l4,l5,ls; |
Multiply the least significant (size - 1) limbs with a recursive |
|
call, and handle the most significant limb of S1 and S2 |
|
separately. */ |
|
/* A slightly faster way to do this would be to make the Karatsuba |
|
code below behave as if the size were even, and let it check for |
|
odd size in the end. I.e., in essence move this code to the end. |
|
Doing so would save us a recursive call, and potentially make the |
|
stack grow a lot less. */ |
|
|
|
mp_size_t esize = size - 1; /* even size */ |
/* Break n words into chunks of size l, l and ls. |
mp_limb_t cy_limb; |
* n = 3*k => l = k, ls = k |
|
* n = 3*k+1 => l = k+1, ls = k-1 |
|
* n = 3*k+2 => l = k+1, ls = k |
|
*/ |
|
{ |
|
mp_limb_t m; |
|
|
MPN_SQR_N_RECURSE (prodp, up, esize, tspace); |
ASSERT (n >= TOOM3_MUL_THRESHOLD); |
cy_limb = mpn_addmul_1 (prodp + esize, up, esize, up[esize]); |
l = ls = n / 3; |
prodp[esize + esize] = cy_limb; |
m = n - l * 3; |
cy_limb = mpn_addmul_1 (prodp + esize, up, size, up[esize]); |
if (m != 0) |
|
++l; |
|
if (m == 1) |
|
--ls; |
|
|
prodp[esize + size] = cy_limb; |
l2 = l * 2; |
|
l3 = l * 3; |
|
l4 = l * 4; |
|
l5 = l * 5; |
|
A = p; |
|
B = ws; |
|
C = p + l2; |
|
D = ws + l2; |
|
E = p + l4; |
|
W = ws + l4; |
|
} |
|
|
|
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ |
|
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); |
|
evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); |
|
|
|
/** Second stage: pointwise multiplies. **/ |
|
TOOM3_MUL_REC(D, C, C + l, l, W); |
|
tD = cD*dD; |
|
if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD); |
|
if (dD) tD += mpn_addmul_1 (D + l, C, l, dD); |
|
ASSERT (tD < 49); |
|
TOOM3_MUL_REC(C, B, B + l, l, W); |
|
tC = cC*dC; |
|
/* TO DO: choose one of the following alternatives. */ |
|
#if 0 |
|
if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC); |
|
if (dC) tC += mpn_addmul_1 (C + l, B, l, dC); |
|
#else |
|
if (cC) |
|
{ |
|
if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l); |
|
else tC += add2Times (C + l, C + l, B + l, l); |
} |
} |
else |
if (dC) |
{ |
{ |
mp_size_t hsize = size >> 1; |
if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l); |
mp_limb_t cy; |
else tC += add2Times (C + l, C + l, B, l); |
|
} |
|
#endif |
|
ASSERT (tC < 9); |
|
TOOM3_MUL_REC(B, A, A + l, l, W); |
|
tB = cB*dB; |
|
if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB); |
|
if (dB) tB += mpn_addmul_1 (B + l, A, l, dB); |
|
ASSERT (tB < 49); |
|
TOOM3_MUL_REC(A, a, b, l, W); |
|
TOOM3_MUL_REC(E, a + l2, b + l2, ls, W); |
|
|
/*** Product H. ________________ ________________ |
/** Third stage: interpolation. **/ |
|_____U1 x U1____||____U0 x U0_____| */ |
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); |
/* Put result in upper part of PROD and pass low part of TSPACE |
|
as new TSPACE. */ |
|
MPN_SQR_N_RECURSE (prodp + size, up + hsize, hsize, tspace); |
|
|
|
/*** Product M. ________________ |
/** Final stage: add up the coefficients. **/ |
|_(U1-U0)(U0-U1)_| */ |
{ |
if (mpn_cmp (up + hsize, up, hsize) >= 0) |
mp_limb_t i, x, y; |
{ |
tB += mpn_add_n (p + l, p + l, B, l2); |
mpn_sub_n (prodp, up + hsize, up, hsize); |
tD += mpn_add_n (p + l3, p + l3, D, l2); |
} |
mpn_incr_u (p + l3, tB); |
else |
mpn_incr_u (p + l4, tC); |
{ |
mpn_incr_u (p + l5, tD); |
mpn_sub_n (prodp, up, up + hsize, hsize); |
} |
} |
} |
|
|
/* Read temporary operands from low part of PROD. |
/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/ |
Put result in low part of TSPACE using upper part of TSPACE |
|
as new TSPACE. */ |
|
MPN_SQR_N_RECURSE (tspace, prodp, hsize, tspace + size); |
|
|
|
/*** Add/copy product H. */ |
/* Like previous function but for squaring */ |
MPN_COPY (prodp + hsize, prodp + size, hsize); |
|
cy = mpn_add_n (prodp + size, prodp + size, prodp + size + hsize, hsize); |
|
|
|
/*** Add product M (if NEGFLG M is a negative number). */ |
#define TOOM3_SQR_REC(p, a, n, ws) \ |
cy -= mpn_sub_n (prodp + hsize, prodp + hsize, tspace, size); |
do { \ |
|
if (n < KARATSUBA_SQR_THRESHOLD) \ |
|
mpn_sqr_basecase (p, a, n); \ |
|
else if (n < TOOM3_SQR_THRESHOLD) \ |
|
mpn_kara_sqr_n (p, a, n, ws); \ |
|
else \ |
|
mpn_toom3_sqr_n (p, a, n, ws); \ |
|
} while (0) |
|
|
/*** Product L. ________________ ________________ |
void |
|________________||____U0 x U0_____| */ |
#if __STDC__ |
/* Read temporary operands from low part of PROD. |
mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) |
Put result in low part of TSPACE using upper part of TSPACE |
#else |
as new TSPACE. */ |
mpn_toom3_sqr_n (p, a, n, ws) |
MPN_SQR_N_RECURSE (tspace, up, hsize, tspace + size); |
mp_ptr p; |
|
mp_srcptr a; |
|
mp_size_t n; |
|
mp_ptr ws; |
|
#endif |
|
{ |
|
mp_limb_t cB,cC,cD, tB,tC,tD; |
|
mp_limb_t *A,*B,*C,*D,*E, *W; |
|
mp_size_t l,l2,l3,l4,l5,ls; |
|
|
/*** Add/copy Product L (twice). */ |
/* Break n words into chunks of size l, l and ls. |
|
* n = 3*k => l = k, ls = k |
|
* n = 3*k+1 => l = k+1, ls = k-1 |
|
* n = 3*k+2 => l = k+1, ls = k |
|
*/ |
|
{ |
|
mp_limb_t m; |
|
|
cy += mpn_add_n (prodp + hsize, prodp + hsize, tspace, size); |
ASSERT (n >= TOOM3_MUL_THRESHOLD); |
if (cy) |
l = ls = n / 3; |
mpn_add_1 (prodp + hsize + size, prodp + hsize + size, hsize, cy); |
m = n - l * 3; |
|
if (m != 0) |
|
++l; |
|
if (m == 1) |
|
--ls; |
|
|
MPN_COPY (prodp, tspace, hsize); |
l2 = l * 2; |
cy = mpn_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize); |
l3 = l * 3; |
if (cy) |
l4 = l * 4; |
mpn_add_1 (prodp + size, prodp + size, size, 1); |
l5 = l * 5; |
|
A = p; |
|
B = ws; |
|
C = p + l2; |
|
D = ws + l2; |
|
E = p + l4; |
|
W = ws + l4; |
|
} |
|
|
|
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ |
|
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); |
|
|
|
/** Second stage: pointwise multiplies. **/ |
|
TOOM3_SQR_REC(D, C, l, W); |
|
tD = cD*cD; |
|
if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD); |
|
ASSERT (tD < 49); |
|
TOOM3_SQR_REC(C, B, l, W); |
|
tC = cC*cC; |
|
/* TO DO: choose one of the following alternatives. */ |
|
#if 0 |
|
if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC); |
|
#else |
|
if (cC >= 1) |
|
{ |
|
tC += add2Times (C + l, C + l, B, l); |
|
if (cC == 2) |
|
tC += add2Times (C + l, C + l, B, l); |
} |
} |
|
#endif |
|
ASSERT (tC < 9); |
|
TOOM3_SQR_REC(B, A, l, W); |
|
tB = cB*cB; |
|
if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB); |
|
ASSERT (tB < 49); |
|
TOOM3_SQR_REC(A, a, l, W); |
|
TOOM3_SQR_REC(E, a + l2, ls, W); |
|
|
|
/** Third stage: interpolation. **/ |
|
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); |
|
|
|
/** Final stage: add up the coefficients. **/ |
|
{ |
|
mp_limb_t i, x, y; |
|
tB += mpn_add_n (p + l, p + l, B, l2); |
|
tD += mpn_add_n (p + l3, p + l3, D, l2); |
|
mpn_incr_u (p + l3, tB); |
|
mpn_incr_u (p + l4, tC); |
|
mpn_incr_u (p + l5, tD); |
|
} |
} |
} |
|
|
/* This should be made into an inline function in gmp.h. */ |
void |
inline void |
|
#if __STDC__ |
#if __STDC__ |
mpn_mul_n (mp_ptr prodp, mp_srcptr up, mp_srcptr vp, mp_size_t size) |
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) |
#else |
#else |
mpn_mul_n (prodp, up, vp, size) |
mpn_mul_n (p, a, b, n) |
mp_ptr prodp; |
mp_ptr p; |
mp_srcptr up; |
mp_srcptr a; |
mp_srcptr vp; |
mp_srcptr b; |
mp_size_t size; |
mp_size_t n; |
#endif |
#endif |
{ |
{ |
TMP_DECL (marker); |
if (n < KARATSUBA_MUL_THRESHOLD) |
TMP_MARK (marker); |
mpn_mul_basecase (p, a, n, b, n); |
if (up == vp) |
else if (n < TOOM3_MUL_THRESHOLD) |
{ |
{ |
if (size < KARATSUBA_THRESHOLD) |
/* Allocate workspace of fixed size on stack: fast! */ |
{ |
#if TUNE_PROGRAM_BUILD |
impn_sqr_n_basecase (prodp, up, size); |
mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB]; |
} |
#else |
else |
mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB]; |
{ |
#endif |
mp_ptr tspace; |
mpn_kara_mul_n (p, a, b, n, ws); |
tspace = (mp_ptr) TMP_ALLOC (2 * size * BYTES_PER_MP_LIMB); |
|
impn_sqr_n (prodp, up, size, tspace); |
|
} |
|
} |
} |
|
#if WANT_FFT || TUNE_PROGRAM_BUILD |
|
else if (n < FFT_MUL_THRESHOLD) |
|
#else |
else |
else |
|
#endif |
{ |
{ |
if (size < KARATSUBA_THRESHOLD) |
/* Use workspace of unknown size in heap, as stack space may |
{ |
* be limited. Since n is at least TOOM3_MUL_THRESHOLD, the |
impn_mul_n_basecase (prodp, up, vp, size); |
* multiplication will take much longer than malloc()/free(). */ |
} |
mp_limb_t wsLen, *ws; |
else |
wsLen = 2 * n + 3 * BITS_PER_MP_LIMB; |
{ |
ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t)); |
mp_ptr tspace; |
mpn_toom3_mul_n (p, a, b, n, ws); |
tspace = (mp_ptr) TMP_ALLOC (2 * size * BYTES_PER_MP_LIMB); |
(*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t)); |
impn_mul_n (prodp, up, vp, size, tspace); |
|
} |
|
} |
} |
TMP_FREE (marker); |
#if WANT_FFT || TUNE_PROGRAM_BUILD |
|
else |
|
{ |
|
mpn_mul_fft_full (p, a, n, b, n); |
|
} |
|
#endif |
} |
} |