version 1.1.1.2, 2000/09/09 14:12:26 |
version 1.1.1.3, 2003/08/25 16:06:20 |
|
|
THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
|
|
|
|
Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software |
Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free |
Foundation, Inc. |
Software Foundation, Inc. |
|
|
This file is part of the GNU MP Library. |
This file is part of the GNU MP Library. |
|
|
Line 31 MA 02111-1307, USA. */ |
|
Line 31 MA 02111-1307, USA. */ |
|
#include "longlong.h" |
#include "longlong.h" |
|
|
|
|
/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. |
#if GMP_NAIL_BITS != 0 |
0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ |
/* The open-coded interpolate3 stuff has not been generalized for nails. */ |
#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) |
#define USE_MORE_MPN 1 |
|
#endif |
|
|
|
#ifndef USE_MORE_MPN |
#if !defined (__alpha) && !defined (__mips) |
#if !defined (__alpha) && !defined (__mips) |
/* For all other machines, we want to call mpn functions for the compund |
/* For all other machines, we want to call mpn functions for the compund |
operations instead of open-coding them. */ |
operations instead of open-coding them. */ |
#define USE_MORE_MPN |
#define USE_MORE_MPN 1 |
#endif |
#endif |
|
#endif |
|
|
/*== Function declarations =================================================*/ |
/*== Function declarations =================================================*/ |
|
|
static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr, |
static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr, |
mp_ptr, mp_ptr, mp_ptr, |
mp_ptr, mp_ptr, mp_ptr, |
mp_srcptr, mp_srcptr, mp_srcptr, |
mp_srcptr, mp_srcptr, mp_srcptr, |
mp_size_t, mp_size_t)); |
mp_size_t, mp_size_t)); |
static void interpolate3 _PROTO ((mp_srcptr, |
static void interpolate3 _PROTO ((mp_srcptr, |
mp_ptr, mp_ptr, mp_ptr, |
mp_ptr, mp_ptr, mp_ptr, |
mp_srcptr, |
mp_srcptr, |
mp_ptr, mp_ptr, mp_ptr, |
mp_ptr, mp_ptr, mp_ptr, |
mp_size_t, mp_size_t)); |
mp_size_t, mp_size_t)); |
static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); |
static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); |
|
|
|
|
Line 64 static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, |
|
Line 67 static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, |
|
*/ |
*/ |
|
|
void |
void |
#if __STDC__ |
|
mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) |
mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) |
#else |
|
mpn_kara_mul_n(p, a, b, n, ws) |
|
mp_ptr p; |
|
mp_srcptr a; |
|
mp_srcptr b; |
|
mp_size_t n; |
|
mp_ptr ws; |
|
#endif |
|
{ |
{ |
mp_limb_t i, sign, w, w0, w1; |
mp_limb_t w, w0, w1; |
mp_size_t n2; |
mp_size_t n2; |
mp_srcptr x, y; |
mp_srcptr x, y; |
|
mp_size_t i; |
|
int sign; |
|
|
n2 = n >> 1; |
n2 = n >> 1; |
ASSERT (n2 > 0); |
ASSERT (n2 > 0); |
|
|
if (n & 1) |
if ((n & 1) != 0) |
{ |
{ |
/* Odd length. */ |
/* Odd length. */ |
mp_size_t n1, n3, nm1; |
mp_size_t n1, n3, nm1; |
Line 100 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 96 mpn_kara_mul_n(p, a, b, n, ws) |
|
{ |
{ |
--i; |
--i; |
w0 = a[i]; |
w0 = a[i]; |
w1 = a[n3+i]; |
w1 = a[n3 + i]; |
} |
} |
while (w0 == w1 && i != 0); |
while (w0 == w1 && i != 0); |
if (w0 < w1) |
if (w0 < w1) |
{ |
{ |
x = a + n3; |
x = a + n3; |
y = a; |
y = a; |
sign = 1; |
sign = ~0; |
} |
} |
else |
else |
{ |
{ |
Line 124 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 120 mpn_kara_mul_n(p, a, b, n, ws) |
|
else |
else |
{ |
{ |
i = n2; |
i = n2; |
do |
do |
{ |
{ |
--i; |
--i; |
w0 = b[i]; |
w0 = b[i]; |
w1 = b[n3+i]; |
w1 = b[n3 + i]; |
} |
} |
while (w0 == w1 && i != 0); |
while (w0 == w1 && i != 0); |
if (w0 < w1) |
if (w0 < w1) |
{ |
{ |
x = b + n3; |
x = b + n3; |
y = b; |
y = b; |
sign ^= 1; |
sign = ~sign; |
} |
} |
else |
else |
{ |
{ |
Line 147 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 143 mpn_kara_mul_n(p, a, b, n, ws) |
|
p[n] = w; |
p[n] = w; |
|
|
n1 = n + 1; |
n1 = n + 1; |
if (n2 < KARATSUBA_MUL_THRESHOLD) |
if (n2 < MUL_KARATSUBA_THRESHOLD) |
{ |
{ |
if (n3 < KARATSUBA_MUL_THRESHOLD) |
if (n3 < MUL_KARATSUBA_THRESHOLD) |
{ |
{ |
mpn_mul_basecase (ws, p, n3, p + n3, n3); |
mpn_mul_basecase (ws, p, n3, p + n3, n3); |
mpn_mul_basecase (p, a, n3, b, n3); |
mpn_mul_basecase (p, a, n3, b, n3); |
Line 176 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 172 mpn_kara_mul_n(p, a, b, n, ws) |
|
nm1 = n - 1; |
nm1 = n - 1; |
if (mpn_add_n (ws, p + n1, ws, nm1)) |
if (mpn_add_n (ws, p + n1, ws, nm1)) |
{ |
{ |
mp_limb_t x = ws[nm1] + 1; |
mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; |
ws[nm1] = x; |
ws[nm1] = x; |
if (x == 0) |
if (x == 0) |
++ws[n]; |
ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; |
} |
} |
if (mpn_add_n (p + n3, p + n3, ws, n1)) |
if (mpn_add_n (p + n3, p + n3, ws, n1)) |
{ |
{ |
mp_limb_t x; |
mpn_incr_u (p + n1 + n3, 1); |
i = n1 + n3; |
|
do |
|
{ |
|
x = p[i] + 1; |
|
p[i] = x; |
|
++i; |
|
} while (x == 0); |
|
} |
} |
} |
} |
else |
else |
{ |
{ |
/* Even length. */ |
/* Even length. */ |
mp_limb_t t; |
|
|
|
i = n2; |
i = n2; |
do |
do |
{ |
{ |
--i; |
--i; |
w0 = a[i]; |
w0 = a[i]; |
w1 = a[n2+i]; |
w1 = a[n2 + i]; |
} |
} |
while (w0 == w1 && i != 0); |
while (w0 == w1 && i != 0); |
sign = 0; |
sign = 0; |
Line 211 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 198 mpn_kara_mul_n(p, a, b, n, ws) |
|
{ |
{ |
x = a + n2; |
x = a + n2; |
y = a; |
y = a; |
sign = 1; |
sign = ~0; |
} |
} |
else |
else |
{ |
{ |
Line 221 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 208 mpn_kara_mul_n(p, a, b, n, ws) |
|
mpn_sub_n (p, x, y, n2); |
mpn_sub_n (p, x, y, n2); |
|
|
i = n2; |
i = n2; |
do |
do |
{ |
{ |
--i; |
--i; |
w0 = b[i]; |
w0 = b[i]; |
w1 = b[n2+i]; |
w1 = b[n2 + i]; |
} |
} |
while (w0 == w1 && i != 0); |
while (w0 == w1 && i != 0); |
if (w0 < w1) |
if (w0 < w1) |
{ |
{ |
x = b + n2; |
x = b + n2; |
y = b; |
y = b; |
sign ^= 1; |
sign = ~sign; |
} |
} |
else |
else |
{ |
{ |
Line 242 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 229 mpn_kara_mul_n(p, a, b, n, ws) |
|
mpn_sub_n (p + n2, x, y, n2); |
mpn_sub_n (p + n2, x, y, n2); |
|
|
/* Pointwise products. */ |
/* Pointwise products. */ |
if (n2 < KARATSUBA_MUL_THRESHOLD) |
if (n2 < MUL_KARATSUBA_THRESHOLD) |
{ |
{ |
mpn_mul_basecase (ws, p, n2, p + n2, n2); |
mpn_mul_basecase (ws, p, n2, p + n2, n2); |
mpn_mul_basecase (p, a, n2, b, n2); |
mpn_mul_basecase (p, a, n2, b, n2); |
Line 262 mpn_kara_mul_n(p, a, b, n, ws) |
|
Line 249 mpn_kara_mul_n(p, a, b, n, ws) |
|
w = -mpn_sub_n (ws, p, ws, n); |
w = -mpn_sub_n (ws, p, ws, n); |
w += mpn_add_n (ws, p + n, ws, n); |
w += mpn_add_n (ws, p + n, ws, n); |
w += mpn_add_n (p + n2, p + n2, ws, n); |
w += mpn_add_n (p + n2, p + n2, ws, n); |
/* TO DO: could put "if (w) { ... }" here. |
MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); |
* Less work but badly predicted branch. |
|
* No measurable difference in speed on Alpha. |
|
*/ |
|
i = n + n2; |
|
t = p[i] + w; |
|
p[i] = t; |
|
if (t < w) |
|
{ |
|
do |
|
{ |
|
++i; |
|
w = p[i] + 1; |
|
p[i] = w; |
|
} |
|
while (w == 0); |
|
} |
|
} |
} |
} |
} |
|
|
void |
void |
#if __STDC__ |
|
mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) |
mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) |
#else |
|
mpn_kara_sqr_n (p, a, n, ws) |
|
mp_ptr p; |
|
mp_srcptr a; |
|
mp_size_t n; |
|
mp_ptr ws; |
|
#endif |
|
{ |
{ |
mp_limb_t i, sign, w, w0, w1; |
mp_limb_t w, w0, w1; |
mp_size_t n2; |
mp_size_t n2; |
mp_srcptr x, y; |
mp_srcptr x, y; |
|
mp_size_t i; |
|
|
n2 = n >> 1; |
n2 = n >> 1; |
ASSERT (n2 > 0); |
ASSERT (n2 > 0); |
|
|
if (n & 1) |
if ((n & 1) != 0) |
{ |
{ |
/* Odd length. */ |
/* Odd length. */ |
mp_size_t n1, n3, nm1; |
mp_size_t n1, n3, nm1; |
|
|
n3 = n - n2; |
n3 = n - n2; |
|
|
sign = 0; |
|
w = a[n2]; |
w = a[n2]; |
if (w != 0) |
if (w != 0) |
w -= mpn_sub_n (p, a, a + n3, n2); |
w -= mpn_sub_n (p, a, a + n3, n2); |
Line 318 mpn_kara_sqr_n (p, a, n, ws) |
|
Line 281 mpn_kara_sqr_n (p, a, n, ws) |
|
{ |
{ |
--i; |
--i; |
w0 = a[i]; |
w0 = a[i]; |
w1 = a[n3+i]; |
w1 = a[n3 + i]; |
} |
} |
while (w0 == w1 && i != 0); |
while (w0 == w1 && i != 0); |
if (w0 < w1) |
if (w0 < w1) |
{ |
{ |
x = a + n3; |
x = a + n3; |
y = a; |
y = a; |
sign = 1; |
|
} |
} |
else |
else |
{ |
{ |
Line 336 mpn_kara_sqr_n (p, a, n, ws) |
|
Line 298 mpn_kara_sqr_n (p, a, n, ws) |
|
} |
} |
p[n2] = w; |
p[n2] = w; |
|
|
w = a[n2]; |
n1 = n + 1; |
if (w != 0) |
|
w -= mpn_sub_n (p + n3, a, a + n3, n2); |
/* n2 is always either n3 or n3-1 so maybe the two sets of tests here |
else |
could be combined. But that's not important, since the tests will |
|
take a miniscule amount of time compared to the function calls. */ |
|
if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD)) |
{ |
{ |
i = n2; |
mpn_mul_basecase (ws, p, n3, p, n3); |
do |
mpn_mul_basecase (p, a, n3, a, n3); |
{ |
|
--i; |
|
w0 = a[i]; |
|
w1 = a[n3+i]; |
|
} |
|
while (w0 == w1 && i != 0); |
|
if (w0 < w1) |
|
{ |
|
x = a + n3; |
|
y = a; |
|
sign ^= 1; |
|
} |
|
else |
|
{ |
|
x = a; |
|
y = a + n3; |
|
} |
|
mpn_sub_n (p + n3, x, y, n2); |
|
} |
} |
p[n] = w; |
else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD)) |
|
|
n1 = n + 1; |
|
if (n2 < KARATSUBA_SQR_THRESHOLD) |
|
{ |
{ |
if (n3 < KARATSUBA_SQR_THRESHOLD) |
mpn_sqr_basecase (ws, p, n3); |
{ |
mpn_sqr_basecase (p, a, n3); |
mpn_sqr_basecase (ws, p, n3); |
|
mpn_sqr_basecase (p, a, n3); |
|
} |
|
else |
|
{ |
|
mpn_kara_sqr_n (ws, p, n3, ws + n1); |
|
mpn_kara_sqr_n (p, a, n3, ws + n1); |
|
} |
|
mpn_sqr_basecase (p + n1, a + n3, n2); |
|
} |
} |
else |
else |
{ |
{ |
mpn_kara_sqr_n (ws, p, n3, ws + n1); |
mpn_kara_sqr_n (ws, p, n3, ws + n1); /* (x-y)^2 */ |
mpn_kara_sqr_n (p, a, n3, ws + n1); |
mpn_kara_sqr_n (p, a, n3, ws + n1); /* x^2 */ |
mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); |
|
} |
} |
|
if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) |
if (sign) |
mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2); |
mpn_add_n (ws, p, ws, n1); |
else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) |
|
mpn_sqr_basecase (p + n1, a + n3, n2); |
else |
else |
mpn_sub_n (ws, p, ws, n1); |
mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); /* y^2 */ |
|
|
|
/* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the |
|
borrow from mpn_sub_n. If it occurs then it'll be cancelled by a |
|
carry from ws[n]. Further, since 2xy fits in n1 limbs there won't |
|
be any carry out of ws[n] other than cancelling that borrow. */ |
|
|
|
mpn_sub_n (ws, p, ws, n1); /* x^2-(x-y)^2 */ |
|
|
nm1 = n - 1; |
nm1 = n - 1; |
if (mpn_add_n (ws, p + n1, ws, nm1)) |
if (mpn_add_n (ws, p + n1, ws, nm1)) /* x^2+y^2-(x-y)^2 = 2xy */ |
{ |
{ |
mp_limb_t x = ws[nm1] + 1; |
mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK; |
ws[nm1] = x; |
ws[nm1] = x; |
if (x == 0) |
if (x == 0) |
++ws[n]; |
ws[n] = (ws[n] + 1) & GMP_NUMB_MASK; |
} |
} |
if (mpn_add_n (p + n3, p + n3, ws, n1)) |
if (mpn_add_n (p + n3, p + n3, ws, n1)) |
{ |
{ |
mp_limb_t x; |
mpn_incr_u (p + n1 + n3, 1); |
i = n1 + n3; |
|
do |
|
{ |
|
x = p[i] + 1; |
|
p[i] = x; |
|
++i; |
|
} while (x == 0); |
|
} |
} |
} |
} |
else |
else |
{ |
{ |
/* Even length. */ |
/* Even length. */ |
mp_limb_t t; |
|
|
|
i = n2; |
i = n2; |
do |
do |
{ |
{ |
--i; |
--i; |
w0 = a[i]; |
w0 = a[i]; |
w1 = a[n2+i]; |
w1 = a[n2 + i]; |
} |
} |
while (w0 == w1 && i != 0); |
while (w0 == w1 && i != 0); |
sign = 0; |
|
if (w0 < w1) |
if (w0 < w1) |
{ |
{ |
x = a + n2; |
x = a + n2; |
y = a; |
y = a; |
sign = 1; |
|
} |
} |
else |
else |
{ |
{ |
Line 438 mpn_kara_sqr_n (p, a, n, ws) |
|
Line 368 mpn_kara_sqr_n (p, a, n, ws) |
|
} |
} |
mpn_sub_n (p, x, y, n2); |
mpn_sub_n (p, x, y, n2); |
|
|
i = n2; |
/* Pointwise products. */ |
do |
if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD)) |
{ |
{ |
--i; |
mpn_mul_basecase (ws, p, n2, p, n2); |
w0 = a[i]; |
mpn_mul_basecase (p, a, n2, a, n2); |
w1 = a[n2+i]; |
mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2); |
} |
} |
while (w0 == w1 && i != 0); |
else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD)) |
if (w0 < w1) |
|
{ |
{ |
x = a + n2; |
mpn_sqr_basecase (ws, p, n2); |
y = a; |
mpn_sqr_basecase (p, a, n2); |
sign ^= 1; |
|
} |
|
else |
|
{ |
|
x = a; |
|
y = a + n2; |
|
} |
|
mpn_sub_n (p + n2, x, y, n2); |
|
|
|
/* Pointwise products. */ |
|
if (n2 < KARATSUBA_SQR_THRESHOLD) |
|
{ |
|
mpn_sqr_basecase (ws, p, n2); |
|
mpn_sqr_basecase (p, a, n2); |
|
mpn_sqr_basecase (p + n, a + n2, n2); |
mpn_sqr_basecase (p + n, a + n2, n2); |
} |
} |
else |
else |
{ |
{ |
mpn_kara_sqr_n (ws, p, n2, ws + n); |
mpn_kara_sqr_n (ws, p, n2, ws + n); |
mpn_kara_sqr_n (p, a, n2, ws + n); |
mpn_kara_sqr_n (p, a, n2, ws + n); |
mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); |
mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); |
} |
} |
|
|
/* Interpolate. */ |
/* Interpolate. */ |
if (sign) |
w = -mpn_sub_n (ws, p, ws, n); |
w = mpn_add_n (ws, p, ws, n); |
|
else |
|
w = -mpn_sub_n (ws, p, ws, n); |
|
w += mpn_add_n (ws, p + n, ws, n); |
w += mpn_add_n (ws, p + n, ws, n); |
w += mpn_add_n (p + n2, p + n2, ws, n); |
w += mpn_add_n (p + n2, p + n2, ws, n); |
/* TO DO: could put "if (w) { ... }" here. |
MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w); |
* Less work but badly predicted branch. |
|
* No measurable difference in speed on Alpha. |
|
*/ |
|
i = n + n2; |
|
t = p[i] + w; |
|
p[i] = t; |
|
if (t < w) |
|
{ |
|
do |
|
{ |
|
++i; |
|
w = p[i] + 1; |
|
p[i] = w; |
|
} |
|
while (w == 0); |
|
} |
|
} |
} |
} |
} |
|
|
/*-- add2Times -------------------------------------------------------------*/ |
/*-- add2Times -------------------------------------------------------------*/ |
|
|
/* z[] = x[] + 2 * y[] |
/* z[] = x[] + 2 * y[] |
Note that z and x might point to the same vectors. */ |
Note that z and x might point to the same vectors. |
#ifdef USE_MORE_MPN |
FIXME: gcc won't inline this because it uses alloca. */ |
|
#if USE_MORE_MPN |
|
|
static inline mp_limb_t |
static inline mp_limb_t |
#if __STDC__ |
|
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) |
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) |
#else |
|
add2Times (z, x, y, n) |
|
mp_ptr z; |
|
mp_srcptr x; |
|
mp_srcptr y; |
|
mp_size_t n; |
|
#endif |
|
{ |
{ |
mp_ptr t; |
mp_ptr t; |
mp_limb_t c; |
mp_limb_t c; |
Line 526 add2Times (z, x, y, n) |
|
Line 416 add2Times (z, x, y, n) |
|
TMP_FREE (marker); |
TMP_FREE (marker); |
return c; |
return c; |
} |
} |
|
|
#else |
#else |
|
|
static mp_limb_t |
static mp_limb_t |
#if __STDC__ |
|
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) |
add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) |
#else |
|
add2Times (z, x, y, n) |
|
mp_ptr z; |
|
mp_srcptr x; |
|
mp_srcptr y; |
|
mp_size_t n; |
|
#endif |
|
{ |
{ |
mp_limb_t c, v, w; |
mp_limb_t c, v, w; |
|
|
ASSERT (n > 0); |
ASSERT (n > 0); |
v = *x; w = *y; |
v = *x; |
|
w = *y; |
c = w >> (BITS_PER_MP_LIMB - 1); |
c = w >> (BITS_PER_MP_LIMB - 1); |
w <<= 1; |
w <<= 1; |
v += w; |
v += w; |
Line 578 add2Times (z, x, y, n) |
|
Line 462 add2Times (z, x, y, n) |
|
* C[] has length len2 with len-len2 = 0, 1 or 2. |
* C[] has length len2 with len-len2 = 0, 1 or 2. |
* Returns top words (overflow) at pth, pt1 and pt2 respectively. |
* Returns top words (overflow) at pth, pt1 and pt2 respectively. |
*/ |
*/ |
#ifdef USE_MORE_MPN |
#if USE_MORE_MPN |
|
|
static void |
static void |
#if __STDC__ |
|
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, |
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, |
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2) |
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len,mp_size_t len2) |
#else |
|
evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
A, B, C, len, len2) |
|
mp_ptr ph; |
|
mp_ptr p1; |
|
mp_ptr p2; |
|
mp_ptr pth; |
|
mp_ptr pt1; |
|
mp_ptr pt2; |
|
mp_srcptr A; |
|
mp_srcptr B; |
|
mp_srcptr C; |
|
mp_size_t len; |
|
mp_size_t len2; |
|
#endif |
|
{ |
{ |
mp_limb_t c, d, e; |
mp_limb_t c, d, e; |
|
|
ASSERT (len - len2 <= 2); |
ASSERT (len - len2 <= 2); |
|
|
e = mpn_lshift (p1, B, len, 1); |
e = mpn_lshift (p1, B, len, 1); |
Line 608 evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
Line 477 evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
c = mpn_lshift (ph, A, len, 2); |
c = mpn_lshift (ph, A, len, 2); |
c += e + mpn_add_n (ph, ph, p1, len); |
c += e + mpn_add_n (ph, ph, p1, len); |
d = mpn_add_n (ph, ph, C, len2); |
d = mpn_add_n (ph, ph, C, len2); |
if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); |
if (len2 == len) |
|
c += d; |
|
else |
|
c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); |
ASSERT (c < 7); |
ASSERT (c < 7); |
*pth = c; |
*pth = c; |
|
|
c = mpn_lshift (p2, C, len2, 2); |
c = mpn_lshift (p2, C, len2, 2); |
#if 1 |
#if 1 |
if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; } |
if (len2 != len) |
|
{ |
|
p2[len-1] = 0; |
|
p2[len2] = c; |
|
c = 0; |
|
} |
c += e + mpn_add_n (p2, p2, p1, len); |
c += e + mpn_add_n (p2, p2, p1, len); |
#else |
#else |
d = mpn_add_n (p2, p2, p1, len2); |
d = mpn_add_n (p2, p2, p1, len2); |
c += d; |
c += d; |
if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); |
if (len2 != len) |
|
c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); |
c += e; |
c += e; |
#endif |
#endif |
c += mpn_add_n (p2, p2, A, len); |
c += mpn_add_n (p2, p2, A, len); |
Line 628 evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
Line 506 evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
|
|
c = mpn_add_n (p1, A, B, len); |
c = mpn_add_n (p1, A, B, len); |
d = mpn_add_n (p1, p1, C, len2); |
d = mpn_add_n (p1, p1, C, len2); |
if (len2 == len) c += d; |
if (len2 == len) |
else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); |
c += d; |
|
else |
|
c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); |
ASSERT (c < 3); |
ASSERT (c < 3); |
*pt1 = c; |
*pt1 = c; |
|
|
} |
} |
|
|
#else |
#else |
|
|
static void |
static void |
#if __STDC__ |
|
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, |
evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, |
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls) |
mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls) |
#else |
|
evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
A, B, C, l, ls) |
|
mp_ptr ph; |
|
mp_ptr p1; |
|
mp_ptr p2; |
|
mp_ptr pth; |
|
mp_ptr pt1; |
|
mp_ptr pt2; |
|
mp_srcptr A; |
|
mp_srcptr B; |
|
mp_srcptr C; |
|
mp_size_t l; |
|
mp_size_t ls; |
|
#endif |
|
{ |
{ |
mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2; |
mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2; |
|
|
Line 744 evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
Line 607 evaluate3 (ph, p1, p2, pth, pt1, pt2, |
|
* and returns new top words there. |
* and returns new top words there. |
*/ |
*/ |
|
|
#ifdef USE_MORE_MPN |
#if USE_MORE_MPN |
|
|
static void |
static void |
#if __STDC__ |
|
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, |
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, |
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2) |
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len,mp_size_t len2) |
#else |
|
interpolate3 (A, B, C, D, E, |
|
ptb, ptc, ptd, len, len2) |
|
mp_srcptr A; |
|
mp_ptr B; |
|
mp_ptr C; |
|
mp_ptr D; |
|
mp_srcptr E; |
|
mp_ptr ptb; |
|
mp_ptr ptc; |
|
mp_ptr ptd; |
|
mp_size_t len; |
|
mp_size_t len2; |
|
#endif |
|
{ |
{ |
mp_ptr ws; |
mp_ptr ws; |
mp_limb_t t, tb,tc,td; |
mp_limb_t t, tb,tc,td; |
Line 772 interpolate3 (A, B, C, D, E, |
|
Line 621 interpolate3 (A, B, C, D, E, |
|
ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); |
ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); |
|
|
/* Let x1, x2, x3 be the values to interpolate. We have: |
/* Let x1, x2, x3 be the values to interpolate. We have: |
* b = 16*a + 8*x1 + 4*x2 + 2*x3 + e |
* b = 16*a + 8*x1 + 4*x2 + 2*x3 + e |
* c = a + x1 + x2 + x3 + e |
* c = a + x1 + x2 + x3 + e |
* d = a + 2*x1 + 4*x2 + 8*x3 + 16*e |
* d = a + 2*x1 + 4*x2 + 8*x3 + 16*e |
*/ |
*/ |
|
|
ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); |
ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); |
Line 782 interpolate3 (A, B, C, D, E, |
|
Line 631 interpolate3 (A, B, C, D, E, |
|
tb = *ptb; tc = *ptc; td = *ptd; |
tb = *ptb; tc = *ptc; td = *ptd; |
|
|
|
|
/* b := b - 16*a - e |
/* b := b - 16*a - e |
* c := c - a - e |
* c := c - a - e |
* d := d - a - 16*e |
* d := d - a - 16*e |
*/ |
*/ |
|
|
t = mpn_lshift (ws, A, len, 4); |
t = mpn_lshift (ws, A, len, 4); |
tb -= t + mpn_sub_n (B, B, ws, len); |
tb -= t + mpn_sub_n (B, B, ws, len); |
t = mpn_sub_n (B, B, E, len2); |
t = mpn_sub_n (B, B, E, len2); |
if (len2 == len) tb -= t; |
if (len2 == len) |
else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); |
tb -= t; |
|
else |
|
tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); |
|
|
tc -= mpn_sub_n (C, C, A, len); |
tc -= mpn_sub_n (C, C, A, len); |
t = mpn_sub_n (C, C, E, len2); |
t = mpn_sub_n (C, C, E, len2); |
if (len2 == len) tc -= t; |
if (len2 == len) |
else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); |
tc -= t; |
|
else |
|
tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); |
|
|
t = mpn_lshift (ws, E, len2, 4); |
t = mpn_lshift (ws, E, len2, 4); |
t += mpn_add_n (ws, ws, A, len2); |
t += mpn_add_n (ws, ws, A, len2); |
#if 1 |
#if 1 |
if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t); |
if (len2 != len) |
|
t = mpn_add_1 (ws+len2, A+len2, len-len2, t); |
td -= t + mpn_sub_n (D, D, ws, len); |
td -= t + mpn_sub_n (D, D, ws, len); |
#else |
#else |
t += mpn_sub_n (D, D, ws, len2); |
t += mpn_sub_n (D, D, ws, len2); |
if (len2 != len) { |
if (len2 != len) |
t = mpn_sub_1 (D+len2, D+len2, len-len2, t); |
{ |
t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); |
t = mpn_sub_1 (D+len2, D+len2, len-len2, t); |
} /* end if/else */ |
t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); |
|
} |
td -= t; |
td -= t; |
#endif |
#endif |
|
|
Line 818 interpolate3 (A, B, C, D, E, |
|
Line 673 interpolate3 (A, B, C, D, E, |
|
#ifdef HAVE_MPN_ADD_SUB_N |
#ifdef HAVE_MPN_ADD_SUB_N |
/* #error TO DO ... */ |
/* #error TO DO ... */ |
#else |
#else |
t = tb + td + mpn_add_n (ws, B, D, len); |
t = tb + td + mpn_add_n (ws, B, D, len); |
td = tb - td - mpn_sub_n (D, B, D, len); |
td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK; |
tb = t; |
tb = t; |
MPN_COPY (B, ws, len); |
MPN_COPY (B, ws, len); |
#endif |
#endif |
|
|
/* b := b-8*c */ |
/* b := b-8*c */ |
t = 8 * tc + mpn_lshift (ws, C, len, 3); |
t = 8 * tc + mpn_lshift (ws, C, len, 3); |
tb -= t + mpn_sub_n (B, B, ws, len); |
tb -= t + mpn_sub_n (B, B, ws, len); |
Line 833 interpolate3 (A, B, C, D, E, |
|
Line 688 interpolate3 (A, B, C, D, E, |
|
tc -= tb + mpn_sub_n (C, C, B, len); |
tc -= tb + mpn_sub_n (C, C, B, len); |
|
|
/* d := d/3 */ |
/* d := d/3 */ |
td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3; |
td = ((td - mpn_divexact_by3 (D, D, len)) * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; |
|
|
/* b, d := b + d, b - d */ |
/* b, d := b + d, b - d */ |
#ifdef HAVE_MPN_ADD_SUB_N |
#ifdef HAVE_MPN_ADD_SUB_N |
/* #error TO DO ... */ |
/* #error TO DO ... */ |
#else |
#else |
t = tb + td + mpn_add_n (ws, B, D, len); |
t = (tb + td + mpn_add_n (ws, B, D, len)) & GMP_NUMB_MASK; |
td = tb - td - mpn_sub_n (D, B, D, len); |
td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK; |
tb = t; |
tb = t; |
MPN_COPY (B, ws, len); |
MPN_COPY (B, ws, len); |
#endif |
#endif |
Line 853 interpolate3 (A, B, C, D, E, |
|
Line 708 interpolate3 (A, B, C, D, E, |
|
|
|
ASSERT(!(*B & 3)); |
ASSERT(!(*B & 3)); |
mpn_rshift (B, B, len, 2); |
mpn_rshift (B, B, len, 2); |
B[len-1] |= tb<<(BITS_PER_MP_LIMB-2); |
B[len-1] |= (tb << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK; |
ASSERT((long)tb >= 0); |
ASSERT((mp_limb_signed_t)tb >= 0); |
tb >>= 2; |
tb >>= 2; |
|
|
ASSERT(!(*C & 1)); |
ASSERT(!(*C & 1)); |
mpn_rshift (C, C, len, 1); |
mpn_rshift (C, C, len, 1); |
C[len-1] |= tc<<(BITS_PER_MP_LIMB-1); |
C[len-1] |= (tc << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; |
ASSERT((long)tc >= 0); |
ASSERT((mp_limb_signed_t)tc >= 0); |
tc >>= 1; |
tc >>= 1; |
|
|
ASSERT(!(*D & 3)); |
ASSERT(!(*D & 3)); |
mpn_rshift (D, D, len, 2); |
mpn_rshift (D, D, len, 2); |
D[len-1] |= td<<(BITS_PER_MP_LIMB-2); |
D[len-1] |= (td << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK; |
ASSERT((long)td >= 0); |
ASSERT((mp_limb_signed_t)td >= 0); |
td >>= 2; |
td >>= 2; |
|
|
#if WANT_ASSERT |
#if WANT_ASSERT |
Line 893 interpolate3 (A, B, C, D, E, |
|
Line 748 interpolate3 (A, B, C, D, E, |
|
#else |
#else |
|
|
static void |
static void |
#if __STDC__ |
|
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, |
interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, |
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls) |
mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls) |
#else |
|
interpolate3 (A, B, C, D, E, |
|
ptb, ptc, ptd, l, ls) |
|
mp_srcptr A; |
|
mp_ptr B; |
|
mp_ptr C; |
|
mp_ptr D; |
|
mp_srcptr E; |
|
mp_ptr ptb; |
|
mp_ptr ptc; |
|
mp_ptr ptd; |
|
mp_size_t l; |
|
mp_size_t ls; |
|
#endif |
|
{ |
{ |
mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; |
mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; |
const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1); |
const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1); |
Line 974 interpolate3 (A, B, C, D, E, |
|
Line 814 interpolate3 (A, B, C, D, E, |
|
c = t - b; |
c = t - b; |
|
|
/* d := d/3 */ |
/* d := d/3 */ |
d *= INVERSE_3; |
d *= MODLIMB_INVERSE_3; |
td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); |
td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); |
td *= INVERSE_3; |
td *= MODLIMB_INVERSE_3; |
|
|
/* b, d := b + d, b - d */ |
/* b, d := b + d, b - d */ |
t = b + d; |
t = b + d; |
Line 1004 interpolate3 (A, B, C, D, E, |
|
Line 844 interpolate3 (A, B, C, D, E, |
|
tc += c < sc; |
tc += c < sc; |
/* TO DO: choose one of the following alternatives. */ |
/* TO DO: choose one of the following alternatives. */ |
#if 1 |
#if 1 |
sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1)); |
sc = (mp_limb_signed_t) sc >> (BITS_PER_MP_LIMB - 1); |
sc += tc; |
sc += tc; |
#else |
#else |
sc = tc - ((long)sc < 0L); |
sc = tc - ((mp_limb_signed_t) sc < 0L); |
#endif |
#endif |
|
|
/* sd has period 2. */ |
/* sd has period 2. */ |
Line 1040 interpolate3 (A, B, C, D, E, |
|
Line 880 interpolate3 (A, B, C, D, E, |
|
b = t; |
b = t; |
b -= c << 3; |
b -= c << 3; |
c = (c << 1) - b; |
c = (c << 1) - b; |
d *= INVERSE_3; |
d *= MODLIMB_INVERSE_3; |
t = b + d; |
t = b + d; |
d = b - d; |
d = b - d; |
b = t; |
b = t; |
Line 1086 interpolate3 (A, B, C, D, E, |
|
Line 926 interpolate3 (A, B, C, D, E, |
|
* ws is workspace. |
* ws is workspace. |
*/ |
*/ |
|
|
/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the |
/* TO DO: If MUL_TOOM3_THRESHOLD is much bigger than MUL_KARATSUBA_THRESHOLD then the |
* recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() |
* recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() |
* because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false. |
* because the "n < MUL_KARATSUBA_THRESHOLD" test here will always be false. |
*/ |
*/ |
|
|
#define TOOM3_MUL_REC(p, a, b, n, ws) \ |
#define TOOM3_MUL_REC(p, a, b, n, ws) \ |
do { \ |
do { \ |
if (n < KARATSUBA_MUL_THRESHOLD) \ |
if (n < MUL_KARATSUBA_THRESHOLD) \ |
mpn_mul_basecase (p, a, n, b, n); \ |
mpn_mul_basecase (p, a, n, b, n); \ |
else if (n < TOOM3_MUL_THRESHOLD) \ |
else if (n < MUL_TOOM3_THRESHOLD) \ |
mpn_kara_mul_n (p, a, b, n, ws); \ |
mpn_kara_mul_n (p, a, b, n, ws); \ |
else \ |
else \ |
mpn_toom3_mul_n (p, a, b, n, ws); \ |
mpn_toom3_mul_n (p, a, b, n, ws); \ |
} while (0) |
} while (0) |
|
|
void |
void |
#if __STDC__ |
|
mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) |
mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) |
#else |
|
mpn_toom3_mul_n (p, a, b, n, ws) |
|
mp_ptr p; |
|
mp_srcptr a; |
|
mp_srcptr b; |
|
mp_size_t n; |
|
mp_ptr ws; |
|
#endif |
|
{ |
{ |
mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; |
mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; |
mp_limb_t *A,*B,*C,*D,*E, *W; |
mp_limb_t *A,*B,*C,*D,*E, *W; |
Line 1125 mpn_toom3_mul_n (p, a, b, n, ws) |
|
Line 956 mpn_toom3_mul_n (p, a, b, n, ws) |
|
{ |
{ |
mp_limb_t m; |
mp_limb_t m; |
|
|
ASSERT (n >= TOOM3_MUL_THRESHOLD); |
/* this is probably unnecessarily strict */ |
|
ASSERT (n >= MUL_TOOM3_THRESHOLD); |
|
|
l = ls = n / 3; |
l = ls = n / 3; |
m = n - l * 3; |
m = n - l * 3; |
if (m != 0) |
if (m != 0) |
Line 1145 mpn_toom3_mul_n (p, a, b, n, ws) |
|
Line 978 mpn_toom3_mul_n (p, a, b, n, ws) |
|
W = ws + l4; |
W = ws + l4; |
} |
} |
|
|
|
ASSERT (l >= 1); |
|
ASSERT (ls >= 1); |
|
|
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ |
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ |
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); |
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); |
evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); |
evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); |
Line 1186 mpn_toom3_mul_n (p, a, b, n, ws) |
|
Line 1022 mpn_toom3_mul_n (p, a, b, n, ws) |
|
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); |
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); |
|
|
/** Final stage: add up the coefficients. **/ |
/** Final stage: add up the coefficients. **/ |
{ |
tB += mpn_add_n (p + l, p + l, B, l2); |
mp_limb_t i, x, y; |
tD += mpn_add_n (p + l3, p + l3, D, l2); |
tB += mpn_add_n (p + l, p + l, B, l2); |
MPN_INCR_U (p + l3, 2 * n - l3, tB); |
tD += mpn_add_n (p + l3, p + l3, D, l2); |
MPN_INCR_U (p + l4, 2 * n - l4, tC); |
mpn_incr_u (p + l3, tB); |
MPN_INCR_U (p + l5, 2 * n - l5, tD); |
mpn_incr_u (p + l4, tC); |
|
mpn_incr_u (p + l5, tD); |
|
} |
|
} |
} |
|
|
/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/ |
/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/ |
|
|
/* Like previous function but for squaring */ |
/* Like previous function but for squaring */ |
|
|
#define TOOM3_SQR_REC(p, a, n, ws) \ |
/* FIXME: If SQR_TOOM3_THRESHOLD is big enough it might never get into the |
do { \ |
basecase range. Try to arrange those conditonals go dead. */ |
if (n < KARATSUBA_SQR_THRESHOLD) \ |
#define TOOM3_SQR_REC(p, a, n, ws) \ |
mpn_sqr_basecase (p, a, n); \ |
do { \ |
else if (n < TOOM3_SQR_THRESHOLD) \ |
if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) \ |
mpn_kara_sqr_n (p, a, n, ws); \ |
mpn_mul_basecase (p, a, n, a, n); \ |
else \ |
else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \ |
mpn_toom3_sqr_n (p, a, n, ws); \ |
mpn_sqr_basecase (p, a, n); \ |
|
else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ |
|
mpn_kara_sqr_n (p, a, n, ws); \ |
|
else \ |
|
mpn_toom3_sqr_n (p, a, n, ws); \ |
} while (0) |
} while (0) |
|
|
void |
void |
#if __STDC__ |
|
mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) |
mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) |
#else |
|
mpn_toom3_sqr_n (p, a, n, ws) |
|
mp_ptr p; |
|
mp_srcptr a; |
|
mp_size_t n; |
|
mp_ptr ws; |
|
#endif |
|
{ |
{ |
mp_limb_t cB,cC,cD, tB,tC,tD; |
mp_limb_t cB,cC,cD, tB,tC,tD; |
mp_limb_t *A,*B,*C,*D,*E, *W; |
mp_limb_t *A,*B,*C,*D,*E, *W; |
Line 1233 mpn_toom3_sqr_n (p, a, n, ws) |
|
Line 1062 mpn_toom3_sqr_n (p, a, n, ws) |
|
{ |
{ |
mp_limb_t m; |
mp_limb_t m; |
|
|
ASSERT (n >= TOOM3_MUL_THRESHOLD); |
/* this is probably unnecessarily strict */ |
|
ASSERT (n >= SQR_TOOM3_THRESHOLD); |
|
|
l = ls = n / 3; |
l = ls = n / 3; |
m = n - l * 3; |
m = n - l * 3; |
if (m != 0) |
if (m != 0) |
Line 1253 mpn_toom3_sqr_n (p, a, n, ws) |
|
Line 1084 mpn_toom3_sqr_n (p, a, n, ws) |
|
W = ws + l4; |
W = ws + l4; |
} |
} |
|
|
|
ASSERT (l >= 1); |
|
ASSERT (ls >= 1); |
|
|
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ |
/** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ |
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); |
evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); |
|
|
Line 1271 mpn_toom3_sqr_n (p, a, n, ws) |
|
Line 1105 mpn_toom3_sqr_n (p, a, n, ws) |
|
{ |
{ |
tC += add2Times (C + l, C + l, B, l); |
tC += add2Times (C + l, C + l, B, l); |
if (cC == 2) |
if (cC == 2) |
tC += add2Times (C + l, C + l, B, l); |
tC += add2Times (C + l, C + l, B, l); |
} |
} |
#endif |
#endif |
ASSERT (tC < 9); |
ASSERT (tC < 9); |
Line 1286 mpn_toom3_sqr_n (p, a, n, ws) |
|
Line 1120 mpn_toom3_sqr_n (p, a, n, ws) |
|
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); |
interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); |
|
|
/** Final stage: add up the coefficients. **/ |
/** Final stage: add up the coefficients. **/ |
{ |
tB += mpn_add_n (p + l, p + l, B, l2); |
mp_limb_t i, x, y; |
tD += mpn_add_n (p + l3, p + l3, D, l2); |
tB += mpn_add_n (p + l, p + l, B, l2); |
MPN_INCR_U (p + l3, 2 * n - l3, tB); |
tD += mpn_add_n (p + l3, p + l3, D, l2); |
MPN_INCR_U (p + l4, 2 * n - l4, tC); |
mpn_incr_u (p + l3, tB); |
MPN_INCR_U (p + l5, 2 * n - l5, tD); |
mpn_incr_u (p + l4, tC); |
|
mpn_incr_u (p + l5, tD); |
|
} |
|
} |
} |
|
|
void |
void |
#if __STDC__ |
|
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) |
mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) |
#else |
|
mpn_mul_n (p, a, b, n) |
|
mp_ptr p; |
|
mp_srcptr a; |
|
mp_srcptr b; |
|
mp_size_t n; |
|
#endif |
|
{ |
{ |
if (n < KARATSUBA_MUL_THRESHOLD) |
ASSERT (n >= 1); |
|
ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); |
|
ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); |
|
|
|
if (n < MUL_KARATSUBA_THRESHOLD) |
mpn_mul_basecase (p, a, n, b, n); |
mpn_mul_basecase (p, a, n, b, n); |
else if (n < TOOM3_MUL_THRESHOLD) |
else if (n < MUL_TOOM3_THRESHOLD) |
{ |
{ |
/* Allocate workspace of fixed size on stack: fast! */ |
/* Allocate workspace of fixed size on stack: fast! */ |
#if TUNE_PROGRAM_BUILD |
#if TUNE_PROGRAM_BUILD |
mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB]; |
mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)]; |
#else |
#else |
mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB]; |
mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD-1)]; |
#endif |
#endif |
mpn_kara_mul_n (p, a, b, n, ws); |
mpn_kara_mul_n (p, a, b, n, ws); |
} |
} |
#if WANT_FFT || TUNE_PROGRAM_BUILD |
#if WANT_FFT || TUNE_PROGRAM_BUILD |
else if (n < FFT_MUL_THRESHOLD) |
else if (n < MUL_FFT_THRESHOLD) |
#else |
#else |
else |
else |
#endif |
#endif |
{ |
{ |
/* Use workspace of unknown size in heap, as stack space may |
/* Use workspace of unknown size in heap, as stack space may |
* be limited. Since n is at least TOOM3_MUL_THRESHOLD, the |
* be limited. Since n is at least MUL_TOOM3_THRESHOLD, the |
* multiplication will take much longer than malloc()/free(). */ |
* multiplication will take much longer than malloc()/free(). */ |
mp_limb_t wsLen, *ws; |
mp_limb_t wsLen, *ws; |
wsLen = 2 * n + 3 * BITS_PER_MP_LIMB; |
wsLen = MPN_TOOM3_MUL_N_TSIZE (n); |
ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t)); |
ws = __GMP_ALLOCATE_FUNC_LIMBS ((size_t) wsLen); |
mpn_toom3_mul_n (p, a, b, n, ws); |
mpn_toom3_mul_n (p, a, b, n, ws); |
(*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t)); |
__GMP_FREE_FUNC_LIMBS (ws, (size_t) wsLen); |
} |
} |
#if WANT_FFT || TUNE_PROGRAM_BUILD |
#if WANT_FFT || TUNE_PROGRAM_BUILD |
else |
else |
{ |
{ |
mpn_mul_fft_full (p, a, n, b, n); |
mpn_mul_fft_full (p, a, n, b, n); |
} |
} |
#endif |
#endif |
} |
} |