Annotation of OpenXM_contrib/gmp/mpn/generic/mul_fft.c, Revision 1.1.1.1
1.1 maekawa 1: /* An implementation in GMP of Scho"nhage's fast multiplication algorithm
2: modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
3:
4: THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
5: MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
6: INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
7: A FUTURE GNU MP RELEASE.
8:
9: Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
10:
11: This file is part of the GNU MP Library.
12:
13: The GNU MP Library is free software; you can redistribute it and/or modify
14: it under the terms of the GNU Lesser General Public License as published by
15: the Free Software Foundation; either version 2.1 of the License, or (at your
16: option) any later version.
17:
18: The GNU MP Library is distributed in the hope that it will be useful, but
19: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21: License for more details.
22:
23: You should have received a copy of the GNU Lesser General Public License
24: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
25: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
26: MA 02111-1307, USA. */
27:
28:
29: /* References:
30:
31: Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
32: Strassen, Computing 7, p. 281-292, 1971.
33:
34: Asymptotically fast algorithms for the numerical multiplication
35: and division of polynomials with complex coefficients, by Arnold Scho"nhage,
36: Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
37:
38: Tapes versus Pointers, a study in implementing fast algorithms,
39: by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
40:
41: See also http://www.loria.fr/~zimmerma/bignum
42:
43:
44: Future:
45:
46: K==2 isn't needed in the current uses of this code and the bits specific
47: for that could be dropped.
48:
49: It might be possible to avoid a small number of MPN_COPYs by using a
50: rotating temporary or two.
51:
52: Multiplications of unequal sized operands can be done with this code, but
53: it needs a tighter test for identifying squaring (same sizes as well as
54: same pointers). */
55:
56:
57: #include <stdio.h>
58: #include "gmp.h"
59: #include "gmp-impl.h"
60:
61:
62: /* Change this to "#define TRACE(x) x" for some traces. */
63: #define TRACE(x)
64:
65:
66:
67: FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
68: FFT_MUL_TABLE,
69: FFT_SQR_TABLE
70: };
71:
72:
73: static void mpn_mul_fft_internal
74: _PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
75: int k, int K,
76: mp_limb_t **Ap, mp_limb_t **Bp,
77: mp_limb_t *A, mp_limb_t *B,
78: mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
79: mp_limb_t *T, int rec));
80:
81:
82: /* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
83: sqr==0 if for a multiply, sqr==1 for a square */
84: int
85: #if __STDC__
86: mpn_fft_best_k (mp_size_t n, int sqr)
87: #else
88: mpn_fft_best_k (n, sqr)
89: mp_size_t n;
90: int sqr;
91: #endif
92: {
93: mp_size_t t;
94: int i;
95:
96: for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
97: if (n < mpn_fft_table[sqr][i])
98: return i + FFT_FIRST_K;
99:
100: /* treat 4*last as one further entry */
101: if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
102: return i + FFT_FIRST_K;
103: else
104: return i + FFT_FIRST_K + 1;
105: }
106:
107:
108: /* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
109: FIXME: Is this simply pl rounded up to the next multiple of 2^k ? */
110:
111: mp_size_t
112: #if __STDC__
113: mpn_fft_next_size (mp_size_t pl, int k)
114: #else
115: mpn_fft_next_size (pl, k)
116: mp_size_t pl;
117: int k;
118: #endif
119: {
120: mp_size_t N, M;
121: int K;
122:
123: /* if (k==0) k = mpn_fft_best_k (pl, sqr); */
124: N = pl*BITS_PER_MP_LIMB;
125: K = 1<<k;
126: if (N%K) N=(N/K+1)*K;
127: M = N/K;
128: if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
129: return (N/BITS_PER_MP_LIMB);
130: }
131:
132:
133: static void
134: #if __STDC__
135: mpn_fft_initl(int **l, int k)
136: #else
137: mpn_fft_initl(l, k)
138: int **l;
139: int k;
140: #endif
141: {
142: int i,j,K;
143:
144: l[0][0] = 0;
145: for (i=1,K=2;i<=k;i++,K*=2) {
146: for (j=0;j<K/2;j++) {
147: l[i][j] = 2*l[i-1][j];
148: l[i][K/2+j] = 1+l[i][j];
149: }
150: }
151: }
152:
153:
154: /* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
155: static void
156: #if __STDC__
157: mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
158: #else
159: mpn_fft_neg_modF(ap, n)
160: mp_limb_t *ap;
161: mp_size_t n;
162: #endif
163: {
164: mp_limb_t c;
165:
166: c = ap[n]+2;
167: mpn_com_n (ap, ap, n);
168: ap[n]=0; mpn_incr_u(ap, c);
169: }
170:
171:
172: /* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
173: static void
174: #if __STDC__
175: mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
176: #else
177: mpn_fft_mul_2exp_modF(ap, e, n, tp)
178: mp_limb_t *ap;
179: int e;
180: mp_size_t n;
181: mp_limb_t *tp;
182: #endif
183: {
184: int d, sh, i; mp_limb_t cc;
185:
186: d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
187: sh = d % BITS_PER_MP_LIMB;
188: if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
189: else MPN_COPY(tp, ap, n+1);
190: d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
191: if (d) {
192: /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
193: /* mpn_xor would be more efficient here */
194: for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
195: cc = 1-mpn_add_1(ap, ap, d, 1);
196: if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
197: else MPN_COPY(ap+d, tp, n-d);
198: if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
199: ap[n]=mpn_add_1(ap, ap, n, cc);
200: else ap[n]=0;
201: }
202: else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
203: ap[n]=mpn_add_1(ap, ap, n, 1);
204: }
205: if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
206: }
207:
208:
209: /* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
210: static void
211: #if __STDC__
212: mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
213: #else
214: mpn_fft_add_modF (ap, bp, n)
215: mp_limb_t *ap,*bp;
216: int n;
217: #endif
218: {
219: mp_limb_t c;
220:
221: c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
222: if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
223: ap[n]=c;
224: }
225:
226:
227: /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
228: N=n*BITS_PER_MP_LIMB
229: 2^omega is a primitive root mod 2^N+1
230: output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
231:
232: static void
233: #if __STDC__
234: mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
235: mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
236: #else
237: mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
238: mp_limb_t **Ap,*tp;
239: mp_size_t K,omega,n,inc;
240: int **ll;
241: #endif
242: {
243: if (K==2) {
244: #ifdef ADDSUB
245: if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
246: #else
247: MPN_COPY(tp, Ap[0], n+1);
248: mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
249: if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
250: #endif
251: Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
252: }
253: else {
254: int j, inc2=2*inc;
255: int *lk = *ll;
256: mp_limb_t *tmp;
257: TMP_DECL(marker);
258:
259: TMP_MARK(marker);
260: tmp = TMP_ALLOC_LIMBS (n+1);
261: mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
262: mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
263: /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
264: A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
265: for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
266: MPN_COPY(tp, Ap[inc], n+1);
267: mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
268: mpn_fft_add_modF(Ap[inc], Ap[0], n);
269: mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
270: mpn_fft_add_modF(Ap[0], tp, n);
271: }
272: TMP_FREE(marker);
273: }
274: }
275:
276:
277: /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
278: N=n*BITS_PER_MP_LIMB
279: 2^omega is a primitive root mod 2^N+1
280: output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
281:
282: static void
283: #if __STDC__
284: mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
285: mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
286: #else
287: mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
288: mp_limb_t **Ap,**Bp,*tp;
289: mp_size_t K,omega,n,inc;
290: int **ll;
291: #endif
292: {
293: if (K==2) {
294: #ifdef ADDSUB
295: if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
296: #else
297: MPN_COPY(tp, Ap[0], n+1);
298: mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
299: if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
300: #endif
301: Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
302: #ifdef ADDSUB
303: if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
304: #else
305: MPN_COPY(tp, Bp[0], n+1);
306: mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
307: if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
308: #endif
309: Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
310: }
311: else {
312: int j, inc2=2*inc;
313: int *lk=*ll;
314: mp_limb_t *tmp;
315: TMP_DECL(marker);
316:
317: TMP_MARK(marker);
318: tmp = TMP_ALLOC_LIMBS (n+1);
319: mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
320: mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
321: /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
322: A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
323: for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
324: MPN_COPY(tp, Ap[inc], n+1);
325: mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
326: mpn_fft_add_modF(Ap[inc], Ap[0], n);
327: mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
328: mpn_fft_add_modF(Ap[0], tp, n);
329: MPN_COPY(tp, Bp[inc], n+1);
330: mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
331: mpn_fft_add_modF(Bp[inc], Bp[0], n);
332: mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
333: mpn_fft_add_modF(Bp[0], tp, n);
334: }
335: TMP_FREE(marker);
336: }
337: }
338:
339:
340: /* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
341: static void
342: #if __STDC__
343: mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K)
344: #else
345: mpn_fft_mul_modF_K(ap, bp, n, K)
346: mp_limb_t **ap, **bp;
347: mp_size_t n;
348: int K;
349: #endif
350: {
351: int i;
352: int sqr = (ap == bp);
353: TMP_DECL(marker);
354:
355: TMP_MARK(marker);
356:
357: if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
358: int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
359: int **_fft_l;
360: mp_limb_t **Ap,**Bp,*A,*B,*T;
361:
362: k = mpn_fft_best_k (n, sqr);
363: K2 = 1<<k;
364: maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
365: M2 = n*BITS_PER_MP_LIMB/K2;
366: l = n/K2;
367: Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
368: nprime2 = Nprime2/BITS_PER_MP_LIMB;
369: Mp2 = Nprime2/K2;
370:
371: Ap = TMP_ALLOC_MP_PTRS (K2);
372: Bp = TMP_ALLOC_MP_PTRS (K2);
373: A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
374: T = TMP_ALLOC_LIMBS (nprime2+1);
375: B = A + K2*(nprime2+1);
376: _fft_l = TMP_ALLOC_TYPE (k+1, int*);
377: for (i=0;i<=k;i++)
378: _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
379: mpn_fft_initl(_fft_l, k);
380:
381: TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
382: n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
383:
384: for (i=0;i<K;i++,ap++,bp++)
385: mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
386: l, Mp2, _fft_l, T, 1);
387: }
388: else {
389: mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
390: tp = TMP_ALLOC_LIMBS (n2);
391: tpn = tp+n;
392: TRACE (printf (" mpn_mul_n %d of %d limbs\n", K, n));
393: for (i=0;i<K;i++) {
394: a = *ap++; b=*bp++;
395: if (sqr)
396: mpn_sqr_n(tp, a, n);
397: else
398: mpn_mul_n(tp, b, a, n);
399: if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
400: if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
401: if (cc) {
402: cc = mpn_add_1(tp, tp, n2, cc);
403: ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
404: }
405: a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1);
406: }
407: }
408: TMP_FREE(marker);
409: }
410:
411:
412: /* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
413: output: K*A[0] K*A[K-1] ... K*A[1] */
414:
415: static void
416: #if __STDC__
417: mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
418: mp_limb_t *tp)
419: #else
420: mpn_fft_fftinv(Ap,K,omega,n,tp)
421: mp_limb_t **Ap, *tp;
422: int K;
423: mp_size_t omega, n;
424: #endif
425: {
426: if (K==2) {
427: #ifdef ADDSUB
428: if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
429: #else
430: MPN_COPY(tp, Ap[0], n+1);
431: mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
432: if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
433: #endif
434: Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
435: }
436: else {
437: int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp;
438: TMP_DECL(marker);
439:
440: TMP_MARK(marker);
441: tmp = TMP_ALLOC_LIMBS (n+1);
442: mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
443: mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
444: /* A[j] <- A[j] + omega^j A[j+K/2]
445: A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
446: for (j=0;j<K2;j++,Ap++,Bp++) {
447: MPN_COPY(tp, Bp[0], n+1);
448: mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
449: mpn_fft_add_modF(Bp[0], Ap[0], n);
450: mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
451: mpn_fft_add_modF(Ap[0], tp, n);
452: }
453: TMP_FREE(marker);
454: }
455: }
456:
457:
458: /* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
459: static void
460: #if __STDC__
461: mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
462: #else
463: mpn_fft_div_2exp_modF(ap,k,n,tp)
464: mp_limb_t *ap,*tp;
465: int k;
466: mp_size_t n;
467: #endif
468: {
469: int i;
470:
471: i = 2*n*BITS_PER_MP_LIMB;
472: i = (i-k) % i;
473: mpn_fft_mul_2exp_modF(ap,i,n,tp);
474: /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
475: /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
476: if (ap[n]==1) {
477: for (i=0;i<n && ap[i]==0;i++);
478: if (i<n) {
479: ap[n]=0;
480: mpn_sub_1(ap, ap, n, 1);
481: }
482: }
483: }
484:
485:
486: /* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
487: static void
488: #if __STDC__
489: mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an)
490: #else
491: mpn_fft_norm_modF(rp, ap, n, an)
492: mp_limb_t *rp;
493: mp_limb_t *ap;
494: mp_size_t n;
495: mp_size_t an;
496: #endif
497: {
498: mp_size_t l;
499:
500: if (an>2*n) {
501: l = n;
502: rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an,
503: mpn_add_n(rp,ap,ap+2*n,an-2*n));
504: }
505: else {
506: l = an-n;
507: MPN_COPY(rp, ap, n);
508: rp[n]=0;
509: }
510: if (mpn_sub_n(rp,rp,ap+n,l)) {
511: if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
512: rp[n]=mpn_add_1(rp,rp,n,1);
513: }
514: }
515:
516:
517: static void
518: #if __STDC__
519: mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
520: int k, int K,
521: mp_limb_t **Ap, mp_limb_t **Bp,
522: mp_limb_t *A, mp_limb_t *B,
523: mp_size_t nprime, mp_size_t l, mp_size_t Mp,
524: int **_fft_l,
525: mp_limb_t *T, int rec)
526: #else
527: mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
528: mp_limb_t *op;
529: mp_srcptr n, m;
530: mp_limb_t **Ap,**Bp,*A,*B,*T;
531: mp_size_t pl,nprime;
532: int **_fft_l;
533: int k,K,l,Mp,rec;
534: #endif
535: {
536: int i, sqr, pla, lo, sh, j;
537: mp_limb_t *p;
538:
539: sqr = (n==m);
540:
541: TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
542: pl,k,K,nprime,l,Mp,rec,sqr));
543:
544: /* decomposition of inputs into arrays Ap[i] and Bp[i] */
545: if (rec) for (i=0;i<K;i++) {
546: Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
547: /* store the next M bits of n into A[i] */
548: /* supposes that M is a multiple of BITS_PER_MP_LIMB */
549: MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
550: /* set most significant bits of n and m (important in recursive calls) */
551: if (i==K-1) Ap[i][l]=n[0];
552: mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
553: if (!sqr) {
554: MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
555: if (i==K-1) Bp[i][l]=m[0];
556: mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
557: }
558: }
559:
560: /* direct fft's */
561: if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
562: else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
563:
564: /* term to term multiplications */
565: mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
566:
567: /* inverse fft's */
568: mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
569:
570: /* division of terms after inverse fft */
571: for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
572:
573: /* addition of terms in result p */
574: MPN_ZERO(T,nprime+1);
575: pla = l*(K-1)+nprime+1; /* number of required limbs for p */
576: p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
577: MPN_ZERO(p, pla);
578: sqr=0; /* will accumulate the (signed) carry at p[pla] */
579: for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
580: mp_ptr n = p+sh;
581: j = (K-i)%K;
582: if (mpn_add_n(n,n,Ap[j],nprime+1))
583: sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
584: T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
585: if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
586: sqr -= mpn_sub_1(n,n,pla-sh,1);
587: sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
588: }
589: }
590: if (sqr==-1) {
591: if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
592: /* p[pla-pl]...p[pla-1] are all zero */
593: mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
594: mpn_sub_1(p+pla-1,p+pla-1,1,1);
595: }
596: }
597: else if (sqr==1) {
598: if (pla>=2*pl)
599: while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
600: else {
601: sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
602: ASSERT (sqr == 0);
603: }
604: }
605: else
606: ASSERT (sqr == 0);
607:
608: /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
609: < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
610: < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
611: mpn_fft_norm_modF(op,p,pl,pla);
612: }
613:
614:
615: /* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
616: n and m have respectively nl and ml limbs
617: op must have space for pl+1 limbs
618: One must have pl = mpn_fft_next_size(pl, k).
619: */
620:
621: void
622: #if __STDC__
623: mpn_mul_fft (mp_ptr op, mp_size_t pl,
624: mp_srcptr n, mp_size_t nl,
625: mp_srcptr m, mp_size_t ml,
626: int k)
627: #else
628: mpn_mul_fft (op, pl, n, nl, m, ml, k)
629: mp_ptr op;
630: mp_size_t pl;
631: mp_srcptr n;
632: mp_size_t nl;
633: mp_srcptr m;
634: mp_size_t ml;
635: int k;
636: #endif
637: {
638: int K,maxLK,i,j;
639: mp_size_t N,Nprime,nprime,M,Mp,l;
640: mp_limb_t **Ap,**Bp,*A,*T,*B;
641: int **_fft_l;
642: int sqr = (n==m && nl==ml);
643: TMP_DECL(marker);
644:
645: TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
646: pl, nl, ml, k));
647: ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
648:
649: TMP_MARK(marker);
650: N = pl*BITS_PER_MP_LIMB;
651: _fft_l = TMP_ALLOC_TYPE (k+1, int*);
652: for (i=0;i<=k;i++)
653: _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
654: mpn_fft_initl(_fft_l, k);
655: K = 1<<k;
656: M = N/K; /* N = 2^k M */
657: l = M/BITS_PER_MP_LIMB;
658: maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
659:
660: Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
661: nprime = Nprime/BITS_PER_MP_LIMB;
662: TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
663: N, K, M, l, maxLK, Nprime, nprime));
664: if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
665: maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
666: if (Nprime % maxLK) {
667: Nprime=((Nprime/maxLK)+1)*maxLK;
668: nprime = Nprime/BITS_PER_MP_LIMB;
669: }
670: TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
671: }
672:
673: T = TMP_ALLOC_LIMBS (nprime+1);
674: Mp = Nprime/K;
675:
676: TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
677: pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
678: printf(" temp space %ld\n", 2*K*(nprime+1)));
679:
680: A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
681: B = A+K*(nprime+1);
682: Ap = TMP_ALLOC_MP_PTRS (K);
683: Bp = TMP_ALLOC_MP_PTRS (K);
684: /* special decomposition for main call */
685: for (i=0;i<K;i++) {
686: Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
687: /* store the next M bits of n into A[i] */
688: /* supposes that M is a multiple of BITS_PER_MP_LIMB */
689: if (nl>0) {
690: j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
691: MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
692: mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
693: }
694: else MPN_ZERO(Ap[i], nprime+1);
695: nl -= l;
696: if (n!=m) {
697: if (ml>0) {
698: j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
699: MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
700: mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
701: }
702: else MPN_ZERO(Bp[i], nprime+1);
703: }
704: ml -= l;
705: }
706: mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
707: TMP_FREE(marker);
708: _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
709: }
710:
711:
712: #if WANT_ASSERT
713: static int
714: #if __STDC__
715: mpn_zero_p (mp_ptr p, mp_size_t n)
716: #else
717: mpn_zero_p (p, n)
718: mp_ptr p;
719: mp_size_t n;
720: #endif
721: {
722: mp_size_t i;
723:
724: for (i = 0; i < n; i++)
725: {
726: if (p[i] != 0)
727: return 0;
728: }
729:
730: return 1;
731: }
732: #endif
733:
734:
735: /* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
736:
737: FIXME: Duplicating the result like this is wasteful, do something better
738: perhaps at the norm_modF stage above. */
739:
740: void
741: #if __STDC__
742: mpn_mul_fft_full (mp_ptr op,
743: mp_srcptr n, mp_size_t nl,
744: mp_srcptr m, mp_size_t ml)
745: #else
746: mpn_mul_fft_full (op, n, nl, m, ml)
747: mp_ptr op;
748: mp_srcptr n;
749: mp_size_t nl;
750: mp_srcptr m;
751: mp_size_t ml;
752: #endif
753: {
754: mp_ptr pad_op;
755: mp_size_t pl;
756: int k;
757: int sqr = (n==m && nl==ml);
758:
759: k = mpn_fft_best_k (nl+ml, sqr);
760: pl = mpn_fft_next_size (nl+ml, k);
761:
762: TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
763: nl, ml, pl, k));
764:
765: pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
766: mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
767:
768: ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
769: MPN_COPY (op, pad_op, nl+ml);
770:
771: _MP_FREE_FUNC_LIMBS (pad_op, pl+1);
772: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>