[BACK]Return to mul_fft.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / generic

Annotation of OpenXM_contrib/gmp/mpn/generic/mul_fft.c, Revision 1.1

1.1     ! maekawa     1: /* An implementation in GMP of Scho"nhage's fast multiplication algorithm
        !             2:    modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
        !             3:
        !             4:    THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
        !             5:    MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
        !             6:    INTERFACES.  IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
        !             7:    A FUTURE GNU MP RELEASE.
        !             8:
        !             9: Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
        !            10:
        !            11: This file is part of the GNU MP Library.
        !            12:
        !            13: The GNU MP Library is free software; you can redistribute it and/or modify
        !            14: it under the terms of the GNU Lesser General Public License as published by
        !            15: the Free Software Foundation; either version 2.1 of the License, or (at your
        !            16: option) any later version.
        !            17:
        !            18: The GNU MP Library is distributed in the hope that it will be useful, but
        !            19: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
        !            20: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
        !            21: License for more details.
        !            22:
        !            23: You should have received a copy of the GNU Lesser General Public License
        !            24: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
        !            25: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
        !            26: MA 02111-1307, USA. */
        !            27:
        !            28:
        !            29: /* References:
        !            30:
        !            31:    Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
        !            32:    Strassen, Computing 7, p. 281-292, 1971.
        !            33:
        !            34:    Asymptotically fast algorithms for the numerical multiplication
        !            35:    and division of polynomials with complex coefficients, by Arnold Scho"nhage,
        !            36:    Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
        !            37:
        !            38:    Tapes versus Pointers, a study in implementing fast algorithms,
        !            39:    by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
        !            40:
        !            41:    See also http://www.loria.fr/~zimmerma/bignum
        !            42:
        !            43:
        !            44:    Future:
        !            45:
        !            46:    K==2 isn't needed in the current uses of this code and the bits specific
        !            47:    for that could be dropped.
        !            48:
        !            49:    It might be possible to avoid a small number of MPN_COPYs by using a
        !            50:    rotating temporary or two.
        !            51:
        !            52:    Multiplications of unequal sized operands can be done with this code, but
        !            53:    it needs a tighter test for identifying squaring (same sizes as well as
        !            54:    same pointers).  */
        !            55:
        !            56:
        !            57: #include <stdio.h>
        !            58: #include "gmp.h"
        !            59: #include "gmp-impl.h"
        !            60:
        !            61:
        !            62: /* Change this to "#define TRACE(x) x" for some traces. */
        !            63: #define TRACE(x)
        !            64:
        !            65:
        !            66:
        !            67: FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
        !            68:   FFT_MUL_TABLE,
        !            69:   FFT_SQR_TABLE
        !            70: };
        !            71:
        !            72:
        !            73: static void mpn_mul_fft_internal
        !            74: _PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
        !            75:          int k, int K,
        !            76:          mp_limb_t **Ap, mp_limb_t **Bp,
        !            77:          mp_limb_t *A, mp_limb_t *B,
        !            78:          mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
        !            79:          mp_limb_t *T, int rec));
        !            80:
        !            81:
        !            82: /* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
        !            83:    sqr==0 if for a multiply, sqr==1 for a square */
        !            84: int
        !            85: #if __STDC__
        !            86: mpn_fft_best_k (mp_size_t n, int sqr)
        !            87: #else
        !            88: mpn_fft_best_k (n, sqr)
        !            89:      mp_size_t n;
        !            90:      int       sqr;
        !            91: #endif
        !            92: {
        !            93:   mp_size_t  t;
        !            94:   int        i;
        !            95:
        !            96:   for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
        !            97:     if (n < mpn_fft_table[sqr][i])
        !            98:       return i + FFT_FIRST_K;
        !            99:
        !           100:   /* treat 4*last as one further entry */
        !           101:   if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
        !           102:     return i + FFT_FIRST_K;
        !           103:   else
        !           104:     return i + FFT_FIRST_K + 1;
        !           105: }
        !           106:
        !           107:
        !           108: /* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
        !           109:    FIXME: Is this simply pl rounded up to the next multiple of 2^k ?  */
        !           110:
        !           111: mp_size_t
        !           112: #if __STDC__
        !           113: mpn_fft_next_size (mp_size_t pl, int k)
        !           114: #else
        !           115: mpn_fft_next_size (pl, k)
        !           116:      mp_size_t pl;
        !           117:      int       k;
        !           118: #endif
        !           119: {
        !           120:   mp_size_t N, M;
        !           121:   int       K;
        !           122:
        !           123:   /*  if (k==0) k = mpn_fft_best_k (pl, sqr); */
        !           124:   N = pl*BITS_PER_MP_LIMB;
        !           125:   K = 1<<k;
        !           126:   if (N%K) N=(N/K+1)*K;
        !           127:   M = N/K;
        !           128:   if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
        !           129:   return (N/BITS_PER_MP_LIMB);
        !           130: }
        !           131:
        !           132:
        !           133: static void
        !           134: #if __STDC__
        !           135: mpn_fft_initl(int **l, int k)
        !           136: #else
        !           137: mpn_fft_initl(l, k)
        !           138:      int  **l;
        !           139:      int  k;
        !           140: #endif
        !           141: {
        !           142:     int i,j,K;
        !           143:
        !           144:     l[0][0] = 0;
        !           145:     for (i=1,K=2;i<=k;i++,K*=2) {
        !           146:        for (j=0;j<K/2;j++) {
        !           147:            l[i][j] = 2*l[i-1][j];
        !           148:            l[i][K/2+j] = 1+l[i][j];
        !           149:        }
        !           150:     }
        !           151: }
        !           152:
        !           153:
        !           154: /* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
        !           155: static void
        !           156: #if __STDC__
        !           157: mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
        !           158: #else
        !           159: mpn_fft_neg_modF(ap, n)
        !           160:      mp_limb_t *ap;
        !           161:      mp_size_t n;
        !           162: #endif
        !           163: {
        !           164:   mp_limb_t c;
        !           165:
        !           166:   c = ap[n]+2;
        !           167:   mpn_com_n (ap, ap, n);
        !           168:   ap[n]=0; mpn_incr_u(ap, c);
        !           169: }
        !           170:
        !           171:
        !           172: /* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
        !           173: static void
        !           174: #if __STDC__
        !           175: mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
        !           176: #else
        !           177: mpn_fft_mul_2exp_modF(ap, e, n, tp)
        !           178:      mp_limb_t *ap;
        !           179:      int e;
        !           180:      mp_size_t n;
        !           181:      mp_limb_t *tp;
        !           182: #endif
        !           183: {
        !           184:   int d, sh, i; mp_limb_t cc;
        !           185:
        !           186:   d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
        !           187:   sh = d % BITS_PER_MP_LIMB;
        !           188:   if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
        !           189:   else MPN_COPY(tp, ap, n+1);
        !           190:   d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
        !           191:  if (d) {
        !           192:    /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
        !           193:    /* mpn_xor would be more efficient here */
        !           194:    for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
        !           195:    cc = 1-mpn_add_1(ap, ap, d, 1);
        !           196:    if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
        !           197:    else MPN_COPY(ap+d, tp, n-d);
        !           198:    if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
        !           199:      ap[n]=mpn_add_1(ap, ap, n, cc);
        !           200:    else ap[n]=0;
        !           201:   }
        !           202:   else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
        !           203:     ap[n]=mpn_add_1(ap, ap, n, 1);
        !           204:   }
        !           205:   if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
        !           206: }
        !           207:
        !           208:
        !           209: /* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
        !           210: static void
        !           211: #if __STDC__
        !           212: mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
        !           213: #else
        !           214: mpn_fft_add_modF (ap, bp, n)
        !           215:      mp_limb_t *ap,*bp;
        !           216:      int n;
        !           217: #endif
        !           218: {
        !           219:   mp_limb_t c;
        !           220:
        !           221:   c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
        !           222:   if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
        !           223:   ap[n]=c;
        !           224: }
        !           225:
        !           226:
        !           227: /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
        !           228:           N=n*BITS_PER_MP_LIMB
        !           229:           2^omega is a primitive root mod 2^N+1
        !           230:    output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
        !           231:
        !           232: static void
        !           233: #if __STDC__
        !           234: mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
        !           235:                  mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
        !           236: #else
        !           237: mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
        !           238: mp_limb_t **Ap,*tp;
        !           239: mp_size_t K,omega,n,inc;
        !           240: int       **ll;
        !           241: #endif
        !           242: {
        !           243:   if (K==2) {
        !           244: #ifdef ADDSUB
        !           245:       if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
        !           246: #else
        !           247:       MPN_COPY(tp, Ap[0], n+1);
        !           248:       mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
        !           249:       if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
        !           250: #endif
        !           251:        Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
        !           252:     }
        !           253:     else {
        !           254:       int       j, inc2=2*inc;
        !           255:       int       *lk = *ll;
        !           256:       mp_limb_t *tmp;
        !           257:       TMP_DECL(marker);
        !           258:
        !           259:       TMP_MARK(marker);
        !           260:       tmp = TMP_ALLOC_LIMBS (n+1);
        !           261:        mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
        !           262:        mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
        !           263:        /* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
        !           264:           A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
        !           265:        for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
        !           266:          MPN_COPY(tp, Ap[inc], n+1);
        !           267:          mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
        !           268:          mpn_fft_add_modF(Ap[inc], Ap[0], n);
        !           269:          mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
        !           270:          mpn_fft_add_modF(Ap[0], tp, n);
        !           271:        }
        !           272:         TMP_FREE(marker);
        !           273:     }
        !           274: }
        !           275:
        !           276:
        !           277: /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
        !           278:           N=n*BITS_PER_MP_LIMB
        !           279:          2^omega is a primitive root mod 2^N+1
        !           280:    output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
        !           281:
        !           282: static void
        !           283: #if __STDC__
        !           284: mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
        !           285:              mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
        !           286: #else
        !           287: mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
        !           288:      mp_limb_t **Ap,**Bp,*tp;
        !           289:      mp_size_t K,omega,n,inc;
        !           290:      int       **ll;
        !           291: #endif
        !           292: {
        !           293:   if (K==2) {
        !           294: #ifdef ADDSUB
        !           295:       if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
        !           296: #else
        !           297:       MPN_COPY(tp, Ap[0], n+1);
        !           298:       mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
        !           299:       if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
        !           300: #endif
        !           301:        Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
        !           302: #ifdef ADDSUB
        !           303:       if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
        !           304: #else
        !           305:       MPN_COPY(tp, Bp[0], n+1);
        !           306:       mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
        !           307:       if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
        !           308: #endif
        !           309:        Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
        !           310:     }
        !           311:     else {
        !           312:        int       j, inc2=2*inc;
        !           313:         int       *lk=*ll;
        !           314:         mp_limb_t *tmp;
        !           315:        TMP_DECL(marker);
        !           316:
        !           317:        TMP_MARK(marker);
        !           318:        tmp = TMP_ALLOC_LIMBS (n+1);
        !           319:        mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
        !           320:        mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
        !           321:        /* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
        !           322:           A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
        !           323:        for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
        !           324:          MPN_COPY(tp, Ap[inc], n+1);
        !           325:          mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
        !           326:          mpn_fft_add_modF(Ap[inc], Ap[0], n);
        !           327:          mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
        !           328:          mpn_fft_add_modF(Ap[0], tp, n);
        !           329:          MPN_COPY(tp, Bp[inc], n+1);
        !           330:          mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
        !           331:          mpn_fft_add_modF(Bp[inc], Bp[0], n);
        !           332:          mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
        !           333:          mpn_fft_add_modF(Bp[0], tp, n);
        !           334:        }
        !           335:        TMP_FREE(marker);
        !           336:     }
        !           337: }
        !           338:
        !           339:
        !           340: /* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
        !           341: static void
        !           342: #if __STDC__
        !           343: mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K)
        !           344: #else
        !           345: mpn_fft_mul_modF_K(ap, bp, n, K)
        !           346:      mp_limb_t **ap, **bp;
        !           347:      mp_size_t n;
        !           348:      int       K;
        !           349: #endif
        !           350: {
        !           351:   int  i;
        !           352:   int  sqr = (ap == bp);
        !           353:   TMP_DECL(marker);
        !           354:
        !           355:   TMP_MARK(marker);
        !           356:
        !           357:   if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
        !           358:     int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
        !           359:     int       **_fft_l;
        !           360:     mp_limb_t **Ap,**Bp,*A,*B,*T;
        !           361:
        !           362:     k = mpn_fft_best_k (n, sqr);
        !           363:     K2 = 1<<k;
        !           364:     maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
        !           365:     M2 = n*BITS_PER_MP_LIMB/K2;
        !           366:     l = n/K2;
        !           367:     Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
        !           368:     nprime2 = Nprime2/BITS_PER_MP_LIMB;
        !           369:     Mp2 = Nprime2/K2;
        !           370:
        !           371:     Ap = TMP_ALLOC_MP_PTRS (K2);
        !           372:     Bp = TMP_ALLOC_MP_PTRS (K2);
        !           373:     A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
        !           374:     T = TMP_ALLOC_LIMBS (nprime2+1);
        !           375:     B = A + K2*(nprime2+1);
        !           376:     _fft_l = TMP_ALLOC_TYPE (k+1, int*);
        !           377:     for (i=0;i<=k;i++)
        !           378:       _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
        !           379:     mpn_fft_initl(_fft_l, k);
        !           380:
        !           381:     TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
        !           382:                   n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
        !           383:
        !           384:     for (i=0;i<K;i++,ap++,bp++)
        !           385:       mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
        !           386:         l, Mp2, _fft_l, T, 1);
        !           387:   }
        !           388:   else {
        !           389:      mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
        !           390:      tp = TMP_ALLOC_LIMBS (n2);
        !           391:      tpn = tp+n;
        !           392:      TRACE (printf ("  mpn_mul_n %d of %d limbs\n", K, n));
        !           393:      for (i=0;i<K;i++) {
        !           394:         a = *ap++; b=*bp++;
        !           395:         if (sqr)
        !           396:           mpn_sqr_n(tp, a, n);
        !           397:         else
        !           398:           mpn_mul_n(tp, b, a, n);
        !           399:        if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
        !           400:        if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
        !           401:        if (cc) {
        !           402:           cc = mpn_add_1(tp, tp, n2, cc);
        !           403:           ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
        !           404:         }
        !           405:        a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1);
        !           406:      }
        !           407:   }
        !           408:   TMP_FREE(marker);
        !           409: }
        !           410:
        !           411:
        !           412: /* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
        !           413:    output: K*A[0] K*A[K-1] ... K*A[1] */
        !           414:
        !           415: static void
        !           416: #if __STDC__
        !           417: mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
        !           418:                 mp_limb_t *tp)
        !           419: #else
        !           420: mpn_fft_fftinv(Ap,K,omega,n,tp)
        !           421:      mp_limb_t **Ap, *tp;
        !           422:      int       K;
        !           423:      mp_size_t omega, n;
        !           424: #endif
        !           425: {
        !           426:     if (K==2) {
        !           427: #ifdef ADDSUB
        !           428:       if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
        !           429: #else
        !           430:       MPN_COPY(tp, Ap[0], n+1);
        !           431:       mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
        !           432:       if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
        !           433: #endif
        !           434:         Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
        !           435:     }
        !           436:     else {
        !           437:        int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp;
        !           438:        TMP_DECL(marker);
        !           439:
        !           440:        TMP_MARK(marker);
        !           441:        tmp = TMP_ALLOC_LIMBS (n+1);
        !           442:        mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
        !           443:        mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
        !           444:        /* A[j]     <- A[j] + omega^j A[j+K/2]
        !           445:           A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
        !           446:         for (j=0;j<K2;j++,Ap++,Bp++) {
        !           447:          MPN_COPY(tp, Bp[0], n+1);
        !           448:          mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
        !           449:          mpn_fft_add_modF(Bp[0], Ap[0], n);
        !           450:          mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
        !           451:          mpn_fft_add_modF(Ap[0], tp, n);
        !           452:        }
        !           453:        TMP_FREE(marker);
        !           454:     }
        !           455: }
        !           456:
        !           457:
        !           458: /* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
        !           459: static void
        !           460: #if __STDC__
        !           461: mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
        !           462: #else
        !           463: mpn_fft_div_2exp_modF(ap,k,n,tp)
        !           464:      mp_limb_t *ap,*tp;
        !           465:      int       k;
        !           466:      mp_size_t n;
        !           467: #endif
        !           468: {
        !           469:     int i;
        !           470:
        !           471:     i = 2*n*BITS_PER_MP_LIMB;
        !           472:     i = (i-k) % i;
        !           473:     mpn_fft_mul_2exp_modF(ap,i,n,tp);
        !           474:     /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
        !           475:     /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
        !           476:     if (ap[n]==1) {
        !           477:       for (i=0;i<n && ap[i]==0;i++);
        !           478:       if (i<n) {
        !           479:        ap[n]=0;
        !           480:        mpn_sub_1(ap, ap, n, 1);
        !           481:       }
        !           482:     }
        !           483: }
        !           484:
        !           485:
        !           486: /* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
        !           487: static void
        !           488: #if __STDC__
        !           489: mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an)
        !           490: #else
        !           491: mpn_fft_norm_modF(rp, ap, n, an)
        !           492:      mp_limb_t *rp;
        !           493:      mp_limb_t *ap;
        !           494:      mp_size_t n;
        !           495:      mp_size_t an;
        !           496: #endif
        !           497: {
        !           498:   mp_size_t l;
        !           499:
        !           500:    if (an>2*n) {
        !           501:      l = n;
        !           502:      rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an,
        !           503:                       mpn_add_n(rp,ap,ap+2*n,an-2*n));
        !           504:    }
        !           505:    else {
        !           506:      l = an-n;
        !           507:      MPN_COPY(rp, ap, n);
        !           508:      rp[n]=0;
        !           509:    }
        !           510:    if (mpn_sub_n(rp,rp,ap+n,l)) {
        !           511:      if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
        !           512:        rp[n]=mpn_add_1(rp,rp,n,1);
        !           513:    }
        !           514: }
        !           515:
        !           516:
        !           517: static void
        !           518: #if __STDC__
        !           519: mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
        !           520:                      int k, int K,
        !           521:                      mp_limb_t **Ap, mp_limb_t **Bp,
        !           522:                      mp_limb_t *A, mp_limb_t *B,
        !           523:                      mp_size_t nprime, mp_size_t l, mp_size_t Mp,
        !           524:                      int **_fft_l,
        !           525:                      mp_limb_t *T, int rec)
        !           526: #else
        !           527: mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
        !           528:      mp_limb_t *op;
        !           529:      mp_srcptr n, m;
        !           530:      mp_limb_t **Ap,**Bp,*A,*B,*T;
        !           531:      mp_size_t pl,nprime;
        !           532:      int       **_fft_l;
        !           533:      int       k,K,l,Mp,rec;
        !           534: #endif
        !           535: {
        !           536:   int       i, sqr, pla, lo, sh, j;
        !           537:   mp_limb_t *p;
        !           538:
        !           539:     sqr = (n==m);
        !           540:
        !           541:     TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
        !           542:                    pl,k,K,nprime,l,Mp,rec,sqr));
        !           543:
        !           544:     /* decomposition of inputs into arrays Ap[i] and Bp[i] */
        !           545:     if (rec) for (i=0;i<K;i++) {
        !           546:       Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
        !           547:       /* store the next M bits of n into A[i] */
        !           548:       /* supposes that M is a multiple of BITS_PER_MP_LIMB */
        !           549:       MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
        !           550:       /* set most significant bits of n and m (important in recursive calls) */
        !           551:       if (i==K-1) Ap[i][l]=n[0];
        !           552:       mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
        !           553:       if (!sqr) {
        !           554:        MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
        !           555:        if (i==K-1) Bp[i][l]=m[0];
        !           556:        mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
        !           557:       }
        !           558:     }
        !           559:
        !           560:     /* direct fft's */
        !           561:     if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
        !           562:     else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
        !           563:
        !           564:     /* term to term multiplications */
        !           565:     mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
        !           566:
        !           567:     /* inverse fft's */
        !           568:     mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
        !           569:
        !           570:     /* division of terms after inverse fft */
        !           571:     for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
        !           572:
        !           573:     /* addition of terms in result p */
        !           574:     MPN_ZERO(T,nprime+1);
        !           575:     pla = l*(K-1)+nprime+1; /* number of required limbs for p */
        !           576:     p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
        !           577:     MPN_ZERO(p, pla);
        !           578:     sqr=0; /* will accumulate the (signed) carry at p[pla] */
        !           579:     for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
        !           580:         mp_ptr n = p+sh;
        !           581:        j = (K-i)%K;
        !           582:        if (mpn_add_n(n,n,Ap[j],nprime+1))
        !           583:          sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
        !           584:        T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
        !           585:        if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
        !           586:          sqr -= mpn_sub_1(n,n,pla-sh,1);
        !           587:          sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
        !           588:        }
        !           589:     }
        !           590:     if (sqr==-1) {
        !           591:       if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
        !           592:        /* p[pla-pl]...p[pla-1] are all zero */
        !           593:         mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
        !           594:        mpn_sub_1(p+pla-1,p+pla-1,1,1);
        !           595:       }
        !           596:     }
        !           597:     else if (sqr==1) {
        !           598:            if (pla>=2*pl)
        !           599:              while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
        !           600:            else {
        !           601:              sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
        !           602:               ASSERT (sqr == 0);
        !           603:            }
        !           604:     }
        !           605:     else
        !           606:       ASSERT (sqr == 0);
        !           607:
        !           608:     /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
        !           609:               < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
        !           610:              < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
        !           611:     mpn_fft_norm_modF(op,p,pl,pla);
        !           612: }
        !           613:
        !           614:
        !           615: /* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
        !           616:    n and m have respectively nl and ml limbs
        !           617:    op must have space for pl+1 limbs
        !           618:    One must have pl = mpn_fft_next_size(pl, k).
        !           619: */
        !           620:
        !           621: void
        !           622: #if __STDC__
        !           623: mpn_mul_fft (mp_ptr op, mp_size_t pl,
        !           624:              mp_srcptr n, mp_size_t nl,
        !           625:              mp_srcptr m, mp_size_t ml,
        !           626:              int k)
        !           627: #else
        !           628: mpn_mul_fft (op, pl, n, nl, m, ml, k)
        !           629:      mp_ptr    op;
        !           630:      mp_size_t pl;
        !           631:      mp_srcptr n;
        !           632:      mp_size_t nl;
        !           633:      mp_srcptr m;
        !           634:      mp_size_t ml;
        !           635:      int k;
        !           636: #endif
        !           637: {
        !           638:     int        K,maxLK,i,j;
        !           639:     mp_size_t  N,Nprime,nprime,M,Mp,l;
        !           640:     mp_limb_t  **Ap,**Bp,*A,*T,*B;
        !           641:     int        **_fft_l;
        !           642:     int        sqr = (n==m && nl==ml);
        !           643:     TMP_DECL(marker);
        !           644:
        !           645:     TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
        !           646:                    pl, nl, ml, k));
        !           647:     ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
        !           648:
        !           649:     TMP_MARK(marker);
        !           650:     N = pl*BITS_PER_MP_LIMB;
        !           651:     _fft_l = TMP_ALLOC_TYPE (k+1, int*);
        !           652:     for (i=0;i<=k;i++)
        !           653:       _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
        !           654:     mpn_fft_initl(_fft_l, k);
        !           655:     K = 1<<k;
        !           656:     M = N/K;   /* N = 2^k M */
        !           657:     l = M/BITS_PER_MP_LIMB;
        !           658:     maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
        !           659:
        !           660:     Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
        !           661:     nprime = Nprime/BITS_PER_MP_LIMB;
        !           662:     TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
        !           663:                    N, K, M, l, maxLK, Nprime, nprime));
        !           664:     if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
        !           665:       maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
        !           666:       if (Nprime % maxLK) {
        !           667:        Nprime=((Nprime/maxLK)+1)*maxLK;
        !           668:        nprime = Nprime/BITS_PER_MP_LIMB;
        !           669:       }
        !           670:       TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
        !           671:     }
        !           672:
        !           673:     T = TMP_ALLOC_LIMBS (nprime+1);
        !           674:     Mp = Nprime/K;
        !           675:
        !           676:     TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
        !           677:                   pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
        !           678:            printf("   temp space %ld\n", 2*K*(nprime+1)));
        !           679:
        !           680:     A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
        !           681:     B = A+K*(nprime+1);
        !           682:     Ap = TMP_ALLOC_MP_PTRS (K);
        !           683:     Bp = TMP_ALLOC_MP_PTRS (K);
        !           684:     /* special decomposition for main call */
        !           685:     for (i=0;i<K;i++) {
        !           686:       Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
        !           687:       /* store the next M bits of n into A[i] */
        !           688:       /* supposes that M is a multiple of BITS_PER_MP_LIMB */
        !           689:       if (nl>0) {
        !           690:        j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
        !           691:        MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
        !           692:        mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
        !           693:       }
        !           694:       else MPN_ZERO(Ap[i], nprime+1);
        !           695:       nl -= l;
        !           696:       if (n!=m) {
        !           697:        if (ml>0) {
        !           698:          j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
        !           699:          MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
        !           700:          mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
        !           701:        }
        !           702:        else MPN_ZERO(Bp[i], nprime+1);
        !           703:       }
        !           704:       ml -= l;
        !           705:     }
        !           706:     mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
        !           707:     TMP_FREE(marker);
        !           708:     _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
        !           709: }
        !           710:
        !           711:
        !           712: #if WANT_ASSERT
        !           713: static int
        !           714: #if __STDC__
        !           715: mpn_zero_p (mp_ptr p, mp_size_t n)
        !           716: #else
        !           717:      mpn_zero_p (p, n)
        !           718:      mp_ptr p;
        !           719:      mp_size_t n;
        !           720: #endif
        !           721: {
        !           722:   mp_size_t i;
        !           723:
        !           724:   for (i = 0; i < n; i++)
        !           725:     {
        !           726:       if (p[i] != 0)
        !           727:         return 0;
        !           728:     }
        !           729:
        !           730:   return 1;
        !           731: }
        !           732: #endif
        !           733:
        !           734:
        !           735: /* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
        !           736:
        !           737:    FIXME: Duplicating the result like this is wasteful, do something better
        !           738:    perhaps at the norm_modF stage above. */
        !           739:
        !           740: void
        !           741: #if __STDC__
        !           742: mpn_mul_fft_full (mp_ptr op,
        !           743:                   mp_srcptr n, mp_size_t nl,
        !           744:                   mp_srcptr m, mp_size_t ml)
        !           745: #else
        !           746: mpn_mul_fft_full (op, n, nl, m, ml)
        !           747:      mp_ptr    op;
        !           748:      mp_srcptr n;
        !           749:      mp_size_t nl;
        !           750:      mp_srcptr m;
        !           751:      mp_size_t ml;
        !           752: #endif
        !           753: {
        !           754:   mp_ptr     pad_op;
        !           755:   mp_size_t  pl;
        !           756:   int        k;
        !           757:   int        sqr = (n==m && nl==ml);
        !           758:
        !           759:   k = mpn_fft_best_k (nl+ml, sqr);
        !           760:   pl = mpn_fft_next_size (nl+ml, k);
        !           761:
        !           762:   TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
        !           763:                  nl, ml, pl, k));
        !           764:
        !           765:   pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
        !           766:   mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
        !           767:
        !           768:   ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
        !           769:   MPN_COPY (op, pad_op, nl+ml);
        !           770:
        !           771:   _MP_FREE_FUNC_LIMBS (pad_op, pl+1);
        !           772: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>