[BACK]Return to mul_n.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / generic

Annotation of OpenXM_contrib/gmp/mpn/generic/mul_n.c, Revision 1.1.1.2

1.1.1.2 ! maekawa     1: /* mpn_mul_n and helper function -- Multiply/square natural numbers.
1.1       maekawa     2:
1.1.1.2 ! maekawa     3:    THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
        !             4:    ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
        !             5:    THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
        !             6:    THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
        !             7:
        !             8:
        !             9: Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
        !            10: Foundation, Inc.
1.1       maekawa    11:
                     12: This file is part of the GNU MP Library.
                     13:
                     14: The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2 ! maekawa    15: it under the terms of the GNU Lesser General Public License as published by
        !            16: the Free Software Foundation; either version 2.1 of the License, or (at your
1.1       maekawa    17: option) any later version.
                     18:
                     19: The GNU MP Library is distributed in the hope that it will be useful, but
                     20: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2 ! maekawa    21: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
1.1       maekawa    22: License for more details.
                     23:
1.1.1.2 ! maekawa    24: You should have received a copy of the GNU Lesser General Public License
1.1       maekawa    25: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     26: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     27: MA 02111-1307, USA. */
                     28:
                     29: #include "gmp.h"
                     30: #include "gmp-impl.h"
1.1.1.2 ! maekawa    31: #include "longlong.h"
1.1       maekawa    32:
                     33:
1.1.1.2 ! maekawa    34: /* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
        !            35:    0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
        !            36: #define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
1.1       maekawa    37:
1.1.1.2 ! maekawa    38: #if !defined (__alpha) && !defined (__mips)
        !            39: /* For all other machines, we want to call mpn functions for the compund
        !            40:    operations instead of open-coding them.  */
        !            41: #define USE_MORE_MPN
1.1       maekawa    42: #endif
                     43:
1.1.1.2 ! maekawa    44: /*== Function declarations =================================================*/
1.1       maekawa    45:
1.1.1.2 ! maekawa    46: static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
        !            47:                                mp_ptr, mp_ptr, mp_ptr,
        !            48:                                mp_srcptr, mp_srcptr, mp_srcptr,
        !            49:                                mp_size_t, mp_size_t));
        !            50: static void interpolate3 _PROTO ((mp_srcptr,
        !            51:                                   mp_ptr, mp_ptr, mp_ptr,
        !            52:                                   mp_srcptr,
        !            53:                                   mp_ptr, mp_ptr, mp_ptr,
        !            54:                                   mp_size_t, mp_size_t));
        !            55: static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
        !            56:
        !            57:
        !            58: /*-- mpn_kara_mul_n ---------------------------------------------------------------*/
        !            59:
        !            60: /* Multiplies using 3 half-sized mults and so on recursively.
        !            61:  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
        !            62:  * No overlap of p[...] with a[...] or b[...].
        !            63:  * ws is workspace.
        !            64:  */
1.1       maekawa    65:
                     66: void
                     67: #if __STDC__
1.1.1.2 ! maekawa    68: mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
1.1       maekawa    69: #else
1.1.1.2 ! maekawa    70: mpn_kara_mul_n(p, a, b, n, ws)
        !            71:      mp_ptr    p;
        !            72:      mp_srcptr a;
        !            73:      mp_srcptr b;
        !            74:      mp_size_t n;
        !            75:      mp_ptr    ws;
1.1       maekawa    76: #endif
                     77: {
1.1.1.2 ! maekawa    78:   mp_limb_t i, sign, w, w0, w1;
        !            79:   mp_size_t n2;
        !            80:   mp_srcptr x, y;
1.1       maekawa    81:
1.1.1.2 ! maekawa    82:   n2 = n >> 1;
        !            83:   ASSERT (n2 > 0);
        !            84:
        !            85:   if (n & 1)
1.1       maekawa    86:     {
1.1.1.2 ! maekawa    87:       /* Odd length. */
        !            88:       mp_size_t n1, n3, nm1;
        !            89:
        !            90:       n3 = n - n2;
        !            91:
        !            92:       sign = 0;
        !            93:       w = a[n2];
        !            94:       if (w != 0)
        !            95:        w -= mpn_sub_n (p, a, a + n3, n2);
        !            96:       else
        !            97:        {
        !            98:          i = n2;
        !            99:          do
        !           100:            {
        !           101:              --i;
        !           102:              w0 = a[i];
        !           103:              w1 = a[n3+i];
        !           104:            }
        !           105:          while (w0 == w1 && i != 0);
        !           106:          if (w0 < w1)
        !           107:            {
        !           108:              x = a + n3;
        !           109:              y = a;
        !           110:              sign = 1;
        !           111:            }
        !           112:          else
        !           113:            {
        !           114:              x = a;
        !           115:              y = a + n3;
        !           116:            }
        !           117:          mpn_sub_n (p, x, y, n2);
        !           118:        }
        !           119:       p[n2] = w;
        !           120:
        !           121:       w = b[n2];
        !           122:       if (w != 0)
        !           123:        w -= mpn_sub_n (p + n3, b, b + n3, n2);
        !           124:       else
        !           125:        {
        !           126:          i = n2;
        !           127:          do
        !           128:            {
        !           129:              --i;
        !           130:              w0 = b[i];
        !           131:              w1 = b[n3+i];
        !           132:            }
        !           133:          while (w0 == w1 && i != 0);
        !           134:          if (w0 < w1)
        !           135:            {
        !           136:              x = b + n3;
        !           137:              y = b;
        !           138:              sign ^= 1;
        !           139:            }
        !           140:          else
        !           141:            {
        !           142:              x = b;
        !           143:              y = b + n3;
        !           144:            }
        !           145:          mpn_sub_n (p + n3, x, y, n2);
        !           146:        }
        !           147:       p[n] = w;
        !           148:
        !           149:       n1 = n + 1;
        !           150:       if (n2 < KARATSUBA_MUL_THRESHOLD)
        !           151:        {
        !           152:          if (n3 < KARATSUBA_MUL_THRESHOLD)
        !           153:            {
        !           154:              mpn_mul_basecase (ws, p, n3, p + n3, n3);
        !           155:              mpn_mul_basecase (p, a, n3, b, n3);
        !           156:            }
        !           157:          else
        !           158:            {
        !           159:              mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
        !           160:              mpn_kara_mul_n (p, a, b, n3, ws + n1);
        !           161:            }
        !           162:          mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
        !           163:        }
        !           164:       else
        !           165:        {
        !           166:          mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
        !           167:          mpn_kara_mul_n (p, a, b, n3, ws + n1);
        !           168:          mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
        !           169:        }
        !           170:
        !           171:       if (sign)
        !           172:        mpn_add_n (ws, p, ws, n1);
1.1       maekawa   173:       else
1.1.1.2 ! maekawa   174:        mpn_sub_n (ws, p, ws, n1);
        !           175:
        !           176:       nm1 = n - 1;
        !           177:       if (mpn_add_n (ws, p + n1, ws, nm1))
        !           178:        {
        !           179:          mp_limb_t x = ws[nm1] + 1;
        !           180:          ws[nm1] = x;
        !           181:          if (x == 0)
        !           182:            ++ws[n];
        !           183:        }
        !           184:       if (mpn_add_n (p + n3, p + n3, ws, n1))
        !           185:        {
        !           186:          mp_limb_t x;
        !           187:          i = n1 + n3;
        !           188:          do
        !           189:            {
        !           190:              x = p[i] + 1;
        !           191:              p[i] = x;
        !           192:              ++i;
        !           193:            } while (x == 0);
        !           194:        }
1.1       maekawa   195:     }
                    196:   else
1.1.1.2 ! maekawa   197:     {
        !           198:       /* Even length. */
        !           199:       mp_limb_t t;
        !           200:
        !           201:       i = n2;
        !           202:       do
        !           203:        {
        !           204:          --i;
        !           205:          w0 = a[i];
        !           206:          w1 = a[n2+i];
        !           207:        }
        !           208:       while (w0 == w1 && i != 0);
        !           209:       sign = 0;
        !           210:       if (w0 < w1)
        !           211:        {
        !           212:          x = a + n2;
        !           213:          y = a;
        !           214:          sign = 1;
        !           215:        }
        !           216:       else
        !           217:        {
        !           218:          x = a;
        !           219:          y = a + n2;
        !           220:        }
        !           221:       mpn_sub_n (p, x, y, n2);
1.1       maekawa   222:
1.1.1.2 ! maekawa   223:       i = n2;
        !           224:       do
        !           225:        {
        !           226:          --i;
        !           227:          w0 = b[i];
        !           228:          w1 = b[n2+i];
        !           229:        }
        !           230:       while (w0 == w1 && i != 0);
        !           231:       if (w0 < w1)
        !           232:        {
        !           233:          x = b + n2;
        !           234:          y = b;
        !           235:          sign ^= 1;
        !           236:        }
        !           237:       else
        !           238:        {
        !           239:          x = b;
        !           240:          y = b + n2;
        !           241:        }
        !           242:       mpn_sub_n (p + n2, x, y, n2);
1.1       maekawa   243:
1.1.1.2 ! maekawa   244:       /* Pointwise products. */
        !           245:       if (n2 < KARATSUBA_MUL_THRESHOLD)
1.1       maekawa   246:        {
1.1.1.2 ! maekawa   247:          mpn_mul_basecase (ws, p, n2, p + n2, n2);
        !           248:          mpn_mul_basecase (p, a, n2, b, n2);
        !           249:          mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
1.1       maekawa   250:        }
                    251:       else
1.1.1.2 ! maekawa   252:        {
        !           253:          mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
        !           254:          mpn_kara_mul_n (p, a, b, n2, ws + n);
        !           255:          mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
        !           256:        }
1.1       maekawa   257:
1.1.1.2 ! maekawa   258:       /* Interpolate. */
        !           259:       if (sign)
        !           260:        w = mpn_add_n (ws, p, ws, n);
        !           261:       else
        !           262:        w = -mpn_sub_n (ws, p, ws, n);
        !           263:       w += mpn_add_n (ws, p + n, ws, n);
        !           264:       w += mpn_add_n (p + n2, p + n2, ws, n);
        !           265:       /* TO DO: could put "if (w) { ... }" here.
        !           266:        * Less work but badly predicted branch.
        !           267:        * No measurable difference in speed on Alpha.
        !           268:        */
        !           269:       i = n + n2;
        !           270:       t = p[i] + w;
        !           271:       p[i] = t;
        !           272:       if (t < w)
        !           273:        {
        !           274:          do
        !           275:            {
        !           276:              ++i;
        !           277:              w = p[i] + 1;
        !           278:              p[i] = w;
        !           279:            }
        !           280:          while (w == 0);
        !           281:        }
1.1       maekawa   282:     }
                    283: }
                    284:
                    285: void
                    286: #if __STDC__
1.1.1.2 ! maekawa   287: mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1.1       maekawa   288: #else
1.1.1.2 ! maekawa   289: mpn_kara_sqr_n (p, a, n, ws)
        !           290:      mp_ptr    p;
        !           291:      mp_srcptr a;
        !           292:      mp_size_t n;
        !           293:      mp_ptr    ws;
        !           294: #endif
        !           295: {
        !           296:   mp_limb_t i, sign, w, w0, w1;
        !           297:   mp_size_t n2;
        !           298:   mp_srcptr x, y;
1.1       maekawa   299:
1.1.1.2 ! maekawa   300:   n2 = n >> 1;
        !           301:   ASSERT (n2 > 0);
        !           302:
        !           303:   if (n & 1)
1.1       maekawa   304:     {
1.1.1.2 ! maekawa   305:       /* Odd length. */
        !           306:       mp_size_t n1, n3, nm1;
1.1       maekawa   307:
1.1.1.2 ! maekawa   308:       n3 = n - n2;
1.1       maekawa   309:
1.1.1.2 ! maekawa   310:       sign = 0;
        !           311:       w = a[n2];
        !           312:       if (w != 0)
        !           313:        w -= mpn_sub_n (p, a, a + n3, n2);
        !           314:       else
        !           315:        {
        !           316:          i = n2;
        !           317:          do
        !           318:            {
        !           319:              --i;
        !           320:              w0 = a[i];
        !           321:              w1 = a[n3+i];
        !           322:            }
        !           323:          while (w0 == w1 && i != 0);
        !           324:          if (w0 < w1)
        !           325:            {
        !           326:              x = a + n3;
        !           327:              y = a;
        !           328:              sign = 1;
        !           329:            }
        !           330:          else
        !           331:            {
        !           332:              x = a;
        !           333:              y = a + n3;
        !           334:            }
        !           335:          mpn_sub_n (p, x, y, n2);
        !           336:        }
        !           337:       p[n2] = w;
1.1       maekawa   338:
1.1.1.2 ! maekawa   339:       w = a[n2];
        !           340:       if (w != 0)
        !           341:        w -= mpn_sub_n (p + n3, a, a + n3, n2);
        !           342:       else
        !           343:        {
        !           344:          i = n2;
        !           345:          do
        !           346:            {
        !           347:              --i;
        !           348:              w0 = a[i];
        !           349:              w1 = a[n3+i];
        !           350:            }
        !           351:          while (w0 == w1 && i != 0);
        !           352:          if (w0 < w1)
        !           353:            {
        !           354:              x = a + n3;
        !           355:              y = a;
        !           356:              sign ^= 1;
        !           357:            }
        !           358:          else
        !           359:            {
        !           360:              x = a;
        !           361:              y = a + n3;
        !           362:            }
        !           363:          mpn_sub_n (p + n3, x, y, n2);
        !           364:        }
        !           365:       p[n] = w;
1.1       maekawa   366:
1.1.1.2 ! maekawa   367:       n1 = n + 1;
        !           368:       if (n2 < KARATSUBA_SQR_THRESHOLD)
        !           369:        {
        !           370:          if (n3 < KARATSUBA_SQR_THRESHOLD)
        !           371:            {
        !           372:              mpn_sqr_basecase (ws, p, n3);
        !           373:              mpn_sqr_basecase (p, a, n3);
        !           374:            }
        !           375:          else
        !           376:            {
        !           377:              mpn_kara_sqr_n (ws, p, n3, ws + n1);
        !           378:              mpn_kara_sqr_n (p, a, n3, ws + n1);
        !           379:            }
        !           380:          mpn_sqr_basecase (p + n1, a + n3, n2);
        !           381:        }
        !           382:       else
        !           383:        {
        !           384:          mpn_kara_sqr_n (ws, p, n3, ws + n1);
        !           385:          mpn_kara_sqr_n (p, a, n3, ws + n1);
        !           386:          mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
        !           387:        }
1.1       maekawa   388:
1.1.1.2 ! maekawa   389:       if (sign)
        !           390:        mpn_add_n (ws, p, ws, n1);
        !           391:       else
        !           392:        mpn_sub_n (ws, p, ws, n1);
1.1       maekawa   393:
1.1.1.2 ! maekawa   394:       nm1 = n - 1;
        !           395:       if (mpn_add_n (ws, p + n1, ws, nm1))
        !           396:        {
        !           397:          mp_limb_t x = ws[nm1] + 1;
        !           398:          ws[nm1] = x;
        !           399:          if (x == 0)
        !           400:            ++ws[n];
        !           401:        }
        !           402:       if (mpn_add_n (p + n3, p + n3, ws, n1))
        !           403:        {
        !           404:          mp_limb_t x;
        !           405:          i = n1 + n3;
        !           406:          do
        !           407:            {
        !           408:              x = p[i] + 1;
        !           409:              p[i] = x;
        !           410:              ++i;
        !           411:            } while (x == 0);
        !           412:        }
        !           413:     }
        !           414:   else
        !           415:     {
        !           416:       /* Even length. */
        !           417:       mp_limb_t t;
1.1       maekawa   418:
1.1.1.2 ! maekawa   419:       i = n2;
        !           420:       do
        !           421:        {
        !           422:          --i;
        !           423:          w0 = a[i];
        !           424:          w1 = a[n2+i];
        !           425:        }
        !           426:       while (w0 == w1 && i != 0);
        !           427:       sign = 0;
        !           428:       if (w0 < w1)
1.1       maekawa   429:        {
1.1.1.2 ! maekawa   430:          x = a + n2;
        !           431:          y = a;
        !           432:          sign = 1;
1.1       maekawa   433:        }
                    434:       else
                    435:        {
1.1.1.2 ! maekawa   436:          x = a;
        !           437:          y = a + n2;
        !           438:        }
        !           439:       mpn_sub_n (p, x, y, n2);
        !           440:
        !           441:       i = n2;
        !           442:       do
        !           443:        {
        !           444:          --i;
        !           445:          w0 = a[i];
        !           446:          w1 = a[n2+i];
1.1       maekawa   447:        }
1.1.1.2 ! maekawa   448:       while (w0 == w1 && i != 0);
        !           449:       if (w0 < w1)
1.1       maekawa   450:        {
1.1.1.2 ! maekawa   451:          x = a + n2;
        !           452:          y = a;
        !           453:          sign ^= 1;
1.1       maekawa   454:        }
                    455:       else
                    456:        {
1.1.1.2 ! maekawa   457:          x = a;
        !           458:          y = a + n2;
1.1       maekawa   459:        }
1.1.1.2 ! maekawa   460:       mpn_sub_n (p + n2, x, y, n2);
1.1       maekawa   461:
1.1.1.2 ! maekawa   462:       /* Pointwise products. */
        !           463:       if (n2 < KARATSUBA_SQR_THRESHOLD)
        !           464:        {
        !           465:          mpn_sqr_basecase (ws, p, n2);
        !           466:          mpn_sqr_basecase (p, a, n2);
        !           467:          mpn_sqr_basecase (p + n, a + n2, n2);
        !           468:        }
        !           469:       else
        !           470:        {
        !           471:          mpn_kara_sqr_n (ws, p, n2, ws + n);
        !           472:          mpn_kara_sqr_n (p, a, n2, ws + n);
        !           473:          mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
        !           474:        }
1.1       maekawa   475:
1.1.1.2 ! maekawa   476:       /* Interpolate. */
        !           477:       if (sign)
        !           478:        w = mpn_add_n (ws, p, ws, n);
1.1       maekawa   479:       else
1.1.1.2 ! maekawa   480:        w = -mpn_sub_n (ws, p, ws, n);
        !           481:       w += mpn_add_n (ws, p + n, ws, n);
        !           482:       w += mpn_add_n (p + n2, p + n2, ws, n);
        !           483:       /* TO DO: could put "if (w) { ... }" here.
        !           484:        * Less work but badly predicted branch.
        !           485:        * No measurable difference in speed on Alpha.
        !           486:        */
        !           487:       i = n + n2;
        !           488:       t = p[i] + w;
        !           489:       p[i] = t;
        !           490:       if (t < w)
        !           491:        {
        !           492:          do
        !           493:            {
        !           494:              ++i;
        !           495:              w = p[i] + 1;
        !           496:              p[i] = w;
        !           497:            }
        !           498:          while (w == 0);
        !           499:        }
        !           500:     }
        !           501: }
1.1       maekawa   502:
1.1.1.2 ! maekawa   503: /*-- add2Times -------------------------------------------------------------*/
1.1       maekawa   504:
1.1.1.2 ! maekawa   505: /* z[] = x[] + 2 * y[]
        !           506:    Note that z and x might point to the same vectors. */
        !           507: #ifdef USE_MORE_MPN
        !           508: static inline mp_limb_t
        !           509: #if __STDC__
        !           510: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
        !           511: #else
        !           512: add2Times (z, x, y, n)
        !           513:      mp_ptr    z;
        !           514:      mp_srcptr x;
        !           515:      mp_srcptr y;
        !           516:      mp_size_t n;
        !           517: #endif
        !           518: {
        !           519:   mp_ptr t;
        !           520:   mp_limb_t c;
        !           521:   TMP_DECL (marker);
        !           522:   TMP_MARK (marker);
        !           523:   t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
        !           524:   c = mpn_lshift (t, y, n, 1);
        !           525:   c += mpn_add_n (z, x, t, n);
        !           526:   TMP_FREE (marker);
        !           527:   return c;
        !           528: }
        !           529: #else
1.1       maekawa   530:
1.1.1.2 ! maekawa   531: static mp_limb_t
        !           532: #if __STDC__
        !           533: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
        !           534: #else
        !           535: add2Times (z, x, y, n)
        !           536:      mp_ptr    z;
        !           537:      mp_srcptr x;
        !           538:      mp_srcptr y;
        !           539:      mp_size_t n;
        !           540: #endif
        !           541: {
        !           542:   mp_limb_t c, v, w;
1.1       maekawa   543:
1.1.1.2 ! maekawa   544:   ASSERT (n > 0);
        !           545:   v = *x; w = *y;
        !           546:   c = w >> (BITS_PER_MP_LIMB - 1);
        !           547:   w <<= 1;
        !           548:   v += w;
        !           549:   c += v < w;
        !           550:   *z = v;
        !           551:   ++x; ++y; ++z;
        !           552:   while (--n)
        !           553:     {
        !           554:       v = *x;
        !           555:       w = *y;
        !           556:       v += c;
        !           557:       c = v < c;
        !           558:       c += w >> (BITS_PER_MP_LIMB - 1);
        !           559:       w <<= 1;
        !           560:       v += w;
        !           561:       c += v < w;
        !           562:       *z = v;
        !           563:       ++x; ++y; ++z;
1.1       maekawa   564:     }
1.1.1.2 ! maekawa   565:
        !           566:   return c;
1.1       maekawa   567: }
1.1.1.2 ! maekawa   568: #endif
1.1       maekawa   569:
1.1.1.2 ! maekawa   570: /*-- evaluate3 -------------------------------------------------------------*/
        !           571:
        !           572: /* Evaluates:
        !           573:  *   ph := 4*A+2*B+C
        !           574:  *   p1 := A+B+C
        !           575:  *   p2 := A+2*B+4*C
        !           576:  * where:
        !           577:  *   ph[], p1[], p2[], A[] and B[] all have length len,
        !           578:  *   C[] has length len2 with len-len2 = 0, 1 or 2.
        !           579:  * Returns top words (overflow) at pth, pt1 and pt2 respectively.
        !           580:  */
        !           581: #ifdef USE_MORE_MPN
        !           582: static void
1.1       maekawa   583: #if __STDC__
1.1.1.2 ! maekawa   584: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
        !           585:           mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
1.1       maekawa   586: #else
1.1.1.2 ! maekawa   587: evaluate3 (ph, p1, p2, pth, pt1, pt2,
        !           588:            A, B, C, len, len2)
        !           589:      mp_ptr    ph;
        !           590:      mp_ptr    p1;
        !           591:      mp_ptr    p2;
        !           592:      mp_ptr    pth;
        !           593:      mp_ptr    pt1;
        !           594:      mp_ptr    pt2;
        !           595:      mp_srcptr A;
        !           596:      mp_srcptr B;
        !           597:      mp_srcptr C;
        !           598:      mp_size_t len;
        !           599:      mp_size_t len2;
1.1       maekawa   600: #endif
                    601: {
1.1.1.2 ! maekawa   602:   mp_limb_t c, d, e;
        !           603:
        !           604:   ASSERT (len - len2 <= 2);
        !           605:
        !           606:   e = mpn_lshift (p1, B, len, 1);
        !           607:
        !           608:   c = mpn_lshift (ph, A, len, 2);
        !           609:   c += e + mpn_add_n (ph, ph, p1, len);
        !           610:   d = mpn_add_n (ph, ph, C, len2);
        !           611:   if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
        !           612:   ASSERT (c < 7);
        !           613:   *pth = c;
        !           614:
        !           615:   c = mpn_lshift (p2, C, len2, 2);
        !           616: #if 1
        !           617:   if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
        !           618:   c += e + mpn_add_n (p2, p2, p1, len);
        !           619: #else
        !           620:   d = mpn_add_n (p2, p2, p1, len2);
        !           621:   c += d;
        !           622:   if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
        !           623:   c += e;
        !           624: #endif
        !           625:   c += mpn_add_n (p2, p2, A, len);
        !           626:   ASSERT (c < 7);
        !           627:   *pt2 = c;
        !           628:
        !           629:   c = mpn_add_n (p1, A, B, len);
        !           630:   d = mpn_add_n (p1, p1, C, len2);
        !           631:   if (len2 == len) c += d;
        !           632:   else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
        !           633:   ASSERT (c < 3);
        !           634:   *pt1 = c;
1.1       maekawa   635:
1.1.1.2 ! maekawa   636: }
        !           637:
        !           638: #else
1.1       maekawa   639:
1.1.1.2 ! maekawa   640: static void
        !           641: #if __STDC__
        !           642: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
        !           643:           mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
        !           644: #else
        !           645: evaluate3 (ph, p1, p2, pth, pt1, pt2,
        !           646:            A, B, C, l, ls)
        !           647:      mp_ptr    ph;
        !           648:      mp_ptr    p1;
        !           649:      mp_ptr    p2;
        !           650:      mp_ptr    pth;
        !           651:      mp_ptr    pt1;
        !           652:      mp_ptr    pt2;
        !           653:      mp_srcptr A;
        !           654:      mp_srcptr B;
        !           655:      mp_srcptr C;
        !           656:      mp_size_t l;
        !           657:      mp_size_t ls;
        !           658: #endif
        !           659: {
        !           660:   mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
        !           661:
        !           662:   ASSERT (l - ls <= 2);
1.1       maekawa   663:
1.1.1.2 ! maekawa   664:   th = t1 = t2 = 0;
        !           665:   for (i = 0; i < l; ++i)
1.1       maekawa   666:     {
1.1.1.2 ! maekawa   667:       a = *A;
        !           668:       b = *B;
        !           669:       c = i < ls ? *C : 0;
        !           670:
        !           671:       /* TO DO: choose one of the following alternatives. */
        !           672: #if 0
        !           673:       t = a << 2;
        !           674:       vh = th + t;
        !           675:       th = vh < t;
        !           676:       th += a >> (BITS_PER_MP_LIMB - 2);
        !           677:       t = b << 1;
        !           678:       vh += t;
        !           679:       th += vh < t;
        !           680:       th += b >> (BITS_PER_MP_LIMB - 1);
        !           681:       vh += c;
        !           682:       th += vh < c;
        !           683: #else
        !           684:       vh = th + c;
        !           685:       th = vh < c;
        !           686:       t = b << 1;
        !           687:       vh += t;
        !           688:       th += vh < t;
        !           689:       th += b >> (BITS_PER_MP_LIMB - 1);
        !           690:       t = a << 2;
        !           691:       vh += t;
        !           692:       th += vh < t;
        !           693:       th += a >> (BITS_PER_MP_LIMB - 2);
        !           694: #endif
1.1       maekawa   695:
1.1.1.2 ! maekawa   696:       v1 = t1 + a;
        !           697:       t1 = v1 < a;
        !           698:       v1 += b;
        !           699:       t1 += v1 < b;
        !           700:       v1 += c;
        !           701:       t1 += v1 < c;
        !           702:
        !           703:       v2 = t2 + a;
        !           704:       t2 = v2 < a;
        !           705:       t = b << 1;
        !           706:       v2 += t;
        !           707:       t2 += v2 < t;
        !           708:       t2 += b >> (BITS_PER_MP_LIMB - 1);
        !           709:       t = c << 2;
        !           710:       v2 += t;
        !           711:       t2 += v2 < t;
        !           712:       t2 += c >> (BITS_PER_MP_LIMB - 2);
        !           713:
        !           714:       *ph = vh;
        !           715:       *p1 = v1;
        !           716:       *p2 = v2;
        !           717:
        !           718:       ++A; ++B; ++C;
        !           719:       ++ph; ++p1; ++p2;
1.1       maekawa   720:     }
1.1.1.2 ! maekawa   721:
        !           722:   ASSERT (th < 7);
        !           723:   ASSERT (t1 < 3);
        !           724:   ASSERT (t2 < 7);
        !           725:
        !           726:   *pth = th;
        !           727:   *pt1 = t1;
        !           728:   *pt2 = t2;
1.1       maekawa   729: }
1.1.1.2 ! maekawa   730: #endif
1.1       maekawa   731:
1.1.1.2 ! maekawa   732:
        !           733: /*-- interpolate3 ----------------------------------------------------------*/
        !           734:
        !           735: /* Interpolates B, C, D (in-place) from:
        !           736:  *   16*A+8*B+4*C+2*D+E
        !           737:  *   A+B+C+D+E
        !           738:  *   A+2*B+4*C+8*D+16*E
        !           739:  * where:
        !           740:  *   A[], B[], C[] and D[] all have length l,
        !           741:  *   E[] has length ls with l-ls = 0, 2 or 4.
        !           742:  *
        !           743:  * Reads top words (from earlier overflow) from ptb, ptc and ptd,
        !           744:  * and returns new top words there.
        !           745:  */
        !           746:
        !           747: #ifdef USE_MORE_MPN
        !           748: static void
1.1       maekawa   749: #if __STDC__
1.1.1.2 ! maekawa   750: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
        !           751:               mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
        !           752: #else
        !           753: interpolate3 (A, B, C, D, E,
        !           754:               ptb, ptc, ptd, len, len2)
        !           755:      mp_srcptr A;
        !           756:      mp_ptr    B;
        !           757:      mp_ptr    C;
        !           758:      mp_ptr    D;
        !           759:      mp_srcptr E;
        !           760:      mp_ptr    ptb;
        !           761:      mp_ptr    ptc;
        !           762:      mp_ptr    ptd;
        !           763:      mp_size_t len;
        !           764:      mp_size_t len2;
        !           765: #endif
        !           766: {
        !           767:   mp_ptr ws;
        !           768:   mp_limb_t t, tb,tc,td;
        !           769:   TMP_DECL (marker);
        !           770:   TMP_MARK (marker);
        !           771:
        !           772:   ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
        !           773:
        !           774:   /* Let x1, x2, x3 be the values to interpolate.  We have:
        !           775:    *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
        !           776:    *         c =    a +   x1 +   x2 +   x3 +    e
        !           777:    *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
        !           778:    */
        !           779:
        !           780:   ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
        !           781:
        !           782:   tb = *ptb; tc = *ptc; td = *ptd;
        !           783:
        !           784:
        !           785:   /* b := b - 16*a -    e
        !           786:    * c := c -    a -    e
        !           787:    * d := d -    a - 16*e
        !           788:    */
        !           789:
        !           790:   t = mpn_lshift (ws, A, len, 4);
        !           791:   tb -= t + mpn_sub_n (B, B, ws, len);
        !           792:   t = mpn_sub_n (B, B, E, len2);
        !           793:   if (len2 == len) tb -= t;
        !           794:   else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
        !           795:
        !           796:   tc -= mpn_sub_n (C, C, A, len);
        !           797:   t = mpn_sub_n (C, C, E, len2);
        !           798:   if (len2 == len) tc -= t;
        !           799:   else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
        !           800:
        !           801:   t = mpn_lshift (ws, E, len2, 4);
        !           802:   t += mpn_add_n (ws, ws, A, len2);
        !           803: #if 1
        !           804:   if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
        !           805:   td -= t + mpn_sub_n (D, D, ws, len);
        !           806: #else
        !           807:   t += mpn_sub_n (D, D, ws, len2);
        !           808:   if (len2 != len) {
        !           809:     t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
        !           810:     t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
        !           811:   } /* end if/else */
        !           812:   td -= t;
        !           813: #endif
        !           814:
        !           815:
        !           816:   /* b, d := b + d, b - d */
        !           817:
        !           818: #ifdef HAVE_MPN_ADD_SUB_N
        !           819:   /* #error TO DO ... */
        !           820: #else
        !           821:   t = tb + td + mpn_add_n (ws, B, D, len);
        !           822:   td = tb - td - mpn_sub_n (D, B, D, len);
        !           823:   tb = t;
        !           824:   MPN_COPY (B, ws, len);
        !           825: #endif
        !           826:
        !           827:   /* b := b-8*c */
        !           828:   t = 8 * tc + mpn_lshift (ws, C, len, 3);
        !           829:   tb -= t + mpn_sub_n (B, B, ws, len);
        !           830:
        !           831:   /* c := 2*c - b */
        !           832:   tc = 2 * tc + mpn_lshift (C, C, len, 1);
        !           833:   tc -= tb + mpn_sub_n (C, C, B, len);
        !           834:
        !           835:   /* d := d/3 */
        !           836:   td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
        !           837:
        !           838:   /* b, d := b + d, b - d */
        !           839: #ifdef HAVE_MPN_ADD_SUB_N
        !           840:   /* #error TO DO ... */
1.1       maekawa   841: #else
1.1.1.2 ! maekawa   842:   t = tb + td + mpn_add_n (ws, B, D, len);
        !           843:   td = tb - td - mpn_sub_n (D, B, D, len);
        !           844:   tb = t;
        !           845:   MPN_COPY (B, ws, len);
        !           846: #endif
1.1       maekawa   847:
1.1.1.2 ! maekawa   848:       /* Now:
        !           849:        *        b = 4*x1
        !           850:        *        c = 2*x2
        !           851:        *        d = 4*x3
        !           852:        */
        !           853:
        !           854:   ASSERT(!(*B & 3));
        !           855:   mpn_rshift (B, B, len, 2);
        !           856:   B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
        !           857:   ASSERT((long)tb >= 0);
        !           858:   tb >>= 2;
        !           859:
        !           860:   ASSERT(!(*C & 1));
        !           861:   mpn_rshift (C, C, len, 1);
        !           862:   C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
        !           863:   ASSERT((long)tc >= 0);
        !           864:   tc >>= 1;
        !           865:
        !           866:   ASSERT(!(*D & 3));
        !           867:   mpn_rshift (D, D, len, 2);
        !           868:   D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
        !           869:   ASSERT((long)td >= 0);
        !           870:   td >>= 2;
        !           871:
        !           872: #if WANT_ASSERT
        !           873:   ASSERT (tb < 2);
        !           874:   if (len == len2)
        !           875:     {
        !           876:       ASSERT (tc < 3);
        !           877:       ASSERT (td < 2);
1.1       maekawa   878:     }
                    879:   else
                    880:     {
1.1.1.2 ! maekawa   881:       ASSERT (tc < 2);
        !           882:       ASSERT (!td);
        !           883:     }
        !           884: #endif
1.1       maekawa   885:
1.1.1.2 ! maekawa   886:   *ptb = tb;
        !           887:   *ptc = tc;
        !           888:   *ptd = td;
1.1       maekawa   889:
1.1.1.2 ! maekawa   890:   TMP_FREE (marker);
        !           891: }
        !           892:
        !           893: #else
        !           894:
        !           895: static void
        !           896: #if __STDC__
        !           897: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
        !           898:              mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
        !           899: #else
        !           900: interpolate3 (A, B, C, D, E,
        !           901:               ptb, ptc, ptd, l, ls)
        !           902:      mp_srcptr A;
        !           903:      mp_ptr    B;
        !           904:      mp_ptr    C;
        !           905:      mp_ptr    D;
        !           906:      mp_srcptr E;
        !           907:      mp_ptr    ptb;
        !           908:      mp_ptr    ptc;
        !           909:      mp_ptr    ptd;
        !           910:      mp_size_t l;
        !           911:      mp_size_t ls;
        !           912: #endif
        !           913: {
        !           914:   mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
        !           915:   const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
        !           916:
        !           917: #if WANT_ASSERT
        !           918:   t = l - ls;
        !           919:   ASSERT (t == 0 || t == 2 || t == 4);
        !           920: #endif
        !           921:
        !           922:   sb = sc = sd = 0;
        !           923:   for (i = 0; i < l; ++i)
        !           924:     {
        !           925:       mp_limb_t tb, tc, td, tt;
        !           926:
        !           927:       a = *A;
        !           928:       b = *B;
        !           929:       c = *C;
        !           930:       d = *D;
        !           931:       e = i < ls ? *E : 0;
        !           932:
        !           933:       /* Let x1, x2, x3 be the values to interpolate.  We have:
        !           934:        *        b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
        !           935:        *        c =    a +   x1 +   x2 +   x3 +    e
        !           936:        *        d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
        !           937:        */
        !           938:
        !           939:       /* b := b - 16*a -    e
        !           940:        * c := c -    a -    e
        !           941:        * d := d -    a - 16*e
        !           942:        */
        !           943:       t = a << 4;
        !           944:       tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
        !           945:       b -= t;
        !           946:       tb -= b < e;
        !           947:       b -= e;
        !           948:       tc = -(c < a);
        !           949:       c -= a;
        !           950:       tc -= c < e;
        !           951:       c -= e;
        !           952:       td = -(d < a);
        !           953:       d -= a;
        !           954:       t = e << 4;
        !           955:       td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
        !           956:       d -= t;
        !           957:
        !           958:       /* b, d := b + d, b - d */
        !           959:       t = b + d;
        !           960:       tt = tb + td + (t < b);
        !           961:       td = tb - td - (b < d);
        !           962:       d = b - d;
        !           963:       b = t;
        !           964:       tb = tt;
        !           965:
        !           966:       /* b := b-8*c */
        !           967:       t = c << 3;
        !           968:       tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
        !           969:       b -= t;
        !           970:
        !           971:       /* c := 2*c - b */
        !           972:       t = c << 1;
        !           973:       tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
        !           974:       c = t - b;
        !           975:
        !           976:       /* d := d/3 */
        !           977:       d *= INVERSE_3;
        !           978:       td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
        !           979:       td *= INVERSE_3;
        !           980:
        !           981:       /* b, d := b + d, b - d */
        !           982:       t = b + d;
        !           983:       tt = tb + td + (t < b);
        !           984:       td = tb - td - (b < d);
        !           985:       d = b - d;
        !           986:       b = t;
        !           987:       tb = tt;
        !           988:
        !           989:       /* Now:
        !           990:        *        b = 4*x1
        !           991:        *        c = 2*x2
        !           992:        *        d = 4*x3
        !           993:        */
        !           994:
        !           995:       /* sb has period 2. */
        !           996:       b += sb;
        !           997:       tb += b < sb;
        !           998:       sb &= maskOffHalf;
        !           999:       sb |= sb >> (BITS_PER_MP_LIMB >> 1);
        !          1000:       sb += tb;
        !          1001:
        !          1002:       /* sc has period 1. */
        !          1003:       c += sc;
        !          1004:       tc += c < sc;
        !          1005:       /* TO DO: choose one of the following alternatives. */
        !          1006: #if 1
        !          1007:       sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
        !          1008:       sc += tc;
        !          1009: #else
        !          1010:       sc = tc - ((long)sc < 0L);
        !          1011: #endif
        !          1012:
        !          1013:       /* sd has period 2. */
        !          1014:       d += sd;
        !          1015:       td += d < sd;
        !          1016:       sd &= maskOffHalf;
        !          1017:       sd |= sd >> (BITS_PER_MP_LIMB >> 1);
        !          1018:       sd += td;
        !          1019:
        !          1020:       if (i != 0)
1.1       maekawa  1021:        {
1.1.1.2 ! maekawa  1022:          B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
        !          1023:          C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
        !          1024:          D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1.1       maekawa  1025:        }
1.1.1.2 ! maekawa  1026:       ob = b >> 2;
        !          1027:       oc = c >> 1;
        !          1028:       od = d >> 2;
1.1       maekawa  1029:
1.1.1.2 ! maekawa  1030:       ++A; ++B; ++C; ++D; ++E;
        !          1031:     }
1.1       maekawa  1032:
1.1.1.2 ! maekawa  1033:   /* Handle top words. */
        !          1034:   b = *ptb;
        !          1035:   c = *ptc;
        !          1036:   d = *ptd;
        !          1037:
        !          1038:   t = b + d;
        !          1039:   d = b - d;
        !          1040:   b = t;
        !          1041:   b -= c << 3;
        !          1042:   c = (c << 1) - b;
        !          1043:   d *= INVERSE_3;
        !          1044:   t = b + d;
        !          1045:   d = b - d;
        !          1046:   b = t;
        !          1047:
        !          1048:   b += sb;
        !          1049:   c += sc;
        !          1050:   d += sd;
        !          1051:
        !          1052:   B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
        !          1053:   C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
        !          1054:   D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
        !          1055:
        !          1056:   b >>= 2;
        !          1057:   c >>= 1;
        !          1058:   d >>= 2;
        !          1059:
        !          1060: #if WANT_ASSERT
        !          1061:   ASSERT (b < 2);
        !          1062:   if (l == ls)
        !          1063:     {
        !          1064:       ASSERT (c < 3);
        !          1065:       ASSERT (d < 2);
        !          1066:     }
        !          1067:   else
        !          1068:     {
        !          1069:       ASSERT (c < 2);
        !          1070:       ASSERT (!d);
        !          1071:     }
        !          1072: #endif
1.1       maekawa  1073:
1.1.1.2 ! maekawa  1074:   *ptb = b;
        !          1075:   *ptc = c;
        !          1076:   *ptd = d;
        !          1077: }
        !          1078: #endif
1.1       maekawa  1079:
                   1080:
1.1.1.2 ! maekawa  1081: /*-- mpn_toom3_mul_n --------------------------------------------------------------*/
1.1       maekawa  1082:
1.1.1.2 ! maekawa  1083: /* Multiplies using 5 mults of one third size and so on recursively.
        !          1084:  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
        !          1085:  * No overlap of p[...] with a[...] or b[...].
        !          1086:  * ws is workspace.
        !          1087:  */
        !          1088:
        !          1089: /* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
        !          1090:  *        recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
        !          1091:  *        because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
        !          1092:  */
        !          1093:
        !          1094: #define TOOM3_MUL_REC(p, a, b, n, ws) \
        !          1095:   do {                                                         \
        !          1096:     if (n < KARATSUBA_MUL_THRESHOLD)                           \
        !          1097:       mpn_mul_basecase (p, a, n, b, n);                                \
        !          1098:     else if (n < TOOM3_MUL_THRESHOLD)                          \
        !          1099:       mpn_kara_mul_n (p, a, b, n, ws);                         \
        !          1100:     else                                                       \
        !          1101:       mpn_toom3_mul_n (p, a, b, n, ws);                                \
        !          1102:   } while (0)
1.1       maekawa  1103:
1.1.1.2 ! maekawa  1104: void
        !          1105: #if __STDC__
        !          1106: mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
        !          1107: #else
        !          1108: mpn_toom3_mul_n (p, a, b, n, ws)
        !          1109:      mp_ptr    p;
        !          1110:      mp_srcptr a;
        !          1111:      mp_srcptr b;
        !          1112:      mp_size_t n;
        !          1113:      mp_ptr    ws;
        !          1114: #endif
        !          1115: {
        !          1116:   mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
        !          1117:   mp_limb_t *A,*B,*C,*D,*E, *W;
        !          1118:   mp_size_t l,l2,l3,l4,l5,ls;
        !          1119:
        !          1120:   /* Break n words into chunks of size l, l and ls.
        !          1121:    * n = 3*k   => l = k,   ls = k
        !          1122:    * n = 3*k+1 => l = k+1, ls = k-1
        !          1123:    * n = 3*k+2 => l = k+1, ls = k
        !          1124:    */
        !          1125:   {
        !          1126:     mp_limb_t m;
        !          1127:
        !          1128:     ASSERT (n >= TOOM3_MUL_THRESHOLD);
        !          1129:     l = ls = n / 3;
        !          1130:     m = n - l * 3;
        !          1131:     if (m != 0)
        !          1132:       ++l;
        !          1133:     if (m == 1)
        !          1134:       --ls;
        !          1135:
        !          1136:     l2 = l * 2;
        !          1137:     l3 = l * 3;
        !          1138:     l4 = l * 4;
        !          1139:     l5 = l * 5;
        !          1140:     A = p;
        !          1141:     B = ws;
        !          1142:     C = p + l2;
        !          1143:     D = ws + l2;
        !          1144:     E = p + l4;
        !          1145:     W = ws + l4;
        !          1146:   }
        !          1147:
        !          1148:   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
        !          1149:   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
        !          1150:   evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
        !          1151:
        !          1152:   /** Second stage: pointwise multiplies. **/
        !          1153:   TOOM3_MUL_REC(D, C, C + l, l, W);
        !          1154:   tD = cD*dD;
        !          1155:   if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
        !          1156:   if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
        !          1157:   ASSERT (tD < 49);
        !          1158:   TOOM3_MUL_REC(C, B, B + l, l, W);
        !          1159:   tC = cC*dC;
        !          1160:   /* TO DO: choose one of the following alternatives. */
        !          1161: #if 0
        !          1162:   if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
        !          1163:   if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
        !          1164: #else
        !          1165:   if (cC)
        !          1166:     {
        !          1167:       if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
        !          1168:       else tC += add2Times (C + l, C + l, B + l, l);
1.1       maekawa  1169:     }
1.1.1.2 ! maekawa  1170:   if (dC)
        !          1171:     {
        !          1172:       if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
        !          1173:       else tC += add2Times (C + l, C + l, B, l);
        !          1174:     }
        !          1175: #endif
        !          1176:   ASSERT (tC < 9);
        !          1177:   TOOM3_MUL_REC(B, A, A + l, l, W);
        !          1178:   tB = cB*dB;
        !          1179:   if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
        !          1180:   if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
        !          1181:   ASSERT (tB < 49);
        !          1182:   TOOM3_MUL_REC(A, a, b, l, W);
        !          1183:   TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
        !          1184:
        !          1185:   /** Third stage: interpolation. **/
        !          1186:   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
        !          1187:
        !          1188:   /** Final stage: add up the coefficients. **/
        !          1189:   {
        !          1190:     mp_limb_t i, x, y;
        !          1191:     tB += mpn_add_n (p + l, p + l, B, l2);
        !          1192:     tD += mpn_add_n (p + l3, p + l3, D, l2);
        !          1193:     mpn_incr_u (p + l3, tB);
        !          1194:     mpn_incr_u (p + l4, tC);
        !          1195:     mpn_incr_u (p + l5, tD);
        !          1196:   }
1.1       maekawa  1197: }
                   1198:
1.1.1.2 ! maekawa  1199: /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
        !          1200:
        !          1201: /* Like previous function but for squaring */
        !          1202:
        !          1203: #define TOOM3_SQR_REC(p, a, n, ws) \
        !          1204:   do {                                                         \
        !          1205:     if (n < KARATSUBA_SQR_THRESHOLD)                           \
        !          1206:       mpn_sqr_basecase (p, a, n);                              \
        !          1207:     else if (n < TOOM3_SQR_THRESHOLD)                          \
        !          1208:       mpn_kara_sqr_n (p, a, n, ws);                            \
        !          1209:     else                                                       \
        !          1210:       mpn_toom3_sqr_n (p, a, n, ws);                           \
        !          1211:   } while (0)
        !          1212:
        !          1213: void
1.1       maekawa  1214: #if __STDC__
1.1.1.2 ! maekawa  1215: mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1.1       maekawa  1216: #else
1.1.1.2 ! maekawa  1217: mpn_toom3_sqr_n (p, a, n, ws)
        !          1218:      mp_ptr    p;
        !          1219:      mp_srcptr a;
        !          1220:      mp_size_t n;
        !          1221:      mp_ptr    ws;
1.1       maekawa  1222: #endif
                   1223: {
1.1.1.2 ! maekawa  1224:   mp_limb_t cB,cC,cD, tB,tC,tD;
        !          1225:   mp_limb_t *A,*B,*C,*D,*E, *W;
        !          1226:   mp_size_t l,l2,l3,l4,l5,ls;
        !          1227:
        !          1228:   /* Break n words into chunks of size l, l and ls.
        !          1229:    * n = 3*k   => l = k,   ls = k
        !          1230:    * n = 3*k+1 => l = k+1, ls = k-1
        !          1231:    * n = 3*k+2 => l = k+1, ls = k
        !          1232:    */
        !          1233:   {
        !          1234:     mp_limb_t m;
        !          1235:
        !          1236:     ASSERT (n >= TOOM3_MUL_THRESHOLD);
        !          1237:     l = ls = n / 3;
        !          1238:     m = n - l * 3;
        !          1239:     if (m != 0)
        !          1240:       ++l;
        !          1241:     if (m == 1)
        !          1242:       --ls;
        !          1243:
        !          1244:     l2 = l * 2;
        !          1245:     l3 = l * 3;
        !          1246:     l4 = l * 4;
        !          1247:     l5 = l * 5;
        !          1248:     A = p;
        !          1249:     B = ws;
        !          1250:     C = p + l2;
        !          1251:     D = ws + l2;
        !          1252:     E = p + l4;
        !          1253:     W = ws + l4;
        !          1254:   }
        !          1255:
        !          1256:   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
        !          1257:   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
        !          1258:
        !          1259:   /** Second stage: pointwise multiplies. **/
        !          1260:   TOOM3_SQR_REC(D, C, l, W);
        !          1261:   tD = cD*cD;
        !          1262:   if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
        !          1263:   ASSERT (tD < 49);
        !          1264:   TOOM3_SQR_REC(C, B, l, W);
        !          1265:   tC = cC*cC;
        !          1266:   /* TO DO: choose one of the following alternatives. */
        !          1267: #if 0
        !          1268:   if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
        !          1269: #else
        !          1270:   if (cC >= 1)
1.1       maekawa  1271:     {
1.1.1.2 ! maekawa  1272:       tC += add2Times (C + l, C + l, B, l);
        !          1273:       if (cC == 2)
        !          1274:         tC += add2Times (C + l, C + l, B, l);
        !          1275:     }
        !          1276: #endif
        !          1277:   ASSERT (tC < 9);
        !          1278:   TOOM3_SQR_REC(B, A, l, W);
        !          1279:   tB = cB*cB;
        !          1280:   if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
        !          1281:   ASSERT (tB < 49);
        !          1282:   TOOM3_SQR_REC(A, a, l, W);
        !          1283:   TOOM3_SQR_REC(E, a + l2, ls, W);
        !          1284:
        !          1285:   /** Third stage: interpolation. **/
        !          1286:   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
        !          1287:
        !          1288:   /** Final stage: add up the coefficients. **/
        !          1289:   {
        !          1290:     mp_limb_t i, x, y;
        !          1291:     tB += mpn_add_n (p + l, p + l, B, l2);
        !          1292:     tD += mpn_add_n (p + l3, p + l3, D, l2);
        !          1293:     mpn_incr_u (p + l3, tB);
        !          1294:     mpn_incr_u (p + l4, tC);
        !          1295:     mpn_incr_u (p + l5, tD);
        !          1296:   }
        !          1297: }
        !          1298:
        !          1299: void
        !          1300: #if __STDC__
        !          1301: mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
        !          1302: #else
        !          1303: mpn_mul_n (p, a, b, n)
        !          1304:      mp_ptr    p;
        !          1305:      mp_srcptr a;
        !          1306:      mp_srcptr b;
        !          1307:      mp_size_t n;
        !          1308: #endif
        !          1309: {
        !          1310:   if (n < KARATSUBA_MUL_THRESHOLD)
        !          1311:     mpn_mul_basecase (p, a, n, b, n);
        !          1312:   else if (n < TOOM3_MUL_THRESHOLD)
        !          1313:     {
        !          1314:       /* Allocate workspace of fixed size on stack: fast! */
        !          1315: #if TUNE_PROGRAM_BUILD
        !          1316:       mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
        !          1317: #else
        !          1318:       mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
        !          1319: #endif
        !          1320:       mpn_kara_mul_n (p, a, b, n, ws);
1.1       maekawa  1321:     }
1.1.1.2 ! maekawa  1322: #if WANT_FFT || TUNE_PROGRAM_BUILD
        !          1323:   else if (n < FFT_MUL_THRESHOLD)
        !          1324: #else
1.1       maekawa  1325:   else
1.1.1.2 ! maekawa  1326: #endif
1.1       maekawa  1327:     {
1.1.1.2 ! maekawa  1328:       /* Use workspace of unknown size in heap, as stack space may
        !          1329:        * be limited.  Since n is at least TOOM3_MUL_THRESHOLD, the
        !          1330:        * multiplication will take much longer than malloc()/free().  */
        !          1331:       mp_limb_t wsLen, *ws;
        !          1332:       wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
        !          1333:       ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
        !          1334:       mpn_toom3_mul_n (p, a, b, n, ws);
        !          1335:       (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
1.1       maekawa  1336:     }
1.1.1.2 ! maekawa  1337: #if WANT_FFT || TUNE_PROGRAM_BUILD
        !          1338:   else
        !          1339:     {
        !          1340:       mpn_mul_fft_full (p, a, n, b, n);
        !          1341:     }
        !          1342: #endif
1.1       maekawa  1343: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>