[BACK]Return to mul_n.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / generic

Annotation of OpenXM_contrib/gmp/mpn/generic/mul_n.c, Revision 1.1.1.3

1.1.1.2   maekawa     1: /* mpn_mul_n and helper function -- Multiply/square natural numbers.
1.1       maekawa     2:
1.1.1.2   maekawa     3:    THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
                      4:    ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
                      5:    THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
                      6:    THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
                      7:
                      8:
1.1.1.3 ! ohara       9: Copyright 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free
        !            10: Software Foundation, Inc.
1.1       maekawa    11:
                     12: This file is part of the GNU MP Library.
                     13:
                     14: The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2   maekawa    15: it under the terms of the GNU Lesser General Public License as published by
                     16: the Free Software Foundation; either version 2.1 of the License, or (at your
1.1       maekawa    17: option) any later version.
                     18:
                     19: The GNU MP Library is distributed in the hope that it will be useful, but
                     20: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2   maekawa    21: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
1.1       maekawa    22: License for more details.
                     23:
1.1.1.2   maekawa    24: You should have received a copy of the GNU Lesser General Public License
1.1       maekawa    25: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     26: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     27: MA 02111-1307, USA. */
                     28:
                     29: #include "gmp.h"
                     30: #include "gmp-impl.h"
1.1.1.2   maekawa    31: #include "longlong.h"
1.1       maekawa    32:
                     33:
1.1.1.3 ! ohara      34: #if GMP_NAIL_BITS != 0
        !            35: /* The open-coded interpolate3 stuff has not been generalized for nails.  */
        !            36: #define USE_MORE_MPN 1
        !            37: #endif
1.1       maekawa    38:
1.1.1.3 ! ohara      39: #ifndef USE_MORE_MPN
1.1.1.2   maekawa    40: #if !defined (__alpha) && !defined (__mips)
                     41: /* For all other machines, we want to call mpn functions for the compund
                     42:    operations instead of open-coding them.  */
1.1.1.3 ! ohara      43: #define USE_MORE_MPN 1
        !            44: #endif
1.1       maekawa    45: #endif
                     46:
1.1.1.2   maekawa    47: /*== Function declarations =================================================*/
1.1       maekawa    48:
1.1.1.2   maekawa    49: static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
1.1.1.3 ! ohara      50:                               mp_ptr, mp_ptr, mp_ptr,
        !            51:                               mp_srcptr, mp_srcptr, mp_srcptr,
        !            52:                               mp_size_t, mp_size_t));
1.1.1.2   maekawa    53: static void interpolate3 _PROTO ((mp_srcptr,
1.1.1.3 ! ohara      54:                                  mp_ptr, mp_ptr, mp_ptr,
        !            55:                                  mp_srcptr,
        !            56:                                  mp_ptr, mp_ptr, mp_ptr,
        !            57:                                  mp_size_t, mp_size_t));
1.1.1.2   maekawa    58: static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
                     59:
                     60:
                     61: /*-- mpn_kara_mul_n ---------------------------------------------------------------*/
                     62:
                     63: /* Multiplies using 3 half-sized mults and so on recursively.
                     64:  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
                     65:  * No overlap of p[...] with a[...] or b[...].
                     66:  * ws is workspace.
                     67:  */
1.1       maekawa    68:
                     69: void
1.1.1.2   maekawa    70: mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
1.1       maekawa    71: {
1.1.1.3 ! ohara      72:   mp_limb_t w, w0, w1;
1.1.1.2   maekawa    73:   mp_size_t n2;
                     74:   mp_srcptr x, y;
1.1.1.3 ! ohara      75:   mp_size_t i;
        !            76:   int sign;
1.1       maekawa    77:
1.1.1.2   maekawa    78:   n2 = n >> 1;
                     79:   ASSERT (n2 > 0);
                     80:
1.1.1.3 ! ohara      81:   if ((n & 1) != 0)
1.1       maekawa    82:     {
1.1.1.2   maekawa    83:       /* Odd length. */
                     84:       mp_size_t n1, n3, nm1;
                     85:
                     86:       n3 = n - n2;
                     87:
                     88:       sign = 0;
                     89:       w = a[n2];
                     90:       if (w != 0)
                     91:        w -= mpn_sub_n (p, a, a + n3, n2);
                     92:       else
                     93:        {
                     94:          i = n2;
                     95:          do
                     96:            {
                     97:              --i;
                     98:              w0 = a[i];
1.1.1.3 ! ohara      99:              w1 = a[n3 + i];
1.1.1.2   maekawa   100:            }
                    101:          while (w0 == w1 && i != 0);
                    102:          if (w0 < w1)
                    103:            {
                    104:              x = a + n3;
                    105:              y = a;
1.1.1.3 ! ohara     106:              sign = ~0;
1.1.1.2   maekawa   107:            }
                    108:          else
                    109:            {
                    110:              x = a;
                    111:              y = a + n3;
                    112:            }
                    113:          mpn_sub_n (p, x, y, n2);
                    114:        }
                    115:       p[n2] = w;
                    116:
                    117:       w = b[n2];
                    118:       if (w != 0)
                    119:        w -= mpn_sub_n (p + n3, b, b + n3, n2);
                    120:       else
                    121:        {
                    122:          i = n2;
1.1.1.3 ! ohara     123:          do
1.1.1.2   maekawa   124:            {
                    125:              --i;
1.1.1.3 ! ohara     126:              w0 = b[i];
        !           127:              w1 = b[n3 + i];
1.1.1.2   maekawa   128:            }
                    129:          while (w0 == w1 && i != 0);
                    130:          if (w0 < w1)
                    131:            {
                    132:              x = b + n3;
                    133:              y = b;
1.1.1.3 ! ohara     134:              sign = ~sign;
1.1.1.2   maekawa   135:            }
                    136:          else
                    137:            {
                    138:              x = b;
                    139:              y = b + n3;
                    140:            }
                    141:          mpn_sub_n (p + n3, x, y, n2);
                    142:        }
                    143:       p[n] = w;
                    144:
                    145:       n1 = n + 1;
1.1.1.3 ! ohara     146:       if (n2 < MUL_KARATSUBA_THRESHOLD)
1.1.1.2   maekawa   147:        {
1.1.1.3 ! ohara     148:          if (n3 < MUL_KARATSUBA_THRESHOLD)
1.1.1.2   maekawa   149:            {
                    150:              mpn_mul_basecase (ws, p, n3, p + n3, n3);
                    151:              mpn_mul_basecase (p, a, n3, b, n3);
                    152:            }
                    153:          else
                    154:            {
                    155:              mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
                    156:              mpn_kara_mul_n (p, a, b, n3, ws + n1);
                    157:            }
                    158:          mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
                    159:        }
                    160:       else
                    161:        {
                    162:          mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
                    163:          mpn_kara_mul_n (p, a, b, n3, ws + n1);
                    164:          mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
                    165:        }
                    166:
                    167:       if (sign)
                    168:        mpn_add_n (ws, p, ws, n1);
1.1       maekawa   169:       else
1.1.1.2   maekawa   170:        mpn_sub_n (ws, p, ws, n1);
                    171:
                    172:       nm1 = n - 1;
                    173:       if (mpn_add_n (ws, p + n1, ws, nm1))
                    174:        {
1.1.1.3 ! ohara     175:          mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
1.1.1.2   maekawa   176:          ws[nm1] = x;
                    177:          if (x == 0)
1.1.1.3 ! ohara     178:            ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
1.1.1.2   maekawa   179:        }
                    180:       if (mpn_add_n (p + n3, p + n3, ws, n1))
                    181:        {
1.1.1.3 ! ohara     182:          mpn_incr_u (p + n1 + n3, 1);
1.1.1.2   maekawa   183:        }
1.1       maekawa   184:     }
                    185:   else
1.1.1.2   maekawa   186:     {
                    187:       /* Even length. */
                    188:       i = n2;
                    189:       do
                    190:        {
                    191:          --i;
                    192:          w0 = a[i];
1.1.1.3 ! ohara     193:          w1 = a[n2 + i];
1.1.1.2   maekawa   194:        }
                    195:       while (w0 == w1 && i != 0);
                    196:       sign = 0;
                    197:       if (w0 < w1)
                    198:        {
                    199:          x = a + n2;
                    200:          y = a;
1.1.1.3 ! ohara     201:          sign = ~0;
1.1.1.2   maekawa   202:        }
                    203:       else
                    204:        {
                    205:          x = a;
                    206:          y = a + n2;
                    207:        }
                    208:       mpn_sub_n (p, x, y, n2);
1.1       maekawa   209:
1.1.1.2   maekawa   210:       i = n2;
1.1.1.3 ! ohara     211:       do
1.1.1.2   maekawa   212:        {
                    213:          --i;
                    214:          w0 = b[i];
1.1.1.3 ! ohara     215:          w1 = b[n2 + i];
1.1.1.2   maekawa   216:        }
                    217:       while (w0 == w1 && i != 0);
                    218:       if (w0 < w1)
                    219:        {
                    220:          x = b + n2;
                    221:          y = b;
1.1.1.3 ! ohara     222:          sign = ~sign;
1.1.1.2   maekawa   223:        }
                    224:       else
                    225:        {
                    226:          x = b;
                    227:          y = b + n2;
                    228:        }
                    229:       mpn_sub_n (p + n2, x, y, n2);
1.1       maekawa   230:
1.1.1.2   maekawa   231:       /* Pointwise products. */
1.1.1.3 ! ohara     232:       if (n2 < MUL_KARATSUBA_THRESHOLD)
1.1       maekawa   233:        {
1.1.1.2   maekawa   234:          mpn_mul_basecase (ws, p, n2, p + n2, n2);
                    235:          mpn_mul_basecase (p, a, n2, b, n2);
                    236:          mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
1.1       maekawa   237:        }
                    238:       else
1.1.1.2   maekawa   239:        {
                    240:          mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
                    241:          mpn_kara_mul_n (p, a, b, n2, ws + n);
                    242:          mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
                    243:        }
1.1       maekawa   244:
1.1.1.2   maekawa   245:       /* Interpolate. */
                    246:       if (sign)
                    247:        w = mpn_add_n (ws, p, ws, n);
                    248:       else
                    249:        w = -mpn_sub_n (ws, p, ws, n);
                    250:       w += mpn_add_n (ws, p + n, ws, n);
                    251:       w += mpn_add_n (p + n2, p + n2, ws, n);
1.1.1.3 ! ohara     252:       MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
1.1       maekawa   253:     }
                    254: }
                    255:
                    256: void
1.1.1.2   maekawa   257: mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
                    258: {
1.1.1.3 ! ohara     259:   mp_limb_t w, w0, w1;
1.1.1.2   maekawa   260:   mp_size_t n2;
                    261:   mp_srcptr x, y;
1.1.1.3 ! ohara     262:   mp_size_t i;
1.1       maekawa   263:
1.1.1.2   maekawa   264:   n2 = n >> 1;
                    265:   ASSERT (n2 > 0);
                    266:
1.1.1.3 ! ohara     267:   if ((n & 1) != 0)
1.1       maekawa   268:     {
1.1.1.2   maekawa   269:       /* Odd length. */
                    270:       mp_size_t n1, n3, nm1;
1.1       maekawa   271:
1.1.1.2   maekawa   272:       n3 = n - n2;
1.1       maekawa   273:
1.1.1.2   maekawa   274:       w = a[n2];
                    275:       if (w != 0)
                    276:        w -= mpn_sub_n (p, a, a + n3, n2);
                    277:       else
                    278:        {
                    279:          i = n2;
                    280:          do
                    281:            {
                    282:              --i;
                    283:              w0 = a[i];
1.1.1.3 ! ohara     284:              w1 = a[n3 + i];
1.1.1.2   maekawa   285:            }
                    286:          while (w0 == w1 && i != 0);
                    287:          if (w0 < w1)
                    288:            {
                    289:              x = a + n3;
                    290:              y = a;
                    291:            }
                    292:          else
                    293:            {
                    294:              x = a;
                    295:              y = a + n3;
                    296:            }
                    297:          mpn_sub_n (p, x, y, n2);
                    298:        }
                    299:       p[n2] = w;
1.1       maekawa   300:
1.1.1.3 ! ohara     301:       n1 = n + 1;
        !           302:
        !           303:       /* n2 is always either n3 or n3-1 so maybe the two sets of tests here
        !           304:         could be combined.  But that's not important, since the tests will
        !           305:         take a miniscule amount of time compared to the function calls.  */
        !           306:       if (BELOW_THRESHOLD (n3, SQR_BASECASE_THRESHOLD))
1.1.1.2   maekawa   307:        {
1.1.1.3 ! ohara     308:          mpn_mul_basecase (ws, p, n3, p, n3);
        !           309:          mpn_mul_basecase (p,  a, n3, a, n3);
1.1.1.2   maekawa   310:        }
1.1.1.3 ! ohara     311:       else if (BELOW_THRESHOLD (n3, SQR_KARATSUBA_THRESHOLD))
1.1.1.2   maekawa   312:        {
1.1.1.3 ! ohara     313:          mpn_sqr_basecase (ws, p, n3);
        !           314:          mpn_sqr_basecase (p,  a, n3);
1.1.1.2   maekawa   315:        }
                    316:       else
                    317:        {
1.1.1.3 ! ohara     318:          mpn_kara_sqr_n   (ws, p, n3, ws + n1);         /* (x-y)^2 */
        !           319:          mpn_kara_sqr_n   (p,  a, n3, ws + n1);         /* x^2     */
1.1.1.2   maekawa   320:        }
1.1.1.3 ! ohara     321:       if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
        !           322:        mpn_mul_basecase (p + n1, a + n3, n2, a + n3, n2);
        !           323:       else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
        !           324:        mpn_sqr_basecase (p + n1, a + n3, n2);
1.1.1.2   maekawa   325:       else
1.1.1.3 ! ohara     326:        mpn_kara_sqr_n   (p + n1, a + n3, n2, ws + n1);  /* y^2     */
        !           327:
        !           328:       /* Since x^2+y^2-(x-y)^2 = 2xy >= 0 there's no need to track the
        !           329:         borrow from mpn_sub_n.  If it occurs then it'll be cancelled by a
        !           330:         carry from ws[n].  Further, since 2xy fits in n1 limbs there won't
        !           331:         be any carry out of ws[n] other than cancelling that borrow. */
        !           332:
        !           333:       mpn_sub_n (ws, p, ws, n1);            /* x^2-(x-y)^2 */
1.1       maekawa   334:
1.1.1.2   maekawa   335:       nm1 = n - 1;
1.1.1.3 ! ohara     336:       if (mpn_add_n (ws, p + n1, ws, nm1))   /* x^2+y^2-(x-y)^2 = 2xy */
1.1.1.2   maekawa   337:        {
1.1.1.3 ! ohara     338:          mp_limb_t x = (ws[nm1] + 1) & GMP_NUMB_MASK;
1.1.1.2   maekawa   339:          ws[nm1] = x;
                    340:          if (x == 0)
1.1.1.3 ! ohara     341:            ws[n] = (ws[n] + 1) & GMP_NUMB_MASK;
1.1.1.2   maekawa   342:        }
                    343:       if (mpn_add_n (p + n3, p + n3, ws, n1))
                    344:        {
1.1.1.3 ! ohara     345:          mpn_incr_u (p + n1 + n3, 1);
1.1.1.2   maekawa   346:        }
                    347:     }
                    348:   else
                    349:     {
                    350:       /* Even length. */
                    351:       i = n2;
                    352:       do
                    353:        {
                    354:          --i;
                    355:          w0 = a[i];
1.1.1.3 ! ohara     356:          w1 = a[n2 + i];
1.1.1.2   maekawa   357:        }
                    358:       while (w0 == w1 && i != 0);
                    359:       if (w0 < w1)
1.1       maekawa   360:        {
1.1.1.2   maekawa   361:          x = a + n2;
                    362:          y = a;
1.1       maekawa   363:        }
                    364:       else
                    365:        {
1.1.1.2   maekawa   366:          x = a;
                    367:          y = a + n2;
                    368:        }
                    369:       mpn_sub_n (p, x, y, n2);
                    370:
1.1.1.3 ! ohara     371:       /* Pointwise products. */
        !           372:       if (BELOW_THRESHOLD (n2, SQR_BASECASE_THRESHOLD))
1.1       maekawa   373:        {
1.1.1.3 ! ohara     374:          mpn_mul_basecase (ws,    p,      n2, p,      n2);
        !           375:          mpn_mul_basecase (p,     a,      n2, a,      n2);
        !           376:          mpn_mul_basecase (p + n, a + n2, n2, a + n2, n2);
1.1       maekawa   377:        }
1.1.1.3 ! ohara     378:       else if (BELOW_THRESHOLD (n2, SQR_KARATSUBA_THRESHOLD))
1.1       maekawa   379:        {
1.1.1.3 ! ohara     380:          mpn_sqr_basecase (ws,    p,      n2);
        !           381:          mpn_sqr_basecase (p,     a,      n2);
1.1.1.2   maekawa   382:          mpn_sqr_basecase (p + n, a + n2, n2);
                    383:        }
                    384:       else
                    385:        {
1.1.1.3 ! ohara     386:          mpn_kara_sqr_n (ws,    p,      n2, ws + n);
        !           387:          mpn_kara_sqr_n (p,     a,      n2, ws + n);
1.1.1.2   maekawa   388:          mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
                    389:        }
1.1       maekawa   390:
1.1.1.2   maekawa   391:       /* Interpolate. */
1.1.1.3 ! ohara     392:       w = -mpn_sub_n (ws, p, ws, n);
1.1.1.2   maekawa   393:       w += mpn_add_n (ws, p + n, ws, n);
                    394:       w += mpn_add_n (p + n2, p + n2, ws, n);
1.1.1.3 ! ohara     395:       MPN_INCR_U (p + n2 + n, 2 * n - (n2 + n), w);
1.1.1.2   maekawa   396:     }
                    397: }
1.1       maekawa   398:
1.1.1.2   maekawa   399: /*-- add2Times -------------------------------------------------------------*/
1.1       maekawa   400:
1.1.1.2   maekawa   401: /* z[] = x[] + 2 * y[]
1.1.1.3 ! ohara     402:    Note that z and x might point to the same vectors.
        !           403:    FIXME: gcc won't inline this because it uses alloca. */
        !           404: #if USE_MORE_MPN
        !           405:
1.1.1.2   maekawa   406: static inline mp_limb_t
                    407: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
                    408: {
                    409:   mp_ptr t;
                    410:   mp_limb_t c;
                    411:   TMP_DECL (marker);
                    412:   TMP_MARK (marker);
                    413:   t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
                    414:   c = mpn_lshift (t, y, n, 1);
                    415:   c += mpn_add_n (z, x, t, n);
                    416:   TMP_FREE (marker);
                    417:   return c;
                    418: }
1.1.1.3 ! ohara     419:
1.1.1.2   maekawa   420: #else
1.1       maekawa   421:
1.1.1.2   maekawa   422: static mp_limb_t
                    423: add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
                    424: {
                    425:   mp_limb_t c, v, w;
1.1       maekawa   426:
1.1.1.2   maekawa   427:   ASSERT (n > 0);
1.1.1.3 ! ohara     428:   v = *x;
        !           429:   w = *y;
1.1.1.2   maekawa   430:   c = w >> (BITS_PER_MP_LIMB - 1);
                    431:   w <<= 1;
                    432:   v += w;
                    433:   c += v < w;
                    434:   *z = v;
                    435:   ++x; ++y; ++z;
                    436:   while (--n)
                    437:     {
                    438:       v = *x;
                    439:       w = *y;
                    440:       v += c;
                    441:       c = v < c;
                    442:       c += w >> (BITS_PER_MP_LIMB - 1);
                    443:       w <<= 1;
                    444:       v += w;
                    445:       c += v < w;
                    446:       *z = v;
                    447:       ++x; ++y; ++z;
1.1       maekawa   448:     }
1.1.1.2   maekawa   449:
                    450:   return c;
1.1       maekawa   451: }
1.1.1.2   maekawa   452: #endif
1.1       maekawa   453:
1.1.1.2   maekawa   454: /*-- evaluate3 -------------------------------------------------------------*/
                    455:
                    456: /* Evaluates:
                    457:  *   ph := 4*A+2*B+C
                    458:  *   p1 := A+B+C
                    459:  *   p2 := A+2*B+4*C
                    460:  * where:
                    461:  *   ph[], p1[], p2[], A[] and B[] all have length len,
                    462:  *   C[] has length len2 with len-len2 = 0, 1 or 2.
                    463:  * Returns top words (overflow) at pth, pt1 and pt2 respectively.
                    464:  */
1.1.1.3 ! ohara     465: #if USE_MORE_MPN
        !           466:
1.1.1.2   maekawa   467: static void
                    468: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
1.1.1.3 ! ohara     469:           mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len,mp_size_t len2)
1.1       maekawa   470: {
1.1.1.2   maekawa   471:   mp_limb_t c, d, e;
1.1.1.3 ! ohara     472:
1.1.1.2   maekawa   473:   ASSERT (len - len2 <= 2);
                    474:
                    475:   e = mpn_lshift (p1, B, len, 1);
                    476:
                    477:   c = mpn_lshift (ph, A, len, 2);
                    478:   c += e + mpn_add_n (ph, ph, p1, len);
                    479:   d = mpn_add_n (ph, ph, C, len2);
1.1.1.3 ! ohara     480:   if (len2 == len)
        !           481:     c += d;
        !           482:   else
        !           483:     c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
1.1.1.2   maekawa   484:   ASSERT (c < 7);
                    485:   *pth = c;
                    486:
                    487:   c = mpn_lshift (p2, C, len2, 2);
                    488: #if 1
1.1.1.3 ! ohara     489:   if (len2 != len)
        !           490:     {
        !           491:       p2[len-1] = 0;
        !           492:       p2[len2] = c;
        !           493:       c = 0;
        !           494:     }
1.1.1.2   maekawa   495:   c += e + mpn_add_n (p2, p2, p1, len);
                    496: #else
                    497:   d = mpn_add_n (p2, p2, p1, len2);
                    498:   c += d;
1.1.1.3 ! ohara     499:   if (len2 != len)
        !           500:     c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
1.1.1.2   maekawa   501:   c += e;
                    502: #endif
                    503:   c += mpn_add_n (p2, p2, A, len);
                    504:   ASSERT (c < 7);
                    505:   *pt2 = c;
                    506:
                    507:   c = mpn_add_n (p1, A, B, len);
                    508:   d = mpn_add_n (p1, p1, C, len2);
1.1.1.3 ! ohara     509:   if (len2 == len)
        !           510:     c += d;
        !           511:   else
        !           512:     c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
1.1.1.2   maekawa   513:   ASSERT (c < 3);
                    514:   *pt1 = c;
                    515: }
                    516:
                    517: #else
1.1       maekawa   518:
1.1.1.2   maekawa   519: static void
                    520: evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
                    521:           mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
                    522: {
                    523:   mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
                    524:
                    525:   ASSERT (l - ls <= 2);
1.1       maekawa   526:
1.1.1.2   maekawa   527:   th = t1 = t2 = 0;
                    528:   for (i = 0; i < l; ++i)
1.1       maekawa   529:     {
1.1.1.2   maekawa   530:       a = *A;
                    531:       b = *B;
                    532:       c = i < ls ? *C : 0;
                    533:
                    534:       /* TO DO: choose one of the following alternatives. */
                    535: #if 0
                    536:       t = a << 2;
                    537:       vh = th + t;
                    538:       th = vh < t;
                    539:       th += a >> (BITS_PER_MP_LIMB - 2);
                    540:       t = b << 1;
                    541:       vh += t;
                    542:       th += vh < t;
                    543:       th += b >> (BITS_PER_MP_LIMB - 1);
                    544:       vh += c;
                    545:       th += vh < c;
                    546: #else
                    547:       vh = th + c;
                    548:       th = vh < c;
                    549:       t = b << 1;
                    550:       vh += t;
                    551:       th += vh < t;
                    552:       th += b >> (BITS_PER_MP_LIMB - 1);
                    553:       t = a << 2;
                    554:       vh += t;
                    555:       th += vh < t;
                    556:       th += a >> (BITS_PER_MP_LIMB - 2);
                    557: #endif
1.1       maekawa   558:
1.1.1.2   maekawa   559:       v1 = t1 + a;
                    560:       t1 = v1 < a;
                    561:       v1 += b;
                    562:       t1 += v1 < b;
                    563:       v1 += c;
                    564:       t1 += v1 < c;
                    565:
                    566:       v2 = t2 + a;
                    567:       t2 = v2 < a;
                    568:       t = b << 1;
                    569:       v2 += t;
                    570:       t2 += v2 < t;
                    571:       t2 += b >> (BITS_PER_MP_LIMB - 1);
                    572:       t = c << 2;
                    573:       v2 += t;
                    574:       t2 += v2 < t;
                    575:       t2 += c >> (BITS_PER_MP_LIMB - 2);
                    576:
                    577:       *ph = vh;
                    578:       *p1 = v1;
                    579:       *p2 = v2;
                    580:
                    581:       ++A; ++B; ++C;
                    582:       ++ph; ++p1; ++p2;
1.1       maekawa   583:     }
1.1.1.2   maekawa   584:
                    585:   ASSERT (th < 7);
                    586:   ASSERT (t1 < 3);
                    587:   ASSERT (t2 < 7);
                    588:
                    589:   *pth = th;
                    590:   *pt1 = t1;
                    591:   *pt2 = t2;
1.1       maekawa   592: }
1.1.1.2   maekawa   593: #endif
1.1       maekawa   594:
1.1.1.2   maekawa   595:
                    596: /*-- interpolate3 ----------------------------------------------------------*/
                    597:
                    598: /* Interpolates B, C, D (in-place) from:
                    599:  *   16*A+8*B+4*C+2*D+E
                    600:  *   A+B+C+D+E
                    601:  *   A+2*B+4*C+8*D+16*E
                    602:  * where:
                    603:  *   A[], B[], C[] and D[] all have length l,
                    604:  *   E[] has length ls with l-ls = 0, 2 or 4.
                    605:  *
                    606:  * Reads top words (from earlier overflow) from ptb, ptc and ptd,
                    607:  * and returns new top words there.
                    608:  */
                    609:
1.1.1.3 ! ohara     610: #if USE_MORE_MPN
        !           611:
1.1.1.2   maekawa   612: static void
                    613: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
1.1.1.3 ! ohara     614:              mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len,mp_size_t len2)
1.1.1.2   maekawa   615: {
                    616:   mp_ptr ws;
                    617:   mp_limb_t t, tb,tc,td;
                    618:   TMP_DECL (marker);
                    619:   TMP_MARK (marker);
                    620:
                    621:   ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
                    622:
                    623:   /* Let x1, x2, x3 be the values to interpolate.  We have:
1.1.1.3 ! ohara     624:    *        b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
        !           625:    *        c =    a +   x1 +   x2 +   x3 +    e
        !           626:    *        d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
1.1.1.2   maekawa   627:    */
                    628:
                    629:   ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
                    630:
                    631:   tb = *ptb; tc = *ptc; td = *ptd;
                    632:
                    633:
1.1.1.3 ! ohara     634:   /* b := b - 16*a -   e
        !           635:    * c := c -   a -    e
        !           636:    * d := d -   a - 16*e
1.1.1.2   maekawa   637:    */
                    638:
                    639:   t = mpn_lshift (ws, A, len, 4);
                    640:   tb -= t + mpn_sub_n (B, B, ws, len);
                    641:   t = mpn_sub_n (B, B, E, len2);
1.1.1.3 ! ohara     642:   if (len2 == len)
        !           643:     tb -= t;
        !           644:   else
        !           645:     tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
1.1.1.2   maekawa   646:
                    647:   tc -= mpn_sub_n (C, C, A, len);
                    648:   t = mpn_sub_n (C, C, E, len2);
1.1.1.3 ! ohara     649:   if (len2 == len)
        !           650:     tc -= t;
        !           651:   else
        !           652:     tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
1.1.1.2   maekawa   653:
                    654:   t = mpn_lshift (ws, E, len2, 4);
                    655:   t += mpn_add_n (ws, ws, A, len2);
                    656: #if 1
1.1.1.3 ! ohara     657:   if (len2 != len)
        !           658:     t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
1.1.1.2   maekawa   659:   td -= t + mpn_sub_n (D, D, ws, len);
                    660: #else
                    661:   t += mpn_sub_n (D, D, ws, len2);
1.1.1.3 ! ohara     662:   if (len2 != len)
        !           663:     {
        !           664:       t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
        !           665:       t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
        !           666:     }
1.1.1.2   maekawa   667:   td -= t;
                    668: #endif
                    669:
                    670:
                    671:   /* b, d := b + d, b - d */
                    672:
                    673: #ifdef HAVE_MPN_ADD_SUB_N
                    674:   /* #error TO DO ... */
                    675: #else
1.1.1.3 ! ohara     676:   t = tb + td + mpn_add_n (ws, B, D, len);
        !           677:   td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK;
1.1.1.2   maekawa   678:   tb = t;
                    679:   MPN_COPY (B, ws, len);
                    680: #endif
1.1.1.3 ! ohara     681:
1.1.1.2   maekawa   682:   /* b := b-8*c */
                    683:   t = 8 * tc + mpn_lshift (ws, C, len, 3);
                    684:   tb -= t + mpn_sub_n (B, B, ws, len);
                    685:
                    686:   /* c := 2*c - b */
                    687:   tc = 2 * tc + mpn_lshift (C, C, len, 1);
                    688:   tc -= tb + mpn_sub_n (C, C, B, len);
                    689:
                    690:   /* d := d/3 */
1.1.1.3 ! ohara     691:   td = ((td - mpn_divexact_by3 (D, D, len)) * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
1.1.1.2   maekawa   692:
                    693:   /* b, d := b + d, b - d */
                    694: #ifdef HAVE_MPN_ADD_SUB_N
                    695:   /* #error TO DO ... */
1.1       maekawa   696: #else
1.1.1.3 ! ohara     697:   t = (tb + td + mpn_add_n (ws, B, D, len)) & GMP_NUMB_MASK;
        !           698:   td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK;
1.1.1.2   maekawa   699:   tb = t;
                    700:   MPN_COPY (B, ws, len);
                    701: #endif
1.1       maekawa   702:
1.1.1.2   maekawa   703:       /* Now:
                    704:        *        b = 4*x1
                    705:        *        c = 2*x2
                    706:        *        d = 4*x3
                    707:        */
                    708:
                    709:   ASSERT(!(*B & 3));
                    710:   mpn_rshift (B, B, len, 2);
1.1.1.3 ! ohara     711:   B[len-1] |= (tb << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK;
        !           712:   ASSERT((mp_limb_signed_t)tb >= 0);
1.1.1.2   maekawa   713:   tb >>= 2;
                    714:
                    715:   ASSERT(!(*C & 1));
                    716:   mpn_rshift (C, C, len, 1);
1.1.1.3 ! ohara     717:   C[len-1] |= (tc << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
        !           718:   ASSERT((mp_limb_signed_t)tc >= 0);
1.1.1.2   maekawa   719:   tc >>= 1;
                    720:
                    721:   ASSERT(!(*D & 3));
                    722:   mpn_rshift (D, D, len, 2);
1.1.1.3 ! ohara     723:   D[len-1] |= (td << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK;
        !           724:   ASSERT((mp_limb_signed_t)td >= 0);
1.1.1.2   maekawa   725:   td >>= 2;
                    726:
                    727: #if WANT_ASSERT
                    728:   ASSERT (tb < 2);
                    729:   if (len == len2)
                    730:     {
                    731:       ASSERT (tc < 3);
                    732:       ASSERT (td < 2);
1.1       maekawa   733:     }
                    734:   else
                    735:     {
1.1.1.2   maekawa   736:       ASSERT (tc < 2);
                    737:       ASSERT (!td);
                    738:     }
                    739: #endif
1.1       maekawa   740:
1.1.1.2   maekawa   741:   *ptb = tb;
                    742:   *ptc = tc;
                    743:   *ptd = td;
1.1       maekawa   744:
1.1.1.2   maekawa   745:   TMP_FREE (marker);
                    746: }
                    747:
                    748: #else
                    749:
                    750: static void
                    751: interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
                    752:              mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
                    753: {
                    754:   mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
                    755:   const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
                    756:
                    757: #if WANT_ASSERT
                    758:   t = l - ls;
                    759:   ASSERT (t == 0 || t == 2 || t == 4);
                    760: #endif
                    761:
                    762:   sb = sc = sd = 0;
                    763:   for (i = 0; i < l; ++i)
                    764:     {
                    765:       mp_limb_t tb, tc, td, tt;
                    766:
                    767:       a = *A;
                    768:       b = *B;
                    769:       c = *C;
                    770:       d = *D;
                    771:       e = i < ls ? *E : 0;
                    772:
                    773:       /* Let x1, x2, x3 be the values to interpolate.  We have:
                    774:        *        b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
                    775:        *        c =    a +   x1 +   x2 +   x3 +    e
                    776:        *        d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
                    777:        */
                    778:
                    779:       /* b := b - 16*a -    e
                    780:        * c := c -    a -    e
                    781:        * d := d -    a - 16*e
                    782:        */
                    783:       t = a << 4;
                    784:       tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
                    785:       b -= t;
                    786:       tb -= b < e;
                    787:       b -= e;
                    788:       tc = -(c < a);
                    789:       c -= a;
                    790:       tc -= c < e;
                    791:       c -= e;
                    792:       td = -(d < a);
                    793:       d -= a;
                    794:       t = e << 4;
                    795:       td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
                    796:       d -= t;
                    797:
                    798:       /* b, d := b + d, b - d */
                    799:       t = b + d;
                    800:       tt = tb + td + (t < b);
                    801:       td = tb - td - (b < d);
                    802:       d = b - d;
                    803:       b = t;
                    804:       tb = tt;
                    805:
                    806:       /* b := b-8*c */
                    807:       t = c << 3;
                    808:       tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
                    809:       b -= t;
                    810:
                    811:       /* c := 2*c - b */
                    812:       t = c << 1;
                    813:       tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
                    814:       c = t - b;
                    815:
                    816:       /* d := d/3 */
1.1.1.3 ! ohara     817:       d *= MODLIMB_INVERSE_3;
1.1.1.2   maekawa   818:       td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
1.1.1.3 ! ohara     819:       td *= MODLIMB_INVERSE_3;
1.1.1.2   maekawa   820:
                    821:       /* b, d := b + d, b - d */
                    822:       t = b + d;
                    823:       tt = tb + td + (t < b);
                    824:       td = tb - td - (b < d);
                    825:       d = b - d;
                    826:       b = t;
                    827:       tb = tt;
                    828:
                    829:       /* Now:
                    830:        *        b = 4*x1
                    831:        *        c = 2*x2
                    832:        *        d = 4*x3
                    833:        */
                    834:
                    835:       /* sb has period 2. */
                    836:       b += sb;
                    837:       tb += b < sb;
                    838:       sb &= maskOffHalf;
                    839:       sb |= sb >> (BITS_PER_MP_LIMB >> 1);
                    840:       sb += tb;
                    841:
                    842:       /* sc has period 1. */
                    843:       c += sc;
                    844:       tc += c < sc;
                    845:       /* TO DO: choose one of the following alternatives. */
                    846: #if 1
1.1.1.3 ! ohara     847:       sc = (mp_limb_signed_t) sc >> (BITS_PER_MP_LIMB - 1);
1.1.1.2   maekawa   848:       sc += tc;
                    849: #else
1.1.1.3 ! ohara     850:       sc = tc - ((mp_limb_signed_t) sc < 0L);
1.1.1.2   maekawa   851: #endif
                    852:
                    853:       /* sd has period 2. */
                    854:       d += sd;
                    855:       td += d < sd;
                    856:       sd &= maskOffHalf;
                    857:       sd |= sd >> (BITS_PER_MP_LIMB >> 1);
                    858:       sd += td;
                    859:
                    860:       if (i != 0)
1.1       maekawa   861:        {
1.1.1.2   maekawa   862:          B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
                    863:          C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
                    864:          D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1.1       maekawa   865:        }
1.1.1.2   maekawa   866:       ob = b >> 2;
                    867:       oc = c >> 1;
                    868:       od = d >> 2;
1.1       maekawa   869:
1.1.1.2   maekawa   870:       ++A; ++B; ++C; ++D; ++E;
                    871:     }
1.1       maekawa   872:
1.1.1.2   maekawa   873:   /* Handle top words. */
                    874:   b = *ptb;
                    875:   c = *ptc;
                    876:   d = *ptd;
                    877:
                    878:   t = b + d;
                    879:   d = b - d;
                    880:   b = t;
                    881:   b -= c << 3;
                    882:   c = (c << 1) - b;
1.1.1.3 ! ohara     883:   d *= MODLIMB_INVERSE_3;
1.1.1.2   maekawa   884:   t = b + d;
                    885:   d = b - d;
                    886:   b = t;
                    887:
                    888:   b += sb;
                    889:   c += sc;
                    890:   d += sd;
                    891:
                    892:   B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
                    893:   C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
                    894:   D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
                    895:
                    896:   b >>= 2;
                    897:   c >>= 1;
                    898:   d >>= 2;
                    899:
                    900: #if WANT_ASSERT
                    901:   ASSERT (b < 2);
                    902:   if (l == ls)
                    903:     {
                    904:       ASSERT (c < 3);
                    905:       ASSERT (d < 2);
                    906:     }
                    907:   else
                    908:     {
                    909:       ASSERT (c < 2);
                    910:       ASSERT (!d);
                    911:     }
                    912: #endif
1.1       maekawa   913:
1.1.1.2   maekawa   914:   *ptb = b;
                    915:   *ptc = c;
                    916:   *ptd = d;
                    917: }
                    918: #endif
1.1       maekawa   919:
                    920:
1.1.1.2   maekawa   921: /*-- mpn_toom3_mul_n --------------------------------------------------------------*/
1.1       maekawa   922:
1.1.1.2   maekawa   923: /* Multiplies using 5 mults of one third size and so on recursively.
                    924:  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
                    925:  * No overlap of p[...] with a[...] or b[...].
                    926:  * ws is workspace.
                    927:  */
                    928:
1.1.1.3 ! ohara     929: /* TO DO: If MUL_TOOM3_THRESHOLD is much bigger than MUL_KARATSUBA_THRESHOLD then the
        !           930:  *       recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
        !           931:  *       because the "n < MUL_KARATSUBA_THRESHOLD" test here will always be false.
1.1.1.2   maekawa   932:  */
                    933:
                    934: #define TOOM3_MUL_REC(p, a, b, n, ws) \
                    935:   do {                                                         \
1.1.1.3 ! ohara     936:     if (n < MUL_KARATSUBA_THRESHOLD)                           \
1.1.1.2   maekawa   937:       mpn_mul_basecase (p, a, n, b, n);                                \
1.1.1.3 ! ohara     938:     else if (n < MUL_TOOM3_THRESHOLD)                          \
1.1.1.2   maekawa   939:       mpn_kara_mul_n (p, a, b, n, ws);                         \
                    940:     else                                                       \
                    941:       mpn_toom3_mul_n (p, a, b, n, ws);                                \
                    942:   } while (0)
1.1       maekawa   943:
1.1.1.2   maekawa   944: void
                    945: mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
                    946: {
                    947:   mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
                    948:   mp_limb_t *A,*B,*C,*D,*E, *W;
                    949:   mp_size_t l,l2,l3,l4,l5,ls;
                    950:
                    951:   /* Break n words into chunks of size l, l and ls.
                    952:    * n = 3*k   => l = k,   ls = k
                    953:    * n = 3*k+1 => l = k+1, ls = k-1
                    954:    * n = 3*k+2 => l = k+1, ls = k
                    955:    */
                    956:   {
                    957:     mp_limb_t m;
                    958:
1.1.1.3 ! ohara     959:     /* this is probably unnecessarily strict */
        !           960:     ASSERT (n >= MUL_TOOM3_THRESHOLD);
        !           961:
1.1.1.2   maekawa   962:     l = ls = n / 3;
                    963:     m = n - l * 3;
                    964:     if (m != 0)
                    965:       ++l;
                    966:     if (m == 1)
                    967:       --ls;
                    968:
                    969:     l2 = l * 2;
                    970:     l3 = l * 3;
                    971:     l4 = l * 4;
                    972:     l5 = l * 5;
                    973:     A = p;
                    974:     B = ws;
                    975:     C = p + l2;
                    976:     D = ws + l2;
                    977:     E = p + l4;
                    978:     W = ws + l4;
                    979:   }
                    980:
1.1.1.3 ! ohara     981:   ASSERT (l >= 1);
        !           982:   ASSERT (ls >= 1);
        !           983:
1.1.1.2   maekawa   984:   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
                    985:   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
                    986:   evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
                    987:
                    988:   /** Second stage: pointwise multiplies. **/
                    989:   TOOM3_MUL_REC(D, C, C + l, l, W);
                    990:   tD = cD*dD;
                    991:   if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
                    992:   if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
                    993:   ASSERT (tD < 49);
                    994:   TOOM3_MUL_REC(C, B, B + l, l, W);
                    995:   tC = cC*dC;
                    996:   /* TO DO: choose one of the following alternatives. */
                    997: #if 0
                    998:   if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
                    999:   if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
                   1000: #else
                   1001:   if (cC)
                   1002:     {
                   1003:       if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
                   1004:       else tC += add2Times (C + l, C + l, B + l, l);
1.1       maekawa  1005:     }
1.1.1.2   maekawa  1006:   if (dC)
                   1007:     {
                   1008:       if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
                   1009:       else tC += add2Times (C + l, C + l, B, l);
                   1010:     }
                   1011: #endif
                   1012:   ASSERT (tC < 9);
                   1013:   TOOM3_MUL_REC(B, A, A + l, l, W);
                   1014:   tB = cB*dB;
                   1015:   if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
                   1016:   if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
                   1017:   ASSERT (tB < 49);
                   1018:   TOOM3_MUL_REC(A, a, b, l, W);
                   1019:   TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
                   1020:
                   1021:   /** Third stage: interpolation. **/
                   1022:   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
                   1023:
                   1024:   /** Final stage: add up the coefficients. **/
1.1.1.3 ! ohara    1025:   tB += mpn_add_n (p + l, p + l, B, l2);
        !          1026:   tD += mpn_add_n (p + l3, p + l3, D, l2);
        !          1027:   MPN_INCR_U (p + l3, 2 * n - l3, tB);
        !          1028:   MPN_INCR_U (p + l4, 2 * n - l4, tC);
        !          1029:   MPN_INCR_U (p + l5, 2 * n - l5, tD);
1.1       maekawa  1030: }
                   1031:
1.1.1.2   maekawa  1032: /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
                   1033:
                   1034: /* Like previous function but for squaring */
                   1035:
1.1.1.3 ! ohara    1036: /* FIXME: If SQR_TOOM3_THRESHOLD is big enough it might never get into the
        !          1037:    basecase range.  Try to arrange those conditonals go dead.  */
        !          1038: #define TOOM3_SQR_REC(p, a, n, ws)                              \
        !          1039:   do {                                                          \
        !          1040:     if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))            \
        !          1041:       mpn_mul_basecase (p, a, n, a, n);                         \
        !          1042:     else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD))      \
        !          1043:       mpn_sqr_basecase (p, a, n);                               \
        !          1044:     else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))          \
        !          1045:       mpn_kara_sqr_n (p, a, n, ws);                             \
        !          1046:     else                                                        \
        !          1047:       mpn_toom3_sqr_n (p, a, n, ws);                            \
1.1.1.2   maekawa  1048:   } while (0)
                   1049:
                   1050: void
                   1051: mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1.1       maekawa  1052: {
1.1.1.2   maekawa  1053:   mp_limb_t cB,cC,cD, tB,tC,tD;
                   1054:   mp_limb_t *A,*B,*C,*D,*E, *W;
                   1055:   mp_size_t l,l2,l3,l4,l5,ls;
                   1056:
                   1057:   /* Break n words into chunks of size l, l and ls.
                   1058:    * n = 3*k   => l = k,   ls = k
                   1059:    * n = 3*k+1 => l = k+1, ls = k-1
                   1060:    * n = 3*k+2 => l = k+1, ls = k
                   1061:    */
                   1062:   {
                   1063:     mp_limb_t m;
                   1064:
1.1.1.3 ! ohara    1065:     /* this is probably unnecessarily strict */
        !          1066:     ASSERT (n >= SQR_TOOM3_THRESHOLD);
        !          1067:
1.1.1.2   maekawa  1068:     l = ls = n / 3;
                   1069:     m = n - l * 3;
                   1070:     if (m != 0)
                   1071:       ++l;
                   1072:     if (m == 1)
                   1073:       --ls;
                   1074:
                   1075:     l2 = l * 2;
                   1076:     l3 = l * 3;
                   1077:     l4 = l * 4;
                   1078:     l5 = l * 5;
                   1079:     A = p;
                   1080:     B = ws;
                   1081:     C = p + l2;
                   1082:     D = ws + l2;
                   1083:     E = p + l4;
                   1084:     W = ws + l4;
                   1085:   }
                   1086:
1.1.1.3 ! ohara    1087:   ASSERT (l >= 1);
        !          1088:   ASSERT (ls >= 1);
        !          1089:
1.1.1.2   maekawa  1090:   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
                   1091:   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
                   1092:
                   1093:   /** Second stage: pointwise multiplies. **/
                   1094:   TOOM3_SQR_REC(D, C, l, W);
                   1095:   tD = cD*cD;
                   1096:   if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
                   1097:   ASSERT (tD < 49);
                   1098:   TOOM3_SQR_REC(C, B, l, W);
                   1099:   tC = cC*cC;
                   1100:   /* TO DO: choose one of the following alternatives. */
                   1101: #if 0
                   1102:   if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
                   1103: #else
                   1104:   if (cC >= 1)
1.1       maekawa  1105:     {
1.1.1.2   maekawa  1106:       tC += add2Times (C + l, C + l, B, l);
                   1107:       if (cC == 2)
1.1.1.3 ! ohara    1108:        tC += add2Times (C + l, C + l, B, l);
1.1.1.2   maekawa  1109:     }
                   1110: #endif
                   1111:   ASSERT (tC < 9);
                   1112:   TOOM3_SQR_REC(B, A, l, W);
                   1113:   tB = cB*cB;
                   1114:   if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
                   1115:   ASSERT (tB < 49);
                   1116:   TOOM3_SQR_REC(A, a, l, W);
                   1117:   TOOM3_SQR_REC(E, a + l2, ls, W);
                   1118:
                   1119:   /** Third stage: interpolation. **/
                   1120:   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
                   1121:
                   1122:   /** Final stage: add up the coefficients. **/
1.1.1.3 ! ohara    1123:   tB += mpn_add_n (p + l, p + l, B, l2);
        !          1124:   tD += mpn_add_n (p + l3, p + l3, D, l2);
        !          1125:   MPN_INCR_U (p + l3, 2 * n - l3, tB);
        !          1126:   MPN_INCR_U (p + l4, 2 * n - l4, tC);
        !          1127:   MPN_INCR_U (p + l5, 2 * n - l5, tD);
1.1.1.2   maekawa  1128: }
                   1129:
                   1130: void
                   1131: mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
                   1132: {
1.1.1.3 ! ohara    1133:   ASSERT (n >= 1);
        !          1134:   ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
        !          1135:   ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));
        !          1136:
        !          1137:   if (n < MUL_KARATSUBA_THRESHOLD)
1.1.1.2   maekawa  1138:     mpn_mul_basecase (p, a, n, b, n);
1.1.1.3 ! ohara    1139:   else if (n < MUL_TOOM3_THRESHOLD)
1.1.1.2   maekawa  1140:     {
                   1141:       /* Allocate workspace of fixed size on stack: fast! */
                   1142: #if TUNE_PROGRAM_BUILD
1.1.1.3 ! ohara    1143:       mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];
1.1.1.2   maekawa  1144: #else
1.1.1.3 ! ohara    1145:       mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD-1)];
1.1.1.2   maekawa  1146: #endif
                   1147:       mpn_kara_mul_n (p, a, b, n, ws);
1.1       maekawa  1148:     }
1.1.1.2   maekawa  1149: #if WANT_FFT || TUNE_PROGRAM_BUILD
1.1.1.3 ! ohara    1150:   else if (n < MUL_FFT_THRESHOLD)
1.1.1.2   maekawa  1151: #else
1.1       maekawa  1152:   else
1.1.1.2   maekawa  1153: #endif
1.1       maekawa  1154:     {
1.1.1.2   maekawa  1155:       /* Use workspace of unknown size in heap, as stack space may
1.1.1.3 ! ohara    1156:        * be limited.  Since n is at least MUL_TOOM3_THRESHOLD, the
1.1.1.2   maekawa  1157:        * multiplication will take much longer than malloc()/free().  */
                   1158:       mp_limb_t wsLen, *ws;
1.1.1.3 ! ohara    1159:       wsLen = MPN_TOOM3_MUL_N_TSIZE (n);
        !          1160:       ws = __GMP_ALLOCATE_FUNC_LIMBS ((size_t) wsLen);
1.1.1.2   maekawa  1161:       mpn_toom3_mul_n (p, a, b, n, ws);
1.1.1.3 ! ohara    1162:       __GMP_FREE_FUNC_LIMBS (ws, (size_t) wsLen);
1.1       maekawa  1163:     }
1.1.1.2   maekawa  1164: #if WANT_FFT || TUNE_PROGRAM_BUILD
                   1165:   else
                   1166:     {
1.1.1.3 ! ohara    1167:       mpn_mul_fft_full (p, a, n, b, n);
1.1.1.2   maekawa  1168:     }
                   1169: #endif
1.1       maekawa  1170: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>