[BACK]Return to tuneup.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / tune

Annotation of OpenXM_contrib/gmp/tune/tuneup.c, Revision 1.1.1.2

1.1.1.2 ! ohara       1: /* Create tuned thresholds for various algorithms.
1.1       maekawa     2:
1.1.1.2 ! ohara       3: Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4:
                      5: This file is part of the GNU MP Library.
                      6:
                      7: The GNU MP Library is free software; you can redistribute it and/or modify
                      8: it under the terms of the GNU Lesser General Public License as published by
                      9: the Free Software Foundation; either version 2.1 of the License, or (at your
                     10: option) any later version.
                     11:
                     12: The GNU MP Library is distributed in the hope that it will be useful, but
                     13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     14: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     15: License for more details.
                     16:
                     17: You should have received a copy of the GNU Lesser General Public License
                     18: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
1.1.1.2 ! ohara      20: MA 02111-1307, USA. */
        !            21:
1.1       maekawa    22:
                     23: /* Usage: tune [-t] [-t] [-p precision]
                     24:
                     25:    -t turns on some diagnostic traces, a second -t turns on more traces.
                     26:
                     27:    The thresholds are determined as follows.  A crossover may not be a
                     28:    single size but rather a range where it oscillates between method A or
                     29:    method B faster.  If the threshold is set making B used where A is faster
                     30:    (or vice versa) that's bad.  Badness is the percentage time lost and
                     31:    total badness is the sum of this over all sizes measured.  The threshold
                     32:    is set to minimize total badness.
                     33:
                     34:    Suppose, as sizes increase, method B becomes faster than method A.  The
                     35:    effect of the rule is that, as you look at increasing sizes, isolated
                     36:    points where B is faster are ignored, but when it's consistently faster,
                     37:    or faster on balance, then the threshold is set there.  The same result
                     38:    is obtained thinking in the other direction of A becoming faster at
                     39:    smaller sizes.
                     40:
                     41:    In practice the thresholds tend to be chosen to bring on the next
                     42:    algorithm fairly quickly.
                     43:
                     44:    This rule is attractive because it's got a basis in reason and is fairly
                     45:    easy to implement, but no work has been done to actually compare it in
                     46:    absolute terms to other possibilities.
                     47:
                     48:    Sometimes running the program twice produces slightly different results.
                     49:    This is probably because there's so little separating algorithms near
                     50:    their crossover, and on that basis it should make little or no difference
                     51:    to the final speed of the relevant routines, but nothing has been done to
                     52:    check that carefully.
                     53:
1.1.1.2 ! ohara      54:    Remarks:
        !            55:
        !            56:    The code here isn't a vision of loveliness, mainly because it's subject
        !            57:    to ongoing modifications according to new things wanting to be tuned and
        !            58:    practical requirements of systems tested.
        !            59:
        !            60:    The way parts of the library are recompiled to insinuate the tuning
        !            61:    variables is a bit subtle, but unavoidable since of course the main
        !            62:    library has fixed thresholds compiled-in but we want to vary them here.
        !            63:    Most of the nonsense for this can be found in tune/Makefile.am and under
        !            64:    TUNE_PROGRAM_BUILD in gmp-impl.h.
        !            65:
        !            66:    The dirty hack which the "second_start_min" feature could perhaps be done
        !            67:    more generally, so if say karatsuba is never better than toom3 then it
        !            68:    can be detected and omitted.  Currently we're hoping very hard that this
        !            69:    doesn't arise in practice, and if it does then it indicates something
        !            70:    badly sub-optimal in the karatsuba implementation.
        !            71:
1.1       maekawa    72:    Limitations:
1.1.1.2 ! ohara      73:
1.1       maekawa    74:    The FFTs aren't subject to the same badness rule as the other thresholds,
                     75:    so each k is probably being brought on a touch early.  This isn't likely
                     76:    to make a difference, and the simpler probing means fewer tests.
                     77:
                     78: */
                     79:
1.1.1.2 ! ohara      80: #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
        !            81:
        !            82: #include "config.h"
1.1       maekawa    83:
                     84: #include <math.h>
                     85: #include <stdio.h>
                     86: #include <stdlib.h>
                     87: #include <time.h>
1.1.1.2 ! ohara      88: #if HAVE_UNISTD_H
1.1       maekawa    89: #include <unistd.h>
1.1.1.2 ! ohara      90: #endif
1.1       maekawa    91:
                     92: #include "gmp.h"
                     93: #include "gmp-impl.h"
1.1.1.2 ! ohara      94: #include "longlong.h"
1.1       maekawa    95:
1.1.1.2 ! ohara      96: #include "tests.h"
1.1       maekawa    97: #include "speed.h"
                     98:
                     99: #if !HAVE_DECL_OPTARG
                    100: extern char *optarg;
                    101: extern int optind, opterr;
                    102: #endif
                    103:
                    104:
1.1.1.2 ! ohara     105: #define DEFAULT_MAX_SIZE   1000  /* limbs */
        !           106: #define MAX_TABLE             5  /* entries */
1.1       maekawa   107:
                    108: #if WANT_FFT
                    109: mp_size_t  option_fft_max_size = 50000;  /* limbs */
                    110: #else
                    111: mp_size_t  option_fft_max_size = 0;
                    112: #endif
                    113: int        option_trace = 0;
                    114: int        option_fft_trace = 0;
                    115: struct speed_params  s;
                    116:
                    117: struct dat_t {
                    118:   mp_size_t  size;
                    119:   double     d;
                    120: } *dat = NULL;
                    121: int  ndat = 0;
                    122: int  allocdat = 0;
                    123:
                    124:
                    125: /* Each "_threshold" array must be 1 bigger than the number of thresholds
1.1.1.2 ! ohara     126:    being tuned in a set, because one() stores a value in the entry above
1.1       maekawa   127:    the one it's determining. */
                    128:
                    129: mp_size_t  mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
                    130: mp_size_t  sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
1.1.1.2 ! ohara     131: mp_size_t  sb_preinv_threshold[2] = { MP_SIZE_T_MAX };
        !           132: mp_size_t  dc_threshold[2] = { MP_SIZE_T_MAX };
1.1       maekawa   133: mp_size_t  powm_threshold[2] = { MP_SIZE_T_MAX };
                    134: mp_size_t  gcd_accel_threshold[2] = { MP_SIZE_T_MAX };
                    135: mp_size_t  gcdext_threshold[2] = { MP_SIZE_T_MAX };
1.1.1.2 ! ohara     136: mp_size_t  divexact_1_threshold[2] = { MP_SIZE_T_MAX };
        !           137: mp_size_t  divrem_1_norm_threshold[2] = { MP_SIZE_T_MAX };
        !           138: mp_size_t  divrem_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
        !           139: mp_size_t  divrem_2_threshold[2] = { MP_SIZE_T_MAX };
        !           140: mp_size_t  mod_1_norm_threshold[2] = { MP_SIZE_T_MAX };
        !           141: mp_size_t  mod_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
        !           142: mp_size_t  modexact_1_odd_threshold[2] = { MP_SIZE_T_MAX };
        !           143: mp_size_t  get_str_basecase_threshold[2] = { MP_SIZE_T_MAX };
        !           144: mp_size_t  get_str_precompute_threshold[2] = { MP_SIZE_T_MAX };
        !           145: mp_size_t  set_str_threshold[2] = { MP_SIZE_T_MAX };
1.1       maekawa   146:
1.1.1.2 ! ohara     147: mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
        !           148: mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
1.1       maekawa   149:
1.1.1.2 ! ohara     150: #ifndef TUNE_SQR_KARATSUBA_MAX
        !           151: #define TUNE_SQR_KARATSUBA_MAX  0 /* meaning no limit */
1.1       maekawa   152: #endif
                    153:
                    154: struct param_t {
1.1.1.2 ! ohara     155:   const char        *name[MAX_TABLE];
        !           156:   speed_function_t  function;
        !           157:   speed_function_t  function2;
        !           158:   double            step_factor;    /* how much to step sizes (rounded down) */
        !           159:   double            function_fudge; /* multiplier for "function" speeds */
        !           160:   int               stop_since_change;
        !           161:   double            stop_factor;
        !           162:   mp_size_t         min_size[MAX_TABLE];
        !           163:   int               min_is_always;
        !           164:   int               second_start_min;
        !           165:   mp_size_t         max_size[MAX_TABLE];
        !           166:   mp_size_t         check_size;
        !           167:   mp_size_t         size_extra;
        !           168:
        !           169: #define DATA_HIGH_LT_R  1
        !           170: #define DATA_HIGH_GE_R  2
        !           171:   int               data_high;
        !           172:
        !           173:   int               noprint;
1.1       maekawa   174: };
                    175:
1.1.1.2 ! ohara     176: #ifndef UDIV_PREINV_ALWAYS
        !           177: #define UDIV_PREINV_ALWAYS 0
        !           178: #endif
        !           179:
        !           180:
        !           181: mp_limb_t
        !           182: randlimb_norm (void)
        !           183: {
        !           184:   mp_limb_t  n;
        !           185:   mpn_random (&n, 1);
        !           186:   n |= GMP_LIMB_HIGHBIT;
        !           187:   return n;
        !           188: }
        !           189:
        !           190: #define MP_LIMB_T_HALFMASK  ((CNST_LIMB(1) << (BITS_PER_MP_LIMB/2)) - 1)
        !           191:
        !           192: mp_limb_t
        !           193: randlimb_half (void)
        !           194: {
        !           195:   mp_limb_t  n;
        !           196:   mpn_random (&n, 1);
        !           197:   n &= MP_LIMB_T_HALFMASK;
        !           198:   n += (n==0);
        !           199:   return n;
        !           200: }
        !           201:
1.1       maekawa   202:
                    203: /* Add an entry to the end of the dat[] array, reallocing to make it bigger
                    204:    if necessary.  */
                    205: void
                    206: add_dat (mp_size_t size, double d)
                    207: {
                    208: #define ALLOCDAT_STEP  500
                    209:
                    210:   ASSERT_ALWAYS (ndat <= allocdat);
                    211:
                    212:   if (ndat == allocdat)
                    213:     {
1.1.1.2 ! ohara     214:       dat = (struct dat_t *) __gmp_allocate_or_reallocate
1.1       maekawa   215:         (dat, allocdat * sizeof(dat[0]),
                    216:          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
                    217:       allocdat += ALLOCDAT_STEP;
                    218:     }
                    219:
                    220:   dat[ndat].size = size;
                    221:   dat[ndat].d = d;
                    222:   ndat++;
                    223: }
                    224:
                    225:
                    226: /* Return the threshold size based on the data accumulated. */
                    227: mp_size_t
                    228: analyze_dat (int i, int final)
                    229: {
                    230:   double  x, min_x;
                    231:   int     j, min_j;
                    232:
                    233:   /* If the threshold is set at dat[0].size, any positive values are bad. */
                    234:   x = 0.0;
                    235:   for (j = 0; j < ndat; j++)
                    236:     if (dat[j].d > 0.0)
                    237:       x += dat[j].d;
                    238:
                    239:   if (option_trace >= 2 && final)
                    240:     {
                    241:       printf ("\n");
                    242:       printf ("x is the sum of the badness from setting thresh at given size\n");
                    243:       printf ("  (minimum x is sought)\n");
                    244:       printf ("i=%d size=%ld  first x=%.4f\n", i, dat[j].size, x);
                    245:     }
                    246:
                    247:   min_x = x;
                    248:   min_j = 0;
                    249:
                    250:
                    251:   /* When stepping to the next dat[j].size, positive values are no longer
                    252:      bad (so subtracted), negative values become bad (so add the absolute
                    253:      value, meaning subtract). */
                    254:   for (j = 0; j < ndat; x -= dat[j].d, j++)
                    255:     {
                    256:       if (option_trace >= 2 && final)
                    257:         printf ("i=%d size=%ld  x=%.4f\n", i, dat[j].size, x);
                    258:
                    259:       if (x < min_x)
                    260:         {
                    261:           min_x = x;
                    262:           min_j = j;
                    263:         }
                    264:     }
1.1.1.2 ! ohara     265:
1.1       maekawa   266:   return min_j;
                    267: }
                    268:
                    269:
1.1.1.2 ! ohara     270: /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
        !           271:
        !           272: mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
        !           273:                                     mp_srcptr ap, mp_size_t size,
        !           274:                                     mp_limb_t d));
        !           275: mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
        !           276:
        !           277: double
        !           278: speed_mpn_mod_1_tune (struct speed_params *s)
        !           279: {
        !           280:   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
        !           281: }
        !           282: double
        !           283: speed_mpn_divrem_1_tune (struct speed_params *s)
        !           284: {
        !           285:   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
        !           286: }
        !           287:
        !           288:
1.1       maekawa   289: double
1.1.1.2 ! ohara     290: tuneup_measure (speed_function_t fun,
        !           291:                 const struct param_t *param,
        !           292:                 struct speed_params *s)
1.1       maekawa   293: {
1.1.1.2 ! ohara     294:   static struct param_t  dummy;
1.1       maekawa   295:   double   t;
                    296:   TMP_DECL (marker);
                    297:
1.1.1.2 ! ohara     298:   if (! param)
        !           299:     param = &dummy;
        !           300:
        !           301:   s->size += param->size_extra;
        !           302:
1.1       maekawa   303:   TMP_MARK (marker);
                    304:   s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
                    305:   s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
                    306:
                    307:   mpn_random (s->xp, s->size);
                    308:   mpn_random (s->yp, s->size);
                    309:
1.1.1.2 ! ohara     310:   switch (param->data_high) {
        !           311:   case DATA_HIGH_LT_R:
        !           312:     s->xp[s->size-1] %= s->r;
        !           313:     s->yp[s->size-1] %= s->r;
        !           314:     break;
        !           315:   case DATA_HIGH_GE_R:
        !           316:     s->xp[s->size-1] |= s->r;
        !           317:     s->yp[s->size-1] |= s->r;
        !           318:     break;
        !           319:   }
        !           320:
1.1       maekawa   321:   t = speed_measure (fun, s);
                    322:
1.1.1.2 ! ohara     323:   s->size -= param->size_extra;
        !           324:
1.1       maekawa   325:   TMP_FREE (marker);
                    326:   return t;
1.1.1.2 ! ohara     327: }
1.1       maekawa   328:
                    329:
1.1.1.2 ! ohara     330: #define PRINT_WIDTH  28
        !           331:
1.1       maekawa   332: void
1.1.1.2 ! ohara     333: print_define_start (const char *name)
        !           334: {
        !           335:   printf ("#define %-*s  ", PRINT_WIDTH, name);
        !           336:   if (option_trace)
        !           337:     printf ("...\n");
        !           338: }
        !           339:
        !           340: void
        !           341: print_define_end_remark (const char *name, mp_size_t value, const char *remark)
        !           342: {
        !           343:   if (option_trace)
        !           344:     printf ("#define %-*s  ", PRINT_WIDTH, name);
        !           345:
        !           346:   if (value == MP_SIZE_T_MAX)
        !           347:     printf ("MP_SIZE_T_MAX");
        !           348:   else
        !           349:     printf ("%5ld", value);
        !           350:
        !           351:   if (remark != NULL)
        !           352:     printf ("  /* %s */", remark);
        !           353:   printf ("\n");
        !           354: }
        !           355:
        !           356: void
        !           357: print_define_end (const char *name, mp_size_t value)
1.1       maekawa   358: {
1.1.1.2 ! ohara     359:   const char  *remark;
1.1       maekawa   360:   if (value == MP_SIZE_T_MAX)
1.1.1.2 ! ohara     361:     remark = "never";
        !           362:   else if (value == 0)
        !           363:     remark = "always";
1.1       maekawa   364:   else
1.1.1.2 ! ohara     365:     remark = NULL;
        !           366:   print_define_end_remark (name, value, remark);
        !           367: }
        !           368:
        !           369: void
        !           370: print_define (const char *name, mp_size_t value)
        !           371: {
        !           372:   print_define_start (name);
        !           373:   print_define_end (name, value);
        !           374: }
        !           375:
        !           376: void
        !           377: print_define_remark (const char *name, mp_size_t value, const char *remark)
        !           378: {
        !           379:   print_define_start (name);
        !           380:   print_define_end_remark (name, value, remark);
1.1       maekawa   381: }
                    382:
                    383:
                    384: /* table[i+1] needs to be set to a sensible value when testing method i+1
1.1.1.2 ! ohara     385:    because mpn_mul_n uses MUL_TOOM3_THRESHOLD to size the temporary
1.1       maekawa   386:    workspace for mpn_kara_mul_n. */
                    387:
                    388: void
1.1.1.2 ! ohara     389: one (mp_size_t table[], size_t max_table, struct param_t *param)
1.1       maekawa   390: {
1.1.1.2 ! ohara     391:   mp_size_t  table_save0 = 0;
        !           392:   int  since_positive, since_thresh_change;
        !           393:   int  thresh_idx, new_thresh_idx;
1.1       maekawa   394:   int  i;
                    395:
1.1.1.2 ! ohara     396:   ASSERT_ALWAYS (max_table <= MAX_TABLE);
1.1       maekawa   397:
1.1.1.2 ! ohara     398: #define DEFAULT(x,n)  if (! (param->x))  param->x = (n);
1.1       maekawa   399:
1.1.1.2 ! ohara     400:   DEFAULT (function_fudge, 1.0);
        !           401:   DEFAULT (function2, param->function);
        !           402:   DEFAULT (step_factor, 0.01);  /* small steps by default */
1.1       maekawa   403:   DEFAULT (stop_since_change, 80);
1.1.1.2 ! ohara     404:   DEFAULT (stop_factor, 1.2);
        !           405:   for (i = 0; i < max_table; i++)
        !           406:     DEFAULT (min_size[i], 10);
        !           407:   for (i = 0; i < max_table; i++)
        !           408:     DEFAULT (max_size[i], DEFAULT_MAX_SIZE);
        !           409:
        !           410:   if (param->check_size != 0)
        !           411:     {
        !           412:       double   t1, t2;
        !           413:       s.size = param->check_size;
1.1       maekawa   414:
1.1.1.2 ! ohara     415:       table[0] = s.size+1;
        !           416:       table[1] = param->max_size[0];
        !           417:       t1 = tuneup_measure (param->function, param, &s);
        !           418:
        !           419:       table[0] = s.size;
        !           420:       table[1] = s.size+1;
        !           421:       t2 = tuneup_measure (param->function2, param, &s);
        !           422:       if (t1 == -1.0 || t2 == -1.0)
        !           423:         {
        !           424:           printf ("Oops, can't run both functions at size %ld\n", s.size);
        !           425:           abort ();
        !           426:         }
        !           427:       t1 *= param->function_fudge;
1.1       maekawa   428:
1.1.1.2 ! ohara     429:       /* ask that t2 is at least 4% below t1 */
        !           430:       if (t1 < t2*1.04)
        !           431:         {
        !           432:           if (option_trace)
        !           433:             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
        !           434:           table[0] = MP_SIZE_T_MAX;
        !           435:           if (! param->noprint)
        !           436:             print_define (param->name[0], table[0]);
        !           437:           return;
        !           438:         }
        !           439:
        !           440:       if (option_trace >= 2)
        !           441:         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
        !           442:                 s.size, t1, t2);
        !           443:     }
        !           444:
        !           445:   for (i = 0, s.size = 1; i < max_table && s.size < param->max_size[i]; i++)
1.1       maekawa   446:     {
1.1.1.2 ! ohara     447:       if (i == 1 && param->second_start_min)
        !           448:         s.size = 1;
        !           449:
        !           450:       if (s.size < param->min_size[i])
        !           451:         s.size = param->min_size[i];
        !           452:
        !           453:       if (! (param->noprint || (i == 1 && param->second_start_min)))
        !           454:         print_define_start (param->name[i]);
1.1       maekawa   455:
                    456:       ndat = 0;
                    457:       since_positive = 0;
                    458:       since_thresh_change = 0;
                    459:       thresh_idx = 0;
                    460:
                    461:       if (option_trace >= 2)
                    462:         {
                    463:           printf ("             algorithm-A  algorithm-B   ratio  possible\n");
                    464:           printf ("              (seconds)    (seconds)    diff    thresh\n");
                    465:         }
                    466:
1.1.1.2 ! ohara     467:       for (;
        !           468:            s.size < param->max_size[i];
        !           469:            s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
1.1       maekawa   470:         {
                    471:           double   ti, tiplus1, d;
                    472:
                    473:           /* If there's a size limit and it's reached then it should still
                    474:              be sensible to analyze the data since we want the threshold put
                    475:              either at or near the limit.  */
                    476:           if (s.size >= param->max_size[i])
                    477:             {
                    478:               if (option_trace)
                    479:                 printf ("Reached maximum size (%ld) without otherwise stopping\n",
                    480:                         param->max_size[i]);
                    481:               break;
                    482:             }
                    483:
                    484:           /*
                    485:             FIXME: check minimum size requirements are met, possibly by just
                    486:             checking for the -1 returns from the speed functions.
                    487:           */
                    488:
1.1.1.2 ! ohara     489:           /* under this hack, don't let method 0 get used at s.size */
        !           490:           if (i == 1 && param->second_start_min)
        !           491:             table[0] = MIN (s.size-1, table_save0);
        !           492:
1.1       maekawa   493:           /* using method i at this size */
                    494:           table[i] = s.size+1;
1.1.1.2 ! ohara     495:           table[i+1] = param->max_size[i];
        !           496:           ti = tuneup_measure (param->function, param, &s);
1.1       maekawa   497:           if (ti == -1.0)
                    498:             abort ();
1.1.1.2 ! ohara     499:           ti *= param->function_fudge;
1.1       maekawa   500:
                    501:           /* using method i+1 at this size */
                    502:           table[i] = s.size;
                    503:           table[i+1] = s.size+1;
1.1.1.2 ! ohara     504:           tiplus1 = tuneup_measure (param->function2, param, &s);
1.1       maekawa   505:           if (tiplus1 == -1.0)
                    506:             abort ();
                    507:
                    508:           /* Calculate the fraction by which the one or the other routine is
                    509:              slower.  */
                    510:           if (tiplus1 >= ti)
                    511:             d = (tiplus1 - ti) / tiplus1;  /* negative */
                    512:           else
                    513:             d = (tiplus1 - ti) / ti;       /* positive */
                    514:
                    515:           add_dat (s.size, d);
                    516:
                    517:           new_thresh_idx = analyze_dat (i, 0);
                    518:
                    519:
                    520:           if (option_trace >= 2)
1.1.1.2 ! ohara     521:             printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
1.1       maekawa   522:                     i, s.size, ti, tiplus1, d,
                    523:                     ti > tiplus1 ? '#' : ' ',
                    524:                     dat[new_thresh_idx].size);
                    525:
                    526:           /* Stop if the last time method i was faster was more than a
                    527:              certain number of measurements ago.  */
                    528: #define STOP_SINCE_POSITIVE  200
                    529:           if (d >= 0)
1.1.1.2 ! ohara     530:             since_positive = 0;
1.1       maekawa   531:           else
                    532:             if (++since_positive > STOP_SINCE_POSITIVE)
                    533:               {
                    534:                 if (option_trace >= 1)
                    535:                   printf ("i=%d stopped due to since_positive (%d)\n",
                    536:                           i, STOP_SINCE_POSITIVE);
                    537:                 break;
                    538:               }
                    539:
                    540:           /* Stop if method i has become slower by a certain factor. */
1.1.1.2 ! ohara     541:           if (ti >= tiplus1 * param->stop_factor)
1.1       maekawa   542:             {
                    543:               if (option_trace >= 1)
                    544:                 printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n",
1.1.1.2 ! ohara     545:                         i, param->stop_factor);
1.1       maekawa   546:               break;
                    547:             }
                    548:
                    549:           /* Stop if the threshold implied hasn't changed in a certain
                    550:              number of measurements.  (It's this condition that ususally
                    551:              stops the loop.) */
                    552:           if (thresh_idx != new_thresh_idx)
                    553:             since_thresh_change = 0, thresh_idx = new_thresh_idx;
                    554:           else
                    555:             if (++since_thresh_change > param->stop_since_change)
                    556:               {
                    557:                 if (option_trace >= 1)
                    558:                   printf ("i=%d stopped due to since_thresh_change (%d)\n",
                    559:                           i, param->stop_since_change);
                    560:                 break;
                    561:               }
                    562:
                    563:           /* Stop if the threshold implied is more than a certain number of
                    564:              measurements ago.  */
                    565: #define STOP_SINCE_AFTER   500
                    566:           if (ndat - thresh_idx > STOP_SINCE_AFTER)
                    567:             {
                    568:               if (option_trace >= 1)
                    569:                 printf ("i=%d stopped due to ndat - thresh_idx > amount (%d)\n",
                    570:                         i, STOP_SINCE_AFTER);
                    571:               break;
                    572:             }
                    573:         }
                    574:
                    575:       /* Stop when the size limit is reached before the end of the
1.1.1.2 ! ohara     576:          crossover, but only show this as an error for >= the default max
        !           577:          size.  FIXME: Maybe should make it a param choice whether this is
        !           578:          an error.  */
        !           579:       if (s.size >= param->max_size[i]
        !           580:           && param->max_size[i] >= DEFAULT_MAX_SIZE)
1.1       maekawa   581:         {
                    582:           fprintf (stderr, "%s\n", param->name[i]);
                    583:           fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n",
                    584:                    i, dat[0].size, dat[ndat-1].size, ndat);
                    585:           fprintf (stderr, "    max size reached before end of crossover\n");
                    586:           break;
                    587:         }
                    588:
                    589:       if (option_trace >= 1)
                    590:         printf ("i=%d sizes %ld to %ld total %d measurements\n",
                    591:                 i, dat[0].size, dat[ndat-1].size, ndat);
                    592:
                    593:       if (ndat == 0)
                    594:         break;
                    595:
                    596:       table[i] = dat[analyze_dat (i, 1)].size;
                    597:
1.1.1.2 ! ohara     598:       /* fudge here, let min_is_always apply only to i==0, that's what the
        !           599:          sqr_n thresholds want */
        !           600:       if (i == 0 && param->min_is_always && table[i] == param->min_size[i])
        !           601:         table[i] = 0;
        !           602:
        !           603:       /* under the second_start_min fudge, if the second threshold turns out
        !           604:          to be lower than the first, then the second method is unwanted, we
        !           605:          should go straight from algorithm 1 to algorithm 3.  */
        !           606:       if (param->second_start_min)
        !           607:         {
        !           608:           if (i == 0)
        !           609:             {
        !           610:               table_save0 = table[0];
        !           611:               table[0] = 0;
        !           612:             }
        !           613:           else if (i == 1)
        !           614:             {
        !           615:               table[0] = table_save0;
        !           616:               if (table[1] <= table[0])
        !           617:                 {
        !           618:                   table[0] = table[1];
        !           619:                   table[1] = 0;
        !           620:                 }
        !           621:             }
        !           622:           s.size = MAX (table[0], table[1]) + 1;
        !           623:         }
        !           624:
        !           625:       if (! (param->noprint || (i == 0 && param->second_start_min)))
        !           626:         {
        !           627:           if (i == 1 && param->second_start_min)
        !           628:             {
        !           629:               print_define_end (param->name[0], table[0]);
        !           630:               print_define_start (param->name[1]);
        !           631:             }
1.1       maekawa   632:
1.1.1.2 ! ohara     633:           print_define_end (param->name[i], table[i]);
        !           634:         }
        !           635:
        !           636:       /* Look for the next threshold starting from the current one. */
1.1       maekawa   637:       s.size = table[i]+1;
1.1.1.2 ! ohara     638:
        !           639:       /* Take a MAX of all to allow for second_start_min producing a 0. */
        !           640:       {
        !           641:         int  j;
        !           642:         for (j = 0; j < i; j++)
        !           643:           s.size = MAX (s.size, table[j]+1);
        !           644:       }
        !           645:     }
1.1       maekawa   646: }
                    647:
                    648:
                    649: /* Special probing for the fft thresholds.  The size restrictions on the
                    650:    FFTs mean the graph of time vs size has a step effect.  See this for
                    651:    example using
                    652:
                    653:        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
                    654:        gnuplot foo.gnuplot
                    655:
                    656:    The current approach is to compare routines at the midpoint of relevant
                    657:    steps.  Arguably a more sophisticated system of threshold data is wanted
                    658:    if this step effect remains. */
                    659:
                    660: struct fft_param_t {
                    661:   const char        *table_name;
                    662:   const char        *threshold_name;
                    663:   const char        *modf_threshold_name;
                    664:   mp_size_t         *p_threshold;
                    665:   mp_size_t         *p_modf_threshold;
                    666:   mp_size_t         first_size;
                    667:   mp_size_t         max_size;
                    668:   speed_function_t  function;
                    669:   speed_function_t  mul_function;
                    670:   mp_size_t         sqr;
                    671: };
                    672:
1.1.1.2 ! ohara     673:
1.1       maekawa   674: /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
                    675:    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
                    676:    of 2^(k-1) bits. */
                    677:
                    678: mp_size_t
                    679: fft_step_size (int k)
                    680: {
1.1.1.2 ! ohara     681:   mp_size_t  step;
        !           682:
        !           683:   step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB;
        !           684:   step *= (mp_size_t) 1 << k;
        !           685:
        !           686:   if (step <= 0)
1.1       maekawa   687:     {
                    688:       printf ("Can't handle k=%d\n", k);
                    689:       abort ();
                    690:     }
1.1.1.2 ! ohara     691:
        !           692:   return step;
1.1       maekawa   693: }
                    694:
                    695: mp_size_t
                    696: fft_next_size (mp_size_t pl, int k)
                    697: {
                    698:   mp_size_t  m = fft_step_size (k);
                    699:
                    700: /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
                    701:
                    702:   if (pl == 0 || (pl & (m-1)) != 0)
                    703:     pl = (pl | (m-1)) + 1;
                    704:
                    705: /*    printf (" %ld\n", pl); */
                    706:   return pl;
                    707: }
                    708:
                    709: void
                    710: fft (struct fft_param_t *p)
                    711: {
                    712:   mp_size_t  size;
                    713:   int        i, k;
                    714:
                    715:   for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++)
                    716:     mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX;
                    717:
                    718:   *p->p_threshold = MP_SIZE_T_MAX;
                    719:   *p->p_modf_threshold = MP_SIZE_T_MAX;
                    720:
                    721:   option_trace = MAX (option_trace, option_fft_trace);
                    722:
                    723:   printf ("#define %s  {", p->table_name);
                    724:   if (option_trace >= 2)
                    725:     printf ("\n");
                    726:
                    727:   k = FFT_FIRST_K;
                    728:   size = p->first_size;
                    729:   for (;;)
                    730:     {
                    731:       double  tk, tk1;
                    732:
                    733:       size = fft_next_size (size+1, k+1);
                    734:
                    735:       if (size >= p->max_size)
                    736:         break;
                    737:       if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))
                    738:         break;
                    739:
                    740:       /* compare k to k+1 in the middle of the current k+1 step */
                    741:       s.size = size + fft_step_size (k+1) / 2;
                    742:       s.r = k;
1.1.1.2 ! ohara     743:       tk = tuneup_measure (p->function, NULL, &s);
1.1       maekawa   744:       if (tk == -1.0)
                    745:         abort ();
                    746:
                    747:       s.r = k+1;
1.1.1.2 ! ohara     748:       tk1 = tuneup_measure (p->function, NULL, &s);
1.1       maekawa   749:       if (tk1 == -1.0)
                    750:         abort ();
                    751:
                    752:       if (option_trace >= 2)
1.1.1.2 ! ohara     753:         printf ("at %ld   size=%ld  k=%d  %.9f   k=%d %.9f\n",
1.1       maekawa   754:                 size, s.size, k, tk, k+1, tk1);
                    755:
                    756:       /* declare the k+1 threshold as soon as it's faster at its midpoint */
                    757:       if (tk1 < tk)
                    758:         {
                    759:           mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size;
                    760:           printf (" %ld,", s.size);
                    761:           if (option_trace >= 2) printf ("\n");
                    762:           k++;
                    763:         }
                    764:     }
                    765:
                    766:   mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;
                    767:   printf (" 0 }\n");
                    768:
                    769:
                    770:   size = p->first_size;
1.1.1.2 ! ohara     771:
1.1       maekawa   772:   /* Declare an FFT faster than a plain toom3 etc multiplication found as
                    773:      soon as one faster measurement obtained.  A multiplication in the
                    774:      middle of the FFT step is tested.  */
                    775:   for (;;)
                    776:     {
                    777:       int     modf = (*p->p_modf_threshold == MP_SIZE_T_MAX);
                    778:       double  tk, tm;
                    779:
                    780:       /* k=7 should be the first FFT which can beat toom3 on a full
                    781:          multiply, so jump to that threshold and save some probing after the
                    782:          modf threshold is found.  */
                    783:       if (!modf && size < mpn_fft_table[p->sqr][2])
                    784:         {
                    785:           size = mpn_fft_table[p->sqr][2];
                    786:           if (option_trace >= 2)
                    787:             printf ("jump to size=%ld\n", size);
                    788:         }
                    789:
                    790:       size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr));
                    791:       k = mpn_fft_best_k (size, p->sqr);
                    792:
                    793:       if (size >= p->max_size)
                    794:         break;
                    795:
                    796:       s.size = size + fft_step_size (k) / 2;
                    797:       s.r = k;
1.1.1.2 ! ohara     798:       tk = tuneup_measure (p->function, NULL, &s);
1.1       maekawa   799:       if (tk == -1.0)
                    800:         abort ();
                    801:
                    802:       if (!modf)  s.size /= 2;
1.1.1.2 ! ohara     803:       tm = tuneup_measure (p->mul_function, NULL, &s);
1.1       maekawa   804:       if (tm == -1.0)
                    805:         abort ();
                    806:
                    807:       if (option_trace >= 2)
1.1.1.2 ! ohara     808:         printf ("at %ld   size=%ld   k=%d  %.9f   size=%ld %s mul %.9f\n",
1.1       maekawa   809:                 size,
                    810:                 size + fft_step_size (k) / 2, k, tk,
                    811:                 s.size, modf ? "modf" : "full", tm);
                    812:
                    813:       if (tk < tm)
                    814:         {
                    815:           if (modf)
                    816:             {
                    817:               *p->p_modf_threshold = s.size;
                    818:               print_define (p->modf_threshold_name, *p->p_modf_threshold);
                    819:             }
                    820:           else
                    821:             {
                    822:               *p->p_threshold = s.size;
                    823:               print_define (p->threshold_name,      *p->p_threshold);
                    824:               break;
                    825:             }
                    826:         }
                    827:     }
                    828:
                    829: }
                    830:
                    831:
1.1.1.2 ! ohara     832:
        !           833: /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
        !           834:    giving wrong results.  */
1.1       maekawa   835: void
1.1.1.2 ! ohara     836: tune_mul (void)
1.1       maekawa   837: {
1.1.1.2 ! ohara     838:   static struct param_t  param;
        !           839:   param.name[0] = "MUL_KARATSUBA_THRESHOLD";
        !           840:   param.name[1] = "MUL_TOOM3_THRESHOLD";
        !           841:   param.function = speed_mpn_mul_n;
        !           842:   param.min_size[0] = MAX (4, MPN_KARA_MUL_N_MINSIZE);
        !           843:   param.max_size[0] = MUL_TOOM3_THRESHOLD_LIMIT-1;
        !           844:   param.max_size[1] = MUL_TOOM3_THRESHOLD_LIMIT-1;
        !           845:   one (mul_threshold, 2, &param);
1.1       maekawa   846:
1.1.1.2 ! ohara     847:   /* disabled until tuned */
        !           848:   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
        !           849: }
1.1       maekawa   850:
                    851:
1.1.1.2 ! ohara     852: /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
        !           853:    is faster only at size==2 then we don't want to bother with extra code
        !           854:    just for that.  Start karatsuba from 4 same as MUL above.  */
        !           855: void
        !           856: tune_sqr (void)
        !           857: {
        !           858:   static struct param_t  param;
        !           859:   param.name[0] = "SQR_BASECASE_THRESHOLD";
        !           860:   param.name[1] = "SQR_KARATSUBA_THRESHOLD";
        !           861:   param.name[2] = "SQR_TOOM3_THRESHOLD";
        !           862:   param.function = speed_mpn_sqr_n;
        !           863:   param.min_is_always = 1;
        !           864:   param.second_start_min = 1;
        !           865:   param.min_size[0] = 3;
        !           866:   param.min_size[1] = MAX (4, MPN_KARA_SQR_N_MINSIZE);
        !           867:   param.min_size[2] = MPN_TOOM3_SQR_N_MINSIZE;
        !           868:   param.max_size[0] = TUNE_SQR_KARATSUBA_MAX;
        !           869:   param.max_size[1] = TUNE_SQR_KARATSUBA_MAX;
        !           870:   one (sqr_threshold, 3, &param);
        !           871:
        !           872:   /* disabled until tuned */
        !           873:   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
        !           874: }
        !           875:
        !           876:
        !           877: void
        !           878: tune_sb_preinv (void)
        !           879: {
        !           880:   static struct param_t  param;
        !           881:
        !           882:   if (UDIV_PREINV_ALWAYS)
        !           883:     {
        !           884:       print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always");
        !           885:       return;
        !           886:     }
        !           887:
        !           888:   param.check_size = 256;
        !           889:   param.min_size[0] = 3;
        !           890:   param.min_is_always = 1;
        !           891:   param.size_extra = 3;
        !           892:   param.stop_factor = 2.0;
        !           893:   param.name[0] = "DIV_SB_PREINV_THRESHOLD";
        !           894:   param.function = speed_mpn_sb_divrem_m3;
        !           895:   one (sb_preinv_threshold, 1, &param);
        !           896: }
        !           897:
        !           898:
        !           899: void
        !           900: tune_dc (void)
        !           901: {
        !           902:   static struct param_t  param;
        !           903:   param.name[0] = "DIV_DC_THRESHOLD";
        !           904:   param.function = speed_mpn_dc_tdiv_qr;
        !           905:   one (dc_threshold, 1, &param);
        !           906: }
        !           907:
        !           908:
        !           909: /* This is an indirect determination, based on a comparison between redc and
        !           910:    mpz_mod.  A fudge factor of 1.04 is applied to redc, to represent
        !           911:    additional overheads it gets in mpz_powm.
        !           912:
        !           913:    stop_factor is 1.1 to hopefully help cray vector systems, where otherwise
        !           914:    currently it hits the 1000 limb limit with only a factor of about 1.18
        !           915:    (threshold should be around 650).  */
        !           916:
        !           917: void
        !           918: tune_powm (void)
        !           919: {
        !           920:   static struct param_t  param;
        !           921:   param.name[0] = "POWM_THRESHOLD";
        !           922:   param.function = speed_redc;
        !           923:   param.function2 = speed_mpz_mod;
        !           924:   param.step_factor = 0.03;
        !           925:   param.stop_factor = 1.1;
        !           926:   param.function_fudge = 1.04;
        !           927:   one (powm_threshold, 1, &param);
        !           928: }
        !           929:
        !           930:
        !           931: void
        !           932: tune_gcd_accel (void)
        !           933: {
        !           934:   static struct param_t  param;
        !           935:   param.name[0] = "GCD_ACCEL_THRESHOLD";
        !           936:   param.function = speed_mpn_gcd;
        !           937:   param.min_size[0] = 1;
        !           938:   one (gcd_accel_threshold, 1, &param);
        !           939: }
        !           940:
        !           941:
        !           942:
        !           943: /* A comparison between the speed of a single limb step and a double limb
        !           944:    step is made.  On a 32-bit limb the ratio is about 2.2 single steps to
        !           945:    equal a double step, or on a 64-bit limb about 2.09.  (These were found
        !           946:    from counting the steps on a 10000 limb gcdext.  */
        !           947: void
        !           948: tune_gcdext (void)
        !           949: {
        !           950:   static struct param_t  param;
        !           951:   param.name[0] = "GCDEXT_THRESHOLD";
        !           952:   param.function = speed_mpn_gcdext_one_single;
        !           953:   param.function2 = speed_mpn_gcdext_one_double;
        !           954:   switch (BITS_PER_MP_LIMB) {
        !           955:   case 32: param.function_fudge = 2.2; break;
        !           956:   case 64: param.function_fudge = 2.09; break;
        !           957:   default:
        !           958:     printf ("Don't know GCDEXT_THERSHOLD factor for BITS_PER_MP_LIMB == %d\n",
        !           959:             BITS_PER_MP_LIMB);
        !           960:     abort ();
1.1       maekawa   961:   }
1.1.1.2 ! ohara     962:   param.min_size[0] = 5;
        !           963:   param.min_is_always = 1;
        !           964:   param.max_size[0] = 300;
        !           965:   param.check_size = 300;
        !           966:   one (gcdext_threshold, 1, &param);
        !           967: }
        !           968:
        !           969:
        !           970: /* size_extra==1 reflects the fact that with high<divisor one division is
        !           971:    always skipped.  Forcing high<divisor while testing ensures consistency
        !           972:    while stepping through sizes, ie. that size-1 divides will be done each
        !           973:    time.
        !           974:
        !           975:    min_size==2 and min_is_always are used so that if plain division is only
        !           976:    better at size==1 then don't bother including that code just for that
        !           977:    case, instead go with preinv always and get a size saving.  */
        !           978:
        !           979: #define DIV_1_PARAMS                    \
        !           980:   param.check_size = 256;               \
        !           981:   param.min_size[0] = 2;                   \
        !           982:   param.min_is_always = 1;              \
        !           983:   param.data_high = DATA_HIGH_LT_R;     \
        !           984:   param.size_extra = 1;                 \
        !           985:   param.stop_factor = 2.0;
        !           986:
        !           987:
        !           988: double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
        !           989:
        !           990: void
        !           991: tune_divrem_1 (void)
        !           992: {
        !           993:   /* plain version by default */
        !           994:   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
        !           995:
        !           996: #ifndef HAVE_NATIVE_mpn_divrem_1
        !           997: #define HAVE_NATIVE_mpn_divrem_1 0
        !           998: #endif
        !           999:
        !          1000:   /* No support for tuning native assembler code, do that by hand and put
        !          1001:      the results in the .asm file, there's no need for such thresholds to
        !          1002:      appear in gmp-mparam.h.  */
        !          1003:   if (HAVE_NATIVE_mpn_divrem_1)
        !          1004:     return;
        !          1005:
        !          1006:   if (UDIV_PREINV_ALWAYS)
        !          1007:     {
        !          1008:       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
        !          1009:       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
        !          1010:       return;
        !          1011:     }
        !          1012:
        !          1013:   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1.1       maekawa  1014:
1.1.1.2 ! ohara    1015:   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
        !          1016:      a bit out for the fractional part, but that's too bad, the integer part
        !          1017:      is more important. */
1.1       maekawa  1018:   {
                   1019:     static struct param_t  param;
1.1.1.2 ! ohara    1020:     param.name[0] = "DIVREM_1_NORM_THRESHOLD";
        !          1021:     DIV_1_PARAMS;
        !          1022:     s.r = randlimb_norm ();
        !          1023:     param.function = speed_mpn_divrem_1_tune;
        !          1024:     one (divrem_1_norm_threshold, 1, &param);
1.1       maekawa  1025:   }
                   1026:   {
                   1027:     static struct param_t  param;
1.1.1.2 ! ohara    1028:     param.name[0] = "DIVREM_1_UNNORM_THRESHOLD";
        !          1029:     DIV_1_PARAMS;
        !          1030:     s.r = randlimb_half ();
        !          1031:     param.function = speed_mpn_divrem_1_tune;
        !          1032:     one (divrem_1_unnorm_threshold, 1, &param);
1.1       maekawa  1033:   }
1.1.1.2 ! ohara    1034: }
        !          1035:
        !          1036:
        !          1037: double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
        !          1038:
        !          1039: void
        !          1040: tune_mod_1 (void)
        !          1041: {
        !          1042:   /* plain version by default */
        !          1043:   tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
        !          1044:
        !          1045: #ifndef HAVE_NATIVE_mpn_mod_1
        !          1046: #define HAVE_NATIVE_mpn_mod_1 0
        !          1047: #endif
        !          1048:
        !          1049:   /* No support for tuning native assembler code, do that by hand and put
        !          1050:      the results in the .asm file, there's no need for such thresholds to
        !          1051:      appear in gmp-mparam.h.  */
        !          1052:   if (HAVE_NATIVE_mpn_mod_1)
        !          1053:     return;
        !          1054:
        !          1055:   if (UDIV_PREINV_ALWAYS)
        !          1056:     {
        !          1057:       print_define ("MOD_1_NORM_THRESHOLD", 0L);
        !          1058:       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
        !          1059:       return;
        !          1060:     }
        !          1061:
        !          1062:   tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
1.1       maekawa  1063:
                   1064:   {
                   1065:     static struct param_t  param;
1.1.1.2 ! ohara    1066:     param.name[0] = "MOD_1_NORM_THRESHOLD";
        !          1067:     DIV_1_PARAMS;
        !          1068:     s.r = randlimb_norm ();
        !          1069:     param.function = speed_mpn_mod_1_tune;
        !          1070:     one (mod_1_norm_threshold, 1, &param);
1.1       maekawa  1071:   }
                   1072:   {
                   1073:     static struct param_t  param;
1.1.1.2 ! ohara    1074:     param.name[0] = "MOD_1_UNNORM_THRESHOLD";
        !          1075:     DIV_1_PARAMS;
        !          1076:     s.r = randlimb_half ();
        !          1077:     param.function = speed_mpn_mod_1_tune;
        !          1078:     one (mod_1_unnorm_threshold, 1, &param);
1.1       maekawa  1079:   }
1.1.1.2 ! ohara    1080: }
        !          1081:
        !          1082:
        !          1083: /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
        !          1084:    imply that udiv_qrnnd_preinv is worth using, but it seems most
        !          1085:    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
        !          1086:    directly.  */
        !          1087:
        !          1088: void
        !          1089: tune_preinv_divrem_1 (void)
        !          1090: {
        !          1091:   static struct param_t  param;
        !          1092:   speed_function_t  divrem_1;
        !          1093:   const char        *divrem_1_name;
        !          1094:   double            t1, t2;
        !          1095:
        !          1096: #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
        !          1097: #define HAVE_NATIVE_mpn_preinv_divrem_1 0
        !          1098: #endif
        !          1099:
        !          1100:   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
        !          1101:      it's faster than mpn_divrem_1.  */
        !          1102:   if (HAVE_NATIVE_mpn_preinv_divrem_1)
        !          1103:     {
        !          1104:       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
        !          1105:       return;
        !          1106:     }
        !          1107:
        !          1108:   /* If udiv_qrnnd_preinv is the only division method then of course
        !          1109:      mpn_preinv_divrem_1 should be used.  */
        !          1110:   if (UDIV_PREINV_ALWAYS)
        !          1111:     {
        !          1112:       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
        !          1113:       return;
        !          1114:     }
        !          1115:
        !          1116:   /* If we've got an assembler version of mpn_divrem_1, then compare against
        !          1117:      that, not the mpn_divrem_1_div generic C.  */
        !          1118:   if (HAVE_NATIVE_mpn_divrem_1)
        !          1119:     {
        !          1120:       divrem_1 = speed_mpn_divrem_1;
        !          1121:       divrem_1_name = "mpn_divrem_1";
        !          1122:     }
        !          1123:   else
        !          1124:     {
        !          1125:       divrem_1 = speed_mpn_divrem_1_div;
        !          1126:       divrem_1_name = "mpn_divrem_1_div";
        !          1127:     }
        !          1128:
        !          1129:   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
        !          1130:   s.size = 200;                     /* generous but not too big */
        !          1131:   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
        !          1132:      since in general that's probably most common, though in fact for a
        !          1133:      64-bit limb mp_bases[10].big_base is normalized.  */
        !          1134:   s.r = urandom() & (MP_LIMB_T_MAX >> 4);
        !          1135:   if (s.r == 0) s.r = 123;
        !          1136:
        !          1137:   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
        !          1138:   t2 = tuneup_measure (divrem_1, &param, &s);
        !          1139:   if (t1 == -1.0 || t2 == -1.0)
        !          1140:     {
        !          1141:       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
        !          1142:               divrem_1_name, s.size);
        !          1143:       abort ();
        !          1144:     }
        !          1145:   if (option_trace >= 1)
        !          1146:     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
        !          1147:             s.size, t1, divrem_1_name, t2);
        !          1148:
        !          1149:   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
        !          1150: }
        !          1151:
        !          1152:
        !          1153: /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
        !          1154:    that udiv_qrnnd_preinv is worth using, but it seems most straightforward
        !          1155:    to compare mpn_preinv_mod_1 and mpn_mod_1_div directly.  */
        !          1156:
        !          1157: void
        !          1158: tune_preinv_mod_1 (void)
        !          1159: {
        !          1160:   static struct param_t  param;
        !          1161:   speed_function_t  mod_1;
        !          1162:   const char        *mod_1_name;
        !          1163:   double            t1, t2;
        !          1164:
        !          1165: #ifndef HAVE_NATIVE_mpn_preinv_mod_1
        !          1166: #define HAVE_NATIVE_mpn_preinv_mod_1 0
        !          1167: #endif
        !          1168:
        !          1169:   /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
        !          1170:      faster than mpn_mod_1.  */
        !          1171:   if (HAVE_NATIVE_mpn_preinv_mod_1)
        !          1172:     {
        !          1173:       print_define_remark ("USE_PREINV_MOD_1", 1, "native");
        !          1174:       return;
        !          1175:     }
        !          1176:
        !          1177:   /* If udiv_qrnnd_preinv is the only division method then of course
        !          1178:      mpn_preinv_mod_1 should be used.  */
        !          1179:   if (UDIV_PREINV_ALWAYS)
        !          1180:     {
        !          1181:       print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
        !          1182:       return;
        !          1183:     }
        !          1184:
        !          1185:   /* If we've got an assembler version of mpn_mod_1, then compare against
        !          1186:      that, not the mpn_mod_1_div generic C.  */
        !          1187:   if (HAVE_NATIVE_mpn_mod_1)
        !          1188:     {
        !          1189:       mod_1 = speed_mpn_mod_1;
        !          1190:       mod_1_name = "mpn_mod_1";
        !          1191:     }
        !          1192:   else
        !          1193:     {
        !          1194:       mod_1 = speed_mpn_mod_1_div;
        !          1195:       mod_1_name = "mpn_mod_1_div";
        !          1196:     }
        !          1197:
        !          1198:   param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
        !          1199:   s.size = 200;                     /* generous but not too big */
        !          1200:   s.r = randlimb_norm();            /* divisor */
        !          1201:
        !          1202:   t1 = tuneup_measure (speed_mpn_preinv_mod_1, &param, &s);
        !          1203:   t2 = tuneup_measure (mod_1, &param, &s);
        !          1204:   if (t1 == -1.0 || t2 == -1.0)
        !          1205:     {
        !          1206:       printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
        !          1207:               mod_1_name, s.size);
        !          1208:       abort ();
        !          1209:     }
        !          1210:   if (option_trace >= 1)
        !          1211:     printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
        !          1212:             s.size, t1, mod_1_name, t2);
        !          1213:
        !          1214:   print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
        !          1215: }
        !          1216:
        !          1217:
        !          1218: void
        !          1219: tune_divrem_2 (void)
        !          1220: {
        !          1221:   static struct param_t  param;
        !          1222:
        !          1223: #ifndef HAVE_NATIVE_mpn_divrem_2
        !          1224: #define HAVE_NATIVE_mpn_divrem_2 0
        !          1225: #endif
        !          1226:
        !          1227:   /* No support for tuning native assembler code, do that by hand and put
        !          1228:      the results in the .asm file, and there's no need for such thresholds
        !          1229:      to appear in gmp-mparam.h.  */
        !          1230:   if (HAVE_NATIVE_mpn_divrem_2)
        !          1231:     return;
        !          1232:
        !          1233:   if (UDIV_PREINV_ALWAYS)
        !          1234:     {
        !          1235:       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
        !          1236:       return;
        !          1237:     }
        !          1238:
        !          1239:   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
        !          1240:      a bit out for the fractional part, but that's too bad, the integer part
        !          1241:      is more important.
        !          1242:
        !          1243:      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
        !          1244:      code space if plain division is better only at size==2 or size==3. */
        !          1245:   param.name[0] = "DIVREM_2_THRESHOLD";
        !          1246:   param.check_size = 256;
        !          1247:   param.min_size[0] = 4;
        !          1248:   param.min_is_always = 1;
        !          1249:   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
        !          1250:   param.stop_factor = 2.0;
        !          1251:
        !          1252:   s.r = randlimb_norm ();
        !          1253:   param.function = speed_mpn_divrem_2;
        !          1254:   one (divrem_2_threshold, 1, &param);
        !          1255: }
        !          1256:
        !          1257:
        !          1258: /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
        !          1259:    tune for that.  Its speed can differ on odd or even divisor, so take an
        !          1260:    average threshold for the two.
        !          1261:
        !          1262:    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
        !          1263:    might not vary that way, but don't test this since high<divisor isn't
        !          1264:    expected to occur often with small divisors.  */
        !          1265:
        !          1266: void
        !          1267: tune_divexact_1 (void)
        !          1268: {
        !          1269:   static struct param_t  param;
        !          1270:   mp_size_t  thresh[2], average;
        !          1271:   int        low, i;
        !          1272:
        !          1273:   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
        !          1274:
        !          1275:   param.name[0] = "DIVEXACT_1_THRESHOLD";
        !          1276:   param.data_high = DATA_HIGH_GE_R;
        !          1277:   param.check_size = 256;
        !          1278:   param.min_size[0] = 2;
        !          1279:   param.stop_factor = 1.5;
        !          1280:   param.function  = tuned_speed_mpn_divrem_1;
        !          1281:   param.function2 = speed_mpn_divexact_1;
        !          1282:   param.noprint = 1;
        !          1283:
        !          1284:   print_define_start (param.name[0]);
        !          1285:
        !          1286:   for (low = 0; low <= 1; low++)
        !          1287:     {
        !          1288:       s.r = randlimb_half();
        !          1289:       if (low == 0)
        !          1290:         s.r |= 1;
        !          1291:       else
        !          1292:         s.r &= ~CNST_LIMB(7);
        !          1293:
        !          1294:       one (divexact_1_threshold, 1, &param);
        !          1295:       if (option_trace)
        !          1296:         printf ("low=%d thresh %ld\n", low, divexact_1_threshold[0]);
        !          1297:
        !          1298:       if (divexact_1_threshold[0] == MP_SIZE_T_MAX)
        !          1299:         {
        !          1300:           average = MP_SIZE_T_MAX;
        !          1301:           goto divexact_1_done;
        !          1302:         }
        !          1303:
        !          1304:       thresh[low] = divexact_1_threshold[0];
        !          1305:     }
        !          1306:
        !          1307:   if (option_trace)
        !          1308:     {
        !          1309:       printf ("average of:");
        !          1310:       for (i = 0; i < numberof(thresh); i++)
        !          1311:         printf (" %ld", thresh[i]);
        !          1312:       printf ("\n");
        !          1313:     }
        !          1314:
        !          1315:   average = 0;
        !          1316:   for (i = 0; i < numberof(thresh); i++)
        !          1317:     average += thresh[i];
        !          1318:   average /= numberof(thresh);
        !          1319:
        !          1320:   /* If divexact turns out to be better as early as 3 limbs, then use it
        !          1321:      always, so as to reduce code size and conditional jumps.  */
        !          1322:   if (average <= 3)
        !          1323:     average = 0;
        !          1324:
        !          1325:  divexact_1_done:
        !          1326:   print_define_end (param.name[0], average);
        !          1327: }
        !          1328:
        !          1329:
        !          1330: /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
        !          1331:    same as mpn_mod_1, but this might not be true of an assembler
        !          1332:    implementation.  The threshold used is an average based on data where a
        !          1333:    divide can be skipped and where it can't.
        !          1334:
        !          1335:    If modexact turns out to be better as early as 3 limbs, then use it
        !          1336:    always, so as to reduce code size and conditional jumps.  */
        !          1337:
        !          1338: void
        !          1339: tune_modexact_1_odd (void)
        !          1340: {
        !          1341:   static struct param_t  param;
        !          1342:   mp_size_t  thresh_lt;
        !          1343:
        !          1344:   ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
        !          1345:
        !          1346:   param.name[0] = "MODEXACT_1_ODD_THRESHOLD";
        !          1347:   param.check_size = 256;
        !          1348:   param.min_size[0] = 2;
        !          1349:   param.stop_factor = 1.5;
        !          1350:   param.function  = tuned_speed_mpn_mod_1;
        !          1351:   param.function2 = speed_mpn_modexact_1c_odd;
        !          1352:   param.noprint = 1;
        !          1353:   s.r = randlimb_half () | 1;
        !          1354:
        !          1355:   print_define_start (param.name[0]);
        !          1356:
        !          1357:   param.data_high = DATA_HIGH_LT_R;
        !          1358:   one (modexact_1_odd_threshold, 1, &param);
        !          1359:   if (option_trace)
        !          1360:     printf ("lt thresh %ld\n", modexact_1_odd_threshold[0]);
        !          1361:
        !          1362:   thresh_lt = modexact_1_odd_threshold[0];
        !          1363:   if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
        !          1364:     {
        !          1365:       param.data_high = DATA_HIGH_GE_R;
        !          1366:       one (modexact_1_odd_threshold, 1, &param);
        !          1367:       if (option_trace)
        !          1368:         printf ("ge thresh %ld\n", modexact_1_odd_threshold[0]);
        !          1369:
        !          1370:       if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
        !          1371:         {
        !          1372:           modexact_1_odd_threshold[0]
        !          1373:             = (modexact_1_odd_threshold[0] + thresh_lt) / 2;
        !          1374:           if (modexact_1_odd_threshold[0] <= 3)
        !          1375:             modexact_1_odd_threshold[0] = 0;
        !          1376:         }
        !          1377:     }
        !          1378:
        !          1379:   print_define_end (param.name[0], modexact_1_odd_threshold[0]);
        !          1380: }
        !          1381:
        !          1382:
        !          1383: void
        !          1384: tune_jacobi_base (void)
        !          1385: {
        !          1386:   static struct param_t  param;
        !          1387:   double   t1, t2, t3;
        !          1388:   int      method;
        !          1389:
        !          1390:   s.size = BITS_PER_MP_LIMB * 3 / 4;
        !          1391:
        !          1392:   t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
        !          1393:   if (option_trace >= 1)
        !          1394:     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", s.size, t1);
        !          1395:
        !          1396:   t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
        !          1397:   if (option_trace >= 1)
        !          1398:     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", s.size, t2);
        !          1399:
        !          1400:   t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
        !          1401:   if (option_trace >= 1)
        !          1402:     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", s.size, t3);
        !          1403:
        !          1404:   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
        !          1405:     {
        !          1406:       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
        !          1407:               s.size);
        !          1408:       abort ();
        !          1409:     }
        !          1410:
        !          1411:   if (t1 < t2 && t1 < t3)
        !          1412:     method = 1;
        !          1413:   else if (t2 < t3)
        !          1414:     method = 2;
        !          1415:   else
        !          1416:     method = 3;
        !          1417:
        !          1418:   print_define ("JACOBI_BASE_METHOD", method);
        !          1419: }
        !          1420:
1.1       maekawa  1421:
1.1.1.2 ! ohara    1422: void
        !          1423: tune_get_str (void)
        !          1424: {
        !          1425:   /* Tune for decimal, it being most common.  Some rough testing suggests
        !          1426:      other bases are different, but not by very much.  */
        !          1427:   s.r = 10;
1.1       maekawa  1428:   {
                   1429:     static struct param_t  param;
1.1.1.2 ! ohara    1430:     get_str_precompute_threshold[0] = 0;
        !          1431:     param.name[0] = "GET_STR_DC_THRESHOLD";
        !          1432:     param.function = speed_mpn_get_str;
        !          1433:     param.min_size[0] = 2;
        !          1434:     param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
        !          1435:     one (get_str_basecase_threshold, 1, &param);
1.1       maekawa  1436:   }
                   1437:   {
                   1438:     static struct param_t  param;
1.1.1.2 ! ohara    1439:     param.name[0] = "GET_STR_PRECOMPUTE_THRESHOLD";
        !          1440:     param.function = speed_mpn_get_str;
        !          1441:     param.min_size[0] = get_str_basecase_threshold[0];
        !          1442:     param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
        !          1443:     one (get_str_precompute_threshold, 1, &param);
1.1       maekawa  1444:   }
1.1.1.2 ! ohara    1445: }
        !          1446:
        !          1447:
        !          1448: void
        !          1449: tune_set_str (void)
        !          1450: {
        !          1451:   static struct param_t  param;
        !          1452:
        !          1453:   s.r = 10;  /* decimal */
        !          1454:   param.step_factor = 0.04;
        !          1455:   param.name[0] = "SET_STR_THRESHOLD";
        !          1456:   param.function = speed_mpn_set_str_basecase;
        !          1457:   param.function2 = speed_mpn_set_str_subquad;
        !          1458:   param.min_size[0] = 100;
        !          1459:   param.max_size[0] = 150000;
        !          1460:   one (set_str_threshold, 1, &param);
        !          1461: }
        !          1462:
        !          1463:
        !          1464: void
        !          1465: tune_fft_mul (void)
        !          1466: {
        !          1467:   static struct fft_param_t  param;
        !          1468:
        !          1469:   if (option_fft_max_size == 0)
        !          1470:     return;
        !          1471:
        !          1472:   param.table_name          = "MUL_FFT_TABLE";
        !          1473:   param.threshold_name      = "MUL_FFT_THRESHOLD";
        !          1474:   param.p_threshold         = &MUL_FFT_THRESHOLD;
        !          1475:   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
        !          1476:   param.p_modf_threshold    = &MUL_FFT_MODF_THRESHOLD;
        !          1477:   param.first_size          = MUL_TOOM3_THRESHOLD / 2;
        !          1478:   param.max_size            = option_fft_max_size;
        !          1479:   param.function            = speed_mpn_mul_fft;
        !          1480:   param.mul_function        = speed_mpn_mul_n;
        !          1481:   param.sqr = 0;
        !          1482:   fft (&param);
        !          1483: }
        !          1484:
        !          1485:
        !          1486: void
        !          1487: tune_fft_sqr (void)
        !          1488: {
        !          1489:   static struct fft_param_t  param;
        !          1490:
        !          1491:   if (option_fft_max_size == 0)
        !          1492:     return;
        !          1493:
        !          1494:   param.table_name          = "SQR_FFT_TABLE";
        !          1495:   param.threshold_name      = "SQR_FFT_THRESHOLD";
        !          1496:   param.p_threshold         = &SQR_FFT_THRESHOLD;
        !          1497:   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
        !          1498:   param.p_modf_threshold    = &SQR_FFT_MODF_THRESHOLD;
        !          1499:   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
        !          1500:   param.max_size            = option_fft_max_size;
        !          1501:   param.function            = speed_mpn_mul_fft_sqr;
        !          1502:   param.mul_function        = speed_mpn_sqr_n;
        !          1503:   param.sqr = 0;
        !          1504:   fft (&param);
        !          1505: }
        !          1506:
        !          1507:
        !          1508: void
        !          1509: all (void)
        !          1510: {
        !          1511:   time_t  start_time, end_time;
        !          1512:   TMP_DECL (marker);
        !          1513:
        !          1514:   TMP_MARK (marker);
        !          1515:   s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
        !          1516:   s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
        !          1517:
        !          1518:   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
        !          1519:   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
        !          1520:
        !          1521:   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
        !          1522:
        !          1523:   speed_time_init ();
        !          1524:   fprintf (stderr, "Using: %s\n", speed_time_string);
        !          1525:
        !          1526:   fprintf (stderr, "speed_precision %d", speed_precision);
        !          1527:   if (speed_unittime == 1.0)
        !          1528:     fprintf (stderr, ", speed_unittime 1 cycle");
        !          1529:   else
        !          1530:     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
        !          1531:   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
        !          1532:     fprintf (stderr, ", CPU freq unknown\n");
        !          1533:   else
        !          1534:     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
        !          1535:
        !          1536:   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
        !          1537:            DEFAULT_MAX_SIZE, option_fft_max_size);
        !          1538:   fprintf (stderr, "\n");
        !          1539:
        !          1540:   time (&start_time);
1.1       maekawa  1541:   {
1.1.1.2 ! ohara    1542:     struct tm  *tp;
        !          1543:     tp = localtime (&start_time);
        !          1544:     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
        !          1545:             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
        !          1546:
        !          1547: #ifdef __GNUC__
        !          1548:     /* gcc sub-minor version doesn't seem to come through as a define */
        !          1549:     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
        !          1550: #define PRINTED_COMPILER
        !          1551: #endif
        !          1552: #if defined (__SUNPRO_C)
        !          1553:     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
        !          1554: #define PRINTED_COMPILER
        !          1555: #endif
        !          1556: #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
        !          1557:     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
        !          1558:     printf ("MIPSpro C %d.%d.%d */\n",
        !          1559:            _COMPILER_VERSION / 100,
        !          1560:            _COMPILER_VERSION / 10 % 10,
        !          1561:            _COMPILER_VERSION % 10);
        !          1562: #define PRINTED_COMPILER
        !          1563: #endif
        !          1564: #if defined (__DECC) && defined (__DECC_VER)
        !          1565:     printf ("DEC C %d */\n", __DECC_VER);
        !          1566: #define PRINTED_COMPILER
        !          1567: #endif
        !          1568: #if ! defined (PRINTED_COMPILER)
        !          1569:     printf ("system compiler */\n");
        !          1570: #endif
1.1       maekawa  1571:   }
1.1.1.2 ! ohara    1572:   printf ("\n");
        !          1573:
        !          1574:   tune_mul ();
1.1       maekawa  1575:   printf("\n");
                   1576:
1.1.1.2 ! ohara    1577:   tune_sqr ();
        !          1578:   printf("\n");
        !          1579:
        !          1580:   tune_sb_preinv ();
        !          1581:   tune_dc ();
        !          1582:   tune_powm ();
        !          1583:   printf("\n");
        !          1584:
        !          1585:   tune_gcd_accel ();
        !          1586:   tune_gcdext ();
        !          1587:   tune_jacobi_base ();
        !          1588:   printf("\n");
        !          1589:
        !          1590:   tune_divrem_1 ();
        !          1591:   tune_mod_1 ();
        !          1592:   tune_preinv_divrem_1 ();
        !          1593:   tune_preinv_mod_1 ();
        !          1594:   tune_divrem_2 ();
        !          1595:   tune_divexact_1 ();
        !          1596:   tune_modexact_1_odd ();
        !          1597:   printf("\n");
        !          1598:
        !          1599:   tune_get_str ();
        !          1600:   tune_set_str ();
        !          1601:   printf("\n");
        !          1602:
        !          1603:   tune_fft_mul ();
        !          1604:   printf("\n");
        !          1605:
        !          1606:   tune_fft_sqr ();
        !          1607:   printf ("\n");
        !          1608:
        !          1609:   time (&end_time);
        !          1610:   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
        !          1611:           end_time - start_time);
1.1       maekawa  1612:
                   1613:   TMP_FREE (marker);
                   1614: }
                   1615:
                   1616:
                   1617: int
                   1618: main (int argc, char *argv[])
                   1619: {
                   1620:   int  opt;
                   1621:
                   1622:   /* Unbuffered so if output is redirected to a file it isn't lost if the
                   1623:      program is killed part way through.  */
                   1624:   setbuf (stdout, NULL);
                   1625:   setbuf (stderr, NULL);
                   1626:
                   1627:   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
                   1628:     {
                   1629:       switch (opt) {
                   1630:       case 'f':
                   1631:         if (optarg[0] == 't')
                   1632:           option_fft_trace = 2;
                   1633:         else
                   1634:           option_fft_max_size = atol (optarg);
                   1635:         break;
                   1636:       case 'o':
                   1637:         speed_option_set (optarg);
                   1638:         break;
                   1639:       case 'p':
                   1640:         speed_precision = atoi (optarg);
                   1641:         break;
                   1642:       case 't':
                   1643:         option_trace++;
                   1644:         break;
                   1645:       case '?':
                   1646:         exit(1);
                   1647:       }
                   1648:     }
1.1.1.2 ! ohara    1649:
1.1       maekawa  1650:   all ();
1.1.1.2 ! ohara    1651:   exit (0);
1.1       maekawa  1652: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>