===================================================================
RCS file: /home/cvs/OpenXM_contrib/gmp/tune/Attic/tuneup.c,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.1 -r1.1.1.2
--- OpenXM_contrib/gmp/tune/Attic/tuneup.c	2000/09/09 14:13:19	1.1.1.1
+++ OpenXM_contrib/gmp/tune/Attic/tuneup.c	2003/08/25 16:06:38	1.1.1.2
@@ -1,7 +1,6 @@
-/* Create tuned thresholds for various algorithms. */
+/* Create tuned thresholds for various algorithms.
 
-/*
-Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -18,9 +17,9 @@ License for more details.
 You should have received a copy of the GNU Lesser General Public License
 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-MA 02111-1307, USA.
-*/
+MA 02111-1307, USA. */
 
+
 /* Usage: tune [-t] [-t] [-p precision]
 
    -t turns on some diagnostic traces, a second -t turns on more traces.
@@ -52,27 +51,50 @@ MA 02111-1307, USA.
    to the final speed of the relevant routines, but nothing has been done to
    check that carefully.
 
+   Remarks:
+
+   The code here isn't a vision of loveliness, mainly because it's subject
+   to ongoing modifications according to new things wanting to be tuned and
+   practical requirements of systems tested.
+
+   The way parts of the library are recompiled to insinuate the tuning
+   variables is a bit subtle, but unavoidable since of course the main
+   library has fixed thresholds compiled-in but we want to vary them here.
+   Most of the nonsense for this can be found in tune/Makefile.am and under
+   TUNE_PROGRAM_BUILD in gmp-impl.h.
+
+   The dirty hack which the "second_start_min" feature could perhaps be done
+   more generally, so if say karatsuba is never better than toom3 then it
+   can be detected and omitted.  Currently we're hoping very hard that this
+   doesn't arise in practice, and if it does then it indicates something
+   badly sub-optimal in the karatsuba implementation.
+
    Limitations:
-   
+
    The FFTs aren't subject to the same badness rule as the other thresholds,
    so each k is probably being brought on a touch early.  This isn't likely
    to make a difference, and the simpler probing means fewer tests.
 
 */
 
-#define TUNE_PROGRAM_BUILD  1
+#define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
 
+#include "config.h"
+
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
+#if HAVE_UNISTD_H
 #include <unistd.h>
+#endif
 
 #include "gmp.h"
 #include "gmp-impl.h"
+#include "longlong.h"
 
+#include "tests.h"
 #include "speed.h"
-#include "sqr_basecase.h"
 
 #if !HAVE_DECL_OPTARG
 extern char *optarg;
@@ -80,11 +102,9 @@ extern int optind, opterr;
 #endif
 
 
-#define MAX_SIZE        1000  /* limbs */
-#define STEP_FACTOR     0.01  /* how much to step sizes by (rounded down) */
-#define MAX_TABLE       2     /* threshold entries */
+#define DEFAULT_MAX_SIZE   1000  /* limbs */
+#define MAX_TABLE             5  /* entries */
 
-
 #if WANT_FFT
 mp_size_t  option_fft_max_size = 50000;  /* limbs */
 #else
@@ -103,32 +123,83 @@ int  allocdat = 0;
 
 
 /* Each "_threshold" array must be 1 bigger than the number of thresholds
-   being tuned in a set, because one() stores an value in the entry above
+   being tuned in a set, because one() stores a value in the entry above
    the one it's determining. */
 
 mp_size_t  mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
-mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
 mp_size_t  sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
-mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
-mp_size_t  bz_threshold[2] = { MP_SIZE_T_MAX };
-mp_size_t  fib_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  sb_preinv_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  dc_threshold[2] = { MP_SIZE_T_MAX };
 mp_size_t  powm_threshold[2] = { MP_SIZE_T_MAX };
 mp_size_t  gcd_accel_threshold[2] = { MP_SIZE_T_MAX };
 mp_size_t  gcdext_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  divexact_1_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  divrem_1_norm_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  divrem_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  divrem_2_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  mod_1_norm_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  mod_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  modexact_1_odd_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  get_str_basecase_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  get_str_precompute_threshold[2] = { MP_SIZE_T_MAX };
+mp_size_t  set_str_threshold[2] = { MP_SIZE_T_MAX };
 
+mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
+mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
 
-#ifndef KARATSUBA_SQR_MAX
-#define KARATSUBA_SQR_MAX  0 /* meaning no limit */
+#ifndef TUNE_SQR_KARATSUBA_MAX
+#define TUNE_SQR_KARATSUBA_MAX  0 /* meaning no limit */
 #endif
 
 struct param_t {
-  const char  *name[MAX_TABLE];
-  int         stop_since_change;
-  mp_size_t   min_size;
-  mp_size_t   max_size[MAX_TABLE];
+  const char        *name[MAX_TABLE];
+  speed_function_t  function;
+  speed_function_t  function2;
+  double            step_factor;    /* how much to step sizes (rounded down) */
+  double            function_fudge; /* multiplier for "function" speeds */
+  int               stop_since_change;
+  double            stop_factor;
+  mp_size_t         min_size[MAX_TABLE];
+  int               min_is_always;
+  int               second_start_min;
+  mp_size_t         max_size[MAX_TABLE];
+  mp_size_t         check_size;
+  mp_size_t         size_extra;
+
+#define DATA_HIGH_LT_R  1
+#define DATA_HIGH_GE_R  2
+  int               data_high;
+
+  int               noprint;
 };
 
+#ifndef UDIV_PREINV_ALWAYS
+#define UDIV_PREINV_ALWAYS 0
+#endif
 
+
+mp_limb_t
+randlimb_norm (void)
+{
+  mp_limb_t  n;
+  mpn_random (&n, 1);
+  n |= GMP_LIMB_HIGHBIT;
+  return n;
+}
+
+#define MP_LIMB_T_HALFMASK  ((CNST_LIMB(1) << (BITS_PER_MP_LIMB/2)) - 1)
+
+mp_limb_t
+randlimb_half (void)
+{
+  mp_limb_t  n;
+  mpn_random (&n, 1);
+  n &= MP_LIMB_T_HALFMASK;
+  n += (n==0);
+  return n;
+}
+
+
 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
    if necessary.  */
 void
@@ -140,7 +211,7 @@ add_dat (mp_size_t size, double d)
 
   if (ndat == allocdat)
     {
-      dat = (struct dat_t *) _mp_allocate_or_reallocate
+      dat = (struct dat_t *) __gmp_allocate_or_reallocate
         (dat, allocdat * sizeof(dat[0]),
          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
       allocdat += ALLOCDAT_STEP;
@@ -191,18 +262,44 @@ analyze_dat (int i, int final)
           min_j = j;
         }
     }
-   
+
   return min_j;
 }
 
 
+/* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
+
+mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
+                                    mp_srcptr ap, mp_size_t size,
+                                    mp_limb_t d));
+mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
+
 double
-tuneup_measure (speed_function_t fun, struct speed_params *s)
+speed_mpn_mod_1_tune (struct speed_params *s)
 {
-  static mp_ptr  xp, yp;
+  SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
+}
+double
+speed_mpn_divrem_1_tune (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
+}
+
+
+double
+tuneup_measure (speed_function_t fun,
+                const struct param_t *param,
+                struct speed_params *s)
+{
+  static struct param_t  dummy;
   double   t;
   TMP_DECL (marker);
 
+  if (! param)
+    param = &dummy;
+
+  s->size += param->size_extra;
+
   TMP_MARK (marker);
   s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
   s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
@@ -210,53 +307,152 @@ tuneup_measure (speed_function_t fun, struct speed_par
   mpn_random (s->xp, s->size);
   mpn_random (s->yp, s->size);
 
+  switch (param->data_high) {
+  case DATA_HIGH_LT_R:
+    s->xp[s->size-1] %= s->r;
+    s->yp[s->size-1] %= s->r;
+    break;
+  case DATA_HIGH_GE_R:
+    s->xp[s->size-1] |= s->r;
+    s->yp[s->size-1] |= s->r;
+    break;
+  }
+
   t = speed_measure (fun, s);
 
+  s->size -= param->size_extra;
+
   TMP_FREE (marker);
   return t;
-}  
+}
 
 
+#define PRINT_WIDTH  28
+
 void
-print_define (const char *name, mp_size_t value)
+print_define_start (const char *name)
 {
-  printf ("#ifndef %s\n", name);
-  printf ("#define %-23s  ", name);
+  printf ("#define %-*s  ", PRINT_WIDTH, name);
+  if (option_trace)
+    printf ("...\n");
+}
+
+void
+print_define_end_remark (const char *name, mp_size_t value, const char *remark)
+{
+  if (option_trace)
+    printf ("#define %-*s  ", PRINT_WIDTH, name);
+
   if (value == MP_SIZE_T_MAX)
-    printf ("MP_SIZE_T_MAX\n");
+    printf ("MP_SIZE_T_MAX");
   else
-    printf ("%5ld\n", value);
-  printf ("#endif\n");
+    printf ("%5ld", value);
+
+  if (remark != NULL)
+    printf ("  /* %s */", remark);
+  printf ("\n");
 }
 
+void
+print_define_end (const char *name, mp_size_t value)
+{
+  const char  *remark;
+  if (value == MP_SIZE_T_MAX)
+    remark = "never";
+  else if (value == 0)
+    remark = "always";
+  else
+    remark = NULL;
+  print_define_end_remark (name, value, remark);
+}
 
+void
+print_define (const char *name, mp_size_t value)
+{
+  print_define_start (name);
+  print_define_end (name, value);
+}
+
+void
+print_define_remark (const char *name, mp_size_t value, const char *remark)
+{
+  print_define_start (name);
+  print_define_end_remark (name, value, remark);
+}
+
+
 /* table[i+1] needs to be set to a sensible value when testing method i+1
-   because mpn_mul_n uses TOOM3_MUL_THRESHOLD to size the temporary
+   because mpn_mul_n uses MUL_TOOM3_THRESHOLD to size the temporary
    workspace for mpn_kara_mul_n. */
 
 void
-one (speed_function_t function, mp_size_t table[], size_t max_table,
-     struct param_t *param)
+one (mp_size_t table[], size_t max_table, struct param_t *param)
 {
-  static struct param_t  dummy;
+  mp_size_t  table_save0 = 0;
+  int  since_positive, since_thresh_change;
+  int  thresh_idx, new_thresh_idx;
   int  i;
 
-  if (param == NULL)  param = &dummy;
+  ASSERT_ALWAYS (max_table <= MAX_TABLE);
 
-#define DEFAULT(x,n)  if (param->x == 0)  param->x = (n);
+#define DEFAULT(x,n)  if (! (param->x))  param->x = (n);
 
+  DEFAULT (function_fudge, 1.0);
+  DEFAULT (function2, param->function);
+  DEFAULT (step_factor, 0.01);  /* small steps by default */
   DEFAULT (stop_since_change, 80);
-  DEFAULT (min_size, 10);
-  for (i = 0; i < numberof (param->max_size); i++)
-    DEFAULT (max_size[i], MAX_SIZE);
+  DEFAULT (stop_factor, 1.2);
+  for (i = 0; i < max_table; i++)
+    DEFAULT (min_size[i], 10);
+  for (i = 0; i < max_table; i++)
+    DEFAULT (max_size[i], DEFAULT_MAX_SIZE);
 
-  s.size = param->min_size;
+  if (param->check_size != 0)
+    {
+      double   t1, t2;
+      s.size = param->check_size;
 
-  for (i = 0; i < max_table && s.size < MAX_SIZE; i++)
+      table[0] = s.size+1;
+      table[1] = param->max_size[0];
+      t1 = tuneup_measure (param->function, param, &s);
+
+      table[0] = s.size;
+      table[1] = s.size+1;
+      t2 = tuneup_measure (param->function2, param, &s);
+      if (t1 == -1.0 || t2 == -1.0)
+        {
+          printf ("Oops, can't run both functions at size %ld\n", s.size);
+          abort ();
+        }
+      t1 *= param->function_fudge;
+
+      /* ask that t2 is at least 4% below t1 */
+      if (t1 < t2*1.04)
+        {
+          if (option_trace)
+            printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
+          table[0] = MP_SIZE_T_MAX;
+          if (! param->noprint)
+            print_define (param->name[0], table[0]);
+          return;
+        }
+
+      if (option_trace >= 2)
+        printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
+                s.size, t1, t2);
+    }
+
+  for (i = 0, s.size = 1; i < max_table && s.size < param->max_size[i]; i++)
     {
-      int  since_positive, since_thresh_change;
-      int  thresh_idx, new_thresh_idx;
+      if (i == 1 && param->second_start_min)
+        s.size = 1;
 
+      if (s.size < param->min_size[i])
+        s.size = param->min_size[i];
+
+      if (! (param->noprint || (i == 1 && param->second_start_min)))
+        print_define_start (param->name[i]);
+
       ndat = 0;
       since_positive = 0;
       since_thresh_change = 0;
@@ -268,8 +464,9 @@ one (speed_function_t function, mp_size_t table[], siz
           printf ("              (seconds)    (seconds)    diff    thresh\n");
         }
 
-      for ( ; s.size < MAX_SIZE; 
-            s.size += MAX ((mp_size_t) floor (s.size * STEP_FACTOR), 1))
+      for (;
+           s.size < param->max_size[i];
+           s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
         {
           double   ti, tiplus1, d;
 
@@ -287,21 +484,24 @@ one (speed_function_t function, mp_size_t table[], siz
           /*
             FIXME: check minimum size requirements are met, possibly by just
             checking for the -1 returns from the speed functions.
-            if (s.size < MPN_TOOM_TABLE_TO_MINSIZE (i))
-            continue;
           */
 
+          /* under this hack, don't let method 0 get used at s.size */
+          if (i == 1 && param->second_start_min)
+            table[0] = MIN (s.size-1, table_save0);
+
           /* using method i at this size */
           table[i] = s.size+1;
-          table[i+1] = MAX_SIZE;
-          ti = tuneup_measure (function, &s);
+          table[i+1] = param->max_size[i];
+          ti = tuneup_measure (param->function, param, &s);
           if (ti == -1.0)
             abort ();
+          ti *= param->function_fudge;
 
           /* using method i+1 at this size */
           table[i] = s.size;
           table[i+1] = s.size+1;
-          tiplus1 = tuneup_measure (function, &s);
+          tiplus1 = tuneup_measure (param->function2, param, &s);
           if (tiplus1 == -1.0)
             abort ();
 
@@ -318,7 +518,7 @@ one (speed_function_t function, mp_size_t table[], siz
 
 
           if (option_trace >= 2)
-            printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %d\n",
+            printf ("i=%d size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
                     i, s.size, ti, tiplus1, d,
                     ti > tiplus1 ? '#' : ' ',
                     dat[new_thresh_idx].size);
@@ -327,7 +527,7 @@ one (speed_function_t function, mp_size_t table[], siz
              certain number of measurements ago.  */
 #define STOP_SINCE_POSITIVE  200
           if (d >= 0)
-            since_positive = 0;            
+            since_positive = 0;
           else
             if (++since_positive > STOP_SINCE_POSITIVE)
               {
@@ -338,12 +538,11 @@ one (speed_function_t function, mp_size_t table[], siz
               }
 
           /* Stop if method i has become slower by a certain factor. */
-#define STOP_FACTOR   1.2
-          if (ti >= tiplus1 * STOP_FACTOR)
+          if (ti >= tiplus1 * param->stop_factor)
             {
               if (option_trace >= 1)
                 printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n",
-                        i, STOP_FACTOR);
+                        i, param->stop_factor);
               break;
             }
 
@@ -374,8 +573,11 @@ one (speed_function_t function, mp_size_t table[], siz
         }
 
       /* Stop when the size limit is reached before the end of the
-         crossover, without a specified param->max_size[i]. */
-      if (s.size >= MAX_SIZE)
+         crossover, but only show this as an error for >= the default max
+         size.  FIXME: Maybe should make it a param choice whether this is
+         an error.  */
+      if (s.size >= param->max_size[i]
+          && param->max_size[i] >= DEFAULT_MAX_SIZE)
         {
           fprintf (stderr, "%s\n", param->name[i]);
           fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n",
@@ -393,12 +595,54 @@ one (speed_function_t function, mp_size_t table[], siz
 
       table[i] = dat[analyze_dat (i, 1)].size;
 
-      print_define (param->name[i], table[i]);
+      /* fudge here, let min_is_always apply only to i==0, that's what the
+         sqr_n thresholds want */
+      if (i == 0 && param->min_is_always && table[i] == param->min_size[i])
+        table[i] = 0;
 
-      /* Look for the next threshold starting from the current one, but back
-         a bit. */
+      /* under the second_start_min fudge, if the second threshold turns out
+         to be lower than the first, then the second method is unwanted, we
+         should go straight from algorithm 1 to algorithm 3.  */
+      if (param->second_start_min)
+        {
+          if (i == 0)
+            {
+              table_save0 = table[0];
+              table[0] = 0;
+            }
+          else if (i == 1)
+            {
+              table[0] = table_save0;
+              if (table[1] <= table[0])
+                {
+                  table[0] = table[1];
+                  table[1] = 0;
+                }
+            }
+          s.size = MAX (table[0], table[1]) + 1;
+        }
+
+      if (! (param->noprint || (i == 0 && param->second_start_min)))
+        {
+          if (i == 1 && param->second_start_min)
+            {
+              print_define_end (param->name[0], table[0]);
+              print_define_start (param->name[1]);
+            }
+
+          print_define_end (param->name[i], table[i]);
+        }
+
+      /* Look for the next threshold starting from the current one. */
       s.size = table[i]+1;
-    }      
+
+      /* Take a MAX of all to allow for second_start_min producing a 0. */
+      {
+        int  j;
+        for (j = 0; j < i; j++)
+          s.size = MAX (s.size, table[j]+1);
+      }
+    }
 }
 
 
@@ -426,6 +670,7 @@ struct fft_param_t {
   mp_size_t         sqr;
 };
 
+
 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
    of 2^(k-1) bits. */
@@ -433,12 +678,18 @@ struct fft_param_t {
 mp_size_t
 fft_step_size (int k)
 {
-  if (2*k-1 > BITS_PER_INT)
+  mp_size_t  step;
+
+  step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB;
+  step *= (mp_size_t) 1 << k;
+
+  if (step <= 0)
     {
       printf ("Can't handle k=%d\n", k);
       abort ();
     }
-  return (1<<k) * (MAX (1<<(k-1), BITS_PER_MP_LIMB)) / BITS_PER_MP_LIMB;
+
+  return step;
 }
 
 mp_size_t
@@ -469,7 +720,6 @@ fft (struct fft_param_t *p)
 
   option_trace = MAX (option_trace, option_fft_trace);
 
-  printf ("#ifndef %s\n", p->table_name);
   printf ("#define %s  {", p->table_name);
   if (option_trace >= 2)
     printf ("\n");
@@ -487,24 +737,20 @@ fft (struct fft_param_t *p)
       if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))
         break;
 
-      usleep(10000);
-
       /* compare k to k+1 in the middle of the current k+1 step */
       s.size = size + fft_step_size (k+1) / 2;
       s.r = k;
-      tk = tuneup_measure (p->function, &s);
+      tk = tuneup_measure (p->function, NULL, &s);
       if (tk == -1.0)
         abort ();
 
-      usleep(10000);
-
       s.r = k+1;
-      tk1 = tuneup_measure (p->function, &s);
+      tk1 = tuneup_measure (p->function, NULL, &s);
       if (tk1 == -1.0)
         abort ();
 
       if (option_trace >= 2)
-        printf ("at %ld   size=%ld  k=%d  %.9lf   k=%d %.9lf\n",
+        printf ("at %ld   size=%ld  k=%d  %.9f   k=%d %.9f\n",
                 size, s.size, k, tk, k+1, tk1);
 
       /* declare the k+1 threshold as soon as it's faster at its midpoint */
@@ -519,11 +765,10 @@ fft (struct fft_param_t *p)
 
   mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;
   printf (" 0 }\n");
-  printf ("#endif\n");
 
 
   size = p->first_size;
-  
+
   /* Declare an FFT faster than a plain toom3 etc multiplication found as
      soon as one faster measurement obtained.  A multiplication in the
      middle of the FFT step is tested.  */
@@ -548,23 +793,19 @@ fft (struct fft_param_t *p)
       if (size >= p->max_size)
         break;
 
-      usleep(10000);
-
       s.size = size + fft_step_size (k) / 2;
       s.r = k;
-      tk = tuneup_measure (p->function, &s);
+      tk = tuneup_measure (p->function, NULL, &s);
       if (tk == -1.0)
         abort ();
 
-      usleep(10000);
-
       if (!modf)  s.size /= 2;
-      tm = tuneup_measure (p->mul_function, &s);
+      tm = tuneup_measure (p->mul_function, NULL, &s);
       if (tm == -1.0)
         abort ();
 
       if (option_trace >= 2)
-        printf ("at %ld   size=%ld   k=%d  %.9lf   size=%ld %s mul %.9lf\n",
+        printf ("at %ld   size=%ld   k=%d  %.9f   size=%ld %s mul %.9f\n",
                 size,
                 size + fft_step_size (k) / 2, k, tk,
                 s.size, modf ? "modf" : "full", tm);
@@ -588,122 +829,787 @@ fft (struct fft_param_t *p)
 }
 
 
+
+/* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
+   giving wrong results.  */
 void
-all (void)
+tune_mul (void)
 {
-  TMP_DECL (marker);
+  static struct param_t  param;
+  param.name[0] = "MUL_KARATSUBA_THRESHOLD";
+  param.name[1] = "MUL_TOOM3_THRESHOLD";
+  param.function = speed_mpn_mul_n;
+  param.min_size[0] = MAX (4, MPN_KARA_MUL_N_MINSIZE);
+  param.max_size[0] = MUL_TOOM3_THRESHOLD_LIMIT-1;
+  param.max_size[1] = MUL_TOOM3_THRESHOLD_LIMIT-1;
+  one (mul_threshold, 2, &param);
 
-  TMP_MARK (marker);
-  s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
-  s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
+  /* disabled until tuned */
+  MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
+}
 
-  speed_time_init ();
-  fprintf (stderr, "speed_precision %d, speed_unittime %.2e\n",
-           speed_precision, speed_unittime);
-  fprintf (stderr, "MAX_SIZE %ld, fft_max_size %ld, STEP_FACTOR %.3f\n",
-           MAX_SIZE, option_fft_max_size, STEP_FACTOR);
-  fprintf (stderr, "\n");
 
-  {
-    struct tm  *tp;
-    time_t     t;
-    time (&t);
-    tp = localtime (&t);
-    printf ("/* Generated by tuneup.c, %d-%02d-%02d. */\n\n",
-            tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
+/* Start the basecase from 3, since 1 is a special case, and if mul_basecase
+   is faster only at size==2 then we don't want to bother with extra code
+   just for that.  Start karatsuba from 4 same as MUL above.  */
+void
+tune_sqr (void)
+{
+  static struct param_t  param;
+  param.name[0] = "SQR_BASECASE_THRESHOLD";
+  param.name[1] = "SQR_KARATSUBA_THRESHOLD";
+  param.name[2] = "SQR_TOOM3_THRESHOLD";
+  param.function = speed_mpn_sqr_n;
+  param.min_is_always = 1;
+  param.second_start_min = 1;
+  param.min_size[0] = 3;
+  param.min_size[1] = MAX (4, MPN_KARA_SQR_N_MINSIZE);
+  param.min_size[2] = MPN_TOOM3_SQR_N_MINSIZE;
+  param.max_size[0] = TUNE_SQR_KARATSUBA_MAX;
+  param.max_size[1] = TUNE_SQR_KARATSUBA_MAX;
+  one (sqr_threshold, 3, &param);
+
+  /* disabled until tuned */
+  SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
+}
+
+
+void
+tune_sb_preinv (void)
+{
+  static struct param_t  param;
+
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always");
+      return;
+    }
+
+  param.check_size = 256;
+  param.min_size[0] = 3;
+  param.min_is_always = 1;
+  param.size_extra = 3;
+  param.stop_factor = 2.0;
+  param.name[0] = "DIV_SB_PREINV_THRESHOLD";
+  param.function = speed_mpn_sb_divrem_m3;
+  one (sb_preinv_threshold, 1, &param);
+}
+
+
+void
+tune_dc (void)
+{
+  static struct param_t  param;
+  param.name[0] = "DIV_DC_THRESHOLD";
+  param.function = speed_mpn_dc_tdiv_qr;
+  one (dc_threshold, 1, &param);
+}
+
+
+/* This is an indirect determination, based on a comparison between redc and
+   mpz_mod.  A fudge factor of 1.04 is applied to redc, to represent
+   additional overheads it gets in mpz_powm.
+
+   stop_factor is 1.1 to hopefully help cray vector systems, where otherwise
+   currently it hits the 1000 limb limit with only a factor of about 1.18
+   (threshold should be around 650).  */
+
+void
+tune_powm (void)
+{
+  static struct param_t  param;
+  param.name[0] = "POWM_THRESHOLD";
+  param.function = speed_redc;
+  param.function2 = speed_mpz_mod;
+  param.step_factor = 0.03;
+  param.stop_factor = 1.1;
+  param.function_fudge = 1.04;
+  one (powm_threshold, 1, &param);
+}
+
+
+void
+tune_gcd_accel (void)
+{
+  static struct param_t  param;
+  param.name[0] = "GCD_ACCEL_THRESHOLD";
+  param.function = speed_mpn_gcd;
+  param.min_size[0] = 1;
+  one (gcd_accel_threshold, 1, &param);
+}
+
+
+
+/* A comparison between the speed of a single limb step and a double limb
+   step is made.  On a 32-bit limb the ratio is about 2.2 single steps to
+   equal a double step, or on a 64-bit limb about 2.09.  (These were found
+   from counting the steps on a 10000 limb gcdext.  */
+void
+tune_gcdext (void)
+{
+  static struct param_t  param;
+  param.name[0] = "GCDEXT_THRESHOLD";
+  param.function = speed_mpn_gcdext_one_single;
+  param.function2 = speed_mpn_gcdext_one_double;
+  switch (BITS_PER_MP_LIMB) {
+  case 32: param.function_fudge = 2.2; break;
+  case 64: param.function_fudge = 2.09; break;
+  default:
+    printf ("Don't know GCDEXT_THERSHOLD factor for BITS_PER_MP_LIMB == %d\n",
+            BITS_PER_MP_LIMB);
+    abort ();
   }
+  param.min_size[0] = 5;
+  param.min_is_always = 1;
+  param.max_size[0] = 300;
+  param.check_size = 300;
+  one (gcdext_threshold, 1, &param);
+}
 
+
+/* size_extra==1 reflects the fact that with high<divisor one division is
+   always skipped.  Forcing high<divisor while testing ensures consistency
+   while stepping through sizes, ie. that size-1 divides will be done each
+   time.
+
+   min_size==2 and min_is_always are used so that if plain division is only
+   better at size==1 then don't bother including that code just for that
+   case, instead go with preinv always and get a size saving.  */
+
+#define DIV_1_PARAMS                    \
+  param.check_size = 256;               \
+  param.min_size[0] = 2;                   \
+  param.min_is_always = 1;              \
+  param.data_high = DATA_HIGH_LT_R;     \
+  param.size_extra = 1;                 \
+  param.stop_factor = 2.0;
+
+
+double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
+
+void
+tune_divrem_1 (void)
+{
+  /* plain version by default */
+  tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
+
+#ifndef HAVE_NATIVE_mpn_divrem_1
+#define HAVE_NATIVE_mpn_divrem_1 0
+#endif
+
+  /* No support for tuning native assembler code, do that by hand and put
+     the results in the .asm file, there's no need for such thresholds to
+     appear in gmp-mparam.h.  */
+  if (HAVE_NATIVE_mpn_divrem_1)
+    return;
+
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
+      print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
+      return;
+    }
+
+  tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
+
+  /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
+     a bit out for the fractional part, but that's too bad, the integer part
+     is more important. */
   {
     static struct param_t  param;
-    param.name[0] = "KARATSUBA_MUL_THRESHOLD";
-    param.name[1] = "TOOM3_MUL_THRESHOLD";
-    param.max_size[1] = TOOM3_MUL_THRESHOLD_LIMIT;
-    one (speed_mpn_mul_n, mul_threshold, numberof(mul_threshold)-1, &param);
+    param.name[0] = "DIVREM_1_NORM_THRESHOLD";
+    DIV_1_PARAMS;
+    s.r = randlimb_norm ();
+    param.function = speed_mpn_divrem_1_tune;
+    one (divrem_1_norm_threshold, 1, &param);
   }
-  printf("\n");
-
   {
     static struct param_t  param;
-    param.name[0] = "KARATSUBA_SQR_THRESHOLD";
-    param.name[1] = "TOOM3_SQR_THRESHOLD";
-    param.max_size[0] = KARATSUBA_SQR_MAX;
-    one (speed_mpn_sqr_n, sqr_threshold, numberof(sqr_threshold)-1, &param);
+    param.name[0] = "DIVREM_1_UNNORM_THRESHOLD";
+    DIV_1_PARAMS;
+    s.r = randlimb_half ();
+    param.function = speed_mpn_divrem_1_tune;
+    one (divrem_1_unnorm_threshold, 1, &param);
   }
-  printf("\n");
+}
 
+
+double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
+
+void
+tune_mod_1 (void)
+{
+  /* plain version by default */
+  tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
+
+#ifndef HAVE_NATIVE_mpn_mod_1
+#define HAVE_NATIVE_mpn_mod_1 0
+#endif
+
+  /* No support for tuning native assembler code, do that by hand and put
+     the results in the .asm file, there's no need for such thresholds to
+     appear in gmp-mparam.h.  */
+  if (HAVE_NATIVE_mpn_mod_1)
+    return;
+
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define ("MOD_1_NORM_THRESHOLD", 0L);
+      print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
+      return;
+    }
+
+  tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
+
   {
     static struct param_t  param;
-    param.name[0] = "BZ_THRESHOLD";
-    one (speed_mpn_bz_tdiv_qr, bz_threshold, 1, &param);
+    param.name[0] = "MOD_1_NORM_THRESHOLD";
+    DIV_1_PARAMS;
+    s.r = randlimb_norm ();
+    param.function = speed_mpn_mod_1_tune;
+    one (mod_1_norm_threshold, 1, &param);
   }
-  printf("\n");
-
   {
     static struct param_t  param;
-    param.name[0] = "FIB_THRESHOLD";
-    one (speed_mpz_fib_ui, fib_threshold, 1, &param);
+    param.name[0] = "MOD_1_UNNORM_THRESHOLD";
+    DIV_1_PARAMS;
+    s.r = randlimb_half ();
+    param.function = speed_mpn_mod_1_tune;
+    one (mod_1_unnorm_threshold, 1, &param);
   }
-  printf("\n");
+}
 
-  /* mpz_powm becomes slow before long, so stop soon after the determined
-     threshold stops changing. */
+
+/* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
+   imply that udiv_qrnnd_preinv is worth using, but it seems most
+   straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
+   directly.  */
+
+void
+tune_preinv_divrem_1 (void)
+{
+  static struct param_t  param;
+  speed_function_t  divrem_1;
+  const char        *divrem_1_name;
+  double            t1, t2;
+
+#ifndef HAVE_NATIVE_mpn_preinv_divrem_1
+#define HAVE_NATIVE_mpn_preinv_divrem_1 0
+#endif
+
+  /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
+     it's faster than mpn_divrem_1.  */
+  if (HAVE_NATIVE_mpn_preinv_divrem_1)
+    {
+      print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
+      return;
+    }
+
+  /* If udiv_qrnnd_preinv is the only division method then of course
+     mpn_preinv_divrem_1 should be used.  */
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
+      return;
+    }
+
+  /* If we've got an assembler version of mpn_divrem_1, then compare against
+     that, not the mpn_divrem_1_div generic C.  */
+  if (HAVE_NATIVE_mpn_divrem_1)
+    {
+      divrem_1 = speed_mpn_divrem_1;
+      divrem_1_name = "mpn_divrem_1";
+    }
+  else
+    {
+      divrem_1 = speed_mpn_divrem_1_div;
+      divrem_1_name = "mpn_divrem_1_div";
+    }
+
+  param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
+  s.size = 200;                     /* generous but not too big */
+  /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
+     since in general that's probably most common, though in fact for a
+     64-bit limb mp_bases[10].big_base is normalized.  */
+  s.r = urandom() & (MP_LIMB_T_MAX >> 4);
+  if (s.r == 0) s.r = 123;
+
+  t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
+  t2 = tuneup_measure (divrem_1, &param, &s);
+  if (t1 == -1.0 || t2 == -1.0)
+    {
+      printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
+              divrem_1_name, s.size);
+      abort ();
+    }
+  if (option_trace >= 1)
+    printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
+            s.size, t1, divrem_1_name, t2);
+
+  print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
+}
+
+
+/* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
+   that udiv_qrnnd_preinv is worth using, but it seems most straightforward
+   to compare mpn_preinv_mod_1 and mpn_mod_1_div directly.  */
+
+void
+tune_preinv_mod_1 (void)
+{
+  static struct param_t  param;
+  speed_function_t  mod_1;
+  const char        *mod_1_name;
+  double            t1, t2;
+
+#ifndef HAVE_NATIVE_mpn_preinv_mod_1
+#define HAVE_NATIVE_mpn_preinv_mod_1 0
+#endif
+
+  /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
+     faster than mpn_mod_1.  */
+  if (HAVE_NATIVE_mpn_preinv_mod_1)
+    {
+      print_define_remark ("USE_PREINV_MOD_1", 1, "native");
+      return;
+    }
+
+  /* If udiv_qrnnd_preinv is the only division method then of course
+     mpn_preinv_mod_1 should be used.  */
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
+      return;
+    }
+
+  /* If we've got an assembler version of mpn_mod_1, then compare against
+     that, not the mpn_mod_1_div generic C.  */
+  if (HAVE_NATIVE_mpn_mod_1)
+    {
+      mod_1 = speed_mpn_mod_1;
+      mod_1_name = "mpn_mod_1";
+    }
+  else
+    {
+      mod_1 = speed_mpn_mod_1_div;
+      mod_1_name = "mpn_mod_1_div";
+    }
+
+  param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
+  s.size = 200;                     /* generous but not too big */
+  s.r = randlimb_norm();            /* divisor */
+
+  t1 = tuneup_measure (speed_mpn_preinv_mod_1, &param, &s);
+  t2 = tuneup_measure (mod_1, &param, &s);
+  if (t1 == -1.0 || t2 == -1.0)
+    {
+      printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
+              mod_1_name, s.size);
+      abort ();
+    }
+  if (option_trace >= 1)
+    printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
+            s.size, t1, mod_1_name, t2);
+
+  print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
+}
+
+
+void
+tune_divrem_2 (void)
+{
+  static struct param_t  param;
+
+#ifndef HAVE_NATIVE_mpn_divrem_2
+#define HAVE_NATIVE_mpn_divrem_2 0
+#endif
+
+  /* No support for tuning native assembler code, do that by hand and put
+     the results in the .asm file, and there's no need for such thresholds
+     to appear in gmp-mparam.h.  */
+  if (HAVE_NATIVE_mpn_divrem_2)
+    return;
+
+  if (UDIV_PREINV_ALWAYS)
+    {
+      print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
+      return;
+    }
+
+  /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
+     a bit out for the fractional part, but that's too bad, the integer part
+     is more important.
+
+     min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
+     code space if plain division is better only at size==2 or size==3. */
+  param.name[0] = "DIVREM_2_THRESHOLD";
+  param.check_size = 256;
+  param.min_size[0] = 4;
+  param.min_is_always = 1;
+  param.size_extra = 2;      /* does qsize==nsize-2 divisions */
+  param.stop_factor = 2.0;
+
+  s.r = randlimb_norm ();
+  param.function = speed_mpn_divrem_2;
+  one (divrem_2_threshold, 1, &param);
+}
+
+
+/* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
+   tune for that.  Its speed can differ on odd or even divisor, so take an
+   average threshold for the two.
+
+   mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
+   might not vary that way, but don't test this since high<divisor isn't
+   expected to occur often with small divisors.  */
+
+void
+tune_divexact_1 (void)
+{
+  static struct param_t  param;
+  mp_size_t  thresh[2], average;
+  int        low, i;
+
+  ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
+
+  param.name[0] = "DIVEXACT_1_THRESHOLD";
+  param.data_high = DATA_HIGH_GE_R;
+  param.check_size = 256;
+  param.min_size[0] = 2;
+  param.stop_factor = 1.5;
+  param.function  = tuned_speed_mpn_divrem_1;
+  param.function2 = speed_mpn_divexact_1;
+  param.noprint = 1;
+
+  print_define_start (param.name[0]);
+
+  for (low = 0; low <= 1; low++)
+    {
+      s.r = randlimb_half();
+      if (low == 0)
+        s.r |= 1;
+      else
+        s.r &= ~CNST_LIMB(7);
+
+      one (divexact_1_threshold, 1, &param);
+      if (option_trace)
+        printf ("low=%d thresh %ld\n", low, divexact_1_threshold[0]);
+
+      if (divexact_1_threshold[0] == MP_SIZE_T_MAX)
+        {
+          average = MP_SIZE_T_MAX;
+          goto divexact_1_done;
+        }
+
+      thresh[low] = divexact_1_threshold[0];
+    }
+
+  if (option_trace)
+    {
+      printf ("average of:");
+      for (i = 0; i < numberof(thresh); i++)
+        printf (" %ld", thresh[i]);
+      printf ("\n");
+    }
+
+  average = 0;
+  for (i = 0; i < numberof(thresh); i++)
+    average += thresh[i];
+  average /= numberof(thresh);
+
+  /* If divexact turns out to be better as early as 3 limbs, then use it
+     always, so as to reduce code size and conditional jumps.  */
+  if (average <= 3)
+    average = 0;
+
+ divexact_1_done:
+  print_define_end (param.name[0], average);
+}
+
+
+/* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
+   same as mpn_mod_1, but this might not be true of an assembler
+   implementation.  The threshold used is an average based on data where a
+   divide can be skipped and where it can't.
+
+   If modexact turns out to be better as early as 3 limbs, then use it
+   always, so as to reduce code size and conditional jumps.  */
+
+void
+tune_modexact_1_odd (void)
+{
+  static struct param_t  param;
+  mp_size_t  thresh_lt;
+
+  ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
+
+  param.name[0] = "MODEXACT_1_ODD_THRESHOLD";
+  param.check_size = 256;
+  param.min_size[0] = 2;
+  param.stop_factor = 1.5;
+  param.function  = tuned_speed_mpn_mod_1;
+  param.function2 = speed_mpn_modexact_1c_odd;
+  param.noprint = 1;
+  s.r = randlimb_half () | 1;
+
+  print_define_start (param.name[0]);
+
+  param.data_high = DATA_HIGH_LT_R;
+  one (modexact_1_odd_threshold, 1, &param);
+  if (option_trace)
+    printf ("lt thresh %ld\n", modexact_1_odd_threshold[0]);
+
+  thresh_lt = modexact_1_odd_threshold[0];
+  if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
+    {
+      param.data_high = DATA_HIGH_GE_R;
+      one (modexact_1_odd_threshold, 1, &param);
+      if (option_trace)
+        printf ("ge thresh %ld\n", modexact_1_odd_threshold[0]);
+
+      if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
+        {
+          modexact_1_odd_threshold[0]
+            = (modexact_1_odd_threshold[0] + thresh_lt) / 2;
+          if (modexact_1_odd_threshold[0] <= 3)
+            modexact_1_odd_threshold[0] = 0;
+        }
+    }
+
+  print_define_end (param.name[0], modexact_1_odd_threshold[0]);
+}
+
+
+void
+tune_jacobi_base (void)
+{
+  static struct param_t  param;
+  double   t1, t2, t3;
+  int      method;
+
+  s.size = BITS_PER_MP_LIMB * 3 / 4;
+
+  t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
+  if (option_trace >= 1)
+    printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", s.size, t1);
+
+  t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
+  if (option_trace >= 1)
+    printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", s.size, t2);
+
+  t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
+  if (option_trace >= 1)
+    printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", s.size, t3);
+
+  if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
+    {
+      printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
+              s.size);
+      abort ();
+    }
+
+  if (t1 < t2 && t1 < t3)
+    method = 1;
+  else if (t2 < t3)
+    method = 2;
+  else
+    method = 3;
+
+  print_define ("JACOBI_BASE_METHOD", method);
+}
+
+
+void
+tune_get_str (void)
+{
+  /* Tune for decimal, it being most common.  Some rough testing suggests
+     other bases are different, but not by very much.  */
+  s.r = 10;
   {
     static struct param_t  param;
-    param.name[0] = "POWM_THRESHOLD";
-    param.stop_since_change = 15;
-    one (speed_mpz_powm, powm_threshold, 1, &param);
+    get_str_precompute_threshold[0] = 0;
+    param.name[0] = "GET_STR_DC_THRESHOLD";
+    param.function = speed_mpn_get_str;
+    param.min_size[0] = 2;
+    param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
+    one (get_str_basecase_threshold, 1, &param);
   }
-  printf("\n");
-
   {
     static struct param_t  param;
-    param.name[0] = "GCD_ACCEL_THRESHOLD";
-    param.min_size = 1;
-    one (speed_mpn_gcd, gcd_accel_threshold, 1, &param);
+    param.name[0] = "GET_STR_PRECOMPUTE_THRESHOLD";
+    param.function = speed_mpn_get_str;
+    param.min_size[0] = get_str_basecase_threshold[0];
+    param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
+    one (get_str_precompute_threshold, 1, &param);
   }
+}
+
+
+void
+tune_set_str (void)
+{
+  static struct param_t  param;
+
+  s.r = 10;  /* decimal */
+  param.step_factor = 0.04;
+  param.name[0] = "SET_STR_THRESHOLD";
+  param.function = speed_mpn_set_str_basecase;
+  param.function2 = speed_mpn_set_str_subquad;
+  param.min_size[0] = 100;
+  param.max_size[0] = 150000;
+  one (set_str_threshold, 1, &param);
+}
+
+
+void
+tune_fft_mul (void)
+{
+  static struct fft_param_t  param;
+
+  if (option_fft_max_size == 0)
+    return;
+
+  param.table_name          = "MUL_FFT_TABLE";
+  param.threshold_name      = "MUL_FFT_THRESHOLD";
+  param.p_threshold         = &MUL_FFT_THRESHOLD;
+  param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
+  param.p_modf_threshold    = &MUL_FFT_MODF_THRESHOLD;
+  param.first_size          = MUL_TOOM3_THRESHOLD / 2;
+  param.max_size            = option_fft_max_size;
+  param.function            = speed_mpn_mul_fft;
+  param.mul_function        = speed_mpn_mul_n;
+  param.sqr = 0;
+  fft (&param);
+}
+
+
+void
+tune_fft_sqr (void)
+{
+  static struct fft_param_t  param;
+
+  if (option_fft_max_size == 0)
+    return;
+
+  param.table_name          = "SQR_FFT_TABLE";
+  param.threshold_name      = "SQR_FFT_THRESHOLD";
+  param.p_threshold         = &SQR_FFT_THRESHOLD;
+  param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
+  param.p_modf_threshold    = &SQR_FFT_MODF_THRESHOLD;
+  param.first_size          = SQR_TOOM3_THRESHOLD / 2;
+  param.max_size            = option_fft_max_size;
+  param.function            = speed_mpn_mul_fft_sqr;
+  param.mul_function        = speed_mpn_sqr_n;
+  param.sqr = 0;
+  fft (&param);
+}
+
+
+void
+all (void)
+{
+  time_t  start_time, end_time;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
+  s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
+
+  mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
+  mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
+
+  fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
+
+  speed_time_init ();
+  fprintf (stderr, "Using: %s\n", speed_time_string);
+
+  fprintf (stderr, "speed_precision %d", speed_precision);
+  if (speed_unittime == 1.0)
+    fprintf (stderr, ", speed_unittime 1 cycle");
+  else
+    fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
+  if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
+    fprintf (stderr, ", CPU freq unknown\n");
+  else
+    fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
+
+  fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
+           DEFAULT_MAX_SIZE, option_fft_max_size);
+  fprintf (stderr, "\n");
+
+  time (&start_time);
   {
-    static struct param_t  param;
-    param.name[0] = "GCDEXT_THRESHOLD";
-    param.min_size = 1;
-    param.max_size[0] = 200;
-    one (speed_mpn_gcdext, gcdext_threshold, 1, &param);
+    struct tm  *tp;
+    tp = localtime (&start_time);
+    printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
+            tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
+
+#ifdef __GNUC__
+    /* gcc sub-minor version doesn't seem to come through as a define */
+    printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
+#define PRINTED_COMPILER
+#endif
+#if defined (__SUNPRO_C)
+    printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
+#define PRINTED_COMPILER
+#endif
+#if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
+    /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
+    printf ("MIPSpro C %d.%d.%d */\n",
+	    _COMPILER_VERSION / 100,
+	    _COMPILER_VERSION / 10 % 10,
+	    _COMPILER_VERSION % 10);
+#define PRINTED_COMPILER
+#endif
+#if defined (__DECC) && defined (__DECC_VER)
+    printf ("DEC C %d */\n", __DECC_VER);
+#define PRINTED_COMPILER
+#endif
+#if ! defined (PRINTED_COMPILER)
+    printf ("system compiler */\n");
+#endif
   }
+  printf ("\n");
+
+  tune_mul ();
   printf("\n");
 
-  if (option_fft_max_size != 0)
-    {
-      {
-        static struct fft_param_t  param;
-        param.table_name          = "FFT_MUL_TABLE";
-        param.threshold_name      = "FFT_MUL_THRESHOLD";
-        param.p_threshold         = &FFT_MUL_THRESHOLD;
-        param.modf_threshold_name = "FFT_MODF_MUL_THRESHOLD";
-        param.p_modf_threshold    = &FFT_MODF_MUL_THRESHOLD;
-        param.first_size          = TOOM3_MUL_THRESHOLD / 2;
-        param.max_size            = option_fft_max_size;
-        param.function            = speed_mpn_mul_fft;
-        param.mul_function        = speed_mpn_mul_n;
-        param.sqr = 0;
-        fft (&param);
-      }
-      printf("\n");
-      {
-        static struct fft_param_t  param;
-        param.table_name          = "FFT_SQR_TABLE";
-        param.threshold_name      = "FFT_SQR_THRESHOLD";
-        param.p_threshold         = &FFT_SQR_THRESHOLD;
-        param.modf_threshold_name = "FFT_MODF_SQR_THRESHOLD";
-        param.p_modf_threshold    = &FFT_MODF_SQR_THRESHOLD;
-        param.first_size          = TOOM3_SQR_THRESHOLD / 2;
-        param.max_size            = option_fft_max_size;
-        param.function            = speed_mpn_mul_fft_sqr;
-        param.mul_function        = speed_mpn_sqr_n;
-        param.sqr = 0;
-        fft (&param);
-      }
-      printf ("\n");
-    }
+  tune_sqr ();
+  printf("\n");
 
+  tune_sb_preinv ();
+  tune_dc ();
+  tune_powm ();
+  printf("\n");
+
+  tune_gcd_accel ();
+  tune_gcdext ();
+  tune_jacobi_base ();
+  printf("\n");
+
+  tune_divrem_1 ();
+  tune_mod_1 ();
+  tune_preinv_divrem_1 ();
+  tune_preinv_mod_1 ();
+  tune_divrem_2 ();
+  tune_divexact_1 ();
+  tune_modexact_1_odd ();
+  printf("\n");
+
+  tune_get_str ();
+  tune_set_str ();
+  printf("\n");
+
+  tune_fft_mul ();
+  printf("\n");
+
+  tune_fft_sqr ();
+  printf ("\n");
+
+  time (&end_time);
+  printf ("/* Tuneup completed successfully, took %ld seconds */\n",
+          end_time - start_time);
+
   TMP_FREE (marker);
 }
 
@@ -740,7 +1646,7 @@ main (int argc, char *argv[])
         exit(1);
       }
     }
-		
+
   all ();
-  return 0;
+  exit (0);
 }