Annotation of OpenXM_contrib/gmp/tune/tuneup.c, Revision 1.1.1.2
1.1.1.2 ! ohara 1: /* Create tuned thresholds for various algorithms.
1.1 maekawa 2:
1.1.1.2 ! ohara 3: Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4:
5: This file is part of the GNU MP Library.
6:
7: The GNU MP Library is free software; you can redistribute it and/or modify
8: it under the terms of the GNU Lesser General Public License as published by
9: the Free Software Foundation; either version 2.1 of the License, or (at your
10: option) any later version.
11:
12: The GNU MP Library is distributed in the hope that it will be useful, but
13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: License for more details.
16:
17: You should have received a copy of the GNU Lesser General Public License
18: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
1.1.1.2 ! ohara 20: MA 02111-1307, USA. */
! 21:
1.1 maekawa 22:
23: /* Usage: tune [-t] [-t] [-p precision]
24:
25: -t turns on some diagnostic traces, a second -t turns on more traces.
26:
27: The thresholds are determined as follows. A crossover may not be a
28: single size but rather a range where it oscillates between method A or
29: method B faster. If the threshold is set making B used where A is faster
30: (or vice versa) that's bad. Badness is the percentage time lost and
31: total badness is the sum of this over all sizes measured. The threshold
32: is set to minimize total badness.
33:
34: Suppose, as sizes increase, method B becomes faster than method A. The
35: effect of the rule is that, as you look at increasing sizes, isolated
36: points where B is faster are ignored, but when it's consistently faster,
37: or faster on balance, then the threshold is set there. The same result
38: is obtained thinking in the other direction of A becoming faster at
39: smaller sizes.
40:
41: In practice the thresholds tend to be chosen to bring on the next
42: algorithm fairly quickly.
43:
44: This rule is attractive because it's got a basis in reason and is fairly
45: easy to implement, but no work has been done to actually compare it in
46: absolute terms to other possibilities.
47:
48: Sometimes running the program twice produces slightly different results.
49: This is probably because there's so little separating algorithms near
50: their crossover, and on that basis it should make little or no difference
51: to the final speed of the relevant routines, but nothing has been done to
52: check that carefully.
53:
1.1.1.2 ! ohara 54: Remarks:
! 55:
! 56: The code here isn't a vision of loveliness, mainly because it's subject
! 57: to ongoing modifications according to new things wanting to be tuned and
! 58: practical requirements of systems tested.
! 59:
! 60: The way parts of the library are recompiled to insinuate the tuning
! 61: variables is a bit subtle, but unavoidable since of course the main
! 62: library has fixed thresholds compiled-in but we want to vary them here.
! 63: Most of the nonsense for this can be found in tune/Makefile.am and under
! 64: TUNE_PROGRAM_BUILD in gmp-impl.h.
! 65:
! 66: The dirty hack which the "second_start_min" feature could perhaps be done
! 67: more generally, so if say karatsuba is never better than toom3 then it
! 68: can be detected and omitted. Currently we're hoping very hard that this
! 69: doesn't arise in practice, and if it does then it indicates something
! 70: badly sub-optimal in the karatsuba implementation.
! 71:
1.1 maekawa 72: Limitations:
1.1.1.2 ! ohara 73:
1.1 maekawa 74: The FFTs aren't subject to the same badness rule as the other thresholds,
75: so each k is probably being brought on a touch early. This isn't likely
76: to make a difference, and the simpler probing means fewer tests.
77:
78: */
79:
1.1.1.2 ! ohara 80: #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
! 81:
! 82: #include "config.h"
1.1 maekawa 83:
84: #include <math.h>
85: #include <stdio.h>
86: #include <stdlib.h>
87: #include <time.h>
1.1.1.2 ! ohara 88: #if HAVE_UNISTD_H
1.1 maekawa 89: #include <unistd.h>
1.1.1.2 ! ohara 90: #endif
1.1 maekawa 91:
92: #include "gmp.h"
93: #include "gmp-impl.h"
1.1.1.2 ! ohara 94: #include "longlong.h"
1.1 maekawa 95:
1.1.1.2 ! ohara 96: #include "tests.h"
1.1 maekawa 97: #include "speed.h"
98:
99: #if !HAVE_DECL_OPTARG
100: extern char *optarg;
101: extern int optind, opterr;
102: #endif
103:
104:
1.1.1.2 ! ohara 105: #define DEFAULT_MAX_SIZE 1000 /* limbs */
! 106: #define MAX_TABLE 5 /* entries */
1.1 maekawa 107:
108: #if WANT_FFT
109: mp_size_t option_fft_max_size = 50000; /* limbs */
110: #else
111: mp_size_t option_fft_max_size = 0;
112: #endif
113: int option_trace = 0;
114: int option_fft_trace = 0;
115: struct speed_params s;
116:
117: struct dat_t {
118: mp_size_t size;
119: double d;
120: } *dat = NULL;
121: int ndat = 0;
122: int allocdat = 0;
123:
124:
125: /* Each "_threshold" array must be 1 bigger than the number of thresholds
1.1.1.2 ! ohara 126: being tuned in a set, because one() stores a value in the entry above
1.1 maekawa 127: the one it's determining. */
128:
129: mp_size_t mul_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
130: mp_size_t sqr_threshold[MAX_TABLE+1] = { MP_SIZE_T_MAX };
1.1.1.2 ! ohara 131: mp_size_t sb_preinv_threshold[2] = { MP_SIZE_T_MAX };
! 132: mp_size_t dc_threshold[2] = { MP_SIZE_T_MAX };
1.1 maekawa 133: mp_size_t powm_threshold[2] = { MP_SIZE_T_MAX };
134: mp_size_t gcd_accel_threshold[2] = { MP_SIZE_T_MAX };
135: mp_size_t gcdext_threshold[2] = { MP_SIZE_T_MAX };
1.1.1.2 ! ohara 136: mp_size_t divexact_1_threshold[2] = { MP_SIZE_T_MAX };
! 137: mp_size_t divrem_1_norm_threshold[2] = { MP_SIZE_T_MAX };
! 138: mp_size_t divrem_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
! 139: mp_size_t divrem_2_threshold[2] = { MP_SIZE_T_MAX };
! 140: mp_size_t mod_1_norm_threshold[2] = { MP_SIZE_T_MAX };
! 141: mp_size_t mod_1_unnorm_threshold[2] = { MP_SIZE_T_MAX };
! 142: mp_size_t modexact_1_odd_threshold[2] = { MP_SIZE_T_MAX };
! 143: mp_size_t get_str_basecase_threshold[2] = { MP_SIZE_T_MAX };
! 144: mp_size_t get_str_precompute_threshold[2] = { MP_SIZE_T_MAX };
! 145: mp_size_t set_str_threshold[2] = { MP_SIZE_T_MAX };
1.1 maekawa 146:
1.1.1.2 ! ohara 147: mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
! 148: mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
1.1 maekawa 149:
1.1.1.2 ! ohara 150: #ifndef TUNE_SQR_KARATSUBA_MAX
! 151: #define TUNE_SQR_KARATSUBA_MAX 0 /* meaning no limit */
1.1 maekawa 152: #endif
153:
154: struct param_t {
1.1.1.2 ! ohara 155: const char *name[MAX_TABLE];
! 156: speed_function_t function;
! 157: speed_function_t function2;
! 158: double step_factor; /* how much to step sizes (rounded down) */
! 159: double function_fudge; /* multiplier for "function" speeds */
! 160: int stop_since_change;
! 161: double stop_factor;
! 162: mp_size_t min_size[MAX_TABLE];
! 163: int min_is_always;
! 164: int second_start_min;
! 165: mp_size_t max_size[MAX_TABLE];
! 166: mp_size_t check_size;
! 167: mp_size_t size_extra;
! 168:
! 169: #define DATA_HIGH_LT_R 1
! 170: #define DATA_HIGH_GE_R 2
! 171: int data_high;
! 172:
! 173: int noprint;
1.1 maekawa 174: };
175:
1.1.1.2 ! ohara 176: #ifndef UDIV_PREINV_ALWAYS
! 177: #define UDIV_PREINV_ALWAYS 0
! 178: #endif
! 179:
! 180:
! 181: mp_limb_t
! 182: randlimb_norm (void)
! 183: {
! 184: mp_limb_t n;
! 185: mpn_random (&n, 1);
! 186: n |= GMP_LIMB_HIGHBIT;
! 187: return n;
! 188: }
! 189:
! 190: #define MP_LIMB_T_HALFMASK ((CNST_LIMB(1) << (BITS_PER_MP_LIMB/2)) - 1)
! 191:
! 192: mp_limb_t
! 193: randlimb_half (void)
! 194: {
! 195: mp_limb_t n;
! 196: mpn_random (&n, 1);
! 197: n &= MP_LIMB_T_HALFMASK;
! 198: n += (n==0);
! 199: return n;
! 200: }
! 201:
1.1 maekawa 202:
203: /* Add an entry to the end of the dat[] array, reallocing to make it bigger
204: if necessary. */
205: void
206: add_dat (mp_size_t size, double d)
207: {
208: #define ALLOCDAT_STEP 500
209:
210: ASSERT_ALWAYS (ndat <= allocdat);
211:
212: if (ndat == allocdat)
213: {
1.1.1.2 ! ohara 214: dat = (struct dat_t *) __gmp_allocate_or_reallocate
1.1 maekawa 215: (dat, allocdat * sizeof(dat[0]),
216: (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
217: allocdat += ALLOCDAT_STEP;
218: }
219:
220: dat[ndat].size = size;
221: dat[ndat].d = d;
222: ndat++;
223: }
224:
225:
226: /* Return the threshold size based on the data accumulated. */
227: mp_size_t
228: analyze_dat (int i, int final)
229: {
230: double x, min_x;
231: int j, min_j;
232:
233: /* If the threshold is set at dat[0].size, any positive values are bad. */
234: x = 0.0;
235: for (j = 0; j < ndat; j++)
236: if (dat[j].d > 0.0)
237: x += dat[j].d;
238:
239: if (option_trace >= 2 && final)
240: {
241: printf ("\n");
242: printf ("x is the sum of the badness from setting thresh at given size\n");
243: printf (" (minimum x is sought)\n");
244: printf ("i=%d size=%ld first x=%.4f\n", i, dat[j].size, x);
245: }
246:
247: min_x = x;
248: min_j = 0;
249:
250:
251: /* When stepping to the next dat[j].size, positive values are no longer
252: bad (so subtracted), negative values become bad (so add the absolute
253: value, meaning subtract). */
254: for (j = 0; j < ndat; x -= dat[j].d, j++)
255: {
256: if (option_trace >= 2 && final)
257: printf ("i=%d size=%ld x=%.4f\n", i, dat[j].size, x);
258:
259: if (x < min_x)
260: {
261: min_x = x;
262: min_j = j;
263: }
264: }
1.1.1.2 ! ohara 265:
1.1 maekawa 266: return min_j;
267: }
268:
269:
1.1.1.2 ! ohara 270: /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
! 271:
! 272: mp_limb_t mpn_divrem_1_tune _PROTO ((mp_ptr qp, mp_size_t xsize,
! 273: mp_srcptr ap, mp_size_t size,
! 274: mp_limb_t d));
! 275: mp_limb_t mpn_mod_1_tune _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d));
! 276:
! 277: double
! 278: speed_mpn_mod_1_tune (struct speed_params *s)
! 279: {
! 280: SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
! 281: }
! 282: double
! 283: speed_mpn_divrem_1_tune (struct speed_params *s)
! 284: {
! 285: SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
! 286: }
! 287:
! 288:
1.1 maekawa 289: double
1.1.1.2 ! ohara 290: tuneup_measure (speed_function_t fun,
! 291: const struct param_t *param,
! 292: struct speed_params *s)
1.1 maekawa 293: {
1.1.1.2 ! ohara 294: static struct param_t dummy;
1.1 maekawa 295: double t;
296: TMP_DECL (marker);
297:
1.1.1.2 ! ohara 298: if (! param)
! 299: param = &dummy;
! 300:
! 301: s->size += param->size_extra;
! 302:
1.1 maekawa 303: TMP_MARK (marker);
304: s->xp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
305: s->yp = SPEED_TMP_ALLOC_LIMBS (s->size, 0);
306:
307: mpn_random (s->xp, s->size);
308: mpn_random (s->yp, s->size);
309:
1.1.1.2 ! ohara 310: switch (param->data_high) {
! 311: case DATA_HIGH_LT_R:
! 312: s->xp[s->size-1] %= s->r;
! 313: s->yp[s->size-1] %= s->r;
! 314: break;
! 315: case DATA_HIGH_GE_R:
! 316: s->xp[s->size-1] |= s->r;
! 317: s->yp[s->size-1] |= s->r;
! 318: break;
! 319: }
! 320:
1.1 maekawa 321: t = speed_measure (fun, s);
322:
1.1.1.2 ! ohara 323: s->size -= param->size_extra;
! 324:
1.1 maekawa 325: TMP_FREE (marker);
326: return t;
1.1.1.2 ! ohara 327: }
1.1 maekawa 328:
329:
1.1.1.2 ! ohara 330: #define PRINT_WIDTH 28
! 331:
1.1 maekawa 332: void
1.1.1.2 ! ohara 333: print_define_start (const char *name)
! 334: {
! 335: printf ("#define %-*s ", PRINT_WIDTH, name);
! 336: if (option_trace)
! 337: printf ("...\n");
! 338: }
! 339:
! 340: void
! 341: print_define_end_remark (const char *name, mp_size_t value, const char *remark)
! 342: {
! 343: if (option_trace)
! 344: printf ("#define %-*s ", PRINT_WIDTH, name);
! 345:
! 346: if (value == MP_SIZE_T_MAX)
! 347: printf ("MP_SIZE_T_MAX");
! 348: else
! 349: printf ("%5ld", value);
! 350:
! 351: if (remark != NULL)
! 352: printf (" /* %s */", remark);
! 353: printf ("\n");
! 354: }
! 355:
! 356: void
! 357: print_define_end (const char *name, mp_size_t value)
1.1 maekawa 358: {
1.1.1.2 ! ohara 359: const char *remark;
1.1 maekawa 360: if (value == MP_SIZE_T_MAX)
1.1.1.2 ! ohara 361: remark = "never";
! 362: else if (value == 0)
! 363: remark = "always";
1.1 maekawa 364: else
1.1.1.2 ! ohara 365: remark = NULL;
! 366: print_define_end_remark (name, value, remark);
! 367: }
! 368:
! 369: void
! 370: print_define (const char *name, mp_size_t value)
! 371: {
! 372: print_define_start (name);
! 373: print_define_end (name, value);
! 374: }
! 375:
! 376: void
! 377: print_define_remark (const char *name, mp_size_t value, const char *remark)
! 378: {
! 379: print_define_start (name);
! 380: print_define_end_remark (name, value, remark);
1.1 maekawa 381: }
382:
383:
384: /* table[i+1] needs to be set to a sensible value when testing method i+1
1.1.1.2 ! ohara 385: because mpn_mul_n uses MUL_TOOM3_THRESHOLD to size the temporary
1.1 maekawa 386: workspace for mpn_kara_mul_n. */
387:
388: void
1.1.1.2 ! ohara 389: one (mp_size_t table[], size_t max_table, struct param_t *param)
1.1 maekawa 390: {
1.1.1.2 ! ohara 391: mp_size_t table_save0 = 0;
! 392: int since_positive, since_thresh_change;
! 393: int thresh_idx, new_thresh_idx;
1.1 maekawa 394: int i;
395:
1.1.1.2 ! ohara 396: ASSERT_ALWAYS (max_table <= MAX_TABLE);
1.1 maekawa 397:
1.1.1.2 ! ohara 398: #define DEFAULT(x,n) if (! (param->x)) param->x = (n);
1.1 maekawa 399:
1.1.1.2 ! ohara 400: DEFAULT (function_fudge, 1.0);
! 401: DEFAULT (function2, param->function);
! 402: DEFAULT (step_factor, 0.01); /* small steps by default */
1.1 maekawa 403: DEFAULT (stop_since_change, 80);
1.1.1.2 ! ohara 404: DEFAULT (stop_factor, 1.2);
! 405: for (i = 0; i < max_table; i++)
! 406: DEFAULT (min_size[i], 10);
! 407: for (i = 0; i < max_table; i++)
! 408: DEFAULT (max_size[i], DEFAULT_MAX_SIZE);
! 409:
! 410: if (param->check_size != 0)
! 411: {
! 412: double t1, t2;
! 413: s.size = param->check_size;
1.1 maekawa 414:
1.1.1.2 ! ohara 415: table[0] = s.size+1;
! 416: table[1] = param->max_size[0];
! 417: t1 = tuneup_measure (param->function, param, &s);
! 418:
! 419: table[0] = s.size;
! 420: table[1] = s.size+1;
! 421: t2 = tuneup_measure (param->function2, param, &s);
! 422: if (t1 == -1.0 || t2 == -1.0)
! 423: {
! 424: printf ("Oops, can't run both functions at size %ld\n", s.size);
! 425: abort ();
! 426: }
! 427: t1 *= param->function_fudge;
1.1 maekawa 428:
1.1.1.2 ! ohara 429: /* ask that t2 is at least 4% below t1 */
! 430: if (t1 < t2*1.04)
! 431: {
! 432: if (option_trace)
! 433: printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
! 434: table[0] = MP_SIZE_T_MAX;
! 435: if (! param->noprint)
! 436: print_define (param->name[0], table[0]);
! 437: return;
! 438: }
! 439:
! 440: if (option_trace >= 2)
! 441: printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
! 442: s.size, t1, t2);
! 443: }
! 444:
! 445: for (i = 0, s.size = 1; i < max_table && s.size < param->max_size[i]; i++)
1.1 maekawa 446: {
1.1.1.2 ! ohara 447: if (i == 1 && param->second_start_min)
! 448: s.size = 1;
! 449:
! 450: if (s.size < param->min_size[i])
! 451: s.size = param->min_size[i];
! 452:
! 453: if (! (param->noprint || (i == 1 && param->second_start_min)))
! 454: print_define_start (param->name[i]);
1.1 maekawa 455:
456: ndat = 0;
457: since_positive = 0;
458: since_thresh_change = 0;
459: thresh_idx = 0;
460:
461: if (option_trace >= 2)
462: {
463: printf (" algorithm-A algorithm-B ratio possible\n");
464: printf (" (seconds) (seconds) diff thresh\n");
465: }
466:
1.1.1.2 ! ohara 467: for (;
! 468: s.size < param->max_size[i];
! 469: s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1))
1.1 maekawa 470: {
471: double ti, tiplus1, d;
472:
473: /* If there's a size limit and it's reached then it should still
474: be sensible to analyze the data since we want the threshold put
475: either at or near the limit. */
476: if (s.size >= param->max_size[i])
477: {
478: if (option_trace)
479: printf ("Reached maximum size (%ld) without otherwise stopping\n",
480: param->max_size[i]);
481: break;
482: }
483:
484: /*
485: FIXME: check minimum size requirements are met, possibly by just
486: checking for the -1 returns from the speed functions.
487: */
488:
1.1.1.2 ! ohara 489: /* under this hack, don't let method 0 get used at s.size */
! 490: if (i == 1 && param->second_start_min)
! 491: table[0] = MIN (s.size-1, table_save0);
! 492:
1.1 maekawa 493: /* using method i at this size */
494: table[i] = s.size+1;
1.1.1.2 ! ohara 495: table[i+1] = param->max_size[i];
! 496: ti = tuneup_measure (param->function, param, &s);
1.1 maekawa 497: if (ti == -1.0)
498: abort ();
1.1.1.2 ! ohara 499: ti *= param->function_fudge;
1.1 maekawa 500:
501: /* using method i+1 at this size */
502: table[i] = s.size;
503: table[i+1] = s.size+1;
1.1.1.2 ! ohara 504: tiplus1 = tuneup_measure (param->function2, param, &s);
1.1 maekawa 505: if (tiplus1 == -1.0)
506: abort ();
507:
508: /* Calculate the fraction by which the one or the other routine is
509: slower. */
510: if (tiplus1 >= ti)
511: d = (tiplus1 - ti) / tiplus1; /* negative */
512: else
513: d = (tiplus1 - ti) / ti; /* positive */
514:
515: add_dat (s.size, d);
516:
517: new_thresh_idx = analyze_dat (i, 0);
518:
519:
520: if (option_trace >= 2)
1.1.1.2 ! ohara 521: printf ("i=%d size=%ld %.9f %.9f % .4f %c %ld\n",
1.1 maekawa 522: i, s.size, ti, tiplus1, d,
523: ti > tiplus1 ? '#' : ' ',
524: dat[new_thresh_idx].size);
525:
526: /* Stop if the last time method i was faster was more than a
527: certain number of measurements ago. */
528: #define STOP_SINCE_POSITIVE 200
529: if (d >= 0)
1.1.1.2 ! ohara 530: since_positive = 0;
1.1 maekawa 531: else
532: if (++since_positive > STOP_SINCE_POSITIVE)
533: {
534: if (option_trace >= 1)
535: printf ("i=%d stopped due to since_positive (%d)\n",
536: i, STOP_SINCE_POSITIVE);
537: break;
538: }
539:
540: /* Stop if method i has become slower by a certain factor. */
1.1.1.2 ! ohara 541: if (ti >= tiplus1 * param->stop_factor)
1.1 maekawa 542: {
543: if (option_trace >= 1)
544: printf ("i=%d stopped due to ti >= tiplus1 * factor (%.1f)\n",
1.1.1.2 ! ohara 545: i, param->stop_factor);
1.1 maekawa 546: break;
547: }
548:
549: /* Stop if the threshold implied hasn't changed in a certain
550: number of measurements. (It's this condition that ususally
551: stops the loop.) */
552: if (thresh_idx != new_thresh_idx)
553: since_thresh_change = 0, thresh_idx = new_thresh_idx;
554: else
555: if (++since_thresh_change > param->stop_since_change)
556: {
557: if (option_trace >= 1)
558: printf ("i=%d stopped due to since_thresh_change (%d)\n",
559: i, param->stop_since_change);
560: break;
561: }
562:
563: /* Stop if the threshold implied is more than a certain number of
564: measurements ago. */
565: #define STOP_SINCE_AFTER 500
566: if (ndat - thresh_idx > STOP_SINCE_AFTER)
567: {
568: if (option_trace >= 1)
569: printf ("i=%d stopped due to ndat - thresh_idx > amount (%d)\n",
570: i, STOP_SINCE_AFTER);
571: break;
572: }
573: }
574:
575: /* Stop when the size limit is reached before the end of the
1.1.1.2 ! ohara 576: crossover, but only show this as an error for >= the default max
! 577: size. FIXME: Maybe should make it a param choice whether this is
! 578: an error. */
! 579: if (s.size >= param->max_size[i]
! 580: && param->max_size[i] >= DEFAULT_MAX_SIZE)
1.1 maekawa 581: {
582: fprintf (stderr, "%s\n", param->name[i]);
583: fprintf (stderr, "i=%d sizes %ld to %ld total %d measurements\n",
584: i, dat[0].size, dat[ndat-1].size, ndat);
585: fprintf (stderr, " max size reached before end of crossover\n");
586: break;
587: }
588:
589: if (option_trace >= 1)
590: printf ("i=%d sizes %ld to %ld total %d measurements\n",
591: i, dat[0].size, dat[ndat-1].size, ndat);
592:
593: if (ndat == 0)
594: break;
595:
596: table[i] = dat[analyze_dat (i, 1)].size;
597:
1.1.1.2 ! ohara 598: /* fudge here, let min_is_always apply only to i==0, that's what the
! 599: sqr_n thresholds want */
! 600: if (i == 0 && param->min_is_always && table[i] == param->min_size[i])
! 601: table[i] = 0;
! 602:
! 603: /* under the second_start_min fudge, if the second threshold turns out
! 604: to be lower than the first, then the second method is unwanted, we
! 605: should go straight from algorithm 1 to algorithm 3. */
! 606: if (param->second_start_min)
! 607: {
! 608: if (i == 0)
! 609: {
! 610: table_save0 = table[0];
! 611: table[0] = 0;
! 612: }
! 613: else if (i == 1)
! 614: {
! 615: table[0] = table_save0;
! 616: if (table[1] <= table[0])
! 617: {
! 618: table[0] = table[1];
! 619: table[1] = 0;
! 620: }
! 621: }
! 622: s.size = MAX (table[0], table[1]) + 1;
! 623: }
! 624:
! 625: if (! (param->noprint || (i == 0 && param->second_start_min)))
! 626: {
! 627: if (i == 1 && param->second_start_min)
! 628: {
! 629: print_define_end (param->name[0], table[0]);
! 630: print_define_start (param->name[1]);
! 631: }
1.1 maekawa 632:
1.1.1.2 ! ohara 633: print_define_end (param->name[i], table[i]);
! 634: }
! 635:
! 636: /* Look for the next threshold starting from the current one. */
1.1 maekawa 637: s.size = table[i]+1;
1.1.1.2 ! ohara 638:
! 639: /* Take a MAX of all to allow for second_start_min producing a 0. */
! 640: {
! 641: int j;
! 642: for (j = 0; j < i; j++)
! 643: s.size = MAX (s.size, table[j]+1);
! 644: }
! 645: }
1.1 maekawa 646: }
647:
648:
649: /* Special probing for the fft thresholds. The size restrictions on the
650: FFTs mean the graph of time vs size has a step effect. See this for
651: example using
652:
653: ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
654: gnuplot foo.gnuplot
655:
656: The current approach is to compare routines at the midpoint of relevant
657: steps. Arguably a more sophisticated system of threshold data is wanted
658: if this step effect remains. */
659:
660: struct fft_param_t {
661: const char *table_name;
662: const char *threshold_name;
663: const char *modf_threshold_name;
664: mp_size_t *p_threshold;
665: mp_size_t *p_modf_threshold;
666: mp_size_t first_size;
667: mp_size_t max_size;
668: speed_function_t function;
669: speed_function_t mul_function;
670: mp_size_t sqr;
671: };
672:
1.1.1.2 ! ohara 673:
1.1 maekawa 674: /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
675: N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
676: of 2^(k-1) bits. */
677:
678: mp_size_t
679: fft_step_size (int k)
680: {
1.1.1.2 ! ohara 681: mp_size_t step;
! 682:
! 683: step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB;
! 684: step *= (mp_size_t) 1 << k;
! 685:
! 686: if (step <= 0)
1.1 maekawa 687: {
688: printf ("Can't handle k=%d\n", k);
689: abort ();
690: }
1.1.1.2 ! ohara 691:
! 692: return step;
1.1 maekawa 693: }
694:
695: mp_size_t
696: fft_next_size (mp_size_t pl, int k)
697: {
698: mp_size_t m = fft_step_size (k);
699:
700: /* printf ("[k=%d %ld] %ld ->", k, m, pl); */
701:
702: if (pl == 0 || (pl & (m-1)) != 0)
703: pl = (pl | (m-1)) + 1;
704:
705: /* printf (" %ld\n", pl); */
706: return pl;
707: }
708:
709: void
710: fft (struct fft_param_t *p)
711: {
712: mp_size_t size;
713: int i, k;
714:
715: for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++)
716: mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX;
717:
718: *p->p_threshold = MP_SIZE_T_MAX;
719: *p->p_modf_threshold = MP_SIZE_T_MAX;
720:
721: option_trace = MAX (option_trace, option_fft_trace);
722:
723: printf ("#define %s {", p->table_name);
724: if (option_trace >= 2)
725: printf ("\n");
726:
727: k = FFT_FIRST_K;
728: size = p->first_size;
729: for (;;)
730: {
731: double tk, tk1;
732:
733: size = fft_next_size (size+1, k+1);
734:
735: if (size >= p->max_size)
736: break;
737: if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr]))
738: break;
739:
740: /* compare k to k+1 in the middle of the current k+1 step */
741: s.size = size + fft_step_size (k+1) / 2;
742: s.r = k;
1.1.1.2 ! ohara 743: tk = tuneup_measure (p->function, NULL, &s);
1.1 maekawa 744: if (tk == -1.0)
745: abort ();
746:
747: s.r = k+1;
1.1.1.2 ! ohara 748: tk1 = tuneup_measure (p->function, NULL, &s);
1.1 maekawa 749: if (tk1 == -1.0)
750: abort ();
751:
752: if (option_trace >= 2)
1.1.1.2 ! ohara 753: printf ("at %ld size=%ld k=%d %.9f k=%d %.9f\n",
1.1 maekawa 754: size, s.size, k, tk, k+1, tk1);
755:
756: /* declare the k+1 threshold as soon as it's faster at its midpoint */
757: if (tk1 < tk)
758: {
759: mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size;
760: printf (" %ld,", s.size);
761: if (option_trace >= 2) printf ("\n");
762: k++;
763: }
764: }
765:
766: mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0;
767: printf (" 0 }\n");
768:
769:
770: size = p->first_size;
1.1.1.2 ! ohara 771:
1.1 maekawa 772: /* Declare an FFT faster than a plain toom3 etc multiplication found as
773: soon as one faster measurement obtained. A multiplication in the
774: middle of the FFT step is tested. */
775: for (;;)
776: {
777: int modf = (*p->p_modf_threshold == MP_SIZE_T_MAX);
778: double tk, tm;
779:
780: /* k=7 should be the first FFT which can beat toom3 on a full
781: multiply, so jump to that threshold and save some probing after the
782: modf threshold is found. */
783: if (!modf && size < mpn_fft_table[p->sqr][2])
784: {
785: size = mpn_fft_table[p->sqr][2];
786: if (option_trace >= 2)
787: printf ("jump to size=%ld\n", size);
788: }
789:
790: size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr));
791: k = mpn_fft_best_k (size, p->sqr);
792:
793: if (size >= p->max_size)
794: break;
795:
796: s.size = size + fft_step_size (k) / 2;
797: s.r = k;
1.1.1.2 ! ohara 798: tk = tuneup_measure (p->function, NULL, &s);
1.1 maekawa 799: if (tk == -1.0)
800: abort ();
801:
802: if (!modf) s.size /= 2;
1.1.1.2 ! ohara 803: tm = tuneup_measure (p->mul_function, NULL, &s);
1.1 maekawa 804: if (tm == -1.0)
805: abort ();
806:
807: if (option_trace >= 2)
1.1.1.2 ! ohara 808: printf ("at %ld size=%ld k=%d %.9f size=%ld %s mul %.9f\n",
1.1 maekawa 809: size,
810: size + fft_step_size (k) / 2, k, tk,
811: s.size, modf ? "modf" : "full", tm);
812:
813: if (tk < tm)
814: {
815: if (modf)
816: {
817: *p->p_modf_threshold = s.size;
818: print_define (p->modf_threshold_name, *p->p_modf_threshold);
819: }
820: else
821: {
822: *p->p_threshold = s.size;
823: print_define (p->threshold_name, *p->p_threshold);
824: break;
825: }
826: }
827: }
828:
829: }
830:
831:
1.1.1.2 ! ohara 832:
! 833: /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
! 834: giving wrong results. */
1.1 maekawa 835: void
1.1.1.2 ! ohara 836: tune_mul (void)
1.1 maekawa 837: {
1.1.1.2 ! ohara 838: static struct param_t param;
! 839: param.name[0] = "MUL_KARATSUBA_THRESHOLD";
! 840: param.name[1] = "MUL_TOOM3_THRESHOLD";
! 841: param.function = speed_mpn_mul_n;
! 842: param.min_size[0] = MAX (4, MPN_KARA_MUL_N_MINSIZE);
! 843: param.max_size[0] = MUL_TOOM3_THRESHOLD_LIMIT-1;
! 844: param.max_size[1] = MUL_TOOM3_THRESHOLD_LIMIT-1;
! 845: one (mul_threshold, 2, ¶m);
1.1 maekawa 846:
1.1.1.2 ! ohara 847: /* disabled until tuned */
! 848: MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
! 849: }
1.1 maekawa 850:
851:
1.1.1.2 ! ohara 852: /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
! 853: is faster only at size==2 then we don't want to bother with extra code
! 854: just for that. Start karatsuba from 4 same as MUL above. */
! 855: void
! 856: tune_sqr (void)
! 857: {
! 858: static struct param_t param;
! 859: param.name[0] = "SQR_BASECASE_THRESHOLD";
! 860: param.name[1] = "SQR_KARATSUBA_THRESHOLD";
! 861: param.name[2] = "SQR_TOOM3_THRESHOLD";
! 862: param.function = speed_mpn_sqr_n;
! 863: param.min_is_always = 1;
! 864: param.second_start_min = 1;
! 865: param.min_size[0] = 3;
! 866: param.min_size[1] = MAX (4, MPN_KARA_SQR_N_MINSIZE);
! 867: param.min_size[2] = MPN_TOOM3_SQR_N_MINSIZE;
! 868: param.max_size[0] = TUNE_SQR_KARATSUBA_MAX;
! 869: param.max_size[1] = TUNE_SQR_KARATSUBA_MAX;
! 870: one (sqr_threshold, 3, ¶m);
! 871:
! 872: /* disabled until tuned */
! 873: SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
! 874: }
! 875:
! 876:
! 877: void
! 878: tune_sb_preinv (void)
! 879: {
! 880: static struct param_t param;
! 881:
! 882: if (UDIV_PREINV_ALWAYS)
! 883: {
! 884: print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always");
! 885: return;
! 886: }
! 887:
! 888: param.check_size = 256;
! 889: param.min_size[0] = 3;
! 890: param.min_is_always = 1;
! 891: param.size_extra = 3;
! 892: param.stop_factor = 2.0;
! 893: param.name[0] = "DIV_SB_PREINV_THRESHOLD";
! 894: param.function = speed_mpn_sb_divrem_m3;
! 895: one (sb_preinv_threshold, 1, ¶m);
! 896: }
! 897:
! 898:
! 899: void
! 900: tune_dc (void)
! 901: {
! 902: static struct param_t param;
! 903: param.name[0] = "DIV_DC_THRESHOLD";
! 904: param.function = speed_mpn_dc_tdiv_qr;
! 905: one (dc_threshold, 1, ¶m);
! 906: }
! 907:
! 908:
! 909: /* This is an indirect determination, based on a comparison between redc and
! 910: mpz_mod. A fudge factor of 1.04 is applied to redc, to represent
! 911: additional overheads it gets in mpz_powm.
! 912:
! 913: stop_factor is 1.1 to hopefully help cray vector systems, where otherwise
! 914: currently it hits the 1000 limb limit with only a factor of about 1.18
! 915: (threshold should be around 650). */
! 916:
! 917: void
! 918: tune_powm (void)
! 919: {
! 920: static struct param_t param;
! 921: param.name[0] = "POWM_THRESHOLD";
! 922: param.function = speed_redc;
! 923: param.function2 = speed_mpz_mod;
! 924: param.step_factor = 0.03;
! 925: param.stop_factor = 1.1;
! 926: param.function_fudge = 1.04;
! 927: one (powm_threshold, 1, ¶m);
! 928: }
! 929:
! 930:
! 931: void
! 932: tune_gcd_accel (void)
! 933: {
! 934: static struct param_t param;
! 935: param.name[0] = "GCD_ACCEL_THRESHOLD";
! 936: param.function = speed_mpn_gcd;
! 937: param.min_size[0] = 1;
! 938: one (gcd_accel_threshold, 1, ¶m);
! 939: }
! 940:
! 941:
! 942:
! 943: /* A comparison between the speed of a single limb step and a double limb
! 944: step is made. On a 32-bit limb the ratio is about 2.2 single steps to
! 945: equal a double step, or on a 64-bit limb about 2.09. (These were found
! 946: from counting the steps on a 10000 limb gcdext. */
! 947: void
! 948: tune_gcdext (void)
! 949: {
! 950: static struct param_t param;
! 951: param.name[0] = "GCDEXT_THRESHOLD";
! 952: param.function = speed_mpn_gcdext_one_single;
! 953: param.function2 = speed_mpn_gcdext_one_double;
! 954: switch (BITS_PER_MP_LIMB) {
! 955: case 32: param.function_fudge = 2.2; break;
! 956: case 64: param.function_fudge = 2.09; break;
! 957: default:
! 958: printf ("Don't know GCDEXT_THERSHOLD factor for BITS_PER_MP_LIMB == %d\n",
! 959: BITS_PER_MP_LIMB);
! 960: abort ();
1.1 maekawa 961: }
1.1.1.2 ! ohara 962: param.min_size[0] = 5;
! 963: param.min_is_always = 1;
! 964: param.max_size[0] = 300;
! 965: param.check_size = 300;
! 966: one (gcdext_threshold, 1, ¶m);
! 967: }
! 968:
! 969:
! 970: /* size_extra==1 reflects the fact that with high<divisor one division is
! 971: always skipped. Forcing high<divisor while testing ensures consistency
! 972: while stepping through sizes, ie. that size-1 divides will be done each
! 973: time.
! 974:
! 975: min_size==2 and min_is_always are used so that if plain division is only
! 976: better at size==1 then don't bother including that code just for that
! 977: case, instead go with preinv always and get a size saving. */
! 978:
! 979: #define DIV_1_PARAMS \
! 980: param.check_size = 256; \
! 981: param.min_size[0] = 2; \
! 982: param.min_is_always = 1; \
! 983: param.data_high = DATA_HIGH_LT_R; \
! 984: param.size_extra = 1; \
! 985: param.stop_factor = 2.0;
! 986:
! 987:
! 988: double (*tuned_speed_mpn_divrem_1) _PROTO ((struct speed_params *s));
! 989:
! 990: void
! 991: tune_divrem_1 (void)
! 992: {
! 993: /* plain version by default */
! 994: tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
! 995:
! 996: #ifndef HAVE_NATIVE_mpn_divrem_1
! 997: #define HAVE_NATIVE_mpn_divrem_1 0
! 998: #endif
! 999:
! 1000: /* No support for tuning native assembler code, do that by hand and put
! 1001: the results in the .asm file, there's no need for such thresholds to
! 1002: appear in gmp-mparam.h. */
! 1003: if (HAVE_NATIVE_mpn_divrem_1)
! 1004: return;
! 1005:
! 1006: if (UDIV_PREINV_ALWAYS)
! 1007: {
! 1008: print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
! 1009: print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
! 1010: return;
! 1011: }
! 1012:
! 1013: tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1.1 maekawa 1014:
1.1.1.2 ! ohara 1015: /* Tune for the integer part of mpn_divrem_1. This will very possibly be
! 1016: a bit out for the fractional part, but that's too bad, the integer part
! 1017: is more important. */
1.1 maekawa 1018: {
1019: static struct param_t param;
1.1.1.2 ! ohara 1020: param.name[0] = "DIVREM_1_NORM_THRESHOLD";
! 1021: DIV_1_PARAMS;
! 1022: s.r = randlimb_norm ();
! 1023: param.function = speed_mpn_divrem_1_tune;
! 1024: one (divrem_1_norm_threshold, 1, ¶m);
1.1 maekawa 1025: }
1026: {
1027: static struct param_t param;
1.1.1.2 ! ohara 1028: param.name[0] = "DIVREM_1_UNNORM_THRESHOLD";
! 1029: DIV_1_PARAMS;
! 1030: s.r = randlimb_half ();
! 1031: param.function = speed_mpn_divrem_1_tune;
! 1032: one (divrem_1_unnorm_threshold, 1, ¶m);
1.1 maekawa 1033: }
1.1.1.2 ! ohara 1034: }
! 1035:
! 1036:
! 1037: double (*tuned_speed_mpn_mod_1) _PROTO ((struct speed_params *s));
! 1038:
! 1039: void
! 1040: tune_mod_1 (void)
! 1041: {
! 1042: /* plain version by default */
! 1043: tuned_speed_mpn_mod_1 = speed_mpn_mod_1;
! 1044:
! 1045: #ifndef HAVE_NATIVE_mpn_mod_1
! 1046: #define HAVE_NATIVE_mpn_mod_1 0
! 1047: #endif
! 1048:
! 1049: /* No support for tuning native assembler code, do that by hand and put
! 1050: the results in the .asm file, there's no need for such thresholds to
! 1051: appear in gmp-mparam.h. */
! 1052: if (HAVE_NATIVE_mpn_mod_1)
! 1053: return;
! 1054:
! 1055: if (UDIV_PREINV_ALWAYS)
! 1056: {
! 1057: print_define ("MOD_1_NORM_THRESHOLD", 0L);
! 1058: print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
! 1059: return;
! 1060: }
! 1061:
! 1062: tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune;
1.1 maekawa 1063:
1064: {
1065: static struct param_t param;
1.1.1.2 ! ohara 1066: param.name[0] = "MOD_1_NORM_THRESHOLD";
! 1067: DIV_1_PARAMS;
! 1068: s.r = randlimb_norm ();
! 1069: param.function = speed_mpn_mod_1_tune;
! 1070: one (mod_1_norm_threshold, 1, ¶m);
1.1 maekawa 1071: }
1072: {
1073: static struct param_t param;
1.1.1.2 ! ohara 1074: param.name[0] = "MOD_1_UNNORM_THRESHOLD";
! 1075: DIV_1_PARAMS;
! 1076: s.r = randlimb_half ();
! 1077: param.function = speed_mpn_mod_1_tune;
! 1078: one (mod_1_unnorm_threshold, 1, ¶m);
1.1 maekawa 1079: }
1.1.1.2 ! ohara 1080: }
! 1081:
! 1082:
! 1083: /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
! 1084: imply that udiv_qrnnd_preinv is worth using, but it seems most
! 1085: straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
! 1086: directly. */
! 1087:
! 1088: void
! 1089: tune_preinv_divrem_1 (void)
! 1090: {
! 1091: static struct param_t param;
! 1092: speed_function_t divrem_1;
! 1093: const char *divrem_1_name;
! 1094: double t1, t2;
! 1095:
! 1096: #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
! 1097: #define HAVE_NATIVE_mpn_preinv_divrem_1 0
! 1098: #endif
! 1099:
! 1100: /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
! 1101: it's faster than mpn_divrem_1. */
! 1102: if (HAVE_NATIVE_mpn_preinv_divrem_1)
! 1103: {
! 1104: print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
! 1105: return;
! 1106: }
! 1107:
! 1108: /* If udiv_qrnnd_preinv is the only division method then of course
! 1109: mpn_preinv_divrem_1 should be used. */
! 1110: if (UDIV_PREINV_ALWAYS)
! 1111: {
! 1112: print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
! 1113: return;
! 1114: }
! 1115:
! 1116: /* If we've got an assembler version of mpn_divrem_1, then compare against
! 1117: that, not the mpn_divrem_1_div generic C. */
! 1118: if (HAVE_NATIVE_mpn_divrem_1)
! 1119: {
! 1120: divrem_1 = speed_mpn_divrem_1;
! 1121: divrem_1_name = "mpn_divrem_1";
! 1122: }
! 1123: else
! 1124: {
! 1125: divrem_1 = speed_mpn_divrem_1_div;
! 1126: divrem_1_name = "mpn_divrem_1_div";
! 1127: }
! 1128:
! 1129: param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
! 1130: s.size = 200; /* generous but not too big */
! 1131: /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
! 1132: since in general that's probably most common, though in fact for a
! 1133: 64-bit limb mp_bases[10].big_base is normalized. */
! 1134: s.r = urandom() & (MP_LIMB_T_MAX >> 4);
! 1135: if (s.r == 0) s.r = 123;
! 1136:
! 1137: t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s);
! 1138: t2 = tuneup_measure (divrem_1, ¶m, &s);
! 1139: if (t1 == -1.0 || t2 == -1.0)
! 1140: {
! 1141: printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
! 1142: divrem_1_name, s.size);
! 1143: abort ();
! 1144: }
! 1145: if (option_trace >= 1)
! 1146: printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
! 1147: s.size, t1, divrem_1_name, t2);
! 1148:
! 1149: print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
! 1150: }
! 1151:
! 1152:
! 1153: /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply
! 1154: that udiv_qrnnd_preinv is worth using, but it seems most straightforward
! 1155: to compare mpn_preinv_mod_1 and mpn_mod_1_div directly. */
! 1156:
! 1157: void
! 1158: tune_preinv_mod_1 (void)
! 1159: {
! 1160: static struct param_t param;
! 1161: speed_function_t mod_1;
! 1162: const char *mod_1_name;
! 1163: double t1, t2;
! 1164:
! 1165: #ifndef HAVE_NATIVE_mpn_preinv_mod_1
! 1166: #define HAVE_NATIVE_mpn_preinv_mod_1 0
! 1167: #endif
! 1168:
! 1169: /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's
! 1170: faster than mpn_mod_1. */
! 1171: if (HAVE_NATIVE_mpn_preinv_mod_1)
! 1172: {
! 1173: print_define_remark ("USE_PREINV_MOD_1", 1, "native");
! 1174: return;
! 1175: }
! 1176:
! 1177: /* If udiv_qrnnd_preinv is the only division method then of course
! 1178: mpn_preinv_mod_1 should be used. */
! 1179: if (UDIV_PREINV_ALWAYS)
! 1180: {
! 1181: print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always");
! 1182: return;
! 1183: }
! 1184:
! 1185: /* If we've got an assembler version of mpn_mod_1, then compare against
! 1186: that, not the mpn_mod_1_div generic C. */
! 1187: if (HAVE_NATIVE_mpn_mod_1)
! 1188: {
! 1189: mod_1 = speed_mpn_mod_1;
! 1190: mod_1_name = "mpn_mod_1";
! 1191: }
! 1192: else
! 1193: {
! 1194: mod_1 = speed_mpn_mod_1_div;
! 1195: mod_1_name = "mpn_mod_1_div";
! 1196: }
! 1197:
! 1198: param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */
! 1199: s.size = 200; /* generous but not too big */
! 1200: s.r = randlimb_norm(); /* divisor */
! 1201:
! 1202: t1 = tuneup_measure (speed_mpn_preinv_mod_1, ¶m, &s);
! 1203: t2 = tuneup_measure (mod_1, ¶m, &s);
! 1204: if (t1 == -1.0 || t2 == -1.0)
! 1205: {
! 1206: printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n",
! 1207: mod_1_name, s.size);
! 1208: abort ();
! 1209: }
! 1210: if (option_trace >= 1)
! 1211: printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n",
! 1212: s.size, t1, mod_1_name, t2);
! 1213:
! 1214: print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL);
! 1215: }
! 1216:
! 1217:
! 1218: void
! 1219: tune_divrem_2 (void)
! 1220: {
! 1221: static struct param_t param;
! 1222:
! 1223: #ifndef HAVE_NATIVE_mpn_divrem_2
! 1224: #define HAVE_NATIVE_mpn_divrem_2 0
! 1225: #endif
! 1226:
! 1227: /* No support for tuning native assembler code, do that by hand and put
! 1228: the results in the .asm file, and there's no need for such thresholds
! 1229: to appear in gmp-mparam.h. */
! 1230: if (HAVE_NATIVE_mpn_divrem_2)
! 1231: return;
! 1232:
! 1233: if (UDIV_PREINV_ALWAYS)
! 1234: {
! 1235: print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
! 1236: return;
! 1237: }
! 1238:
! 1239: /* Tune for the integer part of mpn_divrem_2. This will very possibly be
! 1240: a bit out for the fractional part, but that's too bad, the integer part
! 1241: is more important.
! 1242:
! 1243: min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
! 1244: code space if plain division is better only at size==2 or size==3. */
! 1245: param.name[0] = "DIVREM_2_THRESHOLD";
! 1246: param.check_size = 256;
! 1247: param.min_size[0] = 4;
! 1248: param.min_is_always = 1;
! 1249: param.size_extra = 2; /* does qsize==nsize-2 divisions */
! 1250: param.stop_factor = 2.0;
! 1251:
! 1252: s.r = randlimb_norm ();
! 1253: param.function = speed_mpn_divrem_2;
! 1254: one (divrem_2_threshold, 1, ¶m);
! 1255: }
! 1256:
! 1257:
! 1258: /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
! 1259: tune for that. Its speed can differ on odd or even divisor, so take an
! 1260: average threshold for the two.
! 1261:
! 1262: mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
! 1263: might not vary that way, but don't test this since high<divisor isn't
! 1264: expected to occur often with small divisors. */
! 1265:
! 1266: void
! 1267: tune_divexact_1 (void)
! 1268: {
! 1269: static struct param_t param;
! 1270: mp_size_t thresh[2], average;
! 1271: int low, i;
! 1272:
! 1273: ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
! 1274:
! 1275: param.name[0] = "DIVEXACT_1_THRESHOLD";
! 1276: param.data_high = DATA_HIGH_GE_R;
! 1277: param.check_size = 256;
! 1278: param.min_size[0] = 2;
! 1279: param.stop_factor = 1.5;
! 1280: param.function = tuned_speed_mpn_divrem_1;
! 1281: param.function2 = speed_mpn_divexact_1;
! 1282: param.noprint = 1;
! 1283:
! 1284: print_define_start (param.name[0]);
! 1285:
! 1286: for (low = 0; low <= 1; low++)
! 1287: {
! 1288: s.r = randlimb_half();
! 1289: if (low == 0)
! 1290: s.r |= 1;
! 1291: else
! 1292: s.r &= ~CNST_LIMB(7);
! 1293:
! 1294: one (divexact_1_threshold, 1, ¶m);
! 1295: if (option_trace)
! 1296: printf ("low=%d thresh %ld\n", low, divexact_1_threshold[0]);
! 1297:
! 1298: if (divexact_1_threshold[0] == MP_SIZE_T_MAX)
! 1299: {
! 1300: average = MP_SIZE_T_MAX;
! 1301: goto divexact_1_done;
! 1302: }
! 1303:
! 1304: thresh[low] = divexact_1_threshold[0];
! 1305: }
! 1306:
! 1307: if (option_trace)
! 1308: {
! 1309: printf ("average of:");
! 1310: for (i = 0; i < numberof(thresh); i++)
! 1311: printf (" %ld", thresh[i]);
! 1312: printf ("\n");
! 1313: }
! 1314:
! 1315: average = 0;
! 1316: for (i = 0; i < numberof(thresh); i++)
! 1317: average += thresh[i];
! 1318: average /= numberof(thresh);
! 1319:
! 1320: /* If divexact turns out to be better as early as 3 limbs, then use it
! 1321: always, so as to reduce code size and conditional jumps. */
! 1322: if (average <= 3)
! 1323: average = 0;
! 1324:
! 1325: divexact_1_done:
! 1326: print_define_end (param.name[0], average);
! 1327: }
! 1328:
! 1329:
! 1330: /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
! 1331: same as mpn_mod_1, but this might not be true of an assembler
! 1332: implementation. The threshold used is an average based on data where a
! 1333: divide can be skipped and where it can't.
! 1334:
! 1335: If modexact turns out to be better as early as 3 limbs, then use it
! 1336: always, so as to reduce code size and conditional jumps. */
! 1337:
! 1338: void
! 1339: tune_modexact_1_odd (void)
! 1340: {
! 1341: static struct param_t param;
! 1342: mp_size_t thresh_lt;
! 1343:
! 1344: ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL);
! 1345:
! 1346: param.name[0] = "MODEXACT_1_ODD_THRESHOLD";
! 1347: param.check_size = 256;
! 1348: param.min_size[0] = 2;
! 1349: param.stop_factor = 1.5;
! 1350: param.function = tuned_speed_mpn_mod_1;
! 1351: param.function2 = speed_mpn_modexact_1c_odd;
! 1352: param.noprint = 1;
! 1353: s.r = randlimb_half () | 1;
! 1354:
! 1355: print_define_start (param.name[0]);
! 1356:
! 1357: param.data_high = DATA_HIGH_LT_R;
! 1358: one (modexact_1_odd_threshold, 1, ¶m);
! 1359: if (option_trace)
! 1360: printf ("lt thresh %ld\n", modexact_1_odd_threshold[0]);
! 1361:
! 1362: thresh_lt = modexact_1_odd_threshold[0];
! 1363: if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
! 1364: {
! 1365: param.data_high = DATA_HIGH_GE_R;
! 1366: one (modexact_1_odd_threshold, 1, ¶m);
! 1367: if (option_trace)
! 1368: printf ("ge thresh %ld\n", modexact_1_odd_threshold[0]);
! 1369:
! 1370: if (modexact_1_odd_threshold[0] != MP_SIZE_T_MAX)
! 1371: {
! 1372: modexact_1_odd_threshold[0]
! 1373: = (modexact_1_odd_threshold[0] + thresh_lt) / 2;
! 1374: if (modexact_1_odd_threshold[0] <= 3)
! 1375: modexact_1_odd_threshold[0] = 0;
! 1376: }
! 1377: }
! 1378:
! 1379: print_define_end (param.name[0], modexact_1_odd_threshold[0]);
! 1380: }
! 1381:
! 1382:
! 1383: void
! 1384: tune_jacobi_base (void)
! 1385: {
! 1386: static struct param_t param;
! 1387: double t1, t2, t3;
! 1388: int method;
! 1389:
! 1390: s.size = BITS_PER_MP_LIMB * 3 / 4;
! 1391:
! 1392: t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s);
! 1393: if (option_trace >= 1)
! 1394: printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", s.size, t1);
! 1395:
! 1396: t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s);
! 1397: if (option_trace >= 1)
! 1398: printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", s.size, t2);
! 1399:
! 1400: t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s);
! 1401: if (option_trace >= 1)
! 1402: printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", s.size, t3);
! 1403:
! 1404: if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
! 1405: {
! 1406: printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
! 1407: s.size);
! 1408: abort ();
! 1409: }
! 1410:
! 1411: if (t1 < t2 && t1 < t3)
! 1412: method = 1;
! 1413: else if (t2 < t3)
! 1414: method = 2;
! 1415: else
! 1416: method = 3;
! 1417:
! 1418: print_define ("JACOBI_BASE_METHOD", method);
! 1419: }
! 1420:
1.1 maekawa 1421:
1.1.1.2 ! ohara 1422: void
! 1423: tune_get_str (void)
! 1424: {
! 1425: /* Tune for decimal, it being most common. Some rough testing suggests
! 1426: other bases are different, but not by very much. */
! 1427: s.r = 10;
1.1 maekawa 1428: {
1429: static struct param_t param;
1.1.1.2 ! ohara 1430: get_str_precompute_threshold[0] = 0;
! 1431: param.name[0] = "GET_STR_DC_THRESHOLD";
! 1432: param.function = speed_mpn_get_str;
! 1433: param.min_size[0] = 2;
! 1434: param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
! 1435: one (get_str_basecase_threshold, 1, ¶m);
1.1 maekawa 1436: }
1437: {
1438: static struct param_t param;
1.1.1.2 ! ohara 1439: param.name[0] = "GET_STR_PRECOMPUTE_THRESHOLD";
! 1440: param.function = speed_mpn_get_str;
! 1441: param.min_size[0] = get_str_basecase_threshold[0];
! 1442: param.max_size[0] = GET_STR_THRESHOLD_LIMIT;
! 1443: one (get_str_precompute_threshold, 1, ¶m);
1.1 maekawa 1444: }
1.1.1.2 ! ohara 1445: }
! 1446:
! 1447:
! 1448: void
! 1449: tune_set_str (void)
! 1450: {
! 1451: static struct param_t param;
! 1452:
! 1453: s.r = 10; /* decimal */
! 1454: param.step_factor = 0.04;
! 1455: param.name[0] = "SET_STR_THRESHOLD";
! 1456: param.function = speed_mpn_set_str_basecase;
! 1457: param.function2 = speed_mpn_set_str_subquad;
! 1458: param.min_size[0] = 100;
! 1459: param.max_size[0] = 150000;
! 1460: one (set_str_threshold, 1, ¶m);
! 1461: }
! 1462:
! 1463:
! 1464: void
! 1465: tune_fft_mul (void)
! 1466: {
! 1467: static struct fft_param_t param;
! 1468:
! 1469: if (option_fft_max_size == 0)
! 1470: return;
! 1471:
! 1472: param.table_name = "MUL_FFT_TABLE";
! 1473: param.threshold_name = "MUL_FFT_THRESHOLD";
! 1474: param.p_threshold = &MUL_FFT_THRESHOLD;
! 1475: param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
! 1476: param.p_modf_threshold = &MUL_FFT_MODF_THRESHOLD;
! 1477: param.first_size = MUL_TOOM3_THRESHOLD / 2;
! 1478: param.max_size = option_fft_max_size;
! 1479: param.function = speed_mpn_mul_fft;
! 1480: param.mul_function = speed_mpn_mul_n;
! 1481: param.sqr = 0;
! 1482: fft (¶m);
! 1483: }
! 1484:
! 1485:
! 1486: void
! 1487: tune_fft_sqr (void)
! 1488: {
! 1489: static struct fft_param_t param;
! 1490:
! 1491: if (option_fft_max_size == 0)
! 1492: return;
! 1493:
! 1494: param.table_name = "SQR_FFT_TABLE";
! 1495: param.threshold_name = "SQR_FFT_THRESHOLD";
! 1496: param.p_threshold = &SQR_FFT_THRESHOLD;
! 1497: param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
! 1498: param.p_modf_threshold = &SQR_FFT_MODF_THRESHOLD;
! 1499: param.first_size = SQR_TOOM3_THRESHOLD / 2;
! 1500: param.max_size = option_fft_max_size;
! 1501: param.function = speed_mpn_mul_fft_sqr;
! 1502: param.mul_function = speed_mpn_sqr_n;
! 1503: param.sqr = 0;
! 1504: fft (¶m);
! 1505: }
! 1506:
! 1507:
! 1508: void
! 1509: all (void)
! 1510: {
! 1511: time_t start_time, end_time;
! 1512: TMP_DECL (marker);
! 1513:
! 1514: TMP_MARK (marker);
! 1515: s.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
! 1516: s.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, 0);
! 1517:
! 1518: mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
! 1519: mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
! 1520:
! 1521: fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
! 1522:
! 1523: speed_time_init ();
! 1524: fprintf (stderr, "Using: %s\n", speed_time_string);
! 1525:
! 1526: fprintf (stderr, "speed_precision %d", speed_precision);
! 1527: if (speed_unittime == 1.0)
! 1528: fprintf (stderr, ", speed_unittime 1 cycle");
! 1529: else
! 1530: fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
! 1531: if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
! 1532: fprintf (stderr, ", CPU freq unknown\n");
! 1533: else
! 1534: fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
! 1535:
! 1536: fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
! 1537: DEFAULT_MAX_SIZE, option_fft_max_size);
! 1538: fprintf (stderr, "\n");
! 1539:
! 1540: time (&start_time);
1.1 maekawa 1541: {
1.1.1.2 ! ohara 1542: struct tm *tp;
! 1543: tp = localtime (&start_time);
! 1544: printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
! 1545: tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
! 1546:
! 1547: #ifdef __GNUC__
! 1548: /* gcc sub-minor version doesn't seem to come through as a define */
! 1549: printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
! 1550: #define PRINTED_COMPILER
! 1551: #endif
! 1552: #if defined (__SUNPRO_C)
! 1553: printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
! 1554: #define PRINTED_COMPILER
! 1555: #endif
! 1556: #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
! 1557: /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
! 1558: printf ("MIPSpro C %d.%d.%d */\n",
! 1559: _COMPILER_VERSION / 100,
! 1560: _COMPILER_VERSION / 10 % 10,
! 1561: _COMPILER_VERSION % 10);
! 1562: #define PRINTED_COMPILER
! 1563: #endif
! 1564: #if defined (__DECC) && defined (__DECC_VER)
! 1565: printf ("DEC C %d */\n", __DECC_VER);
! 1566: #define PRINTED_COMPILER
! 1567: #endif
! 1568: #if ! defined (PRINTED_COMPILER)
! 1569: printf ("system compiler */\n");
! 1570: #endif
1.1 maekawa 1571: }
1.1.1.2 ! ohara 1572: printf ("\n");
! 1573:
! 1574: tune_mul ();
1.1 maekawa 1575: printf("\n");
1576:
1.1.1.2 ! ohara 1577: tune_sqr ();
! 1578: printf("\n");
! 1579:
! 1580: tune_sb_preinv ();
! 1581: tune_dc ();
! 1582: tune_powm ();
! 1583: printf("\n");
! 1584:
! 1585: tune_gcd_accel ();
! 1586: tune_gcdext ();
! 1587: tune_jacobi_base ();
! 1588: printf("\n");
! 1589:
! 1590: tune_divrem_1 ();
! 1591: tune_mod_1 ();
! 1592: tune_preinv_divrem_1 ();
! 1593: tune_preinv_mod_1 ();
! 1594: tune_divrem_2 ();
! 1595: tune_divexact_1 ();
! 1596: tune_modexact_1_odd ();
! 1597: printf("\n");
! 1598:
! 1599: tune_get_str ();
! 1600: tune_set_str ();
! 1601: printf("\n");
! 1602:
! 1603: tune_fft_mul ();
! 1604: printf("\n");
! 1605:
! 1606: tune_fft_sqr ();
! 1607: printf ("\n");
! 1608:
! 1609: time (&end_time);
! 1610: printf ("/* Tuneup completed successfully, took %ld seconds */\n",
! 1611: end_time - start_time);
1.1 maekawa 1612:
1613: TMP_FREE (marker);
1614: }
1615:
1616:
1617: int
1618: main (int argc, char *argv[])
1619: {
1620: int opt;
1621:
1622: /* Unbuffered so if output is redirected to a file it isn't lost if the
1623: program is killed part way through. */
1624: setbuf (stdout, NULL);
1625: setbuf (stderr, NULL);
1626:
1627: while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
1628: {
1629: switch (opt) {
1630: case 'f':
1631: if (optarg[0] == 't')
1632: option_fft_trace = 2;
1633: else
1634: option_fft_max_size = atol (optarg);
1635: break;
1636: case 'o':
1637: speed_option_set (optarg);
1638: break;
1639: case 'p':
1640: speed_precision = atoi (optarg);
1641: break;
1642: case 't':
1643: option_trace++;
1644: break;
1645: case '?':
1646: exit(1);
1647: }
1648: }
1.1.1.2 ! ohara 1649:
1.1 maekawa 1650: all ();
1.1.1.2 ! ohara 1651: exit (0);
1.1 maekawa 1652: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>