Annotation of OpenXM_contrib/gmp/tune/common.c, Revision 1.1
1.1 ! maekawa 1: /* Shared speed subroutines. */
! 2:
! 3: /*
! 4: Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 5:
! 6: This file is part of the GNU MP Library.
! 7:
! 8: The GNU MP Library is free software; you can redistribute it and/or modify
! 9: it under the terms of the GNU Lesser General Public License as published by
! 10: the Free Software Foundation; either version 2.1 of the License, or (at your
! 11: option) any later version.
! 12:
! 13: The GNU MP Library is distributed in the hope that it will be useful, but
! 14: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: License for more details.
! 17:
! 18: You should have received a copy of the GNU Lesser General Public License
! 19: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: MA 02111-1307, USA.
! 22: */
! 23:
! 24: #include <errno.h>
! 25: #include <fcntl.h>
! 26: #include <math.h>
! 27: #include <stdio.h>
! 28: #include <stdlib.h> /* for qsort */
! 29: #include <string.h>
! 30: #include <unistd.h>
! 31: #if 0
! 32: #include <sys/ioctl.h>
! 33: #endif
! 34:
! 35: #include "gmp.h"
! 36: #include "gmp-impl.h"
! 37: #include "longlong.h"
! 38:
! 39: #include "speed.h"
! 40:
! 41: /* Change this to "#define TRACE(x) x" to get traces. */
! 42: #define TRACE(x)
! 43:
! 44:
! 45: typedef int (*qsort_function_t) _PROTO ((const void *, const void *));
! 46:
! 47:
! 48: int speed_option_addrs = 0;
! 49:
! 50:
! 51: void
! 52: pentium_wbinvd(void)
! 53: {
! 54: #if 0
! 55: {
! 56: static int fd = -2;
! 57:
! 58: if (fd == -2)
! 59: {
! 60: fd = open ("/dev/wbinvd", O_RDWR);
! 61: if (fd == -1)
! 62: perror ("open /dev/wbinvd");
! 63: }
! 64:
! 65: if (fd != -1)
! 66: ioctl (fd, 0, 0);
! 67: }
! 68: #endif
! 69:
! 70: #if 0
! 71: #define WBINVDSIZE 1024*1024*2
! 72: {
! 73: static char *p = NULL;
! 74: int i, sum;
! 75:
! 76: if (p == NULL)
! 77: p = malloc (WBINVDSIZE);
! 78:
! 79: #if 0
! 80: for (i = 0; i < WBINVDSIZE; i++)
! 81: p[i] = i & 0xFF;
! 82: #endif
! 83:
! 84: sum = 0;
! 85: for (i = 0; i < WBINVDSIZE; i++)
! 86: sum += p[i];
! 87:
! 88: mpn_cache_fill_dummy (sum);
! 89: }
! 90: #endif
! 91: }
! 92:
! 93: static int
! 94: double_cmp_ptr (const double *p, const double *q)
! 95: {
! 96: if (*p > *q) return 1;
! 97: if (*p < *q) return -1;
! 98: return 0;
! 99: }
! 100:
! 101:
! 102: /* Measure the speed of a given routine.
! 103:
! 104: The routine is run with enough repetitions to make it take at least
! 105: speed_precision * speed_unittime. This aims to minimize the effects of a
! 106: limited accuracy time base and the overhead of the measuring itself.
! 107:
! 108: Measurements are made looking for 4 results within TOLERANCE of each
! 109: other (or 3 for routines taking longer than 2 seconds). This aims to get
! 110: an accurate reading even if some runs are bloated by interrupts or task
! 111: switches or whatever.
! 112:
! 113: The given (*fun)() is expected to run its function "s->reps" many times
! 114: and return the total elapsed time measured using speed_starttime() and
! 115: speed_endtime(). If the function doesn't support the given s->size or
! 116: s->r, -1.0 should be returned. See the various base routines below. */
! 117:
! 118: double
! 119: speed_measure (double (*fun) _PROTO ((struct speed_params *s)),
! 120: struct speed_params *s)
! 121: {
! 122: #define TOLERANCE 1.005 /* 0.5% */
! 123:
! 124: struct speed_params s_dummy;
! 125: int i, j, e;
! 126: double t[30];
! 127: double t_unsorted[30];
! 128:
! 129: /* Use dummy parameters if caller doesn't provide any. Only a few special
! 130: "fun"s will cope with this, speed_noop() is one. */
! 131: if (s == NULL)
! 132: {
! 133: memset (&s_dummy, '\0', sizeof (s_dummy));
! 134: s = &s_dummy;
! 135: }
! 136:
! 137: s->reps = 1;
! 138: s->time_divisor = 1.0;
! 139: for (i = 0; i < numberof (t); i++)
! 140: {
! 141: for (;;)
! 142: {
! 143: s->src_num = 0;
! 144: s->dst_num = 0;
! 145:
! 146: t[i] = (*fun) (s);
! 147: t_unsorted[i] = t[i];
! 148:
! 149: TRACE (printf("size=%ld reps=%u r=%d attempt=%d %.9f\n",
! 150: s->size, s->reps, s->r, i, t[i]));
! 151:
! 152: if (t[i] == -1.0)
! 153: return -1.0;
! 154:
! 155: if (t[i] >= speed_unittime * speed_precision)
! 156: break;
! 157:
! 158: /* go to a value of reps to make t[i] >= precision */
! 159: s->reps = (unsigned) ceil (1.1 * s->reps
! 160: * speed_unittime * speed_precision
! 161: / MAX (t[i], speed_unittime));
! 162: }
! 163: t[i] /= s->reps;
! 164:
! 165: if (speed_precision == 0)
! 166: return t[i];
! 167:
! 168: /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
! 169: if (t[0] >= 2.0)
! 170: e = 3;
! 171: else
! 172: e = 4;
! 173:
! 174: /* Look for e many t[]'s within TOLERANCE of each other to consider a
! 175: valid measurement. Return smallest among them. */
! 176: if (i >= e)
! 177: {
! 178: qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
! 179: for (j = e-1; j < i; j++)
! 180: if (t[j] <= t[j-e+1] * TOLERANCE)
! 181: return t[j-e+1] / s->time_divisor;
! 182: }
! 183: }
! 184:
! 185: fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
! 186: e, (TOLERANCE-1.0)*100.0);
! 187: fprintf (stderr, " %.12f is about 0.5%%\n", t[0]*(TOLERANCE-1.0));
! 188: for (i = 0; i < numberof (t); i++)
! 189: fprintf (stderr, " %.09f\n", t_unsorted[i]);
! 190:
! 191: return -1.0;
! 192: }
! 193:
! 194:
! 195: /* Read all of ptr,size to get it into the CPU memory cache.
! 196:
! 197: A call to mpn_cache_fill_dummy() is used to make sure the compiler
! 198: doesn't optimize away the whole loop. Using "volatile mp_limb_t sum"
! 199: would work too, but the function call means we don't rely on every
! 200: compiler actually implementing volatile properly.
! 201:
! 202: mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
! 203: it can inline it. */
! 204:
! 205: void
! 206: mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
! 207: {
! 208: mp_limb_t sum = 0;
! 209: mp_size_t i;
! 210:
! 211: for (i = 0; i < size; i++)
! 212: sum += ptr[i];
! 213:
! 214: mpn_cache_fill_dummy(sum);
! 215: }
! 216:
! 217:
! 218: void
! 219: mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
! 220: {
! 221: mpn_cache_fill (ptr, size);
! 222:
! 223: #if 0
! 224: mpn_random (ptr, size);
! 225: #endif
! 226:
! 227: #if 0
! 228: mp_size_t i;
! 229:
! 230: for (i = 0; i < size; i++)
! 231: ptr[i] = i;
! 232: #endif
! 233: }
! 234:
! 235:
! 236: void
! 237: speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
! 238: {
! 239: if (s->src_num >= numberof (s->src))
! 240: {
! 241: fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
! 242: abort ();
! 243: }
! 244: s->src[s->src_num].ptr = ptr;
! 245: s->src[s->src_num].size = size;
! 246: s->src_num++;
! 247: }
! 248:
! 249:
! 250: void
! 251: speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
! 252: {
! 253: if (s->dst_num >= numberof (s->dst))
! 254: {
! 255: fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
! 256: abort ();
! 257: }
! 258: s->dst[s->dst_num].ptr = ptr;
! 259: s->dst[s->dst_num].size = size;
! 260: s->dst_num++;
! 261: }
! 262:
! 263:
! 264: void
! 265: speed_cache_fill (struct speed_params *s)
! 266: {
! 267: static struct speed_params prev;
! 268: int i;
! 269:
! 270: /* FIXME: need a better way to get the format string for a pointer */
! 271:
! 272: if (speed_option_addrs)
! 273: {
! 274: int different;
! 275:
! 276: different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
! 277: for (i = 0; i < s->dst_num; i++)
! 278: different |= (s->dst[i].ptr != prev.dst[i].ptr);
! 279: for (i = 0; i < s->src_num; i++)
! 280: different |= (s->src[i].ptr != prev.src[i].ptr);
! 281:
! 282: if (different)
! 283: {
! 284: if (s->dst_num != 0)
! 285: {
! 286: printf ("dst");
! 287: for (i = 0; i < s->dst_num; i++)
! 288: printf (" %08lX", (unsigned long) s->dst[i].ptr);
! 289: printf (" ");
! 290: }
! 291:
! 292: if (s->src_num != 0)
! 293: {
! 294: printf ("src");
! 295: for (i = 0; i < s->src_num; i++)
! 296: printf (" %08lX", (unsigned long) s->src[i].ptr);
! 297: printf (" ");
! 298: }
! 299: printf (" (cf sp approx %08lX)\n", (unsigned long) &different);
! 300:
! 301: }
! 302:
! 303: memcpy (&prev, s, sizeof(prev));
! 304: }
! 305:
! 306: switch (s->cache) {
! 307: case 0:
! 308: for (i = 0; i < s->dst_num; i++)
! 309: mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
! 310: for (i = 0; i < s->src_num; i++)
! 311: mpn_cache_fill (s->src[i].ptr, s->src[i].size);
! 312: break;
! 313: case 1:
! 314: pentium_wbinvd();
! 315: break;
! 316: }
! 317: }
! 318:
! 319:
! 320: /* Return p advanced to the next multiple of "align" bytes. "align" must be
! 321: a power of 2. Care is taken not to assume sizeof(int)==sizeof(pointer).
! 322: Using "unsigned long" avoids a warning on hpux. */
! 323: void *
! 324: align_pointer (void *p, size_t align)
! 325: {
! 326: unsigned long d;
! 327: d = ((unsigned long) p) & (align-1);
! 328: d = (d != 0 ? align-d : 0);
! 329: return (void *) (((char *) p) + d);
! 330: }
! 331:
! 332: /* Note that memory allocated with this function can never be freed, because
! 333: the start address of the block allocated is discarded. */
! 334: void *
! 335: _mp_allocate_func_aligned (size_t bytes, size_t align)
! 336: {
! 337: return align_pointer ((*_mp_allocate_func) (bytes + align-1), align);
! 338: }
! 339:
! 340:
! 341: void *
! 342: _mp_allocate_or_reallocate (void *ptr, size_t oldsize, size_t newsize)
! 343: {
! 344: if (ptr == NULL)
! 345: return (*_mp_allocate_func) (newsize);
! 346: else
! 347: return (*_mp_reallocate_func) (ptr, oldsize, newsize);
! 348: }
! 349:
! 350:
! 351: /* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs. ptr
! 352: needs to have room for up to CACHE_LINE_SIZE-4 extra bytes. */
! 353:
! 354: mp_ptr
! 355: speed_tmp_alloc_adjust (void *ptr, mp_size_t align)
! 356: {
! 357: /*
! 358: printf("%p %ld -> %p %X %X\n", ptr, align,
! 359: (mp_ptr) ptr
! 360: + ((align - ((mp_size_t) ptr >> 2)) &
! 361: SPEED_TMP_ALLOC_ADJUST_MASK),
! 362: ((mp_size_t) ptr >> 2) & SPEED_TMP_ALLOC_ADJUST_MASK,
! 363: SPEED_TMP_ALLOC_ADJUST_MASK);
! 364: */
! 365:
! 366: return (mp_ptr) ptr
! 367: + ((align - ((mp_size_t) ptr >> 2)) & SPEED_TMP_ALLOC_ADJUST_MASK);
! 368: }
! 369:
! 370:
! 371: void
! 372: mpz_set_n (mpz_ptr z, mp_srcptr p, mp_size_t size)
! 373: {
! 374: ASSERT (size >= 0);
! 375: MPN_NORMALIZE (p, size);
! 376: MPZ_REALLOC (z, size);
! 377: MPN_COPY (PTR(z), p, size);
! 378: SIZ(z) = size;
! 379: }
! 380:
! 381:
! 382: /* Miscellanous options accepted by tune and speed programs under -o. */
! 383:
! 384: void
! 385: speed_option_set (const char *s)
! 386: {
! 387: if (strcmp (s, "addrs") == 0) speed_option_addrs = 1;
! 388: else
! 389: {
! 390: printf ("Unrecognised -o option: %s\n", s);
! 391: exit (1);
! 392: }
! 393: }
! 394:
! 395:
! 396: /* The following are basic speed running routines for various gmp functions.
! 397: Many are very similar and use speed.h macros.
! 398:
! 399: Each routine allocates it's own destination space for the result of the
! 400: function, because only it can know what the function needs.
! 401:
! 402: speed_starttime() and speed_endtime() are put tight around the code to be
! 403: measured. Any setups are done outside the timed portion.
! 404:
! 405: Each routine is responsible for its own cache priming.
! 406: speed_cache_fill() is a good way to do this, see examples in speed.h.
! 407: One cache priming possibility, for CPUs with write-allocate cache, and
! 408: functions that don't take too long, is to do one dummy call before timing
! 409: so as to cache everything that gets used. But speed_measure() runs a
! 410: routine at least twice and will take the smaller time, so this might not
! 411: be necessary.
! 412:
! 413: Data alignment will be important, for source, destination and temporary
! 414: workspace. A routine can align its destination and workspace. Programs
! 415: using the routines will ensure s->xp and s->yp are aligned. Aligning
! 416: onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and
! 417: s->align_wp2 should be respected where it makes sense to do so.
! 418: SPEED_TMP_ALLOC_LIMBS is a good way to do this.
! 419:
! 420: A loop of the following form can be expected to turn into good assembler
! 421: code on most CPUs, thereby minimizing overhead in the measurement. It
! 422: can always be assumed s->reps >= 1.
! 423:
! 424: i = s->reps
! 425: do
! 426: foo();
! 427: while (--i != 0);
! 428:
! 429: Additional parameters might be added to "struct speed_params" in the
! 430: future. Routines should ignore anything they don't use.
! 431:
! 432: s->size can be used creatively, and s->xp and s->yp can be ignored. For
! 433: example, speed_mpz_fac_ui() uses s->size as n for the factorial. s->r is
! 434: just a user-supplied parameter. speed_mpn_lshift() uses it as a shift,
! 435: speed_mpn_mul_1() uses it as a multiplier. */
! 436:
! 437:
! 438: /* MPN_COPY etc can be macros, so the _CALL forms are necessary */
! 439: double
! 440: speed_MPN_COPY (struct speed_params *s)
! 441: {
! 442: SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY (wp, s->xp, s->size));
! 443: }
! 444: double
! 445: speed_MPN_COPY_INCR (struct speed_params *s)
! 446: {
! 447: SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY_INCR (wp, s->xp, s->size));
! 448: }
! 449: double
! 450: speed_MPN_COPY_DECR (struct speed_params *s)
! 451: {
! 452: SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY_DECR (wp, s->xp, s->size));
! 453: }
! 454: double
! 455: speed_memcpy (struct speed_params *s)
! 456: {
! 457: SPEED_ROUTINE_MPN_COPY_CALL
! 458: (memcpy (wp, s->xp, s->size * BYTES_PER_MP_LIMB));
! 459: }
! 460:
! 461:
! 462: double
! 463: speed_mpn_addmul_1 (struct speed_params *s)
! 464: {
! 465: SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
! 466: }
! 467: double
! 468: speed_mpn_submul_1 (struct speed_params *s)
! 469: {
! 470: SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
! 471: }
! 472:
! 473:
! 474: double
! 475: speed_mpn_mul_1 (struct speed_params *s)
! 476: {
! 477: SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
! 478: }
! 479:
! 480:
! 481: double
! 482: speed_mpn_lshift (struct speed_params *s)
! 483: {
! 484: SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
! 485: }
! 486: double
! 487: speed_mpn_rshift (struct speed_params *s)
! 488: {
! 489: SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
! 490: }
! 491:
! 492:
! 493: /* The carry-in variants (if available) are good for measuring because they
! 494: won't skip a division if high<divisor. Alternately, use -1 as a divisor
! 495: with the plain _1 forms. */
! 496: double
! 497: speed_mpn_divrem_1 (struct speed_params *s)
! 498: {
! 499: SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
! 500: }
! 501: double
! 502: speed_mpn_divrem_1f (struct speed_params *s)
! 503: {
! 504: SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
! 505: }
! 506: #if HAVE_NATIVE_mpn_divrem_1c
! 507: double
! 508: speed_mpn_divrem_1c (struct speed_params *s)
! 509: {
! 510: SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
! 511: }
! 512: double
! 513: speed_mpn_divrem_1cf (struct speed_params *s)
! 514: {
! 515: SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
! 516: }
! 517: #endif
! 518:
! 519: double
! 520: speed_mpn_divrem_2 (struct speed_params *s)
! 521: {
! 522: SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
! 523: }
! 524:
! 525: double
! 526: speed_mpn_mod_1 (struct speed_params *s)
! 527: {
! 528: SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
! 529: }
! 530: #if HAVE_NATIVE_mpn_mod_1c
! 531: double
! 532: speed_mpn_mod_1c (struct speed_params *s)
! 533: {
! 534: SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
! 535: }
! 536: #endif
! 537:
! 538: double
! 539: speed_mpn_divexact_by3 (struct speed_params *s)
! 540: {
! 541: /* mpn_divexact_by3 is a macro, so the _CALL form is necessary */
! 542: SPEED_ROUTINE_MPN_COPY_CALL(mpn_divexact_by3 (wp, s->xp, s->size));
! 543: }
! 544:
! 545:
! 546: double
! 547: speed_mpn_bz_divrem_n (struct speed_params *s)
! 548: {
! 549: SPEED_ROUTINE_MPN_BZ_DIVREM_N (mpn_bz_divrem_n);
! 550: }
! 551: double
! 552: speed_mpn_bz_divrem_sb (struct speed_params *s)
! 553: {
! 554: SPEED_ROUTINE_MPN_BZ_DIVREM_SB (mpn_sb_divrem_mn);
! 555: }
! 556: double
! 557: speed_mpn_bz_tdiv_qr (struct speed_params *s)
! 558: {
! 559: SPEED_ROUTINE_MPN_BZ_TDIV_QR (mpn_tdiv_qr);
! 560: }
! 561:
! 562:
! 563: double
! 564: speed_mpn_popcount (struct speed_params *s)
! 565: {
! 566: SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
! 567: }
! 568: double
! 569: speed_mpn_hamdist (struct speed_params *s)
! 570: {
! 571: SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
! 572: }
! 573:
! 574:
! 575: double
! 576: speed_mpn_add_n (struct speed_params *s)
! 577: {
! 578: SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
! 579: }
! 580: double
! 581: speed_mpn_sub_n (struct speed_params *s)
! 582: {
! 583: SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
! 584: }
! 585: double
! 586: speed_mpn_add_n_self (struct speed_params *s)
! 587: {
! 588: SPEED_ROUTINE_MPN_BINARY_N_SELF (mpn_add_n);
! 589: }
! 590: double
! 591: speed_mpn_add_n_inplace (struct speed_params *s)
! 592: {
! 593: SPEED_ROUTINE_MPN_BINARY_N_INPLACE (mpn_add_n);
! 594: }
! 595:
! 596:
! 597: /* mpn_and_n etc can be macros and so have to be handled with
! 598: SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
! 599: double
! 600: speed_mpn_and_n (struct speed_params *s)
! 601: {
! 602: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size));
! 603: }
! 604: double
! 605: speed_mpn_andn_n (struct speed_params *s)
! 606: {
! 607: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size));
! 608: }
! 609: double
! 610: speed_mpn_nand_n (struct speed_params *s)
! 611: {
! 612: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size));
! 613: }
! 614: double
! 615: speed_mpn_ior_n (struct speed_params *s)
! 616: {
! 617: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size));
! 618: }
! 619: double
! 620: speed_mpn_iorn_n (struct speed_params *s)
! 621: {
! 622: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size));
! 623: }
! 624: double
! 625: speed_mpn_nior_n (struct speed_params *s)
! 626: {
! 627: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size));
! 628: }
! 629: double
! 630: speed_mpn_xor_n (struct speed_params *s)
! 631: {
! 632: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size));
! 633: }
! 634: double
! 635: speed_mpn_xnor_n (struct speed_params *s)
! 636: {
! 637: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size));
! 638: }
! 639:
! 640:
! 641: double
! 642: speed_mpn_mul_n (struct speed_params *s)
! 643: {
! 644: SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
! 645: }
! 646: double
! 647: speed_mpn_sqr_n (struct speed_params *s)
! 648: {
! 649: SPEED_ROUTINE_MPN_SQR (mpn_sqr_n);
! 650: }
! 651: double
! 652: speed_mpn_mul_n_sqr (struct speed_params *s)
! 653: {
! 654: SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
! 655: }
! 656:
! 657: double
! 658: speed_mpn_mul_basecase (struct speed_params *s)
! 659: {
! 660: SPEED_ROUTINE_MPN_MUL_BASECASE(mpn_mul_basecase);
! 661: }
! 662: double
! 663: speed_mpn_sqr_basecase (struct speed_params *s)
! 664: {
! 665: /* FIXME: size restrictions on some versions of sqr_basecase */
! 666: SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
! 667: }
! 668:
! 669: double
! 670: speed_mpn_kara_mul_n (struct speed_params *s)
! 671: {
! 672: SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n);
! 673: }
! 674: double
! 675: speed_mpn_kara_sqr_n (struct speed_params *s)
! 676: {
! 677: SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n);
! 678: }
! 679:
! 680: double
! 681: speed_mpn_toom3_mul_n (struct speed_params *s)
! 682: {
! 683: SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n);
! 684: }
! 685: double
! 686: speed_mpn_toom3_sqr_n (struct speed_params *s)
! 687: {
! 688: SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n);
! 689: }
! 690:
! 691: double
! 692: speed_mpn_mul_fft_full (struct speed_params *s)
! 693: {
! 694: SPEED_ROUTINE_MPN_MUL_N_CALL
! 695: (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
! 696: }
! 697: double
! 698: speed_mpn_mul_fft_full_sqr (struct speed_params *s)
! 699: {
! 700: SPEED_ROUTINE_MPN_SQR_CALL
! 701: (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
! 702: }
! 703:
! 704:
! 705: /* These are mod 2^N+1 multiplies and squares. If s->r is supplied it's
! 706: used as k, otherwise the best k for the size is used. If s->size isn't a
! 707: multiple of 2^k it's rounded up to make the effective operation size. */
! 708:
! 709: #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr) \
! 710: { \
! 711: mp_ptr wp; \
! 712: mp_size_t pl; \
! 713: int k; \
! 714: unsigned i; \
! 715: double t; \
! 716: TMP_DECL (marker); \
! 717: \
! 718: SPEED_RESTRICT_COND (s->size >= 1); \
! 719: \
! 720: if (s->r != 0) \
! 721: k = s->r; \
! 722: else \
! 723: k = mpn_fft_best_k (s->size, sqr); \
! 724: \
! 725: TMP_MARK (marker); \
! 726: pl = mpn_fft_next_size (s->size, k); \
! 727: wp = SPEED_TMP_ALLOC_LIMBS (pl+1, s->align_wp); \
! 728: \
! 729: speed_operand_src (s, s->xp, s->size); \
! 730: if (!sqr) \
! 731: speed_operand_src (s, s->yp, s->size); \
! 732: speed_operand_dst (s, wp, pl+1); \
! 733: speed_cache_fill (s); \
! 734: \
! 735: speed_starttime (); \
! 736: i = s->reps; \
! 737: do \
! 738: call; \
! 739: while (--i != 0); \
! 740: t = speed_endtime (); \
! 741: \
! 742: TMP_FREE (marker); \
! 743: return t; \
! 744: }
! 745:
! 746: double
! 747: speed_mpn_mul_fft (struct speed_params *s)
! 748: {
! 749: SPEED_ROUTINE_MPN_MUL_FFT_CALL
! 750: (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
! 751: }
! 752:
! 753: double
! 754: speed_mpn_mul_fft_sqr (struct speed_params *s)
! 755: {
! 756: SPEED_ROUTINE_MPN_MUL_FFT_CALL
! 757: (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
! 758: }
! 759:
! 760:
! 761: double
! 762: speed_mpn_gcd (struct speed_params *s)
! 763: {
! 764: SPEED_ROUTINE_MPN_GCD (mpn_gcd);
! 765: }
! 766: double
! 767: speed_mpn_gcdext (struct speed_params *s)
! 768: {
! 769: SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
! 770: }
! 771: double
! 772: speed_mpn_gcd_1 (struct speed_params *s)
! 773: {
! 774: SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
! 775: }
! 776:
! 777:
! 778: double
! 779: speed_mpn_jacobi_base (struct speed_params *s)
! 780: {
! 781: SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
! 782: }
! 783:
! 784:
! 785: double
! 786: speed_mpz_fac_ui (struct speed_params *s)
! 787: {
! 788: SPEED_ROUTINE_MPZ_UI (mpz_fac_ui);
! 789: }
! 790: double
! 791: speed_mpz_fib_ui (struct speed_params *s)
! 792: {
! 793: SPEED_ROUTINE_MPZ_UI (mpz_fib_ui);
! 794: }
! 795:
! 796:
! 797: double
! 798: speed_mpz_powm (struct speed_params *s)
! 799: {
! 800: SPEED_ROUTINE_MPZ_POWM (mpz_powm);
! 801: }
! 802:
! 803:
! 804: double
! 805: speed_modlimb_invert (struct speed_params *s)
! 806: {
! 807: SPEED_ROUTINE_MODLIMB_INVERT (modlimb_invert);
! 808: }
! 809:
! 810:
! 811: double
! 812: speed_noop (struct speed_params *s)
! 813: {
! 814: unsigned i;
! 815:
! 816: speed_starttime ();
! 817: i = s->reps;
! 818: do
! 819: noop ();
! 820: while (--i != 0);
! 821: return speed_endtime ();
! 822: }
! 823:
! 824: double
! 825: speed_noop_wxs (struct speed_params *s)
! 826: {
! 827: mp_ptr wp;
! 828: unsigned i;
! 829: double t;
! 830: TMP_DECL (marker);
! 831:
! 832: TMP_MARK (marker);
! 833: wp = TMP_ALLOC_LIMBS (1);
! 834:
! 835: speed_starttime ();
! 836: i = s->reps;
! 837: do
! 838: noop_wxs (wp, s->xp, s->size);
! 839: while (--i != 0);
! 840: t = speed_endtime ();
! 841:
! 842: TMP_FREE (marker);
! 843: return t;
! 844: }
! 845:
! 846: double
! 847: speed_noop_wxys (struct speed_params *s)
! 848: {
! 849: mp_ptr wp;
! 850: unsigned i;
! 851: double t;
! 852: TMP_DECL (marker);
! 853:
! 854: TMP_MARK (marker);
! 855: wp = TMP_ALLOC_LIMBS (1);
! 856:
! 857: speed_starttime ();
! 858: i = s->reps;
! 859: do
! 860: noop_wxys (wp, s->xp, s->yp, s->size);
! 861: while (--i != 0);
! 862: t = speed_endtime ();
! 863:
! 864: TMP_FREE (marker);
! 865: return t;
! 866: }
! 867:
! 868:
! 869: #define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \
! 870: { \
! 871: unsigned i; \
! 872: variables; \
! 873: \
! 874: speed_starttime (); \
! 875: i = s->reps; \
! 876: do \
! 877: { \
! 878: calls; \
! 879: } \
! 880: while (--i != 0); \
! 881: return speed_endtime (); \
! 882: }
! 883:
! 884:
! 885: /* Compare these to see how much malloc/free costs and then how much
! 886: _mp_default_allocate/free and mpz_init/clear add. mpz_init/clear or
! 887: mpq_init/clear will be doing a 1 limb allocate, so use that as the size
! 888: when including them in comparisons. */
! 889:
! 890: double
! 891: speed_malloc_free (struct speed_params *s)
! 892: {
! 893: size_t bytes = s->size * BYTES_PER_MP_LIMB;
! 894: SPEED_ROUTINE_ALLOC_FREE (void *p,
! 895: p = malloc (bytes);
! 896: free (p));
! 897: }
! 898:
! 899: double
! 900: speed_malloc_realloc_free (struct speed_params *s)
! 901: {
! 902: size_t bytes = s->size * BYTES_PER_MP_LIMB;
! 903: SPEED_ROUTINE_ALLOC_FREE (void *p,
! 904: p = malloc (BYTES_PER_MP_LIMB);
! 905: p = realloc (p, bytes);
! 906: free (p));
! 907: }
! 908:
! 909: double
! 910: speed_mp_allocate_free (struct speed_params *s)
! 911: {
! 912: size_t bytes = s->size * BYTES_PER_MP_LIMB;
! 913: SPEED_ROUTINE_ALLOC_FREE (void *p,
! 914: p = (*_mp_allocate_func) (bytes);
! 915: (*_mp_free_func) (p, bytes));
! 916: }
! 917:
! 918: double
! 919: speed_mp_allocate_reallocate_free (struct speed_params *s)
! 920: {
! 921: size_t bytes = s->size * BYTES_PER_MP_LIMB;
! 922: SPEED_ROUTINE_ALLOC_FREE
! 923: (void *p,
! 924: p = (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
! 925: p = (*_mp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB);
! 926: (*_mp_free_func) (p, bytes));
! 927: }
! 928:
! 929: double
! 930: speed_mpz_init_clear (struct speed_params *s)
! 931: {
! 932: SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
! 933: mpz_init (z);
! 934: mpz_clear (z));
! 935: }
! 936:
! 937: double
! 938: speed_mpz_init_realloc_clear (struct speed_params *s)
! 939: {
! 940: SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
! 941: mpz_init (z);
! 942: _mpz_realloc (z, s->size);
! 943: mpz_clear (z));
! 944: }
! 945:
! 946: double
! 947: speed_mpq_init_clear (struct speed_params *s)
! 948: {
! 949: SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
! 950: mpq_init (q);
! 951: mpq_clear (q));
! 952: }
! 953:
! 954: double
! 955: speed_mpf_init_clear (struct speed_params *s)
! 956: {
! 957: SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
! 958: mpf_init (f);
! 959: mpf_clear (f));
! 960: }
! 961:
! 962:
! 963: /* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note
! 964: that repeatedly calling mpz_add with the same data gives branch predition
! 965: in it an advantage. */
! 966:
! 967: double
! 968: speed_mpz_add (struct speed_params *s)
! 969: {
! 970: mpz_t w, x, y;
! 971: unsigned i;
! 972: double t;
! 973:
! 974: mpz_init (w);
! 975: mpz_init (x);
! 976: mpz_init (y);
! 977:
! 978: mpz_set_n (x, s->xp, s->size);
! 979: mpz_set_n (y, s->yp, s->size);
! 980: mpz_add (w, x, y);
! 981:
! 982: speed_starttime ();
! 983: i = s->reps;
! 984: do
! 985: {
! 986: mpz_add (w, x, y);
! 987: }
! 988: while (--i != 0);
! 989: t = speed_endtime ();
! 990:
! 991: mpz_clear (w);
! 992: mpz_clear (x);
! 993: mpz_clear (y);
! 994: return t;
! 995: }
! 996:
! 997:
! 998: /* If r==0, calculate (size,size/2),
! 999: otherwise calculate (size,r). */
! 1000:
! 1001: double
! 1002: speed_mpz_bin_uiui (struct speed_params *s)
! 1003: {
! 1004: mpz_t w;
! 1005: unsigned long k;
! 1006: unsigned i;
! 1007: double t;
! 1008:
! 1009: mpz_init (w);
! 1010: if (s->r != 0)
! 1011: k = s->r;
! 1012: else
! 1013: k = s->size/2;
! 1014:
! 1015: speed_starttime ();
! 1016: i = s->reps;
! 1017: do
! 1018: {
! 1019: mpz_bin_uiui (w, s->size, k);
! 1020: }
! 1021: while (--i != 0);
! 1022: t = speed_endtime ();
! 1023:
! 1024: mpz_clear (w);
! 1025: return t;
! 1026: }
! 1027:
! 1028:
! 1029: /* The multiplies are successively dependent so the latency is measured, not
! 1030: the issue rate. There's only 10 per loop so the code doesn't get too big
! 1031: since umul_ppmm is several instructions on some cpus.
! 1032:
! 1033: Putting the arguments as "h,l,l,h" gets slightly better code from gcc
! 1034: 2.95.2 on x86, it puts only one mov between each mul, not two. That mov
! 1035: though will probably show up as a bogus extra cycle though.
! 1036:
! 1037: The measuring function macros are into three parts to avoid overflowing
! 1038: preprocessor expansion space if umul_ppmm is big.
! 1039:
! 1040: Limitations:
! 1041:
! 1042: Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code
! 1043: generated first, especially on CPUs with low latency multipliers.
! 1044:
! 1045: The default umul_ppmm doing h*l will be getting increasing numbers of
! 1046: high zero bits in the calculation. CPUs with data-dependent multipliers
! 1047: will want to use umul_ppmm.1 to get some randomization into the
! 1048: calculation. The extra xors and fetches will be a slowdown of course. */
! 1049:
! 1050: #define SPEED_MACRO_UMUL_PPMM_A \
! 1051: { \
! 1052: mp_limb_t h, l; \
! 1053: unsigned i; \
! 1054: double t; \
! 1055: \
! 1056: s->time_divisor = 10; \
! 1057: \
! 1058: h = s->xp[0]; \
! 1059: l = s->yp[0]; \
! 1060: \
! 1061: switch (s->r) { \
! 1062: case 1: \
! 1063: speed_starttime (); \
! 1064: i = s->reps; \
! 1065: do \
! 1066: {
! 1067:
! 1068: #define SPEED_MACRO_UMUL_PPMM_B \
! 1069: } \
! 1070: while (--i != 0); \
! 1071: t = speed_endtime (); \
! 1072: break; \
! 1073: \
! 1074: default: \
! 1075: speed_starttime (); \
! 1076: i = s->reps; \
! 1077: do \
! 1078: {
! 1079:
! 1080: #define SPEED_MACRO_UMUL_PPMM_C \
! 1081: } \
! 1082: while (--i != 0); \
! 1083: t = speed_endtime (); \
! 1084: break; \
! 1085: } \
! 1086: \
! 1087: /* stop the compiler optimizing away the whole calculation! */ \
! 1088: noop_1 (h); \
! 1089: noop_1 (l); \
! 1090: \
! 1091: return t; \
! 1092: }
! 1093:
! 1094:
! 1095: double
! 1096: speed_umul_ppmm (struct speed_params *s)
! 1097: {
! 1098: SPEED_MACRO_UMUL_PPMM_A;
! 1099: {
! 1100: umul_ppmm (h, l, l, h); h ^= s->xp_block[0]; l ^= s->yp_block[0];
! 1101: umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
! 1102: umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
! 1103: umul_ppmm (h, l, l, h); h ^= s->xp_block[3]; l ^= s->yp_block[3];
! 1104: umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
! 1105: umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
! 1106: umul_ppmm (h, l, l, h); h ^= s->xp_block[6]; l ^= s->yp_block[6];
! 1107: umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
! 1108: umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
! 1109: umul_ppmm (h, l, l, h); h ^= s->xp_block[9]; l ^= s->yp_block[9];
! 1110: }
! 1111: SPEED_MACRO_UMUL_PPMM_B;
! 1112: {
! 1113: umul_ppmm (h, l, l, h);
! 1114: umul_ppmm (h, l, l, h);
! 1115: umul_ppmm (h, l, l, h);
! 1116: umul_ppmm (h, l, l, h);
! 1117: umul_ppmm (h, l, l, h);
! 1118: umul_ppmm (h, l, l, h);
! 1119: umul_ppmm (h, l, l, h);
! 1120: umul_ppmm (h, l, l, h);
! 1121: umul_ppmm (h, l, l, h);
! 1122: umul_ppmm (h, l, l, h);
! 1123: }
! 1124: SPEED_MACRO_UMUL_PPMM_C;
! 1125: }
! 1126:
! 1127:
! 1128: #if HAVE_NATIVE_mpn_umul_ppmm
! 1129:
! 1130: #if defined (__hppa) && W_TYPE_SIZE == 64
! 1131: #define CALL_MPN_UMUL_PPMM (h = __MPN (umul_ppmm) (h, l, &l))
! 1132: #else
! 1133: #define CALL_MPN_UMUL_PPMM (h = __MPN (umul_ppmm) (&l, h, l))
! 1134: #endif
! 1135:
! 1136: double
! 1137: speed_mpn_umul_ppmm (struct speed_params *s)
! 1138: {
! 1139: SPEED_MACRO_UMUL_PPMM_A;
! 1140: {
! 1141: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[0]; l ^= s->yp_block[0];
! 1142: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[1]; l ^= s->yp_block[1];
! 1143: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[2]; l ^= s->yp_block[2];
! 1144: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[3]; l ^= s->yp_block[3];
! 1145: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[4]; l ^= s->yp_block[4];
! 1146: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[5]; l ^= s->yp_block[5];
! 1147: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[6]; l ^= s->yp_block[6];
! 1148: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[7]; l ^= s->yp_block[7];
! 1149: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[8]; l ^= s->yp_block[8];
! 1150: CALL_MPN_UMUL_PPMM; h ^= s->xp_block[9]; l ^= s->yp_block[9];
! 1151: }
! 1152: SPEED_MACRO_UMUL_PPMM_B;
! 1153: {
! 1154: CALL_MPN_UMUL_PPMM;
! 1155: CALL_MPN_UMUL_PPMM;
! 1156: CALL_MPN_UMUL_PPMM;
! 1157: CALL_MPN_UMUL_PPMM;
! 1158: CALL_MPN_UMUL_PPMM;
! 1159: CALL_MPN_UMUL_PPMM;
! 1160: CALL_MPN_UMUL_PPMM;
! 1161: CALL_MPN_UMUL_PPMM;
! 1162: CALL_MPN_UMUL_PPMM;
! 1163: CALL_MPN_UMUL_PPMM;
! 1164: }
! 1165: SPEED_MACRO_UMUL_PPMM_C;
! 1166: }
! 1167: #endif
! 1168:
! 1169:
! 1170: /* The divisions are successively dependent so latency is measured, not
! 1171: issue rate. There's only 10 per loop so the code doesn't get too big,
! 1172: especially for udiv_qrnnd_preinv and preinv2norm, which are several
! 1173: instructions each.
! 1174:
! 1175: Note that it's only the division which is measured here, there's no data
! 1176: fetching and no shifting if the divisor gets normalized.
! 1177:
! 1178: In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
! 1179: generate x86 div instructions with nothing in between.
! 1180:
! 1181: The measuring function macros are in two parts to avoid overflowing
! 1182: preprocessor expansion space if udiv_qrnnd etc are big.
! 1183:
! 1184: Limitations:
! 1185:
! 1186: Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
! 1187: generated first.
! 1188:
! 1189: CPUs with data-dependent divisions may want more attention paid to the
! 1190: randomness of the data used. Probably the measurement wanted is over
! 1191: uniformly distributed numbers, but what's here might not be giving that. */
! 1192:
! 1193: #define SPEED_ROUTINE_UDIV_QRNND_A(normalize) \
! 1194: { \
! 1195: double t; \
! 1196: unsigned i; \
! 1197: mp_limb_t q, r, d; \
! 1198: mp_limb_t dinv; \
! 1199: \
! 1200: s->time_divisor = 10; \
! 1201: \
! 1202: /* divisor from "r" parameter, or a default */ \
! 1203: d = s->r; \
! 1204: if (d == 0) \
! 1205: d = 0x12345678; \
! 1206: \
! 1207: if (normalize) \
! 1208: { \
! 1209: unsigned norm; \
! 1210: count_leading_zeros (norm, d); \
! 1211: d <<= norm; \
! 1212: invert_limb (dinv, d); \
! 1213: } \
! 1214: \
! 1215: q = s->xp[0]; \
! 1216: r = s->yp[0] % d; \
! 1217: \
! 1218: speed_starttime (); \
! 1219: i = s->reps; \
! 1220: do \
! 1221: {
! 1222:
! 1223: #define SPEED_ROUTINE_UDIV_QRNND_B \
! 1224: } \
! 1225: while (--i != 0); \
! 1226: t = speed_endtime (); \
! 1227: \
! 1228: /* stop the compiler optimizing away the whole calculation! */ \
! 1229: noop_1 (q); \
! 1230: noop_1 (r); \
! 1231: \
! 1232: return t; \
! 1233: }
! 1234:
! 1235: double
! 1236: speed_udiv_qrnnd (struct speed_params *s)
! 1237: {
! 1238: SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
! 1239: {
! 1240: udiv_qrnnd (q, r, r, q, d);
! 1241: udiv_qrnnd (q, r, r, q, d);
! 1242: udiv_qrnnd (q, r, r, q, d);
! 1243: udiv_qrnnd (q, r, r, q, d);
! 1244: udiv_qrnnd (q, r, r, q, d);
! 1245: udiv_qrnnd (q, r, r, q, d);
! 1246: udiv_qrnnd (q, r, r, q, d);
! 1247: udiv_qrnnd (q, r, r, q, d);
! 1248: udiv_qrnnd (q, r, r, q, d);
! 1249: udiv_qrnnd (q, r, r, q, d);
! 1250: }
! 1251: SPEED_ROUTINE_UDIV_QRNND_B;
! 1252: }
! 1253:
! 1254: double
! 1255: speed_udiv_qrnnd_preinv (struct speed_params *s)
! 1256: {
! 1257: SPEED_ROUTINE_UDIV_QRNND_A (1);
! 1258: {
! 1259: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1260: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1261: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1262: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1263: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1264: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1265: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1266: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1267: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1268: udiv_qrnnd_preinv (q, r, r, q, d, dinv);
! 1269: }
! 1270: SPEED_ROUTINE_UDIV_QRNND_B;
! 1271: }
! 1272:
! 1273: double
! 1274: speed_udiv_qrnnd_preinv2norm (struct speed_params *s)
! 1275: {
! 1276: SPEED_ROUTINE_UDIV_QRNND_A (1);
! 1277: {
! 1278: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1279: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1280: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1281: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1282: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1283: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1284: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1285: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1286: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1287: udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
! 1288: }
! 1289: SPEED_ROUTINE_UDIV_QRNND_B;
! 1290: }
! 1291:
! 1292: #if HAVE_NATIVE_mpn_udiv_qrnnd
! 1293:
! 1294: #if defined (__hppa) && W_TYPE_SIZE == 64
! 1295: #define CALL_MPN_UDIV_QRNND (q = __MPN (udiv_qrnnd) (r, q, d, &r))
! 1296: #else
! 1297: #define CALL_MPN_UDIV_QRNND (q = __MPN (udiv_qrnnd) (&r, r, q, d))
! 1298: #endif
! 1299:
! 1300: double
! 1301: speed_mpn_udiv_qrnnd (struct speed_params *s)
! 1302: {
! 1303:
! 1304: SPEED_ROUTINE_UDIV_QRNND_A (1);
! 1305: {
! 1306: CALL_MPN_UDIV_QRNND;
! 1307: CALL_MPN_UDIV_QRNND;
! 1308: CALL_MPN_UDIV_QRNND;
! 1309: CALL_MPN_UDIV_QRNND;
! 1310: CALL_MPN_UDIV_QRNND;
! 1311: CALL_MPN_UDIV_QRNND;
! 1312: CALL_MPN_UDIV_QRNND;
! 1313: CALL_MPN_UDIV_QRNND;
! 1314: CALL_MPN_UDIV_QRNND;
! 1315: CALL_MPN_UDIV_QRNND;
! 1316: }
! 1317: SPEED_ROUTINE_UDIV_QRNND_B;
! 1318: }
! 1319: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>