[BACK]Return to common.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / tune

Annotation of OpenXM_contrib/gmp/tune/common.c, Revision 1.1.1.1

1.1       maekawa     1: /* Shared speed subroutines.  */
                      2:
                      3: /*
                      4: Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      5:
                      6: This file is part of the GNU MP Library.
                      7:
                      8: The GNU MP Library is free software; you can redistribute it and/or modify
                      9: it under the terms of the GNU Lesser General Public License as published by
                     10: the Free Software Foundation; either version 2.1 of the License, or (at your
                     11: option) any later version.
                     12:
                     13: The GNU MP Library is distributed in the hope that it will be useful, but
                     14: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     15: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     16: License for more details.
                     17:
                     18: You should have received a copy of the GNU Lesser General Public License
                     19: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     20: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     21: MA 02111-1307, USA.
                     22: */
                     23:
                     24: #include <errno.h>
                     25: #include <fcntl.h>
                     26: #include <math.h>
                     27: #include <stdio.h>
                     28: #include <stdlib.h> /* for qsort */
                     29: #include <string.h>
                     30: #include <unistd.h>
                     31: #if 0
                     32: #include <sys/ioctl.h>
                     33: #endif
                     34:
                     35: #include "gmp.h"
                     36: #include "gmp-impl.h"
                     37: #include "longlong.h"
                     38:
                     39: #include "speed.h"
                     40:
                     41: /* Change this to "#define TRACE(x) x" to get traces. */
                     42: #define TRACE(x)
                     43:
                     44:
                     45: typedef int (*qsort_function_t) _PROTO ((const void *, const void *));
                     46:
                     47:
                     48: int   speed_option_addrs = 0;
                     49:
                     50:
                     51: void
                     52: pentium_wbinvd(void)
                     53: {
                     54: #if 0
                     55:   {
                     56:     static int  fd = -2;
                     57:
                     58:     if (fd == -2)
                     59:       {
                     60:         fd = open ("/dev/wbinvd", O_RDWR);
                     61:         if (fd == -1)
                     62:           perror ("open /dev/wbinvd");
                     63:       }
                     64:
                     65:     if (fd != -1)
                     66:       ioctl (fd, 0, 0);
                     67:   }
                     68: #endif
                     69:
                     70: #if 0
                     71: #define WBINVDSIZE  1024*1024*2
                     72:   {
                     73:     static char  *p = NULL;
                     74:     int   i, sum;
                     75:
                     76:     if (p == NULL)
                     77:       p = malloc (WBINVDSIZE);
                     78:
                     79: #if 0
                     80:     for (i = 0; i < WBINVDSIZE; i++)
                     81:       p[i] = i & 0xFF;
                     82: #endif
                     83:
                     84:     sum = 0;
                     85:     for (i = 0; i < WBINVDSIZE; i++)
                     86:       sum += p[i];
                     87:
                     88:     mpn_cache_fill_dummy (sum);
                     89:   }
                     90: #endif
                     91: }
                     92:
                     93: static int
                     94: double_cmp_ptr (const double *p, const double *q)
                     95: {
                     96:   if (*p > *q)  return 1;
                     97:   if (*p < *q)  return -1;
                     98:   return 0;
                     99: }
                    100:
                    101:
                    102: /* Measure the speed of a given routine.
                    103:
                    104:    The routine is run with enough repetitions to make it take at least
                    105:    speed_precision * speed_unittime.  This aims to minimize the effects of a
                    106:    limited accuracy time base and the overhead of the measuring itself.
                    107:
                    108:    Measurements are made looking for 4 results within TOLERANCE of each
                    109:    other (or 3 for routines taking longer than 2 seconds).  This aims to get
                    110:    an accurate reading even if some runs are bloated by interrupts or task
                    111:    switches or whatever.
                    112:
                    113:    The given (*fun)() is expected to run its function "s->reps" many times
                    114:    and return the total elapsed time measured using speed_starttime() and
                    115:    speed_endtime().  If the function doesn't support the given s->size or
                    116:    s->r, -1.0 should be returned.  See the various base routines below.  */
                    117:
                    118: double
                    119: speed_measure (double (*fun) _PROTO ((struct speed_params *s)),
                    120:                struct speed_params *s)
                    121: {
                    122: #define TOLERANCE    1.005  /* 0.5% */
                    123:
                    124:   struct speed_params  s_dummy;
                    125:   int     i, j, e;
                    126:   double  t[30];
                    127:   double  t_unsorted[30];
                    128:
                    129:   /* Use dummy parameters if caller doesn't provide any.  Only a few special
                    130:      "fun"s will cope with this, speed_noop() is one.  */
                    131:   if (s == NULL)
                    132:     {
                    133:       memset (&s_dummy, '\0', sizeof (s_dummy));
                    134:       s = &s_dummy;
                    135:     }
                    136:
                    137:   s->reps = 1;
                    138:   s->time_divisor = 1.0;
                    139:   for (i = 0; i < numberof (t); i++)
                    140:     {
                    141:       for (;;)
                    142:         {
                    143:           s->src_num = 0;
                    144:           s->dst_num = 0;
                    145:
                    146:           t[i] = (*fun) (s);
                    147:           t_unsorted[i] = t[i];
                    148:
                    149:           TRACE (printf("size=%ld reps=%u r=%d attempt=%d  %.9f\n",
                    150:                         s->size, s->reps, s->r, i, t[i]));
                    151:
                    152:           if (t[i] == -1.0)
                    153:             return -1.0;
                    154:
                    155:           if (t[i] >= speed_unittime * speed_precision)
                    156:             break;
                    157:
                    158:           /* go to a value of reps to make t[i] >= precision */
                    159:           s->reps = (unsigned) ceil (1.1 * s->reps
                    160:                                      * speed_unittime * speed_precision
                    161:                                      / MAX (t[i], speed_unittime));
                    162:         }
                    163:       t[i] /= s->reps;
                    164:
                    165:       if (speed_precision == 0)
                    166:         return t[i];
                    167:
                    168:       /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
                    169:       if (t[0] >= 2.0)
                    170:         e = 3;
                    171:       else
                    172:         e = 4;
                    173:
                    174:       /* Look for e many t[]'s within TOLERANCE of each other to consider a
                    175:          valid measurement.  Return smallest among them.  */
                    176:       if (i >= e)
                    177:         {
                    178:           qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
                    179:           for (j = e-1; j < i; j++)
                    180:             if (t[j] <= t[j-e+1] * TOLERANCE)
                    181:               return t[j-e+1] / s->time_divisor;
                    182:         }
                    183:     }
                    184:
                    185:   fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
                    186:            e, (TOLERANCE-1.0)*100.0);
                    187:   fprintf (stderr, "  %.12f is about 0.5%%\n", t[0]*(TOLERANCE-1.0));
                    188:   for (i = 0; i < numberof (t); i++)
                    189:     fprintf (stderr, "  %.09f\n", t_unsorted[i]);
                    190:
                    191:   return -1.0;
                    192: }
                    193:
                    194:
                    195: /* Read all of ptr,size to get it into the CPU memory cache.
                    196:
                    197:    A call to mpn_cache_fill_dummy() is used to make sure the compiler
                    198:    doesn't optimize away the whole loop.  Using "volatile mp_limb_t sum"
                    199:    would work too, but the function call means we don't rely on every
                    200:    compiler actually implementing volatile properly.
                    201:
                    202:    mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
                    203:    it can inline it.  */
                    204:
                    205: void
                    206: mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
                    207: {
                    208:   mp_limb_t  sum = 0;
                    209:   mp_size_t  i;
                    210:
                    211:   for (i = 0; i < size; i++)
                    212:     sum += ptr[i];
                    213:
                    214:   mpn_cache_fill_dummy(sum);
                    215: }
                    216:
                    217:
                    218: void
                    219: mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
                    220: {
                    221:   mpn_cache_fill (ptr, size);
                    222:
                    223: #if 0
                    224:   mpn_random (ptr, size);
                    225: #endif
                    226:
                    227: #if 0
                    228:   mp_size_t  i;
                    229:
                    230:   for (i = 0; i < size; i++)
                    231:     ptr[i] = i;
                    232: #endif
                    233: }
                    234:
                    235:
                    236: void
                    237: speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
                    238: {
                    239:   if (s->src_num >= numberof (s->src))
                    240:     {
                    241:       fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
                    242:       abort ();
                    243:     }
                    244:   s->src[s->src_num].ptr = ptr;
                    245:   s->src[s->src_num].size = size;
                    246:   s->src_num++;
                    247: }
                    248:
                    249:
                    250: void
                    251: speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
                    252: {
                    253:   if (s->dst_num >= numberof (s->dst))
                    254:     {
                    255:       fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
                    256:       abort ();
                    257:     }
                    258:   s->dst[s->dst_num].ptr = ptr;
                    259:   s->dst[s->dst_num].size = size;
                    260:   s->dst_num++;
                    261: }
                    262:
                    263:
                    264: void
                    265: speed_cache_fill (struct speed_params *s)
                    266: {
                    267:   static struct speed_params  prev;
                    268:   int  i;
                    269:
                    270:   /* FIXME: need a better way to get the format string for a pointer */
                    271:
                    272:   if (speed_option_addrs)
                    273:     {
                    274:       int  different;
                    275:
                    276:       different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
                    277:       for (i = 0; i < s->dst_num; i++)
                    278:         different |= (s->dst[i].ptr != prev.dst[i].ptr);
                    279:       for (i = 0; i < s->src_num; i++)
                    280:         different |= (s->src[i].ptr != prev.src[i].ptr);
                    281:
                    282:       if (different)
                    283:         {
                    284:           if (s->dst_num != 0)
                    285:             {
                    286:               printf ("dst");
                    287:               for (i = 0; i < s->dst_num; i++)
                    288:                 printf (" %08lX", (unsigned long) s->dst[i].ptr);
                    289:               printf (" ");
                    290:             }
                    291:
                    292:           if (s->src_num != 0)
                    293:             {
                    294:               printf ("src");
                    295:               for (i = 0; i < s->src_num; i++)
                    296:                 printf (" %08lX", (unsigned long) s->src[i].ptr);
                    297:               printf (" ");
                    298:             }
                    299:           printf ("  (cf sp approx %08lX)\n", (unsigned long) &different);
                    300:
                    301:         }
                    302:
                    303:       memcpy (&prev, s, sizeof(prev));
                    304:     }
                    305:
                    306:   switch (s->cache) {
                    307:   case 0:
                    308:     for (i = 0; i < s->dst_num; i++)
                    309:       mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
                    310:     for (i = 0; i < s->src_num; i++)
                    311:       mpn_cache_fill (s->src[i].ptr, s->src[i].size);
                    312:     break;
                    313:   case 1:
                    314:     pentium_wbinvd();
                    315:     break;
                    316:   }
                    317: }
                    318:
                    319:
                    320: /* Return p advanced to the next multiple of "align" bytes.  "align" must be
                    321:    a power of 2.  Care is taken not to assume sizeof(int)==sizeof(pointer).
                    322:    Using "unsigned long" avoids a warning on hpux.  */
                    323: void *
                    324: align_pointer (void *p, size_t align)
                    325: {
                    326:   unsigned long  d;
                    327:   d = ((unsigned long) p) & (align-1);
                    328:   d = (d != 0 ? align-d : 0);
                    329:   return (void *) (((char *) p) + d);
                    330: }
                    331:
                    332: /* Note that memory allocated with this function can never be freed, because
                    333:    the start address of the block allocated is discarded. */
                    334: void *
                    335: _mp_allocate_func_aligned (size_t bytes, size_t align)
                    336: {
                    337:   return align_pointer ((*_mp_allocate_func) (bytes + align-1), align);
                    338: }
                    339:
                    340:
                    341: void *
                    342: _mp_allocate_or_reallocate (void *ptr, size_t oldsize, size_t newsize)
                    343: {
                    344:   if (ptr == NULL)
                    345:     return (*_mp_allocate_func) (newsize);
                    346:   else
                    347:     return (*_mp_reallocate_func) (ptr, oldsize, newsize);
                    348: }
                    349:
                    350:
                    351: /* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs.  ptr
                    352:    needs to have room for up to CACHE_LINE_SIZE-4 extra bytes.  */
                    353:
                    354: mp_ptr
                    355: speed_tmp_alloc_adjust (void *ptr, mp_size_t align)
                    356: {
                    357:   /*
                    358:   printf("%p %ld -> %p %X %X\n", ptr, align,
                    359:          (mp_ptr) ptr
                    360:          + ((align - ((mp_size_t) ptr >> 2)) &
                    361:             SPEED_TMP_ALLOC_ADJUST_MASK),
                    362:          ((mp_size_t) ptr >> 2) & SPEED_TMP_ALLOC_ADJUST_MASK,
                    363:          SPEED_TMP_ALLOC_ADJUST_MASK);
                    364:   */
                    365:
                    366:   return (mp_ptr) ptr
                    367:     + ((align - ((mp_size_t) ptr >> 2)) & SPEED_TMP_ALLOC_ADJUST_MASK);
                    368: }
                    369:
                    370:
                    371: void
                    372: mpz_set_n (mpz_ptr z, mp_srcptr p, mp_size_t size)
                    373: {
                    374:   ASSERT (size >= 0);
                    375:   MPN_NORMALIZE (p, size);
                    376:   MPZ_REALLOC (z, size);
                    377:   MPN_COPY (PTR(z), p, size);
                    378:   SIZ(z) = size;
                    379: }
                    380:
                    381:
                    382: /* Miscellanous options accepted by tune and speed programs under -o. */
                    383:
                    384: void
                    385: speed_option_set (const char *s)
                    386: {
                    387:   if (strcmp (s, "addrs") == 0)  speed_option_addrs = 1;
                    388:   else
                    389:     {
                    390:       printf ("Unrecognised -o option: %s\n", s);
                    391:       exit (1);
                    392:     }
                    393: }
                    394:
                    395:
                    396: /* The following are basic speed running routines for various gmp functions.
                    397:    Many are very similar and use speed.h macros.
                    398:
                    399:    Each routine allocates it's own destination space for the result of the
                    400:    function, because only it can know what the function needs.
                    401:
                    402:    speed_starttime() and speed_endtime() are put tight around the code to be
                    403:    measured.  Any setups are done outside the timed portion.
                    404:
                    405:    Each routine is responsible for its own cache priming.
                    406:    speed_cache_fill() is a good way to do this, see examples in speed.h.
                    407:    One cache priming possibility, for CPUs with write-allocate cache, and
                    408:    functions that don't take too long, is to do one dummy call before timing
                    409:    so as to cache everything that gets used.  But speed_measure() runs a
                    410:    routine at least twice and will take the smaller time, so this might not
                    411:    be necessary.
                    412:
                    413:    Data alignment will be important, for source, destination and temporary
                    414:    workspace.  A routine can align its destination and workspace.  Programs
                    415:    using the routines will ensure s->xp and s->yp are aligned.  Aligning
                    416:    onto a CACHE_LINE_SIZE boundary is suggested.  s->align_wp and
                    417:    s->align_wp2 should be respected where it makes sense to do so.
                    418:    SPEED_TMP_ALLOC_LIMBS is a good way to do this.
                    419:
                    420:    A loop of the following form can be expected to turn into good assembler
                    421:    code on most CPUs, thereby minimizing overhead in the measurement.  It
                    422:    can always be assumed s->reps >= 1.
                    423:
                    424:           i = s->reps
                    425:           do
                    426:             foo();
                    427:           while (--i != 0);
                    428:
                    429:    Additional parameters might be added to "struct speed_params" in the
                    430:    future.  Routines should ignore anything they don't use.
                    431:
                    432:    s->size can be used creatively, and s->xp and s->yp can be ignored.  For
                    433:    example, speed_mpz_fac_ui() uses s->size as n for the factorial.  s->r is
                    434:    just a user-supplied parameter.  speed_mpn_lshift() uses it as a shift,
                    435:    speed_mpn_mul_1() uses it as a multiplier.  */
                    436:
                    437:
                    438: /* MPN_COPY etc can be macros, so the _CALL forms are necessary */
                    439: double
                    440: speed_MPN_COPY (struct speed_params *s)
                    441: {
                    442:   SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY (wp, s->xp, s->size));
                    443: }
                    444: double
                    445: speed_MPN_COPY_INCR (struct speed_params *s)
                    446: {
                    447:   SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY_INCR (wp, s->xp, s->size));
                    448: }
                    449: double
                    450: speed_MPN_COPY_DECR (struct speed_params *s)
                    451: {
                    452:   SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY_DECR (wp, s->xp, s->size));
                    453: }
                    454: double
                    455: speed_memcpy (struct speed_params *s)
                    456: {
                    457:   SPEED_ROUTINE_MPN_COPY_CALL
                    458:     (memcpy (wp, s->xp, s->size * BYTES_PER_MP_LIMB));
                    459: }
                    460:
                    461:
                    462: double
                    463: speed_mpn_addmul_1 (struct speed_params *s)
                    464: {
                    465:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
                    466: }
                    467: double
                    468: speed_mpn_submul_1 (struct speed_params *s)
                    469: {
                    470:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
                    471: }
                    472:
                    473:
                    474: double
                    475: speed_mpn_mul_1 (struct speed_params *s)
                    476: {
                    477:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
                    478: }
                    479:
                    480:
                    481: double
                    482: speed_mpn_lshift (struct speed_params *s)
                    483: {
                    484:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
                    485: }
                    486: double
                    487: speed_mpn_rshift (struct speed_params *s)
                    488: {
                    489:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
                    490: }
                    491:
                    492:
                    493: /* The carry-in variants (if available) are good for measuring because they
                    494:    won't skip a division if high<divisor.  Alternately, use -1 as a divisor
                    495:    with the plain _1 forms. */
                    496: double
                    497: speed_mpn_divrem_1 (struct speed_params *s)
                    498: {
                    499:   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
                    500: }
                    501: double
                    502: speed_mpn_divrem_1f (struct speed_params *s)
                    503: {
                    504:   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
                    505: }
                    506: #if HAVE_NATIVE_mpn_divrem_1c
                    507: double
                    508: speed_mpn_divrem_1c (struct speed_params *s)
                    509: {
                    510:   SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
                    511: }
                    512: double
                    513: speed_mpn_divrem_1cf (struct speed_params *s)
                    514: {
                    515:   SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
                    516: }
                    517: #endif
                    518:
                    519: double
                    520: speed_mpn_divrem_2 (struct speed_params *s)
                    521: {
                    522:   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
                    523: }
                    524:
                    525: double
                    526: speed_mpn_mod_1 (struct speed_params *s)
                    527: {
                    528:   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
                    529: }
                    530: #if HAVE_NATIVE_mpn_mod_1c
                    531: double
                    532: speed_mpn_mod_1c (struct speed_params *s)
                    533: {
                    534:   SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
                    535: }
                    536: #endif
                    537:
                    538: double
                    539: speed_mpn_divexact_by3 (struct speed_params *s)
                    540: {
                    541:   /* mpn_divexact_by3 is a macro, so the _CALL form is necessary */
                    542:   SPEED_ROUTINE_MPN_COPY_CALL(mpn_divexact_by3 (wp, s->xp, s->size));
                    543: }
                    544:
                    545:
                    546: double
                    547: speed_mpn_bz_divrem_n (struct speed_params *s)
                    548: {
                    549:   SPEED_ROUTINE_MPN_BZ_DIVREM_N (mpn_bz_divrem_n);
                    550: }
                    551: double
                    552: speed_mpn_bz_divrem_sb (struct speed_params *s)
                    553: {
                    554:   SPEED_ROUTINE_MPN_BZ_DIVREM_SB (mpn_sb_divrem_mn);
                    555: }
                    556: double
                    557: speed_mpn_bz_tdiv_qr (struct speed_params *s)
                    558: {
                    559:   SPEED_ROUTINE_MPN_BZ_TDIV_QR (mpn_tdiv_qr);
                    560: }
                    561:
                    562:
                    563: double
                    564: speed_mpn_popcount (struct speed_params *s)
                    565: {
                    566:   SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
                    567: }
                    568: double
                    569: speed_mpn_hamdist (struct speed_params *s)
                    570: {
                    571:   SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
                    572: }
                    573:
                    574:
                    575: double
                    576: speed_mpn_add_n (struct speed_params *s)
                    577: {
                    578:   SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
                    579: }
                    580: double
                    581: speed_mpn_sub_n (struct speed_params *s)
                    582: {
                    583: SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
                    584: }
                    585: double
                    586: speed_mpn_add_n_self (struct speed_params *s)
                    587: {
                    588:   SPEED_ROUTINE_MPN_BINARY_N_SELF (mpn_add_n);
                    589: }
                    590: double
                    591: speed_mpn_add_n_inplace (struct speed_params *s)
                    592: {
                    593:   SPEED_ROUTINE_MPN_BINARY_N_INPLACE (mpn_add_n);
                    594: }
                    595:
                    596:
                    597: /* mpn_and_n etc can be macros and so have to be handled with
                    598:    SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
                    599: double
                    600: speed_mpn_and_n (struct speed_params *s)
                    601: {
                    602:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size));
                    603: }
                    604: double
                    605: speed_mpn_andn_n (struct speed_params *s)
                    606: {
                    607: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size));
                    608: }
                    609: double
                    610: speed_mpn_nand_n (struct speed_params *s)
                    611: {
                    612:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size));
                    613: }
                    614: double
                    615: speed_mpn_ior_n (struct speed_params *s)
                    616: {
                    617: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size));
                    618: }
                    619: double
                    620: speed_mpn_iorn_n (struct speed_params *s)
                    621: {
                    622:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size));
                    623: }
                    624: double
                    625: speed_mpn_nior_n (struct speed_params *s)
                    626: {
                    627:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size));
                    628: }
                    629: double
                    630: speed_mpn_xor_n (struct speed_params *s)
                    631: {
                    632:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size));
                    633: }
                    634: double
                    635: speed_mpn_xnor_n (struct speed_params *s)
                    636: {
                    637:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size));
                    638: }
                    639:
                    640:
                    641: double
                    642: speed_mpn_mul_n (struct speed_params *s)
                    643: {
                    644:   SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
                    645: }
                    646: double
                    647: speed_mpn_sqr_n (struct speed_params *s)
                    648: {
                    649:   SPEED_ROUTINE_MPN_SQR (mpn_sqr_n);
                    650: }
                    651: double
                    652: speed_mpn_mul_n_sqr (struct speed_params *s)
                    653: {
                    654:   SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
                    655: }
                    656:
                    657: double
                    658: speed_mpn_mul_basecase (struct speed_params *s)
                    659: {
                    660:   SPEED_ROUTINE_MPN_MUL_BASECASE(mpn_mul_basecase);
                    661: }
                    662: double
                    663: speed_mpn_sqr_basecase (struct speed_params *s)
                    664: {
                    665:   /* FIXME: size restrictions on some versions of sqr_basecase */
                    666:   SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
                    667: }
                    668:
                    669: double
                    670: speed_mpn_kara_mul_n (struct speed_params *s)
                    671: {
                    672:   SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n);
                    673: }
                    674: double
                    675: speed_mpn_kara_sqr_n (struct speed_params *s)
                    676: {
                    677:   SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n);
                    678: }
                    679:
                    680: double
                    681: speed_mpn_toom3_mul_n (struct speed_params *s)
                    682: {
                    683:   SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n);
                    684: }
                    685: double
                    686: speed_mpn_toom3_sqr_n (struct speed_params *s)
                    687: {
                    688:   SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n);
                    689: }
                    690:
                    691: double
                    692: speed_mpn_mul_fft_full (struct speed_params *s)
                    693: {
                    694:   SPEED_ROUTINE_MPN_MUL_N_CALL
                    695:     (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
                    696: }
                    697: double
                    698: speed_mpn_mul_fft_full_sqr (struct speed_params *s)
                    699: {
                    700:   SPEED_ROUTINE_MPN_SQR_CALL
                    701:     (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
                    702: }
                    703:
                    704:
                    705: /* These are mod 2^N+1 multiplies and squares.  If s->r is supplied it's
                    706:    used as k, otherwise the best k for the size is used.  If s->size isn't a
                    707:    multiple of 2^k it's rounded up to make the effective operation size.  */
                    708:
                    709: #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr)       \
                    710:   {                                                     \
                    711:     mp_ptr     wp;                                      \
                    712:     mp_size_t  pl;                                      \
                    713:     int        k;                                       \
                    714:     unsigned   i;                                       \
                    715:     double     t;                                       \
                    716:     TMP_DECL (marker);                                  \
                    717:                                                         \
                    718:     SPEED_RESTRICT_COND (s->size >= 1);                 \
                    719:                                                         \
                    720:     if (s->r != 0)                                      \
                    721:       k = s->r;                                         \
                    722:     else                                                \
                    723:       k = mpn_fft_best_k (s->size, sqr);                \
                    724:                                                         \
                    725:     TMP_MARK (marker);                                  \
                    726:     pl = mpn_fft_next_size (s->size, k);                \
                    727:     wp = SPEED_TMP_ALLOC_LIMBS (pl+1, s->align_wp);     \
                    728:                                                         \
                    729:     speed_operand_src (s, s->xp, s->size);              \
                    730:     if (!sqr)                                           \
                    731:       speed_operand_src (s, s->yp, s->size);            \
                    732:     speed_operand_dst (s, wp, pl+1);                    \
                    733:     speed_cache_fill (s);                               \
                    734:                                                         \
                    735:     speed_starttime ();                                 \
                    736:     i = s->reps;                                        \
                    737:     do                                                  \
                    738:       call;                                             \
                    739:     while (--i != 0);                                   \
                    740:     t = speed_endtime ();                               \
                    741:                                                         \
                    742:     TMP_FREE (marker);                                  \
                    743:     return t;                                           \
                    744:   }
                    745:
                    746: double
                    747: speed_mpn_mul_fft (struct speed_params *s)
                    748: {
                    749:   SPEED_ROUTINE_MPN_MUL_FFT_CALL
                    750:     (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
                    751: }
                    752:
                    753: double
                    754: speed_mpn_mul_fft_sqr (struct speed_params *s)
                    755: {
                    756:   SPEED_ROUTINE_MPN_MUL_FFT_CALL
                    757:     (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
                    758: }
                    759:
                    760:
                    761: double
                    762: speed_mpn_gcd (struct speed_params *s)
                    763: {
                    764:   SPEED_ROUTINE_MPN_GCD (mpn_gcd);
                    765: }
                    766: double
                    767: speed_mpn_gcdext (struct speed_params *s)
                    768: {
                    769:   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
                    770: }
                    771: double
                    772: speed_mpn_gcd_1 (struct speed_params *s)
                    773: {
                    774:   SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
                    775: }
                    776:
                    777:
                    778: double
                    779: speed_mpn_jacobi_base (struct speed_params *s)
                    780: {
                    781:   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
                    782: }
                    783:
                    784:
                    785: double
                    786: speed_mpz_fac_ui (struct speed_params *s)
                    787: {
                    788:   SPEED_ROUTINE_MPZ_UI (mpz_fac_ui);
                    789: }
                    790: double
                    791: speed_mpz_fib_ui (struct speed_params *s)
                    792: {
                    793:   SPEED_ROUTINE_MPZ_UI (mpz_fib_ui);
                    794: }
                    795:
                    796:
                    797: double
                    798: speed_mpz_powm (struct speed_params *s)
                    799: {
                    800:   SPEED_ROUTINE_MPZ_POWM (mpz_powm);
                    801: }
                    802:
                    803:
                    804: double
                    805: speed_modlimb_invert (struct speed_params *s)
                    806: {
                    807:   SPEED_ROUTINE_MODLIMB_INVERT (modlimb_invert);
                    808: }
                    809:
                    810:
                    811: double
                    812: speed_noop (struct speed_params *s)
                    813: {
                    814:   unsigned  i;
                    815:
                    816:   speed_starttime ();
                    817:   i = s->reps;
                    818:   do
                    819:     noop ();
                    820:   while (--i != 0);
                    821:   return speed_endtime ();
                    822: }
                    823:
                    824: double
                    825: speed_noop_wxs (struct speed_params *s)
                    826: {
                    827:   mp_ptr   wp;
                    828:   unsigned i;
                    829:   double   t;
                    830:   TMP_DECL (marker);
                    831:
                    832:   TMP_MARK (marker);
                    833:   wp = TMP_ALLOC_LIMBS (1);
                    834:
                    835:   speed_starttime ();
                    836:   i = s->reps;
                    837:   do
                    838:     noop_wxs (wp, s->xp, s->size);
                    839:   while (--i != 0);
                    840:   t = speed_endtime ();
                    841:
                    842:   TMP_FREE (marker);
                    843:   return t;
                    844: }
                    845:
                    846: double
                    847: speed_noop_wxys (struct speed_params *s)
                    848: {
                    849:   mp_ptr   wp;
                    850:   unsigned i;
                    851:   double   t;
                    852:   TMP_DECL (marker);
                    853:
                    854:   TMP_MARK (marker);
                    855:   wp = TMP_ALLOC_LIMBS (1);
                    856:
                    857:   speed_starttime ();
                    858:   i = s->reps;
                    859:   do
                    860:     noop_wxys (wp, s->xp, s->yp, s->size);
                    861:   while (--i != 0);
                    862:   t = speed_endtime ();
                    863:
                    864:   TMP_FREE (marker);
                    865:   return t;
                    866: }
                    867:
                    868:
                    869: #define SPEED_ROUTINE_ALLOC_FREE(variables, calls)      \
                    870:   {                                                     \
                    871:     unsigned  i;                                        \
                    872:     variables;                                          \
                    873:                                                         \
                    874:     speed_starttime ();                                 \
                    875:     i = s->reps;                                        \
                    876:     do                                                  \
                    877:       {                                                 \
                    878:         calls;                                          \
                    879:       }                                                 \
                    880:     while (--i != 0);                                   \
                    881:     return speed_endtime ();                            \
                    882:   }
                    883:
                    884:
                    885: /* Compare these to see how much malloc/free costs and then how much
                    886:    _mp_default_allocate/free and mpz_init/clear add.  mpz_init/clear or
                    887:    mpq_init/clear will be doing a 1 limb allocate, so use that as the size
                    888:    when including them in comparisons.  */
                    889:
                    890: double
                    891: speed_malloc_free (struct speed_params *s)
                    892: {
                    893:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
                    894:   SPEED_ROUTINE_ALLOC_FREE (void *p,
                    895:                             p = malloc (bytes);
                    896:                             free (p));
                    897: }
                    898:
                    899: double
                    900: speed_malloc_realloc_free (struct speed_params *s)
                    901: {
                    902:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
                    903:   SPEED_ROUTINE_ALLOC_FREE (void *p,
                    904:                             p = malloc (BYTES_PER_MP_LIMB);
                    905:                             p = realloc (p, bytes);
                    906:                             free (p));
                    907: }
                    908:
                    909: double
                    910: speed_mp_allocate_free (struct speed_params *s)
                    911: {
                    912:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
                    913:   SPEED_ROUTINE_ALLOC_FREE (void *p,
                    914:                             p = (*_mp_allocate_func) (bytes);
                    915:                             (*_mp_free_func) (p, bytes));
                    916: }
                    917:
                    918: double
                    919: speed_mp_allocate_reallocate_free (struct speed_params *s)
                    920: {
                    921:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
                    922:   SPEED_ROUTINE_ALLOC_FREE
                    923:     (void *p,
                    924:      p = (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
                    925:      p = (*_mp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB);
                    926:      (*_mp_free_func) (p, bytes));
                    927: }
                    928:
                    929: double
                    930: speed_mpz_init_clear (struct speed_params *s)
                    931: {
                    932:   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
                    933:                             mpz_init (z);
                    934:                             mpz_clear (z));
                    935: }
                    936:
                    937: double
                    938: speed_mpz_init_realloc_clear (struct speed_params *s)
                    939: {
                    940:   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
                    941:                             mpz_init (z);
                    942:                             _mpz_realloc (z, s->size);
                    943:                             mpz_clear (z));
                    944: }
                    945:
                    946: double
                    947: speed_mpq_init_clear (struct speed_params *s)
                    948: {
                    949:   SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
                    950:                             mpq_init (q);
                    951:                             mpq_clear (q));
                    952: }
                    953:
                    954: double
                    955: speed_mpf_init_clear (struct speed_params *s)
                    956: {
                    957:   SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
                    958:                             mpf_init (f);
                    959:                             mpf_clear (f));
                    960: }
                    961:
                    962:
                    963: /* Compare this to mpn_add_n to see how much overhead mpz_add adds.  Note
                    964:    that repeatedly calling mpz_add with the same data gives branch predition
                    965:    in it an advantage.  */
                    966:
                    967: double
                    968: speed_mpz_add (struct speed_params *s)
                    969: {
                    970:   mpz_t     w, x, y;
                    971:   unsigned  i;
                    972:   double    t;
                    973:
                    974:   mpz_init (w);
                    975:   mpz_init (x);
                    976:   mpz_init (y);
                    977:
                    978:   mpz_set_n (x, s->xp, s->size);
                    979:   mpz_set_n (y, s->yp, s->size);
                    980:   mpz_add (w, x, y);
                    981:
                    982:   speed_starttime ();
                    983:   i = s->reps;
                    984:   do
                    985:     {
                    986:       mpz_add (w, x, y);
                    987:     }
                    988:   while (--i != 0);
                    989:   t = speed_endtime ();
                    990:
                    991:   mpz_clear (w);
                    992:   mpz_clear (x);
                    993:   mpz_clear (y);
                    994:   return t;
                    995: }
                    996:
                    997:
                    998: /* If r==0, calculate (size,size/2),
                    999:    otherwise calculate (size,r). */
                   1000:
                   1001: double
                   1002: speed_mpz_bin_uiui (struct speed_params *s)
                   1003: {
                   1004:   mpz_t          w;
                   1005:   unsigned long  k;
                   1006:   unsigned  i;
                   1007:   double    t;
                   1008:
                   1009:   mpz_init (w);
                   1010:   if (s->r != 0)
                   1011:     k = s->r;
                   1012:   else
                   1013:     k = s->size/2;
                   1014:
                   1015:   speed_starttime ();
                   1016:   i = s->reps;
                   1017:   do
                   1018:     {
                   1019:       mpz_bin_uiui (w, s->size, k);
                   1020:     }
                   1021:   while (--i != 0);
                   1022:   t = speed_endtime ();
                   1023:
                   1024:   mpz_clear (w);
                   1025:   return t;
                   1026: }
                   1027:
                   1028:
                   1029: /* The multiplies are successively dependent so the latency is measured, not
                   1030:    the issue rate.  There's only 10 per loop so the code doesn't get too big
                   1031:    since umul_ppmm is several instructions on some cpus.
                   1032:
                   1033:    Putting the arguments as "h,l,l,h" gets slightly better code from gcc
                   1034:    2.95.2 on x86, it puts only one mov between each mul, not two.  That mov
                   1035:    though will probably show up as a bogus extra cycle though.
                   1036:
                   1037:    The measuring function macros are into three parts to avoid overflowing
                   1038:    preprocessor expansion space if umul_ppmm is big.
                   1039:
                   1040:    Limitations:
                   1041:
                   1042:    Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code
                   1043:    generated first, especially on CPUs with low latency multipliers.
                   1044:
                   1045:    The default umul_ppmm doing h*l will be getting increasing numbers of
                   1046:    high zero bits in the calculation.  CPUs with data-dependent multipliers
                   1047:    will want to use umul_ppmm.1 to get some randomization into the
                   1048:    calculation.  The extra xors and fetches will be a slowdown of course.  */
                   1049:
                   1050: #define SPEED_MACRO_UMUL_PPMM_A \
                   1051:   {                             \
                   1052:     mp_limb_t  h, l;            \
                   1053:     unsigned   i;               \
                   1054:     double     t;               \
                   1055:                                 \
                   1056:     s->time_divisor = 10;       \
                   1057:                                 \
                   1058:     h = s->xp[0];               \
                   1059:     l = s->yp[0];               \
                   1060:                                 \
                   1061:     switch (s->r) {             \
                   1062:     case 1:                     \
                   1063:       speed_starttime ();       \
                   1064:       i = s->reps;              \
                   1065:       do                        \
                   1066:         {
                   1067:
                   1068: #define SPEED_MACRO_UMUL_PPMM_B \
                   1069:         }                       \
                   1070:       while (--i != 0);         \
                   1071:       t = speed_endtime ();     \
                   1072:       break;                    \
                   1073:                                 \
                   1074:     default:                    \
                   1075:       speed_starttime ();       \
                   1076:       i = s->reps;              \
                   1077:       do                        \
                   1078:         {
                   1079:
                   1080: #define SPEED_MACRO_UMUL_PPMM_C                                         \
                   1081:         }                                                               \
                   1082:       while (--i != 0);                                                 \
                   1083:       t = speed_endtime ();                                             \
                   1084:       break;                                                            \
                   1085:     }                                                                   \
                   1086:                                                                         \
                   1087:     /* stop the compiler optimizing away the whole calculation! */      \
                   1088:     noop_1 (h);                                                         \
                   1089:     noop_1 (l);                                                         \
                   1090:                                                                         \
                   1091:     return t;                                                           \
                   1092:   }
                   1093:
                   1094:
                   1095: double
                   1096: speed_umul_ppmm (struct speed_params *s)
                   1097: {
                   1098:   SPEED_MACRO_UMUL_PPMM_A;
                   1099:   {
                   1100:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
                   1101:      umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
                   1102:      umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
                   1103:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
                   1104:      umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
                   1105:      umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
                   1106:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
                   1107:      umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
                   1108:      umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
                   1109:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
                   1110:   }
                   1111:   SPEED_MACRO_UMUL_PPMM_B;
                   1112:   {
                   1113:     umul_ppmm (h, l, l, h);
                   1114:      umul_ppmm (h, l, l, h);
                   1115:      umul_ppmm (h, l, l, h);
                   1116:     umul_ppmm (h, l, l, h);
                   1117:      umul_ppmm (h, l, l, h);
                   1118:      umul_ppmm (h, l, l, h);
                   1119:     umul_ppmm (h, l, l, h);
                   1120:      umul_ppmm (h, l, l, h);
                   1121:      umul_ppmm (h, l, l, h);
                   1122:     umul_ppmm (h, l, l, h);
                   1123:   }
                   1124:   SPEED_MACRO_UMUL_PPMM_C;
                   1125: }
                   1126:
                   1127:
                   1128: #if HAVE_NATIVE_mpn_umul_ppmm
                   1129:
                   1130: #if defined (__hppa) && W_TYPE_SIZE == 64
                   1131: #define CALL_MPN_UMUL_PPMM  (h = __MPN (umul_ppmm) (h, l, &l))
                   1132: #else
                   1133: #define CALL_MPN_UMUL_PPMM  (h = __MPN (umul_ppmm) (&l, h, l))
                   1134: #endif
                   1135:
                   1136: double
                   1137: speed_mpn_umul_ppmm (struct speed_params *s)
                   1138: {
                   1139:   SPEED_MACRO_UMUL_PPMM_A;
                   1140:   {
                   1141:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[0]; l ^= s->yp_block[0];
                   1142:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[1]; l ^= s->yp_block[1];
                   1143:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[2]; l ^= s->yp_block[2];
                   1144:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[3]; l ^= s->yp_block[3];
                   1145:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[4]; l ^= s->yp_block[4];
                   1146:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[5]; l ^= s->yp_block[5];
                   1147:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[6]; l ^= s->yp_block[6];
                   1148:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[7]; l ^= s->yp_block[7];
                   1149:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[8]; l ^= s->yp_block[8];
                   1150:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[9]; l ^= s->yp_block[9];
                   1151:   }
                   1152:   SPEED_MACRO_UMUL_PPMM_B;
                   1153:   {
                   1154:     CALL_MPN_UMUL_PPMM;
                   1155:      CALL_MPN_UMUL_PPMM;
                   1156:      CALL_MPN_UMUL_PPMM;
                   1157:     CALL_MPN_UMUL_PPMM;
                   1158:      CALL_MPN_UMUL_PPMM;
                   1159:      CALL_MPN_UMUL_PPMM;
                   1160:     CALL_MPN_UMUL_PPMM;
                   1161:      CALL_MPN_UMUL_PPMM;
                   1162:      CALL_MPN_UMUL_PPMM;
                   1163:     CALL_MPN_UMUL_PPMM;
                   1164:   }
                   1165:   SPEED_MACRO_UMUL_PPMM_C;
                   1166: }
                   1167: #endif
                   1168:
                   1169:
                   1170: /* The divisions are successively dependent so latency is measured, not
                   1171:    issue rate.  There's only 10 per loop so the code doesn't get too big,
                   1172:    especially for udiv_qrnnd_preinv and preinv2norm, which are several
                   1173:    instructions each.
                   1174:
                   1175:    Note that it's only the division which is measured here, there's no data
                   1176:    fetching and no shifting if the divisor gets normalized.
                   1177:
                   1178:    In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
                   1179:    generate x86 div instructions with nothing in between.
                   1180:
                   1181:    The measuring function macros are in two parts to avoid overflowing
                   1182:    preprocessor expansion space if udiv_qrnnd etc are big.
                   1183:
                   1184:    Limitations:
                   1185:
                   1186:    Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
                   1187:    generated first.
                   1188:
                   1189:    CPUs with data-dependent divisions may want more attention paid to the
                   1190:    randomness of the data used.  Probably the measurement wanted is over
                   1191:    uniformly distributed numbers, but what's here might not be giving that.  */
                   1192:
                   1193: #define SPEED_ROUTINE_UDIV_QRNND_A(normalize)           \
                   1194:   {                                                     \
                   1195:     double     t;                                       \
                   1196:     unsigned   i;                                       \
                   1197:     mp_limb_t  q, r, d;                                 \
                   1198:     mp_limb_t  dinv;                                    \
                   1199:                                                         \
                   1200:     s->time_divisor = 10;                               \
                   1201:                                                         \
                   1202:     /* divisor from "r" parameter, or a default */      \
                   1203:     d = s->r;                                           \
                   1204:     if (d == 0)                                         \
                   1205:       d = 0x12345678;                                   \
                   1206:                                                         \
                   1207:     if (normalize)                                      \
                   1208:       {                                                 \
                   1209:         unsigned  norm;                                 \
                   1210:         count_leading_zeros (norm, d);                  \
                   1211:         d <<= norm;                                     \
                   1212:         invert_limb (dinv, d);                          \
                   1213:       }                                                 \
                   1214:                                                         \
                   1215:     q = s->xp[0];                                       \
                   1216:     r = s->yp[0] % d;                                   \
                   1217:                                                         \
                   1218:     speed_starttime ();                                 \
                   1219:     i = s->reps;                                        \
                   1220:     do                                                  \
                   1221:       {
                   1222:
                   1223: #define SPEED_ROUTINE_UDIV_QRNND_B                                      \
                   1224:       }                                                                 \
                   1225:     while (--i != 0);                                                   \
                   1226:     t = speed_endtime ();                                               \
                   1227:                                                                         \
                   1228:     /* stop the compiler optimizing away the whole calculation! */      \
                   1229:     noop_1 (q);                                                         \
                   1230:     noop_1 (r);                                                         \
                   1231:                                                                         \
                   1232:     return t;                                                           \
                   1233:   }
                   1234:
                   1235: double
                   1236: speed_udiv_qrnnd (struct speed_params *s)
                   1237: {
                   1238:   SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
                   1239:   {
                   1240:     udiv_qrnnd (q, r, r, q, d);
                   1241:      udiv_qrnnd (q, r, r, q, d);
                   1242:      udiv_qrnnd (q, r, r, q, d);
                   1243:     udiv_qrnnd (q, r, r, q, d);
                   1244:      udiv_qrnnd (q, r, r, q, d);
                   1245:      udiv_qrnnd (q, r, r, q, d);
                   1246:     udiv_qrnnd (q, r, r, q, d);
                   1247:      udiv_qrnnd (q, r, r, q, d);
                   1248:      udiv_qrnnd (q, r, r, q, d);
                   1249:     udiv_qrnnd (q, r, r, q, d);
                   1250:   }
                   1251:   SPEED_ROUTINE_UDIV_QRNND_B;
                   1252: }
                   1253:
                   1254: double
                   1255: speed_udiv_qrnnd_preinv (struct speed_params *s)
                   1256: {
                   1257:   SPEED_ROUTINE_UDIV_QRNND_A (1);
                   1258:   {
                   1259:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1260:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1261:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1262:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1263:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1264:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1265:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1266:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1267:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1268:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
                   1269:   }
                   1270:   SPEED_ROUTINE_UDIV_QRNND_B;
                   1271: }
                   1272:
                   1273: double
                   1274: speed_udiv_qrnnd_preinv2norm (struct speed_params *s)
                   1275: {
                   1276:   SPEED_ROUTINE_UDIV_QRNND_A (1);
                   1277:   {
                   1278:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1279:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1280:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1281:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1282:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1283:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1284:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1285:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1286:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1287:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
                   1288:   }
                   1289:   SPEED_ROUTINE_UDIV_QRNND_B;
                   1290: }
                   1291:
                   1292: #if HAVE_NATIVE_mpn_udiv_qrnnd
                   1293:
                   1294: #if defined (__hppa) && W_TYPE_SIZE == 64
                   1295: #define CALL_MPN_UDIV_QRNND  (q = __MPN (udiv_qrnnd) (r, q, d, &r))
                   1296: #else
                   1297: #define CALL_MPN_UDIV_QRNND  (q = __MPN (udiv_qrnnd) (&r, r, q, d))
                   1298: #endif
                   1299:
                   1300: double
                   1301: speed_mpn_udiv_qrnnd (struct speed_params *s)
                   1302: {
                   1303:
                   1304:   SPEED_ROUTINE_UDIV_QRNND_A (1);
                   1305:   {
                   1306:     CALL_MPN_UDIV_QRNND;
                   1307:      CALL_MPN_UDIV_QRNND;
                   1308:      CALL_MPN_UDIV_QRNND;
                   1309:     CALL_MPN_UDIV_QRNND;
                   1310:      CALL_MPN_UDIV_QRNND;
                   1311:      CALL_MPN_UDIV_QRNND;
                   1312:     CALL_MPN_UDIV_QRNND;
                   1313:      CALL_MPN_UDIV_QRNND;
                   1314:      CALL_MPN_UDIV_QRNND;
                   1315:     CALL_MPN_UDIV_QRNND;
                   1316:   }
                   1317:   SPEED_ROUTINE_UDIV_QRNND_B;
                   1318: }
                   1319: #endif

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>