[BACK]Return to common.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / tune

Annotation of OpenXM_contrib/gmp/tune/common.c, Revision 1.1

1.1     ! maekawa     1: /* Shared speed subroutines.  */
        !             2:
        !             3: /*
        !             4: Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             5:
        !             6: This file is part of the GNU MP Library.
        !             7:
        !             8: The GNU MP Library is free software; you can redistribute it and/or modify
        !             9: it under the terms of the GNU Lesser General Public License as published by
        !            10: the Free Software Foundation; either version 2.1 of the License, or (at your
        !            11: option) any later version.
        !            12:
        !            13: The GNU MP Library is distributed in the hope that it will be useful, but
        !            14: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
        !            15: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
        !            16: License for more details.
        !            17:
        !            18: You should have received a copy of the GNU Lesser General Public License
        !            19: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
        !            20: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
        !            21: MA 02111-1307, USA.
        !            22: */
        !            23:
        !            24: #include <errno.h>
        !            25: #include <fcntl.h>
        !            26: #include <math.h>
        !            27: #include <stdio.h>
        !            28: #include <stdlib.h> /* for qsort */
        !            29: #include <string.h>
        !            30: #include <unistd.h>
        !            31: #if 0
        !            32: #include <sys/ioctl.h>
        !            33: #endif
        !            34:
        !            35: #include "gmp.h"
        !            36: #include "gmp-impl.h"
        !            37: #include "longlong.h"
        !            38:
        !            39: #include "speed.h"
        !            40:
        !            41: /* Change this to "#define TRACE(x) x" to get traces. */
        !            42: #define TRACE(x)
        !            43:
        !            44:
        !            45: typedef int (*qsort_function_t) _PROTO ((const void *, const void *));
        !            46:
        !            47:
        !            48: int   speed_option_addrs = 0;
        !            49:
        !            50:
        !            51: void
        !            52: pentium_wbinvd(void)
        !            53: {
        !            54: #if 0
        !            55:   {
        !            56:     static int  fd = -2;
        !            57:
        !            58:     if (fd == -2)
        !            59:       {
        !            60:         fd = open ("/dev/wbinvd", O_RDWR);
        !            61:         if (fd == -1)
        !            62:           perror ("open /dev/wbinvd");
        !            63:       }
        !            64:
        !            65:     if (fd != -1)
        !            66:       ioctl (fd, 0, 0);
        !            67:   }
        !            68: #endif
        !            69:
        !            70: #if 0
        !            71: #define WBINVDSIZE  1024*1024*2
        !            72:   {
        !            73:     static char  *p = NULL;
        !            74:     int   i, sum;
        !            75:
        !            76:     if (p == NULL)
        !            77:       p = malloc (WBINVDSIZE);
        !            78:
        !            79: #if 0
        !            80:     for (i = 0; i < WBINVDSIZE; i++)
        !            81:       p[i] = i & 0xFF;
        !            82: #endif
        !            83:
        !            84:     sum = 0;
        !            85:     for (i = 0; i < WBINVDSIZE; i++)
        !            86:       sum += p[i];
        !            87:
        !            88:     mpn_cache_fill_dummy (sum);
        !            89:   }
        !            90: #endif
        !            91: }
        !            92:
        !            93: static int
        !            94: double_cmp_ptr (const double *p, const double *q)
        !            95: {
        !            96:   if (*p > *q)  return 1;
        !            97:   if (*p < *q)  return -1;
        !            98:   return 0;
        !            99: }
        !           100:
        !           101:
        !           102: /* Measure the speed of a given routine.
        !           103:
        !           104:    The routine is run with enough repetitions to make it take at least
        !           105:    speed_precision * speed_unittime.  This aims to minimize the effects of a
        !           106:    limited accuracy time base and the overhead of the measuring itself.
        !           107:
        !           108:    Measurements are made looking for 4 results within TOLERANCE of each
        !           109:    other (or 3 for routines taking longer than 2 seconds).  This aims to get
        !           110:    an accurate reading even if some runs are bloated by interrupts or task
        !           111:    switches or whatever.
        !           112:
        !           113:    The given (*fun)() is expected to run its function "s->reps" many times
        !           114:    and return the total elapsed time measured using speed_starttime() and
        !           115:    speed_endtime().  If the function doesn't support the given s->size or
        !           116:    s->r, -1.0 should be returned.  See the various base routines below.  */
        !           117:
        !           118: double
        !           119: speed_measure (double (*fun) _PROTO ((struct speed_params *s)),
        !           120:                struct speed_params *s)
        !           121: {
        !           122: #define TOLERANCE    1.005  /* 0.5% */
        !           123:
        !           124:   struct speed_params  s_dummy;
        !           125:   int     i, j, e;
        !           126:   double  t[30];
        !           127:   double  t_unsorted[30];
        !           128:
        !           129:   /* Use dummy parameters if caller doesn't provide any.  Only a few special
        !           130:      "fun"s will cope with this, speed_noop() is one.  */
        !           131:   if (s == NULL)
        !           132:     {
        !           133:       memset (&s_dummy, '\0', sizeof (s_dummy));
        !           134:       s = &s_dummy;
        !           135:     }
        !           136:
        !           137:   s->reps = 1;
        !           138:   s->time_divisor = 1.0;
        !           139:   for (i = 0; i < numberof (t); i++)
        !           140:     {
        !           141:       for (;;)
        !           142:         {
        !           143:           s->src_num = 0;
        !           144:           s->dst_num = 0;
        !           145:
        !           146:           t[i] = (*fun) (s);
        !           147:           t_unsorted[i] = t[i];
        !           148:
        !           149:           TRACE (printf("size=%ld reps=%u r=%d attempt=%d  %.9f\n",
        !           150:                         s->size, s->reps, s->r, i, t[i]));
        !           151:
        !           152:           if (t[i] == -1.0)
        !           153:             return -1.0;
        !           154:
        !           155:           if (t[i] >= speed_unittime * speed_precision)
        !           156:             break;
        !           157:
        !           158:           /* go to a value of reps to make t[i] >= precision */
        !           159:           s->reps = (unsigned) ceil (1.1 * s->reps
        !           160:                                      * speed_unittime * speed_precision
        !           161:                                      / MAX (t[i], speed_unittime));
        !           162:         }
        !           163:       t[i] /= s->reps;
        !           164:
        !           165:       if (speed_precision == 0)
        !           166:         return t[i];
        !           167:
        !           168:       /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
        !           169:       if (t[0] >= 2.0)
        !           170:         e = 3;
        !           171:       else
        !           172:         e = 4;
        !           173:
        !           174:       /* Look for e many t[]'s within TOLERANCE of each other to consider a
        !           175:          valid measurement.  Return smallest among them.  */
        !           176:       if (i >= e)
        !           177:         {
        !           178:           qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
        !           179:           for (j = e-1; j < i; j++)
        !           180:             if (t[j] <= t[j-e+1] * TOLERANCE)
        !           181:               return t[j-e+1] / s->time_divisor;
        !           182:         }
        !           183:     }
        !           184:
        !           185:   fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
        !           186:            e, (TOLERANCE-1.0)*100.0);
        !           187:   fprintf (stderr, "  %.12f is about 0.5%%\n", t[0]*(TOLERANCE-1.0));
        !           188:   for (i = 0; i < numberof (t); i++)
        !           189:     fprintf (stderr, "  %.09f\n", t_unsorted[i]);
        !           190:
        !           191:   return -1.0;
        !           192: }
        !           193:
        !           194:
        !           195: /* Read all of ptr,size to get it into the CPU memory cache.
        !           196:
        !           197:    A call to mpn_cache_fill_dummy() is used to make sure the compiler
        !           198:    doesn't optimize away the whole loop.  Using "volatile mp_limb_t sum"
        !           199:    would work too, but the function call means we don't rely on every
        !           200:    compiler actually implementing volatile properly.
        !           201:
        !           202:    mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
        !           203:    it can inline it.  */
        !           204:
        !           205: void
        !           206: mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
        !           207: {
        !           208:   mp_limb_t  sum = 0;
        !           209:   mp_size_t  i;
        !           210:
        !           211:   for (i = 0; i < size; i++)
        !           212:     sum += ptr[i];
        !           213:
        !           214:   mpn_cache_fill_dummy(sum);
        !           215: }
        !           216:
        !           217:
        !           218: void
        !           219: mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
        !           220: {
        !           221:   mpn_cache_fill (ptr, size);
        !           222:
        !           223: #if 0
        !           224:   mpn_random (ptr, size);
        !           225: #endif
        !           226:
        !           227: #if 0
        !           228:   mp_size_t  i;
        !           229:
        !           230:   for (i = 0; i < size; i++)
        !           231:     ptr[i] = i;
        !           232: #endif
        !           233: }
        !           234:
        !           235:
        !           236: void
        !           237: speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
        !           238: {
        !           239:   if (s->src_num >= numberof (s->src))
        !           240:     {
        !           241:       fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
        !           242:       abort ();
        !           243:     }
        !           244:   s->src[s->src_num].ptr = ptr;
        !           245:   s->src[s->src_num].size = size;
        !           246:   s->src_num++;
        !           247: }
        !           248:
        !           249:
        !           250: void
        !           251: speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
        !           252: {
        !           253:   if (s->dst_num >= numberof (s->dst))
        !           254:     {
        !           255:       fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
        !           256:       abort ();
        !           257:     }
        !           258:   s->dst[s->dst_num].ptr = ptr;
        !           259:   s->dst[s->dst_num].size = size;
        !           260:   s->dst_num++;
        !           261: }
        !           262:
        !           263:
        !           264: void
        !           265: speed_cache_fill (struct speed_params *s)
        !           266: {
        !           267:   static struct speed_params  prev;
        !           268:   int  i;
        !           269:
        !           270:   /* FIXME: need a better way to get the format string for a pointer */
        !           271:
        !           272:   if (speed_option_addrs)
        !           273:     {
        !           274:       int  different;
        !           275:
        !           276:       different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
        !           277:       for (i = 0; i < s->dst_num; i++)
        !           278:         different |= (s->dst[i].ptr != prev.dst[i].ptr);
        !           279:       for (i = 0; i < s->src_num; i++)
        !           280:         different |= (s->src[i].ptr != prev.src[i].ptr);
        !           281:
        !           282:       if (different)
        !           283:         {
        !           284:           if (s->dst_num != 0)
        !           285:             {
        !           286:               printf ("dst");
        !           287:               for (i = 0; i < s->dst_num; i++)
        !           288:                 printf (" %08lX", (unsigned long) s->dst[i].ptr);
        !           289:               printf (" ");
        !           290:             }
        !           291:
        !           292:           if (s->src_num != 0)
        !           293:             {
        !           294:               printf ("src");
        !           295:               for (i = 0; i < s->src_num; i++)
        !           296:                 printf (" %08lX", (unsigned long) s->src[i].ptr);
        !           297:               printf (" ");
        !           298:             }
        !           299:           printf ("  (cf sp approx %08lX)\n", (unsigned long) &different);
        !           300:
        !           301:         }
        !           302:
        !           303:       memcpy (&prev, s, sizeof(prev));
        !           304:     }
        !           305:
        !           306:   switch (s->cache) {
        !           307:   case 0:
        !           308:     for (i = 0; i < s->dst_num; i++)
        !           309:       mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
        !           310:     for (i = 0; i < s->src_num; i++)
        !           311:       mpn_cache_fill (s->src[i].ptr, s->src[i].size);
        !           312:     break;
        !           313:   case 1:
        !           314:     pentium_wbinvd();
        !           315:     break;
        !           316:   }
        !           317: }
        !           318:
        !           319:
        !           320: /* Return p advanced to the next multiple of "align" bytes.  "align" must be
        !           321:    a power of 2.  Care is taken not to assume sizeof(int)==sizeof(pointer).
        !           322:    Using "unsigned long" avoids a warning on hpux.  */
        !           323: void *
        !           324: align_pointer (void *p, size_t align)
        !           325: {
        !           326:   unsigned long  d;
        !           327:   d = ((unsigned long) p) & (align-1);
        !           328:   d = (d != 0 ? align-d : 0);
        !           329:   return (void *) (((char *) p) + d);
        !           330: }
        !           331:
        !           332: /* Note that memory allocated with this function can never be freed, because
        !           333:    the start address of the block allocated is discarded. */
        !           334: void *
        !           335: _mp_allocate_func_aligned (size_t bytes, size_t align)
        !           336: {
        !           337:   return align_pointer ((*_mp_allocate_func) (bytes + align-1), align);
        !           338: }
        !           339:
        !           340:
        !           341: void *
        !           342: _mp_allocate_or_reallocate (void *ptr, size_t oldsize, size_t newsize)
        !           343: {
        !           344:   if (ptr == NULL)
        !           345:     return (*_mp_allocate_func) (newsize);
        !           346:   else
        !           347:     return (*_mp_reallocate_func) (ptr, oldsize, newsize);
        !           348: }
        !           349:
        !           350:
        !           351: /* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs.  ptr
        !           352:    needs to have room for up to CACHE_LINE_SIZE-4 extra bytes.  */
        !           353:
        !           354: mp_ptr
        !           355: speed_tmp_alloc_adjust (void *ptr, mp_size_t align)
        !           356: {
        !           357:   /*
        !           358:   printf("%p %ld -> %p %X %X\n", ptr, align,
        !           359:          (mp_ptr) ptr
        !           360:          + ((align - ((mp_size_t) ptr >> 2)) &
        !           361:             SPEED_TMP_ALLOC_ADJUST_MASK),
        !           362:          ((mp_size_t) ptr >> 2) & SPEED_TMP_ALLOC_ADJUST_MASK,
        !           363:          SPEED_TMP_ALLOC_ADJUST_MASK);
        !           364:   */
        !           365:
        !           366:   return (mp_ptr) ptr
        !           367:     + ((align - ((mp_size_t) ptr >> 2)) & SPEED_TMP_ALLOC_ADJUST_MASK);
        !           368: }
        !           369:
        !           370:
        !           371: void
        !           372: mpz_set_n (mpz_ptr z, mp_srcptr p, mp_size_t size)
        !           373: {
        !           374:   ASSERT (size >= 0);
        !           375:   MPN_NORMALIZE (p, size);
        !           376:   MPZ_REALLOC (z, size);
        !           377:   MPN_COPY (PTR(z), p, size);
        !           378:   SIZ(z) = size;
        !           379: }
        !           380:
        !           381:
        !           382: /* Miscellanous options accepted by tune and speed programs under -o. */
        !           383:
        !           384: void
        !           385: speed_option_set (const char *s)
        !           386: {
        !           387:   if (strcmp (s, "addrs") == 0)  speed_option_addrs = 1;
        !           388:   else
        !           389:     {
        !           390:       printf ("Unrecognised -o option: %s\n", s);
        !           391:       exit (1);
        !           392:     }
        !           393: }
        !           394:
        !           395:
        !           396: /* The following are basic speed running routines for various gmp functions.
        !           397:    Many are very similar and use speed.h macros.
        !           398:
        !           399:    Each routine allocates it's own destination space for the result of the
        !           400:    function, because only it can know what the function needs.
        !           401:
        !           402:    speed_starttime() and speed_endtime() are put tight around the code to be
        !           403:    measured.  Any setups are done outside the timed portion.
        !           404:
        !           405:    Each routine is responsible for its own cache priming.
        !           406:    speed_cache_fill() is a good way to do this, see examples in speed.h.
        !           407:    One cache priming possibility, for CPUs with write-allocate cache, and
        !           408:    functions that don't take too long, is to do one dummy call before timing
        !           409:    so as to cache everything that gets used.  But speed_measure() runs a
        !           410:    routine at least twice and will take the smaller time, so this might not
        !           411:    be necessary.
        !           412:
        !           413:    Data alignment will be important, for source, destination and temporary
        !           414:    workspace.  A routine can align its destination and workspace.  Programs
        !           415:    using the routines will ensure s->xp and s->yp are aligned.  Aligning
        !           416:    onto a CACHE_LINE_SIZE boundary is suggested.  s->align_wp and
        !           417:    s->align_wp2 should be respected where it makes sense to do so.
        !           418:    SPEED_TMP_ALLOC_LIMBS is a good way to do this.
        !           419:
        !           420:    A loop of the following form can be expected to turn into good assembler
        !           421:    code on most CPUs, thereby minimizing overhead in the measurement.  It
        !           422:    can always be assumed s->reps >= 1.
        !           423:
        !           424:           i = s->reps
        !           425:           do
        !           426:             foo();
        !           427:           while (--i != 0);
        !           428:
        !           429:    Additional parameters might be added to "struct speed_params" in the
        !           430:    future.  Routines should ignore anything they don't use.
        !           431:
        !           432:    s->size can be used creatively, and s->xp and s->yp can be ignored.  For
        !           433:    example, speed_mpz_fac_ui() uses s->size as n for the factorial.  s->r is
        !           434:    just a user-supplied parameter.  speed_mpn_lshift() uses it as a shift,
        !           435:    speed_mpn_mul_1() uses it as a multiplier.  */
        !           436:
        !           437:
        !           438: /* MPN_COPY etc can be macros, so the _CALL forms are necessary */
        !           439: double
        !           440: speed_MPN_COPY (struct speed_params *s)
        !           441: {
        !           442:   SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY (wp, s->xp, s->size));
        !           443: }
        !           444: double
        !           445: speed_MPN_COPY_INCR (struct speed_params *s)
        !           446: {
        !           447:   SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY_INCR (wp, s->xp, s->size));
        !           448: }
        !           449: double
        !           450: speed_MPN_COPY_DECR (struct speed_params *s)
        !           451: {
        !           452:   SPEED_ROUTINE_MPN_COPY_CALL (MPN_COPY_DECR (wp, s->xp, s->size));
        !           453: }
        !           454: double
        !           455: speed_memcpy (struct speed_params *s)
        !           456: {
        !           457:   SPEED_ROUTINE_MPN_COPY_CALL
        !           458:     (memcpy (wp, s->xp, s->size * BYTES_PER_MP_LIMB));
        !           459: }
        !           460:
        !           461:
        !           462: double
        !           463: speed_mpn_addmul_1 (struct speed_params *s)
        !           464: {
        !           465:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
        !           466: }
        !           467: double
        !           468: speed_mpn_submul_1 (struct speed_params *s)
        !           469: {
        !           470:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
        !           471: }
        !           472:
        !           473:
        !           474: double
        !           475: speed_mpn_mul_1 (struct speed_params *s)
        !           476: {
        !           477:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
        !           478: }
        !           479:
        !           480:
        !           481: double
        !           482: speed_mpn_lshift (struct speed_params *s)
        !           483: {
        !           484:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
        !           485: }
        !           486: double
        !           487: speed_mpn_rshift (struct speed_params *s)
        !           488: {
        !           489:   SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
        !           490: }
        !           491:
        !           492:
        !           493: /* The carry-in variants (if available) are good for measuring because they
        !           494:    won't skip a division if high<divisor.  Alternately, use -1 as a divisor
        !           495:    with the plain _1 forms. */
        !           496: double
        !           497: speed_mpn_divrem_1 (struct speed_params *s)
        !           498: {
        !           499:   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
        !           500: }
        !           501: double
        !           502: speed_mpn_divrem_1f (struct speed_params *s)
        !           503: {
        !           504:   SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
        !           505: }
        !           506: #if HAVE_NATIVE_mpn_divrem_1c
        !           507: double
        !           508: speed_mpn_divrem_1c (struct speed_params *s)
        !           509: {
        !           510:   SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
        !           511: }
        !           512: double
        !           513: speed_mpn_divrem_1cf (struct speed_params *s)
        !           514: {
        !           515:   SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
        !           516: }
        !           517: #endif
        !           518:
        !           519: double
        !           520: speed_mpn_divrem_2 (struct speed_params *s)
        !           521: {
        !           522:   SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
        !           523: }
        !           524:
        !           525: double
        !           526: speed_mpn_mod_1 (struct speed_params *s)
        !           527: {
        !           528:   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
        !           529: }
        !           530: #if HAVE_NATIVE_mpn_mod_1c
        !           531: double
        !           532: speed_mpn_mod_1c (struct speed_params *s)
        !           533: {
        !           534:   SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
        !           535: }
        !           536: #endif
        !           537:
        !           538: double
        !           539: speed_mpn_divexact_by3 (struct speed_params *s)
        !           540: {
        !           541:   /* mpn_divexact_by3 is a macro, so the _CALL form is necessary */
        !           542:   SPEED_ROUTINE_MPN_COPY_CALL(mpn_divexact_by3 (wp, s->xp, s->size));
        !           543: }
        !           544:
        !           545:
        !           546: double
        !           547: speed_mpn_bz_divrem_n (struct speed_params *s)
        !           548: {
        !           549:   SPEED_ROUTINE_MPN_BZ_DIVREM_N (mpn_bz_divrem_n);
        !           550: }
        !           551: double
        !           552: speed_mpn_bz_divrem_sb (struct speed_params *s)
        !           553: {
        !           554:   SPEED_ROUTINE_MPN_BZ_DIVREM_SB (mpn_sb_divrem_mn);
        !           555: }
        !           556: double
        !           557: speed_mpn_bz_tdiv_qr (struct speed_params *s)
        !           558: {
        !           559:   SPEED_ROUTINE_MPN_BZ_TDIV_QR (mpn_tdiv_qr);
        !           560: }
        !           561:
        !           562:
        !           563: double
        !           564: speed_mpn_popcount (struct speed_params *s)
        !           565: {
        !           566:   SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
        !           567: }
        !           568: double
        !           569: speed_mpn_hamdist (struct speed_params *s)
        !           570: {
        !           571:   SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
        !           572: }
        !           573:
        !           574:
        !           575: double
        !           576: speed_mpn_add_n (struct speed_params *s)
        !           577: {
        !           578:   SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
        !           579: }
        !           580: double
        !           581: speed_mpn_sub_n (struct speed_params *s)
        !           582: {
        !           583: SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
        !           584: }
        !           585: double
        !           586: speed_mpn_add_n_self (struct speed_params *s)
        !           587: {
        !           588:   SPEED_ROUTINE_MPN_BINARY_N_SELF (mpn_add_n);
        !           589: }
        !           590: double
        !           591: speed_mpn_add_n_inplace (struct speed_params *s)
        !           592: {
        !           593:   SPEED_ROUTINE_MPN_BINARY_N_INPLACE (mpn_add_n);
        !           594: }
        !           595:
        !           596:
        !           597: /* mpn_and_n etc can be macros and so have to be handled with
        !           598:    SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
        !           599: double
        !           600: speed_mpn_and_n (struct speed_params *s)
        !           601: {
        !           602:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size));
        !           603: }
        !           604: double
        !           605: speed_mpn_andn_n (struct speed_params *s)
        !           606: {
        !           607: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size));
        !           608: }
        !           609: double
        !           610: speed_mpn_nand_n (struct speed_params *s)
        !           611: {
        !           612:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size));
        !           613: }
        !           614: double
        !           615: speed_mpn_ior_n (struct speed_params *s)
        !           616: {
        !           617: SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size));
        !           618: }
        !           619: double
        !           620: speed_mpn_iorn_n (struct speed_params *s)
        !           621: {
        !           622:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size));
        !           623: }
        !           624: double
        !           625: speed_mpn_nior_n (struct speed_params *s)
        !           626: {
        !           627:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size));
        !           628: }
        !           629: double
        !           630: speed_mpn_xor_n (struct speed_params *s)
        !           631: {
        !           632:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size));
        !           633: }
        !           634: double
        !           635: speed_mpn_xnor_n (struct speed_params *s)
        !           636: {
        !           637:   SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size));
        !           638: }
        !           639:
        !           640:
        !           641: double
        !           642: speed_mpn_mul_n (struct speed_params *s)
        !           643: {
        !           644:   SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
        !           645: }
        !           646: double
        !           647: speed_mpn_sqr_n (struct speed_params *s)
        !           648: {
        !           649:   SPEED_ROUTINE_MPN_SQR (mpn_sqr_n);
        !           650: }
        !           651: double
        !           652: speed_mpn_mul_n_sqr (struct speed_params *s)
        !           653: {
        !           654:   SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
        !           655: }
        !           656:
        !           657: double
        !           658: speed_mpn_mul_basecase (struct speed_params *s)
        !           659: {
        !           660:   SPEED_ROUTINE_MPN_MUL_BASECASE(mpn_mul_basecase);
        !           661: }
        !           662: double
        !           663: speed_mpn_sqr_basecase (struct speed_params *s)
        !           664: {
        !           665:   /* FIXME: size restrictions on some versions of sqr_basecase */
        !           666:   SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
        !           667: }
        !           668:
        !           669: double
        !           670: speed_mpn_kara_mul_n (struct speed_params *s)
        !           671: {
        !           672:   SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n);
        !           673: }
        !           674: double
        !           675: speed_mpn_kara_sqr_n (struct speed_params *s)
        !           676: {
        !           677:   SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n);
        !           678: }
        !           679:
        !           680: double
        !           681: speed_mpn_toom3_mul_n (struct speed_params *s)
        !           682: {
        !           683:   SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n);
        !           684: }
        !           685: double
        !           686: speed_mpn_toom3_sqr_n (struct speed_params *s)
        !           687: {
        !           688:   SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n);
        !           689: }
        !           690:
        !           691: double
        !           692: speed_mpn_mul_fft_full (struct speed_params *s)
        !           693: {
        !           694:   SPEED_ROUTINE_MPN_MUL_N_CALL
        !           695:     (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
        !           696: }
        !           697: double
        !           698: speed_mpn_mul_fft_full_sqr (struct speed_params *s)
        !           699: {
        !           700:   SPEED_ROUTINE_MPN_SQR_CALL
        !           701:     (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
        !           702: }
        !           703:
        !           704:
        !           705: /* These are mod 2^N+1 multiplies and squares.  If s->r is supplied it's
        !           706:    used as k, otherwise the best k for the size is used.  If s->size isn't a
        !           707:    multiple of 2^k it's rounded up to make the effective operation size.  */
        !           708:
        !           709: #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr)       \
        !           710:   {                                                     \
        !           711:     mp_ptr     wp;                                      \
        !           712:     mp_size_t  pl;                                      \
        !           713:     int        k;                                       \
        !           714:     unsigned   i;                                       \
        !           715:     double     t;                                       \
        !           716:     TMP_DECL (marker);                                  \
        !           717:                                                         \
        !           718:     SPEED_RESTRICT_COND (s->size >= 1);                 \
        !           719:                                                         \
        !           720:     if (s->r != 0)                                      \
        !           721:       k = s->r;                                         \
        !           722:     else                                                \
        !           723:       k = mpn_fft_best_k (s->size, sqr);                \
        !           724:                                                         \
        !           725:     TMP_MARK (marker);                                  \
        !           726:     pl = mpn_fft_next_size (s->size, k);                \
        !           727:     wp = SPEED_TMP_ALLOC_LIMBS (pl+1, s->align_wp);     \
        !           728:                                                         \
        !           729:     speed_operand_src (s, s->xp, s->size);              \
        !           730:     if (!sqr)                                           \
        !           731:       speed_operand_src (s, s->yp, s->size);            \
        !           732:     speed_operand_dst (s, wp, pl+1);                    \
        !           733:     speed_cache_fill (s);                               \
        !           734:                                                         \
        !           735:     speed_starttime ();                                 \
        !           736:     i = s->reps;                                        \
        !           737:     do                                                  \
        !           738:       call;                                             \
        !           739:     while (--i != 0);                                   \
        !           740:     t = speed_endtime ();                               \
        !           741:                                                         \
        !           742:     TMP_FREE (marker);                                  \
        !           743:     return t;                                           \
        !           744:   }
        !           745:
        !           746: double
        !           747: speed_mpn_mul_fft (struct speed_params *s)
        !           748: {
        !           749:   SPEED_ROUTINE_MPN_MUL_FFT_CALL
        !           750:     (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
        !           751: }
        !           752:
        !           753: double
        !           754: speed_mpn_mul_fft_sqr (struct speed_params *s)
        !           755: {
        !           756:   SPEED_ROUTINE_MPN_MUL_FFT_CALL
        !           757:     (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
        !           758: }
        !           759:
        !           760:
        !           761: double
        !           762: speed_mpn_gcd (struct speed_params *s)
        !           763: {
        !           764:   SPEED_ROUTINE_MPN_GCD (mpn_gcd);
        !           765: }
        !           766: double
        !           767: speed_mpn_gcdext (struct speed_params *s)
        !           768: {
        !           769:   SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
        !           770: }
        !           771: double
        !           772: speed_mpn_gcd_1 (struct speed_params *s)
        !           773: {
        !           774:   SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
        !           775: }
        !           776:
        !           777:
        !           778: double
        !           779: speed_mpn_jacobi_base (struct speed_params *s)
        !           780: {
        !           781:   SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
        !           782: }
        !           783:
        !           784:
        !           785: double
        !           786: speed_mpz_fac_ui (struct speed_params *s)
        !           787: {
        !           788:   SPEED_ROUTINE_MPZ_UI (mpz_fac_ui);
        !           789: }
        !           790: double
        !           791: speed_mpz_fib_ui (struct speed_params *s)
        !           792: {
        !           793:   SPEED_ROUTINE_MPZ_UI (mpz_fib_ui);
        !           794: }
        !           795:
        !           796:
        !           797: double
        !           798: speed_mpz_powm (struct speed_params *s)
        !           799: {
        !           800:   SPEED_ROUTINE_MPZ_POWM (mpz_powm);
        !           801: }
        !           802:
        !           803:
        !           804: double
        !           805: speed_modlimb_invert (struct speed_params *s)
        !           806: {
        !           807:   SPEED_ROUTINE_MODLIMB_INVERT (modlimb_invert);
        !           808: }
        !           809:
        !           810:
        !           811: double
        !           812: speed_noop (struct speed_params *s)
        !           813: {
        !           814:   unsigned  i;
        !           815:
        !           816:   speed_starttime ();
        !           817:   i = s->reps;
        !           818:   do
        !           819:     noop ();
        !           820:   while (--i != 0);
        !           821:   return speed_endtime ();
        !           822: }
        !           823:
        !           824: double
        !           825: speed_noop_wxs (struct speed_params *s)
        !           826: {
        !           827:   mp_ptr   wp;
        !           828:   unsigned i;
        !           829:   double   t;
        !           830:   TMP_DECL (marker);
        !           831:
        !           832:   TMP_MARK (marker);
        !           833:   wp = TMP_ALLOC_LIMBS (1);
        !           834:
        !           835:   speed_starttime ();
        !           836:   i = s->reps;
        !           837:   do
        !           838:     noop_wxs (wp, s->xp, s->size);
        !           839:   while (--i != 0);
        !           840:   t = speed_endtime ();
        !           841:
        !           842:   TMP_FREE (marker);
        !           843:   return t;
        !           844: }
        !           845:
        !           846: double
        !           847: speed_noop_wxys (struct speed_params *s)
        !           848: {
        !           849:   mp_ptr   wp;
        !           850:   unsigned i;
        !           851:   double   t;
        !           852:   TMP_DECL (marker);
        !           853:
        !           854:   TMP_MARK (marker);
        !           855:   wp = TMP_ALLOC_LIMBS (1);
        !           856:
        !           857:   speed_starttime ();
        !           858:   i = s->reps;
        !           859:   do
        !           860:     noop_wxys (wp, s->xp, s->yp, s->size);
        !           861:   while (--i != 0);
        !           862:   t = speed_endtime ();
        !           863:
        !           864:   TMP_FREE (marker);
        !           865:   return t;
        !           866: }
        !           867:
        !           868:
        !           869: #define SPEED_ROUTINE_ALLOC_FREE(variables, calls)      \
        !           870:   {                                                     \
        !           871:     unsigned  i;                                        \
        !           872:     variables;                                          \
        !           873:                                                         \
        !           874:     speed_starttime ();                                 \
        !           875:     i = s->reps;                                        \
        !           876:     do                                                  \
        !           877:       {                                                 \
        !           878:         calls;                                          \
        !           879:       }                                                 \
        !           880:     while (--i != 0);                                   \
        !           881:     return speed_endtime ();                            \
        !           882:   }
        !           883:
        !           884:
        !           885: /* Compare these to see how much malloc/free costs and then how much
        !           886:    _mp_default_allocate/free and mpz_init/clear add.  mpz_init/clear or
        !           887:    mpq_init/clear will be doing a 1 limb allocate, so use that as the size
        !           888:    when including them in comparisons.  */
        !           889:
        !           890: double
        !           891: speed_malloc_free (struct speed_params *s)
        !           892: {
        !           893:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
        !           894:   SPEED_ROUTINE_ALLOC_FREE (void *p,
        !           895:                             p = malloc (bytes);
        !           896:                             free (p));
        !           897: }
        !           898:
        !           899: double
        !           900: speed_malloc_realloc_free (struct speed_params *s)
        !           901: {
        !           902:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
        !           903:   SPEED_ROUTINE_ALLOC_FREE (void *p,
        !           904:                             p = malloc (BYTES_PER_MP_LIMB);
        !           905:                             p = realloc (p, bytes);
        !           906:                             free (p));
        !           907: }
        !           908:
        !           909: double
        !           910: speed_mp_allocate_free (struct speed_params *s)
        !           911: {
        !           912:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
        !           913:   SPEED_ROUTINE_ALLOC_FREE (void *p,
        !           914:                             p = (*_mp_allocate_func) (bytes);
        !           915:                             (*_mp_free_func) (p, bytes));
        !           916: }
        !           917:
        !           918: double
        !           919: speed_mp_allocate_reallocate_free (struct speed_params *s)
        !           920: {
        !           921:   size_t  bytes = s->size * BYTES_PER_MP_LIMB;
        !           922:   SPEED_ROUTINE_ALLOC_FREE
        !           923:     (void *p,
        !           924:      p = (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
        !           925:      p = (*_mp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB);
        !           926:      (*_mp_free_func) (p, bytes));
        !           927: }
        !           928:
        !           929: double
        !           930: speed_mpz_init_clear (struct speed_params *s)
        !           931: {
        !           932:   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
        !           933:                             mpz_init (z);
        !           934:                             mpz_clear (z));
        !           935: }
        !           936:
        !           937: double
        !           938: speed_mpz_init_realloc_clear (struct speed_params *s)
        !           939: {
        !           940:   SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
        !           941:                             mpz_init (z);
        !           942:                             _mpz_realloc (z, s->size);
        !           943:                             mpz_clear (z));
        !           944: }
        !           945:
        !           946: double
        !           947: speed_mpq_init_clear (struct speed_params *s)
        !           948: {
        !           949:   SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
        !           950:                             mpq_init (q);
        !           951:                             mpq_clear (q));
        !           952: }
        !           953:
        !           954: double
        !           955: speed_mpf_init_clear (struct speed_params *s)
        !           956: {
        !           957:   SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
        !           958:                             mpf_init (f);
        !           959:                             mpf_clear (f));
        !           960: }
        !           961:
        !           962:
        !           963: /* Compare this to mpn_add_n to see how much overhead mpz_add adds.  Note
        !           964:    that repeatedly calling mpz_add with the same data gives branch predition
        !           965:    in it an advantage.  */
        !           966:
        !           967: double
        !           968: speed_mpz_add (struct speed_params *s)
        !           969: {
        !           970:   mpz_t     w, x, y;
        !           971:   unsigned  i;
        !           972:   double    t;
        !           973:
        !           974:   mpz_init (w);
        !           975:   mpz_init (x);
        !           976:   mpz_init (y);
        !           977:
        !           978:   mpz_set_n (x, s->xp, s->size);
        !           979:   mpz_set_n (y, s->yp, s->size);
        !           980:   mpz_add (w, x, y);
        !           981:
        !           982:   speed_starttime ();
        !           983:   i = s->reps;
        !           984:   do
        !           985:     {
        !           986:       mpz_add (w, x, y);
        !           987:     }
        !           988:   while (--i != 0);
        !           989:   t = speed_endtime ();
        !           990:
        !           991:   mpz_clear (w);
        !           992:   mpz_clear (x);
        !           993:   mpz_clear (y);
        !           994:   return t;
        !           995: }
        !           996:
        !           997:
        !           998: /* If r==0, calculate (size,size/2),
        !           999:    otherwise calculate (size,r). */
        !          1000:
        !          1001: double
        !          1002: speed_mpz_bin_uiui (struct speed_params *s)
        !          1003: {
        !          1004:   mpz_t          w;
        !          1005:   unsigned long  k;
        !          1006:   unsigned  i;
        !          1007:   double    t;
        !          1008:
        !          1009:   mpz_init (w);
        !          1010:   if (s->r != 0)
        !          1011:     k = s->r;
        !          1012:   else
        !          1013:     k = s->size/2;
        !          1014:
        !          1015:   speed_starttime ();
        !          1016:   i = s->reps;
        !          1017:   do
        !          1018:     {
        !          1019:       mpz_bin_uiui (w, s->size, k);
        !          1020:     }
        !          1021:   while (--i != 0);
        !          1022:   t = speed_endtime ();
        !          1023:
        !          1024:   mpz_clear (w);
        !          1025:   return t;
        !          1026: }
        !          1027:
        !          1028:
        !          1029: /* The multiplies are successively dependent so the latency is measured, not
        !          1030:    the issue rate.  There's only 10 per loop so the code doesn't get too big
        !          1031:    since umul_ppmm is several instructions on some cpus.
        !          1032:
        !          1033:    Putting the arguments as "h,l,l,h" gets slightly better code from gcc
        !          1034:    2.95.2 on x86, it puts only one mov between each mul, not two.  That mov
        !          1035:    though will probably show up as a bogus extra cycle though.
        !          1036:
        !          1037:    The measuring function macros are into three parts to avoid overflowing
        !          1038:    preprocessor expansion space if umul_ppmm is big.
        !          1039:
        !          1040:    Limitations:
        !          1041:
        !          1042:    Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code
        !          1043:    generated first, especially on CPUs with low latency multipliers.
        !          1044:
        !          1045:    The default umul_ppmm doing h*l will be getting increasing numbers of
        !          1046:    high zero bits in the calculation.  CPUs with data-dependent multipliers
        !          1047:    will want to use umul_ppmm.1 to get some randomization into the
        !          1048:    calculation.  The extra xors and fetches will be a slowdown of course.  */
        !          1049:
        !          1050: #define SPEED_MACRO_UMUL_PPMM_A \
        !          1051:   {                             \
        !          1052:     mp_limb_t  h, l;            \
        !          1053:     unsigned   i;               \
        !          1054:     double     t;               \
        !          1055:                                 \
        !          1056:     s->time_divisor = 10;       \
        !          1057:                                 \
        !          1058:     h = s->xp[0];               \
        !          1059:     l = s->yp[0];               \
        !          1060:                                 \
        !          1061:     switch (s->r) {             \
        !          1062:     case 1:                     \
        !          1063:       speed_starttime ();       \
        !          1064:       i = s->reps;              \
        !          1065:       do                        \
        !          1066:         {
        !          1067:
        !          1068: #define SPEED_MACRO_UMUL_PPMM_B \
        !          1069:         }                       \
        !          1070:       while (--i != 0);         \
        !          1071:       t = speed_endtime ();     \
        !          1072:       break;                    \
        !          1073:                                 \
        !          1074:     default:                    \
        !          1075:       speed_starttime ();       \
        !          1076:       i = s->reps;              \
        !          1077:       do                        \
        !          1078:         {
        !          1079:
        !          1080: #define SPEED_MACRO_UMUL_PPMM_C                                         \
        !          1081:         }                                                               \
        !          1082:       while (--i != 0);                                                 \
        !          1083:       t = speed_endtime ();                                             \
        !          1084:       break;                                                            \
        !          1085:     }                                                                   \
        !          1086:                                                                         \
        !          1087:     /* stop the compiler optimizing away the whole calculation! */      \
        !          1088:     noop_1 (h);                                                         \
        !          1089:     noop_1 (l);                                                         \
        !          1090:                                                                         \
        !          1091:     return t;                                                           \
        !          1092:   }
        !          1093:
        !          1094:
        !          1095: double
        !          1096: speed_umul_ppmm (struct speed_params *s)
        !          1097: {
        !          1098:   SPEED_MACRO_UMUL_PPMM_A;
        !          1099:   {
        !          1100:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[0]; l ^= s->yp_block[0];
        !          1101:      umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
        !          1102:      umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
        !          1103:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[3]; l ^= s->yp_block[3];
        !          1104:      umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
        !          1105:      umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
        !          1106:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[6]; l ^= s->yp_block[6];
        !          1107:      umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
        !          1108:      umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
        !          1109:     umul_ppmm (h, l, l, h);  h ^= s->xp_block[9]; l ^= s->yp_block[9];
        !          1110:   }
        !          1111:   SPEED_MACRO_UMUL_PPMM_B;
        !          1112:   {
        !          1113:     umul_ppmm (h, l, l, h);
        !          1114:      umul_ppmm (h, l, l, h);
        !          1115:      umul_ppmm (h, l, l, h);
        !          1116:     umul_ppmm (h, l, l, h);
        !          1117:      umul_ppmm (h, l, l, h);
        !          1118:      umul_ppmm (h, l, l, h);
        !          1119:     umul_ppmm (h, l, l, h);
        !          1120:      umul_ppmm (h, l, l, h);
        !          1121:      umul_ppmm (h, l, l, h);
        !          1122:     umul_ppmm (h, l, l, h);
        !          1123:   }
        !          1124:   SPEED_MACRO_UMUL_PPMM_C;
        !          1125: }
        !          1126:
        !          1127:
        !          1128: #if HAVE_NATIVE_mpn_umul_ppmm
        !          1129:
        !          1130: #if defined (__hppa) && W_TYPE_SIZE == 64
        !          1131: #define CALL_MPN_UMUL_PPMM  (h = __MPN (umul_ppmm) (h, l, &l))
        !          1132: #else
        !          1133: #define CALL_MPN_UMUL_PPMM  (h = __MPN (umul_ppmm) (&l, h, l))
        !          1134: #endif
        !          1135:
        !          1136: double
        !          1137: speed_mpn_umul_ppmm (struct speed_params *s)
        !          1138: {
        !          1139:   SPEED_MACRO_UMUL_PPMM_A;
        !          1140:   {
        !          1141:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[0]; l ^= s->yp_block[0];
        !          1142:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[1]; l ^= s->yp_block[1];
        !          1143:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[2]; l ^= s->yp_block[2];
        !          1144:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[3]; l ^= s->yp_block[3];
        !          1145:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[4]; l ^= s->yp_block[4];
        !          1146:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[5]; l ^= s->yp_block[5];
        !          1147:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[6]; l ^= s->yp_block[6];
        !          1148:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[7]; l ^= s->yp_block[7];
        !          1149:      CALL_MPN_UMUL_PPMM; h ^= s->xp_block[8]; l ^= s->yp_block[8];
        !          1150:     CALL_MPN_UMUL_PPMM;  h ^= s->xp_block[9]; l ^= s->yp_block[9];
        !          1151:   }
        !          1152:   SPEED_MACRO_UMUL_PPMM_B;
        !          1153:   {
        !          1154:     CALL_MPN_UMUL_PPMM;
        !          1155:      CALL_MPN_UMUL_PPMM;
        !          1156:      CALL_MPN_UMUL_PPMM;
        !          1157:     CALL_MPN_UMUL_PPMM;
        !          1158:      CALL_MPN_UMUL_PPMM;
        !          1159:      CALL_MPN_UMUL_PPMM;
        !          1160:     CALL_MPN_UMUL_PPMM;
        !          1161:      CALL_MPN_UMUL_PPMM;
        !          1162:      CALL_MPN_UMUL_PPMM;
        !          1163:     CALL_MPN_UMUL_PPMM;
        !          1164:   }
        !          1165:   SPEED_MACRO_UMUL_PPMM_C;
        !          1166: }
        !          1167: #endif
        !          1168:
        !          1169:
        !          1170: /* The divisions are successively dependent so latency is measured, not
        !          1171:    issue rate.  There's only 10 per loop so the code doesn't get too big,
        !          1172:    especially for udiv_qrnnd_preinv and preinv2norm, which are several
        !          1173:    instructions each.
        !          1174:
        !          1175:    Note that it's only the division which is measured here, there's no data
        !          1176:    fetching and no shifting if the divisor gets normalized.
        !          1177:
        !          1178:    In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
        !          1179:    generate x86 div instructions with nothing in between.
        !          1180:
        !          1181:    The measuring function macros are in two parts to avoid overflowing
        !          1182:    preprocessor expansion space if udiv_qrnnd etc are big.
        !          1183:
        !          1184:    Limitations:
        !          1185:
        !          1186:    Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
        !          1187:    generated first.
        !          1188:
        !          1189:    CPUs with data-dependent divisions may want more attention paid to the
        !          1190:    randomness of the data used.  Probably the measurement wanted is over
        !          1191:    uniformly distributed numbers, but what's here might not be giving that.  */
        !          1192:
        !          1193: #define SPEED_ROUTINE_UDIV_QRNND_A(normalize)           \
        !          1194:   {                                                     \
        !          1195:     double     t;                                       \
        !          1196:     unsigned   i;                                       \
        !          1197:     mp_limb_t  q, r, d;                                 \
        !          1198:     mp_limb_t  dinv;                                    \
        !          1199:                                                         \
        !          1200:     s->time_divisor = 10;                               \
        !          1201:                                                         \
        !          1202:     /* divisor from "r" parameter, or a default */      \
        !          1203:     d = s->r;                                           \
        !          1204:     if (d == 0)                                         \
        !          1205:       d = 0x12345678;                                   \
        !          1206:                                                         \
        !          1207:     if (normalize)                                      \
        !          1208:       {                                                 \
        !          1209:         unsigned  norm;                                 \
        !          1210:         count_leading_zeros (norm, d);                  \
        !          1211:         d <<= norm;                                     \
        !          1212:         invert_limb (dinv, d);                          \
        !          1213:       }                                                 \
        !          1214:                                                         \
        !          1215:     q = s->xp[0];                                       \
        !          1216:     r = s->yp[0] % d;                                   \
        !          1217:                                                         \
        !          1218:     speed_starttime ();                                 \
        !          1219:     i = s->reps;                                        \
        !          1220:     do                                                  \
        !          1221:       {
        !          1222:
        !          1223: #define SPEED_ROUTINE_UDIV_QRNND_B                                      \
        !          1224:       }                                                                 \
        !          1225:     while (--i != 0);                                                   \
        !          1226:     t = speed_endtime ();                                               \
        !          1227:                                                                         \
        !          1228:     /* stop the compiler optimizing away the whole calculation! */      \
        !          1229:     noop_1 (q);                                                         \
        !          1230:     noop_1 (r);                                                         \
        !          1231:                                                                         \
        !          1232:     return t;                                                           \
        !          1233:   }
        !          1234:
        !          1235: double
        !          1236: speed_udiv_qrnnd (struct speed_params *s)
        !          1237: {
        !          1238:   SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
        !          1239:   {
        !          1240:     udiv_qrnnd (q, r, r, q, d);
        !          1241:      udiv_qrnnd (q, r, r, q, d);
        !          1242:      udiv_qrnnd (q, r, r, q, d);
        !          1243:     udiv_qrnnd (q, r, r, q, d);
        !          1244:      udiv_qrnnd (q, r, r, q, d);
        !          1245:      udiv_qrnnd (q, r, r, q, d);
        !          1246:     udiv_qrnnd (q, r, r, q, d);
        !          1247:      udiv_qrnnd (q, r, r, q, d);
        !          1248:      udiv_qrnnd (q, r, r, q, d);
        !          1249:     udiv_qrnnd (q, r, r, q, d);
        !          1250:   }
        !          1251:   SPEED_ROUTINE_UDIV_QRNND_B;
        !          1252: }
        !          1253:
        !          1254: double
        !          1255: speed_udiv_qrnnd_preinv (struct speed_params *s)
        !          1256: {
        !          1257:   SPEED_ROUTINE_UDIV_QRNND_A (1);
        !          1258:   {
        !          1259:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1260:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1261:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1262:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1263:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1264:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1265:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1266:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1267:      udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1268:     udiv_qrnnd_preinv (q, r, r, q, d, dinv);
        !          1269:   }
        !          1270:   SPEED_ROUTINE_UDIV_QRNND_B;
        !          1271: }
        !          1272:
        !          1273: double
        !          1274: speed_udiv_qrnnd_preinv2norm (struct speed_params *s)
        !          1275: {
        !          1276:   SPEED_ROUTINE_UDIV_QRNND_A (1);
        !          1277:   {
        !          1278:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1279:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1280:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1281:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1282:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1283:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1284:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1285:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1286:      udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1287:     udiv_qrnnd_preinv2norm (q, r, r, q, d, dinv);
        !          1288:   }
        !          1289:   SPEED_ROUTINE_UDIV_QRNND_B;
        !          1290: }
        !          1291:
        !          1292: #if HAVE_NATIVE_mpn_udiv_qrnnd
        !          1293:
        !          1294: #if defined (__hppa) && W_TYPE_SIZE == 64
        !          1295: #define CALL_MPN_UDIV_QRNND  (q = __MPN (udiv_qrnnd) (r, q, d, &r))
        !          1296: #else
        !          1297: #define CALL_MPN_UDIV_QRNND  (q = __MPN (udiv_qrnnd) (&r, r, q, d))
        !          1298: #endif
        !          1299:
        !          1300: double
        !          1301: speed_mpn_udiv_qrnnd (struct speed_params *s)
        !          1302: {
        !          1303:
        !          1304:   SPEED_ROUTINE_UDIV_QRNND_A (1);
        !          1305:   {
        !          1306:     CALL_MPN_UDIV_QRNND;
        !          1307:      CALL_MPN_UDIV_QRNND;
        !          1308:      CALL_MPN_UDIV_QRNND;
        !          1309:     CALL_MPN_UDIV_QRNND;
        !          1310:      CALL_MPN_UDIV_QRNND;
        !          1311:      CALL_MPN_UDIV_QRNND;
        !          1312:     CALL_MPN_UDIV_QRNND;
        !          1313:      CALL_MPN_UDIV_QRNND;
        !          1314:      CALL_MPN_UDIV_QRNND;
        !          1315:     CALL_MPN_UDIV_QRNND;
        !          1316:   }
        !          1317:   SPEED_ROUTINE_UDIV_QRNND_B;
        !          1318: }
        !          1319: #endif

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>