Annotation of OpenXM_contrib/gmp/tune/speed.h, Revision 1.1
1.1 ! maekawa 1: /* Header for speed and threshold things. */
! 2:
! 3: /*
! 4: Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 5:
! 6: This file is part of the GNU MP Library.
! 7:
! 8: The GNU MP Library is free software; you can redistribute it and/or modify
! 9: it under the terms of the GNU Lesser General Public License as published by
! 10: the Free Software Foundation; either version 2.1 of the License, or (at your
! 11: option) any later version.
! 12:
! 13: The GNU MP Library is distributed in the hope that it will be useful, but
! 14: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: License for more details.
! 17:
! 18: You should have received a copy of the GNU Lesser General Public License
! 19: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: MA 02111-1307, USA.
! 22: */
! 23:
! 24: #ifndef __SPEED_H__
! 25: #define __SPEED_H__
! 26:
! 27:
! 28: /* sizes of temporary space required */
! 29: #define MPN_KARA_MUL_N_TSIZE(n) (2*((n)+BITS_PER_MP_LIMB))
! 30: #define MPN_KARA_SQR_N_TSIZE(n) (2*((n)+BITS_PER_MP_LIMB))
! 31: #define MPN_TOOM3_MUL_N_TSIZE(n) (2*(n) + 3*BITS_PER_MP_LIMB)
! 32: #define MPN_TOOM3_SQR_N_TSIZE(n) (2*((n) + BITS_PER_MP_LIMB))
! 33:
! 34:
! 35: /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
! 36: newsize long. */
! 37: #define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \
! 38: do { \
! 39: ASSERT ((newsize) >= (oldsize)); \
! 40: MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \
! 41: } while (0)
! 42:
! 43: /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
! 44: x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */
! 45: #define MP_LIMB_T_LOWBITMASK(n) \
! 46: ((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
! 47:
! 48:
! 49: /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
! 50:
! 51: #define TMP_ALLOC_ALIGNED(bytes, align) \
! 52: align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
! 53: #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
! 54: ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
! 55:
! 56: /* 32 for pentium, 64 for athlon, might want to configure this for other
! 57: CPUs. In truth though nothing has yet shown up that cares about cache
! 58: line boundaries. The only practical effect of this is to restrict the
! 59: range that s->align_xp can take. Perhaps this could be a variable
! 60: instead. */
! 61: #define CACHE_LINE_SIZE 64 /* bytes */
! 62:
! 63: #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
! 64:
! 65: #define SPEED_TMP_ALLOC_LIMBS(limbs, align) \
! 66: (speed_tmp_alloc_adjust \
! 67: (TMP_ALLOC_LIMBS((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK), (align)))
! 68:
! 69:
! 70: /* This is the size for s->xp_block and s->yp_block, used in certain
! 71: routines that want to run across many different data values and use
! 72: s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
! 73:
! 74: 512 means 2kbytes of data for each of xp_block and yp_block, making 4k
! 75: total, which should fit easily in any L1 data cache. */
! 76:
! 77: #define SPEED_BLOCK_SIZE 512 /* limbs */
! 78:
! 79:
! 80: extern double speed_unittime;
! 81: extern double speed_cycletime;
! 82: extern int speed_precision;
! 83: extern const char *speed_time_string;
! 84: void speed_time_init _PROTO ((void));
! 85: void speed_starttime _PROTO ((void));
! 86: double speed_endtime _PROTO ((void));
! 87:
! 88: struct speed_params {
! 89: unsigned reps; /* how many times to run the routine */
! 90: mp_ptr xp; /* first argument */
! 91: mp_ptr yp; /* second argument */
! 92: mp_size_t size; /* size of both arguments */
! 93: long r; /* user supplied parameter */
! 94: mp_size_t align_xp; /* alignment of xp */
! 95: mp_size_t align_yp; /* alignment of yp */
! 96: mp_size_t align_wp; /* intended alignment of wp */
! 97: mp_size_t align_wp2; /* intended alignment of wp2 */
! 98: mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
! 99: mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
! 100:
! 101: double time_divisor; /* optionally set by the speed routine */
! 102:
! 103: /* used by the cache priming things */
! 104: int cache;
! 105: unsigned src_num, dst_num;
! 106: struct {
! 107: mp_ptr ptr;
! 108: mp_size_t size;
! 109: } src[2], dst[3];
! 110: };
! 111:
! 112: typedef double (*speed_function_t) _PROTO ((struct speed_params *s));
! 113:
! 114: double speed_measure _PROTO ((speed_function_t fun, struct speed_params *s));
! 115:
! 116: /* Prototypes for speed measuring routines */
! 117:
! 118: double speed_malloc_free _PROTO ((struct speed_params *s));
! 119: double speed_malloc_realloc_free _PROTO ((struct speed_params *s));
! 120: double speed_memcpy _PROTO ((struct speed_params *s));
! 121: double speed_modlimb_invert _PROTO ((struct speed_params *s));
! 122: double speed_mp_allocate_free _PROTO ((struct speed_params *s));
! 123: double speed_mp_allocate_reallocate_free _PROTO ((struct speed_params *s));
! 124:
! 125: double speed_mpf_init_clear _PROTO ((struct speed_params *s));
! 126:
! 127: double speed_mpn_add_n _PROTO ((struct speed_params *s));
! 128: double speed_mpn_add_n_self _PROTO ((struct speed_params *s));
! 129: double speed_mpn_add_n_inplace _PROTO ((struct speed_params *s));
! 130: double speed_mpn_and_n _PROTO ((struct speed_params *s));
! 131: double speed_mpn_andn_n _PROTO ((struct speed_params *s));
! 132: double speed_mpn_addmul_1 _PROTO ((struct speed_params *s));
! 133: double speed_mpn_bz_divrem_n _PROTO ((struct speed_params *s));
! 134: double speed_mpn_bz_divrem_sb _PROTO ((struct speed_params *s));
! 135: double speed_mpn_bz_tdiv_qr _PROTO ((struct speed_params *s));
! 136: double speed_MPN_COPY _PROTO ((struct speed_params *s));
! 137: double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
! 138: double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
! 139: double speed_mpn_divexact_by3 _PROTO ((struct speed_params *s));
! 140: double speed_mpn_divmod_1 _PROTO ((struct speed_params *s));
! 141: double speed_mpn_divrem_1 _PROTO ((struct speed_params *s));
! 142: double speed_mpn_divrem_1f _PROTO ((struct speed_params *s));
! 143: double speed_mpn_divrem_1c _PROTO ((struct speed_params *s));
! 144: double speed_mpn_divrem_1cf _PROTO ((struct speed_params *s));
! 145: double speed_mpn_divrem_2 _PROTO ((struct speed_params *s));
! 146: double speed_mpn_gcd _PROTO ((struct speed_params *s));
! 147: double speed_mpn_gcd_1 _PROTO ((struct speed_params *s));
! 148: double speed_mpn_gcdext _PROTO ((struct speed_params *s));
! 149: double speed_mpn_hamdist _PROTO ((struct speed_params *s));
! 150: double speed_mpn_ior_n _PROTO ((struct speed_params *s));
! 151: double speed_mpn_iorn_n _PROTO ((struct speed_params *s));
! 152: double speed_mpn_jacobi_base _PROTO ((struct speed_params *s));
! 153: double speed_mpn_kara_mul_n _PROTO ((struct speed_params *s));
! 154: double speed_mpn_kara_sqr_n _PROTO ((struct speed_params *s));
! 155: double speed_mpn_lshift _PROTO ((struct speed_params *s));
! 156: double speed_mpn_mod_1 _PROTO ((struct speed_params *s));
! 157: double speed_mpn_mod_1c _PROTO ((struct speed_params *s));
! 158: double speed_mpn_mul_1 _PROTO ((struct speed_params *s));
! 159: double speed_mpn_mul_basecase _PROTO ((struct speed_params *s));
! 160: double speed_mpn_mul_fft _PROTO ((struct speed_params *s));
! 161: double speed_mpn_mul_fft_sqr _PROTO ((struct speed_params *s));
! 162: double speed_mpn_mul_fft_full _PROTO ((struct speed_params *s));
! 163: double speed_mpn_mul_fft_full_sqr _PROTO ((struct speed_params *s));
! 164: double speed_mpn_mul_n _PROTO ((struct speed_params *s));
! 165: double speed_mpn_mul_n_sqr _PROTO ((struct speed_params *s));
! 166: double speed_mpn_mul_n_toom3 _PROTO ((struct speed_params *s));
! 167: double speed_mpn_nand_n _PROTO ((struct speed_params *s));
! 168: double speed_mpn_nior_n _PROTO ((struct speed_params *s));
! 169: double speed_mpn_popcount _PROTO ((struct speed_params *s));
! 170: double speed_mpn_rshift _PROTO ((struct speed_params *s));
! 171: double speed_mpn_sqr_basecase _PROTO ((struct speed_params *s));
! 172: double speed_mpn_sqr_n _PROTO ((struct speed_params *s));
! 173: double speed_mpn_sqr_toom3 _PROTO ((struct speed_params *s));
! 174: double speed_mpn_sub_n _PROTO ((struct speed_params *s));
! 175: double speed_mpn_submul_1 _PROTO ((struct speed_params *s));
! 176: double speed_mpn_toom3_mul_n _PROTO ((struct speed_params *s));
! 177: double speed_mpn_toom3_sqr_n _PROTO ((struct speed_params *s));
! 178: double speed_mpn_udiv_qrnnd _PROTO ((struct speed_params *s));
! 179: double speed_mpn_umul_ppmm _PROTO ((struct speed_params *s));
! 180: double speed_mpn_xnor_n _PROTO ((struct speed_params *s));
! 181: double speed_mpn_xor_n _PROTO ((struct speed_params *s));
! 182:
! 183: double speed_mpq_init_clear _PROTO ((struct speed_params *s));
! 184:
! 185: double speed_mpz_add _PROTO ((struct speed_params *s));
! 186: double speed_mpz_bin_uiui _PROTO ((struct speed_params *s));
! 187: double speed_mpz_fac_ui _PROTO ((struct speed_params *s));
! 188: double speed_mpz_fib_ui _PROTO ((struct speed_params *s));
! 189: double speed_mpz_init_clear _PROTO ((struct speed_params *s));
! 190: double speed_mpz_init_realloc_clear _PROTO ((struct speed_params *s));
! 191: double speed_mpz_powm _PROTO ((struct speed_params *s));
! 192:
! 193: double speed_noop _PROTO ((struct speed_params *s));
! 194: double speed_noop_wxs _PROTO ((struct speed_params *s));
! 195: double speed_noop_wxys _PROTO ((struct speed_params *s));
! 196:
! 197: double speed_udiv_qrnnd _PROTO ((struct speed_params *s));
! 198: double speed_udiv_qrnnd_preinv _PROTO ((struct speed_params *s));
! 199: double speed_udiv_qrnnd_preinv2norm _PROTO ((struct speed_params *s));
! 200: double speed_umul_ppmm _PROTO ((struct speed_params *s));
! 201:
! 202:
! 203: /* Prototypes for other routines */
! 204:
! 205: /* low 32-bits in p[0], high 32-bits in p[1] */
! 206: void speed_cyclecounter _PROTO ((unsigned p[2]));
! 207:
! 208: void pentium_wbinvd _PROTO ((void));
! 209:
! 210: void noop _PROTO ((void));
! 211: void noop_1 _PROTO ((mp_limb_t n));
! 212: void noop_wxs _PROTO ((mp_ptr wp, mp_srcptr xp, mp_size_t size));
! 213: void noop_wxys _PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
! 214: mp_size_t size));
! 215: void mpn_cache_fill _PROTO ((mp_srcptr ptr, mp_size_t size));
! 216: void mpn_cache_fill_dummy _PROTO ((mp_limb_t n));
! 217: mp_ptr speed_tmp_alloc_adjust _PROTO ((void *ptr, mp_size_t align));
! 218: void *_mp_allocate_or_reallocate _PROTO ((void *ptr,
! 219: size_t oldsize, size_t newsize));
! 220: void *align_pointer _PROTO ((void *p, size_t align));
! 221: void *_mp_allocate_func_aligned _PROTO ((size_t bytes, size_t align));
! 222: void speed_cache_fill _PROTO ((struct speed_params *s));
! 223: void speed_operand_src _PROTO ((struct speed_params *s,
! 224: mp_ptr ptr, mp_size_t size));
! 225: void speed_operand_dst _PROTO ((struct speed_params *s,
! 226: mp_ptr ptr, mp_size_t size));
! 227: void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
! 228:
! 229: extern int speed_option_addrs;
! 230: void speed_option_set _PROTO((const char *s));
! 231:
! 232:
! 233: #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
! 234:
! 235: /* For mpn_copy or similar. */
! 236: #define SPEED_ROUTINE_MPN_COPY_CALL(call) \
! 237: { \
! 238: mp_ptr wp; \
! 239: unsigned i; \
! 240: double t; \
! 241: TMP_DECL (marker); \
! 242: \
! 243: SPEED_RESTRICT_COND (s->size >= 0); \
! 244: \
! 245: TMP_MARK (marker); \
! 246: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 247: \
! 248: speed_operand_src (s, s->xp, s->size); \
! 249: speed_operand_dst (s, wp, s->size); \
! 250: speed_cache_fill (s); \
! 251: \
! 252: speed_starttime (); \
! 253: i = s->reps; \
! 254: do \
! 255: call; \
! 256: while (--i != 0); \
! 257: t = speed_endtime (); \
! 258: \
! 259: TMP_FREE (marker); \
! 260: return t; \
! 261: }
! 262:
! 263: #define SPEED_ROUTINE_MPN_COPY(function) \
! 264: SPEED_ROUTINE_MPN_COPY_CALL(function (wp, s->xp, s->size))
! 265: #define SPEED_ROUTINE_MPN_COPYC(function) \
! 266: SPEED_ROUTINE_MPN_COPY_CALL(function (wp, s->xp, s->size, 0))
! 267:
! 268:
! 269: /* For mpn_add_n, mpn_sub_n, or similar. */
! 270: #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \
! 271: { \
! 272: mp_ptr wp; \
! 273: unsigned i; \
! 274: double t; \
! 275: TMP_DECL (marker); \
! 276: \
! 277: SPEED_RESTRICT_COND (s->size >= 1); \
! 278: \
! 279: TMP_MARK (marker); \
! 280: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 281: \
! 282: speed_operand_src (s, s->xp, s->size); \
! 283: speed_operand_src (s, s->yp, s->size); \
! 284: speed_operand_dst (s, wp, s->size); \
! 285: speed_cache_fill (s); \
! 286: \
! 287: speed_starttime (); \
! 288: i = s->reps; \
! 289: do \
! 290: call; \
! 291: while (--i != 0); \
! 292: t = speed_endtime (); \
! 293: \
! 294: TMP_FREE (marker); \
! 295: return t; \
! 296: }
! 297:
! 298: #define SPEED_ROUTINE_MPN_BINARY_N(function) \
! 299: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size))
! 300:
! 301: #define SPEED_ROUTINE_MPN_BINARY_NC(function) \
! 302: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size, 0))
! 303:
! 304: #define SPEED_ROUTINE_MPN_BINARY_N_SELF(function) \
! 305: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->xp, s->size))
! 306:
! 307: #define SPEED_ROUTINE_MPN_BINARY_N_INPLACE(function) \
! 308: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, wp, s->xp, s->size))
! 309:
! 310:
! 311: /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
! 312: #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \
! 313: { \
! 314: mp_ptr wp; \
! 315: unsigned i; \
! 316: double t; \
! 317: TMP_DECL (marker); \
! 318: \
! 319: SPEED_RESTRICT_COND (s->size >= 1); \
! 320: \
! 321: TMP_MARK (marker); \
! 322: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 323: \
! 324: speed_operand_src (s, s->xp, s->size); \
! 325: speed_operand_dst (s, wp, s->size); \
! 326: speed_cache_fill (s); \
! 327: \
! 328: speed_starttime (); \
! 329: i = s->reps; \
! 330: do \
! 331: call; \
! 332: while (--i != 0); \
! 333: t = speed_endtime (); \
! 334: \
! 335: TMP_FREE (marker); \
! 336: return t; \
! 337: }
! 338:
! 339: #define SPEED_ROUTINE_MPN_UNARY_1(function) \
! 340: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
! 341:
! 342: #define SPEED_ROUTINE_MPN_UNARY_1C(function) \
! 343: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
! 344:
! 345: #define SPEED_ROUTINE_MPN_DIVREM_1(function) \
! 346: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
! 347:
! 348: #define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
! 349: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
! 350:
! 351: #define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
! 352: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
! 353:
! 354: #define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
! 355: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
! 356:
! 357:
! 358: /* For mpn_mul_basecase, xsize=r, ysize=s->size. */
! 359: #define SPEED_ROUTINE_MPN_MUL_BASECASE(function) \
! 360: { \
! 361: mp_ptr wp; \
! 362: mp_size_t size1; \
! 363: unsigned i; \
! 364: double t; \
! 365: TMP_DECL (marker); \
! 366: \
! 367: size1 = (s->r == 0 ? s->size : s->r); \
! 368: \
! 369: SPEED_RESTRICT_COND (s->size >= 1); \
! 370: SPEED_RESTRICT_COND (size1 >= s->size); \
! 371: \
! 372: TMP_MARK (marker); \
! 373: wp = SPEED_TMP_ALLOC_LIMBS (size1 + s->size, s->align_wp); \
! 374: \
! 375: speed_operand_src (s, s->xp, size1); \
! 376: speed_operand_src (s, s->yp, s->size); \
! 377: speed_operand_dst (s, wp, size1 + s->size); \
! 378: speed_cache_fill (s); \
! 379: \
! 380: speed_starttime (); \
! 381: i = s->reps; \
! 382: do \
! 383: function (wp, s->xp, size1, s->yp, s->size); \
! 384: while (--i != 0); \
! 385: t = speed_endtime (); \
! 386: \
! 387: TMP_FREE (marker); \
! 388: return t; \
! 389: }
! 390:
! 391:
! 392: #define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \
! 393: { \
! 394: mp_ptr wp; \
! 395: unsigned i; \
! 396: double t; \
! 397: TMP_DECL (marker); \
! 398: \
! 399: SPEED_RESTRICT_COND (s->size >= 1); \
! 400: \
! 401: TMP_MARK (marker); \
! 402: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
! 403: \
! 404: speed_operand_src (s, s->xp, s->size); \
! 405: speed_operand_src (s, s->yp, s->size); \
! 406: speed_operand_dst (s, wp, 2*s->size); \
! 407: speed_cache_fill (s); \
! 408: \
! 409: speed_starttime (); \
! 410: i = s->reps; \
! 411: do \
! 412: call; \
! 413: while (--i != 0); \
! 414: t = speed_endtime (); \
! 415: \
! 416: TMP_FREE (marker); \
! 417: return t; \
! 418: }
! 419:
! 420: #define SPEED_ROUTINE_MPN_MUL_N(function) \
! 421: SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
! 422:
! 423:
! 424: #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize) \
! 425: { \
! 426: mp_ptr wp, tspace; \
! 427: unsigned i; \
! 428: double t; \
! 429: TMP_DECL (marker); \
! 430: \
! 431: SPEED_RESTRICT_COND (s->size >= 1); \
! 432: \
! 433: TMP_MARK (marker); \
! 434: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
! 435: tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
! 436: \
! 437: speed_operand_src (s, s->xp, s->size); \
! 438: speed_operand_src (s, s->yp, s->size); \
! 439: speed_operand_dst (s, wp, 2*s->size); \
! 440: speed_operand_dst (s, tspace, tsize); \
! 441: speed_cache_fill (s); \
! 442: \
! 443: speed_starttime (); \
! 444: i = s->reps; \
! 445: do \
! 446: call; \
! 447: while (--i != 0); \
! 448: t = speed_endtime (); \
! 449: \
! 450: TMP_FREE (marker); \
! 451: return t; \
! 452: }
! 453:
! 454: /* FIXME: size restrictions */
! 455: #define SPEED_ROUTINE_MPN_KARA_MUL_N(function) \
! 456: SPEED_ROUTINE_MPN_MUL_N_TSPACE \
! 457: (function (wp, s->xp, s->xp, s->size, tspace), \
! 458: MPN_KARA_MUL_N_TSIZE (s->size))
! 459:
! 460: /* FIXME: size restrictions */
! 461: #define SPEED_ROUTINE_MPN_TOOM3_MUL_N(function) \
! 462: SPEED_ROUTINE_MPN_MUL_N_TSPACE \
! 463: (function (wp, s->xp, s->yp, s->size, tspace), \
! 464: MPN_TOOM3_MUL_N_TSIZE (s->size))
! 465:
! 466:
! 467: #define SPEED_ROUTINE_MPN_SQR_CALL(call) \
! 468: { \
! 469: mp_ptr wp; \
! 470: unsigned i; \
! 471: double t; \
! 472: TMP_DECL (marker); \
! 473: \
! 474: SPEED_RESTRICT_COND (s->size >= 1); \
! 475: \
! 476: TMP_MARK (marker); \
! 477: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
! 478: \
! 479: speed_operand_src (s, s->xp, s->size); \
! 480: speed_operand_dst (s, wp, 2*s->size); \
! 481: speed_cache_fill (s); \
! 482: \
! 483: speed_starttime (); \
! 484: i = s->reps; \
! 485: do \
! 486: call; \
! 487: while (--i != 0); \
! 488: t = speed_endtime (); \
! 489: \
! 490: TMP_FREE (marker); \
! 491: return t; \
! 492: }
! 493:
! 494: #define SPEED_ROUTINE_MPN_SQR(function) \
! 495: SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
! 496:
! 497:
! 498: #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize) \
! 499: { \
! 500: mp_ptr wp, tspace; \
! 501: unsigned i; \
! 502: double t; \
! 503: TMP_DECL (marker); \
! 504: \
! 505: SPEED_RESTRICT_COND (s->size >= 1); \
! 506: \
! 507: TMP_MARK (marker); \
! 508: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
! 509: tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
! 510: \
! 511: speed_operand_src (s, s->xp, s->size); \
! 512: speed_operand_dst (s, wp, 2*s->size); \
! 513: speed_operand_dst (s, tspace, tsize); \
! 514: speed_cache_fill (s); \
! 515: \
! 516: speed_starttime (); \
! 517: i = s->reps; \
! 518: do \
! 519: call; \
! 520: while (--i != 0); \
! 521: t = speed_endtime (); \
! 522: \
! 523: TMP_FREE (marker); \
! 524: return t; \
! 525: }
! 526:
! 527: /* FIXME: size restrictions */
! 528: #define SPEED_ROUTINE_MPN_KARA_SQR_N(function) \
! 529: SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
! 530: MPN_KARA_SQR_N_TSIZE (s->size))
! 531:
! 532: /* FIXME: size restrictions */
! 533: #define SPEED_ROUTINE_MPN_TOOM3_SQR_N(function) \
! 534: SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
! 535: MPN_TOOM3_SQR_N_TSIZE (s->size))
! 536:
! 537:
! 538: #define SPEED_ROUTINE_MPN_MOD_CALL(call) \
! 539: { \
! 540: unsigned i; \
! 541: \
! 542: SPEED_RESTRICT_COND (s->size >= 0); \
! 543: \
! 544: speed_operand_src (s, s->xp, s->size); \
! 545: speed_cache_fill (s); \
! 546: \
! 547: speed_starttime (); \
! 548: i = s->reps; \
! 549: do \
! 550: call; \
! 551: while (--i != 0); \
! 552: return speed_endtime (); \
! 553: }
! 554:
! 555: #define SPEED_ROUTINE_MPN_MOD_1(function) \
! 556: SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
! 557:
! 558: #define SPEED_ROUTINE_MPN_MOD_1C(function) \
! 559: SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r, 0))
! 560:
! 561:
! 562: /* A division of 2*s->size by s->size limbs */
! 563:
! 564: #define SPEED_ROUTINE_MPN_BZ_DIVREM_CALL(call) \
! 565: { \
! 566: unsigned i; \
! 567: mp_ptr a, d, q, r; \
! 568: double t; \
! 569: TMP_DECL (marker); \
! 570: \
! 571: SPEED_RESTRICT_COND (s->size >= 1); \
! 572: \
! 573: TMP_MARK (marker); \
! 574: a = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_xp); \
! 575: d = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \
! 576: q = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
! 577: r = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
! 578: \
! 579: MPN_COPY (a, s->xp, s->size); \
! 580: MPN_COPY (a+s->size, s->xp, s->size); \
! 581: \
! 582: MPN_COPY (d, s->yp, s->size); \
! 583: \
! 584: /* normalize the data */ \
! 585: d[s->size-1] |= MP_LIMB_T_HIGHBIT; \
! 586: a[2*s->size-1] = d[s->size-1] - 1; \
! 587: \
! 588: speed_operand_src (s, a, 2*s->size); \
! 589: speed_operand_src (s, d, s->size); \
! 590: speed_operand_dst (s, q, s->size+1); \
! 591: speed_operand_dst (s, r, s->size); \
! 592: speed_cache_fill (s); \
! 593: \
! 594: speed_starttime (); \
! 595: i = s->reps; \
! 596: do \
! 597: call; \
! 598: while (--i != 0); \
! 599: t = speed_endtime (); \
! 600: \
! 601: TMP_FREE (marker); \
! 602: return t; \
! 603: }
! 604:
! 605: #define SPEED_ROUTINE_MPN_BZ_DIVREM_N(function) \
! 606: SPEED_ROUTINE_MPN_BZ_DIVREM_CALL((*function) (q, a, d, s->size))
! 607:
! 608: #define SPEED_ROUTINE_MPN_BZ_DIVREM_SB(function) \
! 609: SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \
! 610: ((*function) (q, a, 2*s->size, d, s->size))
! 611:
! 612: #define SPEED_ROUTINE_MPN_BZ_TDIV_QR(function) \
! 613: SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \
! 614: ((*function) (q, r, 0, a, 2*s->size, d, s->size))
! 615:
! 616:
! 617: #define SPEED_ROUTINE_MPN_POPCOUNT(function) \
! 618: { \
! 619: unsigned i; \
! 620: \
! 621: SPEED_RESTRICT_COND (s->size >= 1); \
! 622: \
! 623: speed_operand_src (s, s->xp, s->size); \
! 624: speed_cache_fill (s); \
! 625: \
! 626: speed_starttime (); \
! 627: i = s->reps; \
! 628: do \
! 629: function (s->xp, s->size); \
! 630: while (--i != 0); \
! 631: return speed_endtime (); \
! 632: }
! 633:
! 634: #define SPEED_ROUTINE_MPN_HAMDIST(function) \
! 635: { \
! 636: unsigned i; \
! 637: \
! 638: SPEED_RESTRICT_COND (s->size >= 1); \
! 639: \
! 640: speed_operand_src (s, s->xp, s->size); \
! 641: speed_operand_src (s, s->yp, s->size); \
! 642: speed_cache_fill (s); \
! 643: \
! 644: speed_starttime (); \
! 645: i = s->reps; \
! 646: do \
! 647: function (s->xp, s->yp, s->size); \
! 648: while (--i != 0); \
! 649: return speed_endtime (); \
! 650: }
! 651:
! 652:
! 653: /* For mpz_fib_ui, mpz_fac_ui, etc */
! 654:
! 655: #define SPEED_ROUTINE_MPZ_UI(function) \
! 656: { \
! 657: mpz_t z; \
! 658: unsigned i; \
! 659: double t; \
! 660: \
! 661: SPEED_RESTRICT_COND (s->size >= 0); \
! 662: \
! 663: mpz_init (z); \
! 664: \
! 665: speed_starttime (); \
! 666: i = s->reps; \
! 667: do \
! 668: function (z, s->size); \
! 669: while (--i != 0); \
! 670: t = speed_endtime (); \
! 671: \
! 672: mpz_clear (z); \
! 673: return t; \
! 674: }
! 675:
! 676:
! 677: /* Calculate 2^(m-1) mod m for random odd m of s->size limbs. Having m odd
! 678: allows redc to be used. Actually the exponent (m-1) is cut down to at
! 679: most 6 limbs so the calculation doesn't take too long. */
! 680: #define SPEED_ROUTINE_MPZ_POWM(function) \
! 681: { \
! 682: mpz_t r, b, e, m; \
! 683: unsigned i; \
! 684: double t; \
! 685: \
! 686: SPEED_RESTRICT_COND (s->size >= 1); \
! 687: \
! 688: mpz_init (r); \
! 689: mpz_init_set_ui (b, 2); \
! 690: \
! 691: /* force m to odd */ \
! 692: mpz_init (m); \
! 693: mpz_set_n (m, s->xp, s->size); \
! 694: PTR(m)[0] |= 1; \
! 695: \
! 696: mpz_init_set (e, m); \
! 697: mpz_sub_ui (e, e, 1); \
! 698: SIZ(e) = MIN (SIZ(e), 6); \
! 699: \
! 700: speed_starttime (); \
! 701: i = s->reps; \
! 702: do \
! 703: function (r, b, e, m); \
! 704: while (--i != 0); \
! 705: t = speed_endtime (); \
! 706: \
! 707: mpz_clear (r); \
! 708: mpz_clear (b); \
! 709: mpz_clear (e); \
! 710: mpz_clear (m); \
! 711: return t; \
! 712: }
! 713:
! 714:
! 715: #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \
! 716: { \
! 717: mp_ptr wp, wp2, xp, yp; \
! 718: unsigned i; \
! 719: double t; \
! 720: TMP_DECL (marker); \
! 721: \
! 722: SPEED_RESTRICT_COND (s->size >= 0); \
! 723: \
! 724: TMP_MARK (marker); \
! 725: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 726: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
! 727: xp = s->xp; \
! 728: yp = s->yp; \
! 729: \
! 730: switch (s->r) { \
! 731: case 0: break; \
! 732: case 1: xp = wp; break; \
! 733: case 2: yp = wp2; break; \
! 734: case 3: xp = wp; yp = wp2; break; \
! 735: case 4: xp = wp2; yp = wp; break; \
! 736: default: \
! 737: fprintf (stderr, "Unrecognised r=%ld in addsub measuring\n", s->r); \
! 738: abort (); \
! 739: } \
! 740: if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \
! 741: if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \
! 742: \
! 743: speed_operand_src (s, xp, s->size); \
! 744: speed_operand_src (s, yp, s->size); \
! 745: speed_operand_dst (s, wp, s->size); \
! 746: speed_operand_dst (s, wp2, s->size); \
! 747: speed_cache_fill (s); \
! 748: \
! 749: speed_starttime (); \
! 750: i = s->reps; \
! 751: do \
! 752: call; \
! 753: while (--i != 0); \
! 754: t = speed_endtime (); \
! 755: \
! 756: TMP_FREE (marker); \
! 757: return t; \
! 758: }
! 759:
! 760: #define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
! 761: SPEED_ROUTINE_MPN_ADDSUB_CALL \
! 762: (function (wp, wp2, xp, yp, s->size));
! 763:
! 764: #define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
! 765: SPEED_ROUTINE_MPN_ADDSUB_CALL \
! 766: (function (wp, wp2, xp, yp, s->size, 0));
! 767:
! 768:
! 769: #define SPEED_ROUTINE_MPN_GCD_1xN(function) \
! 770: { \
! 771: unsigned i; \
! 772: double t; \
! 773: TMP_DECL (marker); \
! 774: \
! 775: SPEED_RESTRICT_COND (s->size >= 1); \
! 776: SPEED_RESTRICT_COND (s->r != 0); \
! 777: \
! 778: TMP_MARK (marker); \
! 779: \
! 780: speed_operand_src (s, s->xp, s->size); \
! 781: speed_cache_fill (s); \
! 782: \
! 783: speed_starttime (); \
! 784: i = s->reps; \
! 785: do \
! 786: function (s->xp, s->size, s->r); \
! 787: while (--i != 0); \
! 788: t = speed_endtime (); \
! 789: \
! 790: TMP_FREE (marker); \
! 791: return t; \
! 792: }
! 793:
! 794:
! 795: /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
! 796:
! 797: #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \
! 798: { \
! 799: unsigned i, j; \
! 800: mp_ptr px, py; \
! 801: mp_limb_t x_mask, y_mask; \
! 802: double t; \
! 803: TMP_DECL (marker); \
! 804: \
! 805: SPEED_RESTRICT_COND (s->size >= 1); \
! 806: SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \
! 807: \
! 808: TMP_MARK (marker); \
! 809: px = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
! 810: py = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_yp); \
! 811: MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \
! 812: MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \
! 813: \
! 814: x_mask = MP_LIMB_T_LOWBITMASK (s->size); \
! 815: y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
! 816: for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
! 817: { \
! 818: px[i] &= x_mask; px[i] += (px[i] == 0); \
! 819: py[i] &= y_mask; py[i] += (py[i] == 0); \
! 820: setup; \
! 821: } \
! 822: \
! 823: speed_operand_src (s, px, SPEED_BLOCK_SIZE); \
! 824: speed_operand_src (s, py, SPEED_BLOCK_SIZE); \
! 825: speed_cache_fill (s); \
! 826: \
! 827: speed_starttime (); \
! 828: i = s->reps; \
! 829: do \
! 830: { \
! 831: j = SPEED_BLOCK_SIZE; \
! 832: do \
! 833: { \
! 834: call; \
! 835: } \
! 836: while (--j != 0); \
! 837: } \
! 838: while (--i != 0); \
! 839: t = speed_endtime (); \
! 840: \
! 841: TMP_FREE (marker); \
! 842: \
! 843: s->time_divisor = SPEED_BLOCK_SIZE; \
! 844: return t; \
! 845: }
! 846:
! 847: #define SPEED_ROUTINE_MPN_GCD_1(function) \
! 848: SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
! 849:
! 850: #define SPEED_ROUTINE_MPN_JACBASE(function) \
! 851: SPEED_ROUTINE_MPN_GCD_1_CALL \
! 852: ({ \
! 853: px[i] %= py[i]; \
! 854: px[i] |= 1; \
! 855: py[i] |= 1; \
! 856: if (py[i]==1) py[i]=3; \
! 857: }, \
! 858: function (px[j-1], py[j-1], 0))
! 859:
! 860:
! 861: /* Run some GCDs of s->size limbs each. The number of different data values
! 862: is decreased as s->size**2, since GCD is a quadratic algorithm.
! 863: SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
! 864: though, because the plain gcd is about twice as fast as gcdext. */
! 865:
! 866: #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \
! 867: { \
! 868: unsigned i; \
! 869: mp_size_t j, pieces, psize; \
! 870: mp_ptr wp, wp2, xtmp, ytmp, px, py; \
! 871: double t; \
! 872: TMP_DECL (marker); \
! 873: \
! 874: SPEED_RESTRICT_COND (s->size >= 1); \
! 875: \
! 876: TMP_MARK (marker); \
! 877: xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \
! 878: ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \
! 879: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 880: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
! 881: \
! 882: pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \
! 883: pieces = MAX (pieces, 1); \
! 884: pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \
! 885: \
! 886: psize = pieces * s->size; \
! 887: px = TMP_ALLOC_LIMBS (psize); \
! 888: py = TMP_ALLOC_LIMBS (psize); \
! 889: MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
! 890: MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
! 891: \
! 892: /* y must be odd, x must have at least as many bits as y */ \
! 893: for (j = 0; j < pieces; j++) \
! 894: { \
! 895: mp_ptr x = px+j*s->size; \
! 896: mp_ptr y = py+j*s->size; \
! 897: y[0] |= 1; \
! 898: if (x[s->size-1] == 0) x[s->size-1] = 1; \
! 899: if (y[s->size-1] == 0) y[s->size-1] = 1; \
! 900: x[s->size-1] = MAX (x[s->size-1], y[s->size-1]); \
! 901: } \
! 902: \
! 903: speed_operand_src (s, px, psize); \
! 904: speed_operand_src (s, py, psize); \
! 905: speed_operand_dst (s, xtmp, s->size); \
! 906: speed_operand_dst (s, ytmp, s->size); \
! 907: speed_operand_dst (s, wp, s->size); \
! 908: speed_cache_fill (s); \
! 909: \
! 910: speed_starttime (); \
! 911: i = s->reps; \
! 912: do \
! 913: { \
! 914: j = pieces; \
! 915: do \
! 916: { \
! 917: MPN_COPY (xtmp, px+(j-1)*s->size, s->size); \
! 918: MPN_COPY (ytmp, py+(j-1)*s->size, s->size); \
! 919: call; \
! 920: } \
! 921: while (--j != 0); \
! 922: } \
! 923: while (--i != 0); \
! 924: t = speed_endtime (); \
! 925: \
! 926: TMP_FREE (marker); \
! 927: \
! 928: s->time_divisor = pieces; \
! 929: return t; \
! 930: }
! 931:
! 932: #define SPEED_ROUTINE_MPN_GCD(function) \
! 933: SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
! 934:
! 935: #define SPEED_ROUTINE_MPN_GCDEXT(function) \
! 936: SPEED_ROUTINE_MPN_GCD_CALL \
! 937: (4, { mp_size_t wp2size; \
! 938: function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
! 939:
! 940:
! 941: #define SPEED_ROUTINE_MPN_DIVREM_2(function) \
! 942: { \
! 943: mp_ptr wp, xp; \
! 944: mp_limb_t yp[2]; \
! 945: unsigned i; \
! 946: double t; \
! 947: TMP_DECL (marker); \
! 948: \
! 949: SPEED_RESTRICT_COND (s->size >= 1); \
! 950: \
! 951: TMP_MARK (marker); \
! 952: xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
! 953: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 954: \
! 955: /* source is destroyed */ \
! 956: MPN_COPY (xp, s->xp, s->size); \
! 957: \
! 958: /* divisor must be normalized */ \
! 959: MPN_COPY (yp, s->yp_block, 2); \
! 960: yp[1] |= MP_LIMB_T_HIGHBIT; \
! 961: \
! 962: speed_operand_src (s, xp, s->size); \
! 963: speed_operand_src (s, yp, 2); \
! 964: speed_operand_dst (s, wp, s->size); \
! 965: speed_cache_fill (s); \
! 966: \
! 967: speed_starttime (); \
! 968: i = s->reps; \
! 969: do \
! 970: function (wp, 0, xp, s->size, yp); \
! 971: while (--i != 0); \
! 972: t = speed_endtime (); \
! 973: \
! 974: TMP_FREE (marker); \
! 975: return t; \
! 976: }
! 977:
! 978:
! 979: #define SPEED_ROUTINE_MODLIMB_INVERT(function) \
! 980: { \
! 981: unsigned i, j; \
! 982: mp_ptr xp; \
! 983: mp_limb_t n = 1; \
! 984: double t; \
! 985: \
! 986: xp = s->xp_block-1; \
! 987: \
! 988: speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
! 989: speed_cache_fill (s); \
! 990: \
! 991: speed_starttime (); \
! 992: i = s->reps; \
! 993: do \
! 994: { \
! 995: j = SPEED_BLOCK_SIZE; \
! 996: do \
! 997: { \
! 998: /* randomized but successively dependent */ \
! 999: n += (xp[j] << 1); \
! 1000: \
! 1001: function (n, n); \
! 1002: } \
! 1003: while (--j != 0); \
! 1004: } \
! 1005: while (--i != 0); \
! 1006: t = speed_endtime (); \
! 1007: \
! 1008: /* make sure the compiler won't optimize away n */ \
! 1009: noop_1 (n); \
! 1010: \
! 1011: s->time_divisor = SPEED_BLOCK_SIZE; \
! 1012: return t; \
! 1013: }
! 1014:
! 1015: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>