Annotation of OpenXM_contrib/gmp/tune/speed.h, Revision 1.1.1.3
1.1.1.3 ! ohara 1: /* Header for speed and threshold things.
1.1 maekawa 2:
1.1.1.3 ! ohara 3: Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4:
5: This file is part of the GNU MP Library.
6:
7: The GNU MP Library is free software; you can redistribute it and/or modify
8: it under the terms of the GNU Lesser General Public License as published by
9: the Free Software Foundation; either version 2.1 of the License, or (at your
10: option) any later version.
11:
12: The GNU MP Library is distributed in the hope that it will be useful, but
13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: License for more details.
16:
17: You should have received a copy of the GNU Lesser General Public License
18: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
1.1.1.3 ! ohara 20: MA 02111-1307, USA. */
1.1 maekawa 21:
22: #ifndef __SPEED_H__
23: #define __SPEED_H__
24:
25:
26: /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
27: newsize long. */
28: #define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \
29: do { \
30: ASSERT ((newsize) >= (oldsize)); \
31: MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \
32: } while (0)
33:
34: /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
35: x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */
36: #define MP_LIMB_T_LOWBITMASK(n) \
37: ((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
38:
39:
40: /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
41:
42: #define TMP_ALLOC_ALIGNED(bytes, align) \
43: align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
44: #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
45: ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
46:
47: /* 32 for pentium, 64 for athlon, might want to configure this for other
48: CPUs. In truth though nothing has yet shown up that cares about cache
49: line boundaries. The only practical effect of this is to restrict the
50: range that s->align_xp can take. Perhaps this could be a variable
51: instead. */
52: #define CACHE_LINE_SIZE 64 /* bytes */
53:
54: #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
55:
56: #define SPEED_TMP_ALLOC_LIMBS(limbs, align) \
57: (speed_tmp_alloc_adjust \
58: (TMP_ALLOC_LIMBS((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK), (align)))
59:
60:
61: /* This is the size for s->xp_block and s->yp_block, used in certain
62: routines that want to run across many different data values and use
63: s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
64:
65: 512 means 2kbytes of data for each of xp_block and yp_block, making 4k
66: total, which should fit easily in any L1 data cache. */
67:
68: #define SPEED_BLOCK_SIZE 512 /* limbs */
69:
70:
71: extern double speed_unittime;
72: extern double speed_cycletime;
73: extern int speed_precision;
1.1.1.3 ! ohara 74: extern char speed_time_string[];
1.1 maekawa 75: void speed_time_init _PROTO ((void));
1.1.1.3 ! ohara 76: void speed_cycletime_fail _PROTO ((const char *str));
! 77: void speed_cycletime_init _PROTO ((void));
! 78: void speed_cycletime_need_cycles _PROTO ((void));
! 79: void speed_cycletime_need_seconds _PROTO ((void));
1.1 maekawa 80: void speed_starttime _PROTO ((void));
81: double speed_endtime _PROTO ((void));
82:
1.1.1.3 ! ohara 83:
1.1 maekawa 84: struct speed_params {
85: unsigned reps; /* how many times to run the routine */
86: mp_ptr xp; /* first argument */
87: mp_ptr yp; /* second argument */
88: mp_size_t size; /* size of both arguments */
1.1.1.3 ! ohara 89: mp_limb_t r; /* user supplied parameter */
1.1 maekawa 90: mp_size_t align_xp; /* alignment of xp */
91: mp_size_t align_yp; /* alignment of yp */
92: mp_size_t align_wp; /* intended alignment of wp */
93: mp_size_t align_wp2; /* intended alignment of wp2 */
94: mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
95: mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
96:
97: double time_divisor; /* optionally set by the speed routine */
98:
99: /* used by the cache priming things */
100: int cache;
101: unsigned src_num, dst_num;
102: struct {
103: mp_ptr ptr;
104: mp_size_t size;
105: } src[2], dst[3];
106: };
107:
108: typedef double (*speed_function_t) _PROTO ((struct speed_params *s));
109:
110: double speed_measure _PROTO ((speed_function_t fun, struct speed_params *s));
111:
112: /* Prototypes for speed measuring routines */
113:
1.1.1.3 ! ohara 114: double speed_back_to_back (struct speed_params *s);
! 115: double speed_count_leading_zeros _PROTO ((struct speed_params *s));
! 116: double speed_count_trailing_zeros _PROTO ((struct speed_params *s));
! 117: double speed_find_a _PROTO ((struct speed_params *s));
! 118: double speed_gmp_allocate_free _PROTO ((struct speed_params *s));
! 119: double speed_gmp_allocate_reallocate_free _PROTO ((struct speed_params *s));
! 120: double speed_invert_limb _PROTO ((struct speed_params *s));
1.1 maekawa 121: double speed_malloc_free _PROTO ((struct speed_params *s));
122: double speed_malloc_realloc_free _PROTO ((struct speed_params *s));
123: double speed_memcpy _PROTO ((struct speed_params *s));
124: double speed_modlimb_invert _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 125: double speed_modlimb_invert_mul1 _PROTO ((struct speed_params *s));
! 126: double speed_modlimb_invert_loop _PROTO ((struct speed_params *s));
! 127: double speed_modlimb_invert_cond _PROTO ((struct speed_params *s));
! 128: double speed_modlimb_invert_arith _PROTO ((struct speed_params *s));
1.1 maekawa 129:
130: double speed_mpf_init_clear _PROTO ((struct speed_params *s));
131:
132: double speed_mpn_add_n _PROTO ((struct speed_params *s));
133: double speed_mpn_and_n _PROTO ((struct speed_params *s));
134: double speed_mpn_andn_n _PROTO ((struct speed_params *s));
135: double speed_mpn_addmul_1 _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 136: double speed_mpn_com_n _PROTO ((struct speed_params *s));
! 137: double speed_mpn_copyd _PROTO ((struct speed_params *s));
! 138: double speed_mpn_copyi _PROTO ((struct speed_params *s));
! 139: double speed_mpn_dc_divrem_n _PROTO ((struct speed_params *s));
! 140: double speed_mpn_dc_divrem_sb _PROTO ((struct speed_params *s));
! 141: double speed_mpn_dc_divrem_sb_div _PROTO ((struct speed_params *s));
! 142: double speed_mpn_dc_divrem_sb_inv _PROTO ((struct speed_params *s));
! 143: double speed_mpn_dc_tdiv_qr _PROTO ((struct speed_params *s));
1.1 maekawa 144: double speed_MPN_COPY _PROTO ((struct speed_params *s));
145: double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
146: double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 147: double speed_mpn_divexact_1 _PROTO ((struct speed_params *s));
1.1 maekawa 148: double speed_mpn_divexact_by3 _PROTO ((struct speed_params *s));
149: double speed_mpn_divrem_1 _PROTO ((struct speed_params *s));
150: double speed_mpn_divrem_1f _PROTO ((struct speed_params *s));
151: double speed_mpn_divrem_1c _PROTO ((struct speed_params *s));
152: double speed_mpn_divrem_1cf _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 153: double speed_mpn_divrem_1_div _PROTO ((struct speed_params *s));
! 154: double speed_mpn_divrem_1f_div _PROTO ((struct speed_params *s));
! 155: double speed_mpn_divrem_1_inv _PROTO ((struct speed_params *s));
! 156: double speed_mpn_divrem_1f_inv _PROTO ((struct speed_params *s));
1.1 maekawa 157: double speed_mpn_divrem_2 _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 158: double speed_mpn_divrem_2_div _PROTO ((struct speed_params *s));
! 159: double speed_mpn_divrem_2_inv _PROTO ((struct speed_params *s));
! 160: double speed_mpn_fib2_ui _PROTO ((struct speed_params *s));
1.1 maekawa 161: double speed_mpn_gcd _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 162: double speed_mpn_gcd_finda _PROTO ((struct speed_params *s));
1.1 maekawa 163: double speed_mpn_gcd_1 _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 164: double speed_mpn_gcd_1N _PROTO ((struct speed_params *s));
! 165: double speed_mpn_gcd_binary _PROTO ((struct speed_params *s));
! 166: double speed_mpn_gcd_finda _PROTO ((struct speed_params *s));
1.1 maekawa 167: double speed_mpn_gcdext _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 168: double speed_mpn_gcdext_double _PROTO ((struct speed_params *s));
! 169: double speed_mpn_gcdext_one_double _PROTO ((struct speed_params *s));
! 170: double speed_mpn_gcdext_one_single _PROTO ((struct speed_params *s));
! 171: double speed_mpn_gcdext_single _PROTO ((struct speed_params *s));
! 172: double speed_mpn_get_str _PROTO ((struct speed_params *s));
1.1 maekawa 173: double speed_mpn_hamdist _PROTO ((struct speed_params *s));
174: double speed_mpn_ior_n _PROTO ((struct speed_params *s));
175: double speed_mpn_iorn_n _PROTO ((struct speed_params *s));
176: double speed_mpn_jacobi_base _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 177: double speed_mpn_jacobi_base_1 _PROTO ((struct speed_params *s));
! 178: double speed_mpn_jacobi_base_2 _PROTO ((struct speed_params *s));
! 179: double speed_mpn_jacobi_base_3 _PROTO ((struct speed_params *s));
1.1 maekawa 180: double speed_mpn_kara_mul_n _PROTO ((struct speed_params *s));
181: double speed_mpn_kara_sqr_n _PROTO ((struct speed_params *s));
182: double speed_mpn_lshift _PROTO ((struct speed_params *s));
183: double speed_mpn_mod_1 _PROTO ((struct speed_params *s));
184: double speed_mpn_mod_1c _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 185: double speed_mpn_mod_1_div _PROTO ((struct speed_params *s));
! 186: double speed_mpn_mod_1_inv _PROTO ((struct speed_params *s));
! 187: double speed_mpn_mod_34lsub1 _PROTO ((struct speed_params *s));
! 188: double speed_mpn_modexact_1_odd _PROTO ((struct speed_params *s));
! 189: double speed_mpn_modexact_1c_odd _PROTO ((struct speed_params *s));
1.1 maekawa 190: double speed_mpn_mul_1 _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 191: double speed_mpn_mul_1_inplace _PROTO ((struct speed_params *s));
! 192: double speed_mpn_mul_2 _PROTO ((struct speed_params *s));
1.1 maekawa 193: double speed_mpn_mul_basecase _PROTO ((struct speed_params *s));
194: double speed_mpn_mul_fft _PROTO ((struct speed_params *s));
195: double speed_mpn_mul_fft_sqr _PROTO ((struct speed_params *s));
196: double speed_mpn_mul_fft_full _PROTO ((struct speed_params *s));
197: double speed_mpn_mul_fft_full_sqr _PROTO ((struct speed_params *s));
198: double speed_mpn_mul_n _PROTO ((struct speed_params *s));
199: double speed_mpn_mul_n_sqr _PROTO ((struct speed_params *s));
200: double speed_mpn_nand_n _PROTO ((struct speed_params *s));
201: double speed_mpn_nior_n _PROTO ((struct speed_params *s));
202: double speed_mpn_popcount _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 203: double speed_mpn_preinv_divrem_1 _PROTO ((struct speed_params *s));
! 204: double speed_mpn_preinv_divrem_1f _PROTO ((struct speed_params *s));
! 205: double speed_mpn_preinv_mod_1 _PROTO ((struct speed_params *s));
! 206: double speed_redc _PROTO ((struct speed_params *s));
1.1 maekawa 207: double speed_mpn_rshift _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 208: double speed_mpn_sb_divrem_m3 _PROTO ((struct speed_params *s));
! 209: double speed_mpn_sb_divrem_m3_div _PROTO ((struct speed_params *s));
! 210: double speed_mpn_sb_divrem_m3_inv _PROTO ((struct speed_params *s));
! 211: double speed_mpn_set_str _PROTO ((struct speed_params *s));
! 212: double speed_mpn_set_str_basecase _PROTO ((struct speed_params *s));
! 213: double speed_mpn_set_str_subquad _PROTO ((struct speed_params *s));
1.1 maekawa 214: double speed_mpn_sqr_basecase _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 215: double speed_mpn_sqr_diagonal _PROTO ((struct speed_params *s));
1.1 maekawa 216: double speed_mpn_sqr_n _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 217: double speed_mpn_sqrtrem _PROTO ((struct speed_params *s));
1.1 maekawa 218: double speed_mpn_sub_n _PROTO ((struct speed_params *s));
219: double speed_mpn_submul_1 _PROTO ((struct speed_params *s));
220: double speed_mpn_toom3_mul_n _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 221: double speed_mpn_toom3_mul_n_mpn _PROTO ((struct speed_params *s));
! 222: double speed_mpn_toom3_mul_n_open _PROTO ((struct speed_params *s));
1.1 maekawa 223: double speed_mpn_toom3_sqr_n _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 224: double speed_mpn_toom3_sqr_n_mpn _PROTO ((struct speed_params *s));
! 225: double speed_mpn_toom3_sqr_n_open _PROTO ((struct speed_params *s));
1.1 maekawa 226: double speed_mpn_udiv_qrnnd _PROTO ((struct speed_params *s));
227: double speed_mpn_umul_ppmm _PROTO ((struct speed_params *s));
228: double speed_mpn_xnor_n _PROTO ((struct speed_params *s));
229: double speed_mpn_xor_n _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 230: double speed_MPN_ZERO _PROTO ((struct speed_params *s));
1.1 maekawa 231:
232: double speed_mpq_init_clear _PROTO ((struct speed_params *s));
233:
234: double speed_mpz_add _PROTO ((struct speed_params *s));
235: double speed_mpz_bin_uiui _PROTO ((struct speed_params *s));
236: double speed_mpz_fac_ui _PROTO ((struct speed_params *s));
237: double speed_mpz_fib_ui _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 238: double speed_mpz_fib2_ui _PROTO ((struct speed_params *s));
1.1 maekawa 239: double speed_mpz_init_clear _PROTO ((struct speed_params *s));
240: double speed_mpz_init_realloc_clear _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 241: double speed_mpz_jacobi _PROTO ((struct speed_params *s));
! 242: double speed_mpz_lucnum_ui _PROTO ((struct speed_params *s));
! 243: double speed_mpz_lucnum2_ui _PROTO ((struct speed_params *s));
! 244: double speed_mpz_mod _PROTO ((struct speed_params *s));
1.1 maekawa 245: double speed_mpz_powm _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 246: double speed_mpz_powm_mod _PROTO ((struct speed_params *s));
! 247: double speed_mpz_powm_redc _PROTO ((struct speed_params *s));
! 248: double speed_mpz_powm_ui _PROTO ((struct speed_params *s));
1.1 maekawa 249:
250: double speed_noop _PROTO ((struct speed_params *s));
251: double speed_noop_wxs _PROTO ((struct speed_params *s));
252: double speed_noop_wxys _PROTO ((struct speed_params *s));
253:
1.1.1.3 ! ohara 254: double speed_operator_div (struct speed_params *s);
! 255: double speed_operator_mod (struct speed_params *s);
! 256:
1.1 maekawa 257: double speed_udiv_qrnnd _PROTO ((struct speed_params *s));
258: double speed_udiv_qrnnd_preinv _PROTO ((struct speed_params *s));
259: double speed_udiv_qrnnd_preinv2norm _PROTO ((struct speed_params *s));
1.1.1.3 ! ohara 260: double speed_udiv_qrnnd_c _PROTO ((struct speed_params *s));
1.1 maekawa 261: double speed_umul_ppmm _PROTO ((struct speed_params *s));
262:
263:
264: /* Prototypes for other routines */
265:
266: /* low 32-bits in p[0], high 32-bits in p[1] */
267: void speed_cyclecounter _PROTO ((unsigned p[2]));
268:
1.1.1.3 ! ohara 269: /* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
! 270: output or a clobber for the cpuid, hence an explicit save and restore. A
! 271: clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
! 272: the dummy output style in non-PIC, so there's an error if somehow -fPIC
! 273: is used without a -DPIC to tell us about it. */
! 274: #if defined(__GNUC__) && ! defined (NO_ASM) \
! 275: && (defined (__i386__) || defined (__i486__))
! 276: #ifdef PIC
! 277: #define speed_cyclecounter(p) \
! 278: do { \
! 279: int __speed_cyclecounter__save_ebx; \
! 280: int __speed_cyclecounter__dummy; \
! 281: __asm__ __volatile__ ("movl %%ebx, %1\n" \
! 282: "cpuid\n" \
! 283: "movl %1, %%ebx\n" \
! 284: "rdtsc" \
! 285: : "=a" ((p)[0]), \
! 286: "=&rm" (__speed_cyclecounter__save_ebx), \
! 287: "=c" (__speed_cyclecounter__dummy), \
! 288: "=d" ((p)[1])); \
! 289: } while (0)
! 290: #else
! 291: #define speed_cyclecounter(p) \
! 292: do { \
! 293: int __speed_cyclecounter__dummy1; \
! 294: int __speed_cyclecounter__dummy2; \
! 295: __asm__ __volatile__ ("cpuid\n" \
! 296: "rdtsc" \
! 297: : "=a" ((p)[0]), \
! 298: "=b" (__speed_cyclecounter__dummy1), \
! 299: "=c" (__speed_cyclecounter__dummy2), \
! 300: "=d" ((p)[1])); \
! 301: } while (0)
! 302: #endif
! 303: #endif
! 304:
! 305: double speed_cyclecounter_diff _PROTO ((const unsigned end[2],
! 306: const unsigned start[2]));
! 307: int gettimeofday_microseconds_p _PROTO ((void));
! 308: int getrusage_microseconds_p _PROTO ((void));
! 309: int cycles_works_p _PROTO ((void));
! 310: long clk_tck _PROTO ((void));
! 311:
! 312: int double_cmp_ptr _PROTO ((const double *p, const double *q));
1.1 maekawa 313: void pentium_wbinvd _PROTO ((void));
1.1.1.3 ! ohara 314: typedef int (*qsort_function_t) _PROTO ((const void *, const void *));
1.1 maekawa 315:
316: void noop _PROTO ((void));
317: void noop_1 _PROTO ((mp_limb_t n));
318: void noop_wxs _PROTO ((mp_ptr wp, mp_srcptr xp, mp_size_t size));
319: void noop_wxys _PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
320: mp_size_t size));
321: void mpn_cache_fill _PROTO ((mp_srcptr ptr, mp_size_t size));
322: void mpn_cache_fill_dummy _PROTO ((mp_limb_t n));
323: mp_ptr speed_tmp_alloc_adjust _PROTO ((void *ptr, mp_size_t align));
324: void speed_cache_fill _PROTO ((struct speed_params *s));
325: void speed_operand_src _PROTO ((struct speed_params *s,
326: mp_ptr ptr, mp_size_t size));
327: void speed_operand_dst _PROTO ((struct speed_params *s,
328: mp_ptr ptr, mp_size_t size));
329:
330: extern int speed_option_addrs;
1.1.1.3 ! ohara 331: extern int speed_option_verbose;
1.1 maekawa 332: void speed_option_set _PROTO((const char *s));
333:
1.1.1.3 ! ohara 334: mp_limb_t mpn_divrem_1_div _PROTO ((mp_ptr qp, mp_size_t xsize,
! 335: mp_srcptr ap, mp_size_t size,
! 336: mp_limb_t d));
! 337: mp_limb_t mpn_divrem_1_inv _PROTO ((mp_ptr qp, mp_size_t xsize,
! 338: mp_srcptr ap, mp_size_t size,
! 339: mp_limb_t d));
! 340: mp_limb_t mpn_divrem_2_div _PROTO ((mp_ptr qp, mp_size_t qxn,
! 341: mp_ptr np, mp_size_t nsize,
! 342: mp_srcptr dp));
! 343: mp_limb_t mpn_divrem_2_inv _PROTO ((mp_ptr qp, mp_size_t qxn,
! 344: mp_ptr np, mp_size_t nsize,
! 345: mp_srcptr dp));
! 346:
! 347: int mpn_jacobi_base_1 _PROTO ((mp_limb_t a, mp_limb_t b, int result_bit1));
! 348: int mpn_jacobi_base_2 _PROTO ((mp_limb_t a, mp_limb_t b, int result_bit1));
! 349: int mpn_jacobi_base_3 _PROTO ((mp_limb_t a, mp_limb_t b, int result_bit1));
! 350:
! 351: mp_limb_t mpn_mod_1_div _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d)) __GMP_ATTRIBUTE_PURE;
! 352: mp_limb_t mpn_mod_1_inv _PROTO ((mp_srcptr ap, mp_size_t size, mp_limb_t d)) __GMP_ATTRIBUTE_PURE;
! 353:
! 354: mp_size_t mpn_gcd_binary
! 355: _PROTO ((mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize));
! 356: mp_size_t mpn_gcdext_one_double
! 357: _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
! 358: mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
! 359: mp_size_t mpn_gcdext_one_single
! 360: _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
! 361: mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
! 362: mp_size_t mpn_gcdext_single
! 363: _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
! 364: mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
! 365: mp_size_t mpn_gcdext_double
! 366: _PROTO ((mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
! 367: mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize));
! 368:
! 369: mp_limb_t mpn_sb_divrem_mn_div _PROTO ((mp_ptr qp,
! 370: mp_ptr np, mp_size_t nsize,
! 371: mp_srcptr dp, mp_size_t dsize));
! 372: mp_limb_t mpn_sb_divrem_mn_inv _PROTO ((mp_ptr qp,
! 373: mp_ptr np, mp_size_t nsize,
! 374: mp_srcptr dp, mp_size_t dsize));
! 375:
! 376: mp_size_t mpn_set_str_basecase _PROTO ((mp_ptr, const unsigned char *, size_t, int));
! 377: mp_size_t mpn_set_str_subquad _PROTO ((mp_ptr, const unsigned char *, size_t, int));
! 378:
! 379: void mpn_toom3_mul_n_open _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,
! 380: mp_ptr));
! 381: void mpn_toom3_sqr_n_open _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
! 382: void mpn_toom3_mul_n_mpn _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,
! 383: mp_ptr));
! 384: void mpn_toom3_sqr_n_mpn _PROTO((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
! 385:
! 386: void mpz_powm_mod _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e,
! 387: mpz_srcptr mod));
! 388: void mpz_powm_redc _PROTO ((mpz_ptr res, mpz_srcptr base, mpz_srcptr e,
! 389: mpz_srcptr mod));
! 390: void redc _PROTO ((mp_ptr cp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim,
! 391: mp_ptr tp));
! 392:
! 393: int speed_routine_count_zeros_setup _PROTO ((struct speed_params *s,
! 394: mp_ptr xp, int leading,
! 395: int zero));
! 396:
! 397: /* The measuring routines use these big macros to save duplication for
! 398: similar forms. They also get used for some automatically generated
! 399: measuring of new implementations of functions.
! 400:
! 401: Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
! 402: function pointer is considered undesirable since it's not the way a
! 403: normal application will be calling, and some processors might do
! 404: different things with an indirect call, like not branch predicting, or
! 405: doing a full pipe flush. At least some of the "functions" measured are
! 406: actually macros too.
! 407:
! 408: The net effect is to bloat the object code, possibly in a big way, but
! 409: only what's being measured is being run, so that doesn't matter.
! 410:
! 411: Note that if a called function is __GMP_ATTRIBUTE_PURE or
! 412: ATTRIBUTE_CONST, as for example mpn_mod_1, then the return value should
! 413: be used in some way, to stop gcc 3 and up from discarding the calls. See
! 414: SPEED_ROUTINE_MPN_MOD_CALL for instance. */
! 415:
1.1 maekawa 416:
417: #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
418:
419: /* For mpn_copy or similar. */
1.1.1.3 ! ohara 420: #define SPEED_ROUTINE_MPN_COPY(function) \
1.1 maekawa 421: { \
422: mp_ptr wp; \
423: unsigned i; \
424: double t; \
425: TMP_DECL (marker); \
426: \
427: SPEED_RESTRICT_COND (s->size >= 0); \
428: \
429: TMP_MARK (marker); \
430: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
431: \
432: speed_operand_src (s, s->xp, s->size); \
433: speed_operand_dst (s, wp, s->size); \
434: speed_cache_fill (s); \
435: \
436: speed_starttime (); \
437: i = s->reps; \
438: do \
1.1.1.3 ! ohara 439: function (wp, s->xp, s->size); \
! 440: while (--i != 0); \
! 441: t = speed_endtime (); \
! 442: \
! 443: TMP_FREE (marker); \
! 444: return t; \
! 445: }
! 446:
! 447: #define SPEED_ROUTINE_MPN_COPYC(function) \
! 448: { \
! 449: mp_ptr wp; \
! 450: unsigned i; \
! 451: double t; \
! 452: TMP_DECL (marker); \
! 453: \
! 454: SPEED_RESTRICT_COND (s->size >= 0); \
! 455: \
! 456: TMP_MARK (marker); \
! 457: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 458: \
! 459: speed_operand_src (s, s->xp, s->size); \
! 460: speed_operand_dst (s, wp, s->size); \
! 461: speed_cache_fill (s); \
! 462: \
! 463: speed_starttime (); \
! 464: i = s->reps; \
! 465: do \
! 466: function (wp, s->xp, s->size, 0); \
1.1 maekawa 467: while (--i != 0); \
468: t = speed_endtime (); \
469: \
470: TMP_FREE (marker); \
471: return t; \
472: }
473:
1.1.1.3 ! ohara 474: /* s->size is still in limbs, and it's limbs which are copied, but
! 475: "function" takes a size in bytes not limbs. */
! 476: #define SPEED_ROUTINE_MPN_COPY_BYTES(function) \
! 477: { \
! 478: mp_ptr wp; \
! 479: unsigned i; \
! 480: double t; \
! 481: TMP_DECL (marker); \
! 482: \
! 483: SPEED_RESTRICT_COND (s->size >= 0); \
! 484: \
! 485: TMP_MARK (marker); \
! 486: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 487: \
! 488: speed_operand_src (s, s->xp, s->size); \
! 489: speed_operand_dst (s, wp, s->size); \
! 490: speed_cache_fill (s); \
! 491: \
! 492: speed_starttime (); \
! 493: i = s->reps; \
! 494: do \
! 495: function (wp, s->xp, s->size * BYTES_PER_MP_LIMB); \
! 496: while (--i != 0); \
! 497: t = speed_endtime (); \
! 498: \
! 499: TMP_FREE (marker); \
! 500: return t; \
! 501: }
1.1 maekawa 502:
503:
504: /* For mpn_add_n, mpn_sub_n, or similar. */
505: #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \
506: { \
1.1.1.3 ! ohara 507: mp_ptr wp; \
! 508: mp_ptr xp, yp; \
! 509: unsigned i; \
! 510: double t; \
1.1 maekawa 511: TMP_DECL (marker); \
512: \
513: SPEED_RESTRICT_COND (s->size >= 1); \
514: \
515: TMP_MARK (marker); \
516: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
517: \
1.1.1.3 ! ohara 518: xp = s->xp; \
! 519: yp = s->yp; \
! 520: \
! 521: if (s->r == 0) ; \
! 522: else if (s->r == 1) { xp = wp; } \
! 523: else if (s->r == 2) { yp = wp; } \
! 524: else if (s->r == 3) { xp = wp; yp = wp; } \
! 525: else if (s->r == 4) { yp = xp; } \
! 526: else { \
! 527: TMP_FREE (marker); \
! 528: return -1.0; \
! 529: } \
! 530: \
! 531: /* initialize wp if operand overlap */ \
! 532: if (xp == wp || yp == wp) \
! 533: MPN_COPY (wp, s->xp, s->size); \
! 534: \
! 535: speed_operand_src (s, xp, s->size); \
! 536: speed_operand_src (s, yp, s->size); \
1.1 maekawa 537: speed_operand_dst (s, wp, s->size); \
538: speed_cache_fill (s); \
539: \
540: speed_starttime (); \
541: i = s->reps; \
542: do \
543: call; \
544: while (--i != 0); \
545: t = speed_endtime (); \
546: \
547: TMP_FREE (marker); \
548: return t; \
549: }
550:
551: #define SPEED_ROUTINE_MPN_BINARY_N(function) \
1.1.1.3 ! ohara 552: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
1.1 maekawa 553:
554: #define SPEED_ROUTINE_MPN_BINARY_NC(function) \
1.1.1.3 ! ohara 555: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
1.1 maekawa 556:
557:
558: /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
559: #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \
560: { \
561: mp_ptr wp; \
562: unsigned i; \
563: double t; \
564: TMP_DECL (marker); \
565: \
566: SPEED_RESTRICT_COND (s->size >= 1); \
567: \
568: TMP_MARK (marker); \
569: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
570: \
571: speed_operand_src (s, s->xp, s->size); \
572: speed_operand_dst (s, wp, s->size); \
573: speed_cache_fill (s); \
574: \
575: speed_starttime (); \
576: i = s->reps; \
577: do \
578: call; \
579: while (--i != 0); \
580: t = speed_endtime (); \
581: \
582: TMP_FREE (marker); \
583: return t; \
584: }
585:
586: #define SPEED_ROUTINE_MPN_UNARY_1(function) \
587: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
588:
589: #define SPEED_ROUTINE_MPN_UNARY_1C(function) \
590: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
591:
1.1.1.3 ! ohara 592: /* FIXME: wp is uninitialized here, should start it off from xp */
! 593: #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \
! 594: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
! 595:
! 596: #define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \
! 597: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
! 598:
1.1 maekawa 599: #define SPEED_ROUTINE_MPN_DIVREM_1(function) \
600: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
601:
602: #define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
603: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
604:
605: #define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
606: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
607:
608: #define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
609: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
610:
611:
1.1.1.3 ! ohara 612: /* s->r is duplicated to form the multiplier. Not sure if that's
! 613: particularly useful, but at least it provides some control. */
! 614: #define SPEED_ROUTINE_MPN_MUL_2(function) \
! 615: { \
! 616: mp_ptr wp; \
! 617: unsigned i; \
! 618: double t; \
! 619: mp_limb_t mult[2]; \
! 620: TMP_DECL (marker); \
! 621: \
! 622: SPEED_RESTRICT_COND (s->size >= 1); \
! 623: \
! 624: TMP_MARK (marker); \
! 625: wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
! 626: mult[0] = s->r; \
! 627: mult[1] = s->r; \
! 628: \
! 629: speed_operand_src (s, s->xp, s->size); \
! 630: speed_operand_src (s, mult, 2); \
! 631: speed_operand_dst (s, wp, s->size+1); \
! 632: speed_cache_fill (s); \
! 633: \
! 634: speed_starttime (); \
! 635: i = s->reps; \
! 636: do \
! 637: function (wp, s->xp, s->size, mult); \
! 638: while (--i != 0); \
! 639: t = speed_endtime (); \
! 640: \
! 641: TMP_FREE (marker); \
! 642: return t; \
! 643: }
! 644:
! 645:
! 646: #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \
! 647: { \
! 648: unsigned shift; \
! 649: mp_limb_t dinv; \
! 650: \
! 651: SPEED_RESTRICT_COND (s->size >= 0); \
! 652: SPEED_RESTRICT_COND (s->r != 0); \
! 653: \
! 654: count_leading_zeros (shift, s->r); \
! 655: invert_limb (dinv, s->r << shift); \
! 656: \
! 657: SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \
! 658: } \
! 659:
! 660: #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \
! 661: SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
! 662: ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
! 663:
! 664: /* s->size limbs worth of fraction part */
! 665: #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \
! 666: SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
! 667: ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
! 668:
! 669:
! 670: /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
! 671: #define SPEED_ROUTINE_MPN_UNARY_2_CALL(call) \
! 672: { \
! 673: mp_ptr wp; \
! 674: unsigned i; \
! 675: double t; \
! 676: mp_limb_t h, l; \
! 677: TMP_DECL (marker); \
! 678: \
! 679: SPEED_RESTRICT_COND (s->size >= 1); \
! 680: \
! 681: TMP_MARK (marker); \
! 682: wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
! 683: l = s->yp[0]; \
! 684: h = s->yp[1]; \
! 685: \
! 686: speed_operand_src (s, s->xp, s->size); \
! 687: speed_operand_dst (s, wp, s->size+1); \
! 688: speed_cache_fill (s); \
! 689: \
! 690: speed_starttime (); \
! 691: i = s->reps; \
! 692: do \
! 693: call; \
! 694: while (--i != 0); \
! 695: t = speed_endtime (); \
! 696: \
! 697: TMP_FREE (marker); \
! 698: return t; \
! 699: }
! 700:
! 701: #define SPEED_ROUTINE_MPN_UNARY_2(function) \
! 702: SPEED_ROUTINE_MPN_UNARY_2_CALL ((*function) (wp, s->xp, s->size, l, h))
! 703:
! 704:
1.1 maekawa 705: /* For mpn_mul_basecase, xsize=r, ysize=s->size. */
706: #define SPEED_ROUTINE_MPN_MUL_BASECASE(function) \
707: { \
708: mp_ptr wp; \
709: mp_size_t size1; \
710: unsigned i; \
711: double t; \
712: TMP_DECL (marker); \
713: \
714: size1 = (s->r == 0 ? s->size : s->r); \
715: \
716: SPEED_RESTRICT_COND (s->size >= 1); \
717: SPEED_RESTRICT_COND (size1 >= s->size); \
718: \
719: TMP_MARK (marker); \
720: wp = SPEED_TMP_ALLOC_LIMBS (size1 + s->size, s->align_wp); \
721: \
722: speed_operand_src (s, s->xp, size1); \
723: speed_operand_src (s, s->yp, s->size); \
724: speed_operand_dst (s, wp, size1 + s->size); \
725: speed_cache_fill (s); \
726: \
727: speed_starttime (); \
728: i = s->reps; \
729: do \
730: function (wp, s->xp, size1, s->yp, s->size); \
731: while (--i != 0); \
732: t = speed_endtime (); \
733: \
734: TMP_FREE (marker); \
735: return t; \
736: }
737:
738:
739: #define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \
740: { \
741: mp_ptr wp; \
742: unsigned i; \
743: double t; \
744: TMP_DECL (marker); \
745: \
746: SPEED_RESTRICT_COND (s->size >= 1); \
747: \
748: TMP_MARK (marker); \
749: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
750: \
751: speed_operand_src (s, s->xp, s->size); \
752: speed_operand_src (s, s->yp, s->size); \
753: speed_operand_dst (s, wp, 2*s->size); \
754: speed_cache_fill (s); \
755: \
756: speed_starttime (); \
757: i = s->reps; \
758: do \
759: call; \
760: while (--i != 0); \
761: t = speed_endtime (); \
762: \
763: TMP_FREE (marker); \
764: return t; \
765: }
766:
767: #define SPEED_ROUTINE_MPN_MUL_N(function) \
768: SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
769:
770:
1.1.1.3 ! ohara 771: #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \
1.1 maekawa 772: { \
773: mp_ptr wp, tspace; \
774: unsigned i; \
775: double t; \
776: TMP_DECL (marker); \
777: \
1.1.1.3 ! ohara 778: SPEED_RESTRICT_COND (s->size >= minsize); \
1.1 maekawa 779: \
780: TMP_MARK (marker); \
781: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
782: tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
783: \
784: speed_operand_src (s, s->xp, s->size); \
785: speed_operand_src (s, s->yp, s->size); \
786: speed_operand_dst (s, wp, 2*s->size); \
787: speed_operand_dst (s, tspace, tsize); \
788: speed_cache_fill (s); \
789: \
790: speed_starttime (); \
791: i = s->reps; \
792: do \
793: call; \
794: while (--i != 0); \
795: t = speed_endtime (); \
796: \
797: TMP_FREE (marker); \
798: return t; \
799: }
800:
801: #define SPEED_ROUTINE_MPN_KARA_MUL_N(function) \
802: SPEED_ROUTINE_MPN_MUL_N_TSPACE \
803: (function (wp, s->xp, s->xp, s->size, tspace), \
1.1.1.3 ! ohara 804: MPN_KARA_MUL_N_TSIZE (s->size), \
! 805: MPN_KARA_MUL_N_MINSIZE)
1.1 maekawa 806:
807: #define SPEED_ROUTINE_MPN_TOOM3_MUL_N(function) \
808: SPEED_ROUTINE_MPN_MUL_N_TSPACE \
809: (function (wp, s->xp, s->yp, s->size, tspace), \
1.1.1.3 ! ohara 810: MPN_TOOM3_MUL_N_TSIZE (s->size), \
! 811: MPN_TOOM3_MUL_N_MINSIZE)
1.1 maekawa 812:
813:
814: #define SPEED_ROUTINE_MPN_SQR_CALL(call) \
815: { \
816: mp_ptr wp; \
817: unsigned i; \
818: double t; \
819: TMP_DECL (marker); \
820: \
821: SPEED_RESTRICT_COND (s->size >= 1); \
822: \
823: TMP_MARK (marker); \
824: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
825: \
826: speed_operand_src (s, s->xp, s->size); \
827: speed_operand_dst (s, wp, 2*s->size); \
828: speed_cache_fill (s); \
829: \
830: speed_starttime (); \
831: i = s->reps; \
832: do \
833: call; \
834: while (--i != 0); \
835: t = speed_endtime (); \
836: \
837: TMP_FREE (marker); \
838: return t; \
839: }
840:
841: #define SPEED_ROUTINE_MPN_SQR(function) \
842: SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
843:
1.1.1.3 ! ohara 844: #define SPEED_ROUTINE_MPN_SQR_DIAGONAL(function) \
! 845: SPEED_ROUTINE_MPN_SQR (function)
! 846:
1.1 maekawa 847:
1.1.1.3 ! ohara 848: #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \
1.1 maekawa 849: { \
850: mp_ptr wp, tspace; \
851: unsigned i; \
852: double t; \
853: TMP_DECL (marker); \
854: \
1.1.1.3 ! ohara 855: SPEED_RESTRICT_COND (s->size >= minsize); \
1.1 maekawa 856: \
857: TMP_MARK (marker); \
858: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
859: tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
860: \
861: speed_operand_src (s, s->xp, s->size); \
862: speed_operand_dst (s, wp, 2*s->size); \
863: speed_operand_dst (s, tspace, tsize); \
864: speed_cache_fill (s); \
865: \
866: speed_starttime (); \
867: i = s->reps; \
868: do \
869: call; \
870: while (--i != 0); \
871: t = speed_endtime (); \
872: \
873: TMP_FREE (marker); \
874: return t; \
875: }
876:
877: #define SPEED_ROUTINE_MPN_KARA_SQR_N(function) \
878: SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1.1.1.3 ! ohara 879: MPN_KARA_SQR_N_TSIZE (s->size), \
! 880: MPN_KARA_SQR_N_MINSIZE)
1.1 maekawa 881:
882: #define SPEED_ROUTINE_MPN_TOOM3_SQR_N(function) \
883: SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1.1.1.3 ! ohara 884: MPN_TOOM3_SQR_N_TSIZE (s->size), \
! 885: MPN_TOOM3_SQR_N_MINSIZE)
1.1 maekawa 886:
887:
888: #define SPEED_ROUTINE_MPN_MOD_CALL(call) \
889: { \
1.1.1.3 ! ohara 890: unsigned i; \
! 891: mp_limb_t dummy = 0; \
1.1 maekawa 892: \
893: SPEED_RESTRICT_COND (s->size >= 0); \
894: \
895: speed_operand_src (s, s->xp, s->size); \
896: speed_cache_fill (s); \
897: \
898: speed_starttime (); \
899: i = s->reps; \
900: do \
1.1.1.3 ! ohara 901: dummy += call; \
1.1 maekawa 902: while (--i != 0); \
1.1.1.3 ! ohara 903: \
! 904: noop_1 (dummy); \
1.1 maekawa 905: return speed_endtime (); \
906: }
907:
908: #define SPEED_ROUTINE_MPN_MOD_1(function) \
909: SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
910:
911: #define SPEED_ROUTINE_MPN_MOD_1C(function) \
1.1.1.3 ! ohara 912: SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
! 913:
! 914: #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \
! 915: SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
! 916:
! 917: #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \
! 918: SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
! 919:
! 920: #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \
! 921: SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
! 922:
! 923: #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \
! 924: { \
! 925: unsigned i; \
! 926: mp_limb_t inv; \
! 927: mp_limb_t dummy = 0; \
! 928: \
! 929: SPEED_RESTRICT_COND (s->size >= 0); \
! 930: SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \
! 931: \
! 932: invert_limb (inv, s->r); \
! 933: speed_operand_src (s, s->xp, s->size); \
! 934: speed_cache_fill (s); \
! 935: \
! 936: speed_starttime (); \
! 937: i = s->reps; \
! 938: do \
! 939: dummy += (*function) (s->xp, s->size, s->r, inv); \
! 940: while (--i != 0); \
! 941: \
! 942: noop_1 (dummy); \
! 943: return speed_endtime (); \
! 944: }
1.1 maekawa 945:
946:
947: /* A division of 2*s->size by s->size limbs */
948:
1.1.1.3 ! ohara 949: #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \
1.1 maekawa 950: { \
1.1.1.3 ! ohara 951: unsigned i; \
! 952: mp_ptr a, d, q, r; \
1.1 maekawa 953: double t; \
954: TMP_DECL (marker); \
955: \
956: SPEED_RESTRICT_COND (s->size >= 1); \
957: \
958: TMP_MARK (marker); \
959: a = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_xp); \
960: d = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \
961: q = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
962: r = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
963: \
964: MPN_COPY (a, s->xp, s->size); \
965: MPN_COPY (a+s->size, s->xp, s->size); \
966: \
967: MPN_COPY (d, s->yp, s->size); \
968: \
969: /* normalize the data */ \
1.1.1.3 ! ohara 970: d[s->size-1] |= GMP_LIMB_HIGHBIT; \
1.1 maekawa 971: a[2*s->size-1] = d[s->size-1] - 1; \
972: \
973: speed_operand_src (s, a, 2*s->size); \
974: speed_operand_src (s, d, s->size); \
975: speed_operand_dst (s, q, s->size+1); \
976: speed_operand_dst (s, r, s->size); \
977: speed_cache_fill (s); \
978: \
979: speed_starttime (); \
980: i = s->reps; \
981: do \
982: call; \
983: while (--i != 0); \
984: t = speed_endtime (); \
985: \
986: TMP_FREE (marker); \
987: return t; \
988: }
989:
1.1.1.3 ! ohara 990: #define SPEED_ROUTINE_MPN_DC_DIVREM_N(function) \
! 991: SPEED_ROUTINE_MPN_DC_DIVREM_CALL((*function) (q, a, d, s->size))
1.1 maekawa 992:
1.1.1.3 ! ohara 993: #define SPEED_ROUTINE_MPN_DC_DIVREM_SB(function) \
! 994: SPEED_ROUTINE_MPN_DC_DIVREM_CALL \
1.1 maekawa 995: ((*function) (q, a, 2*s->size, d, s->size))
996:
1.1.1.3 ! ohara 997: #define SPEED_ROUTINE_MPN_DC_TDIV_QR(function) \
! 998: SPEED_ROUTINE_MPN_DC_DIVREM_CALL \
1.1 maekawa 999: ((*function) (q, r, 0, a, 2*s->size, d, s->size))
1000:
1001:
1.1.1.3 ! ohara 1002: /* A division of s->size by 3 limbs */
! 1003:
! 1004: #define SPEED_ROUTINE_MPN_SB_DIVREM_M3(function) \
! 1005: { \
! 1006: unsigned i; \
! 1007: mp_ptr a, d, q; \
! 1008: mp_size_t qsize; \
! 1009: double t; \
! 1010: TMP_DECL (marker); \
! 1011: \
! 1012: SPEED_RESTRICT_COND (s->size >= 3); \
! 1013: \
! 1014: TMP_MARK (marker); \
! 1015: a = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
! 1016: \
! 1017: d = SPEED_TMP_ALLOC_LIMBS (3, s->align_yp); \
! 1018: MPN_COPY (d, s->yp, 3); \
! 1019: d[2] |= GMP_LIMB_HIGHBIT; \
! 1020: \
! 1021: qsize = s->size - 3; \
! 1022: q = SPEED_TMP_ALLOC_LIMBS (qsize, s->align_wp); \
! 1023: \
! 1024: speed_operand_dst (s, a, s->size); \
! 1025: speed_operand_src (s, d, 3); \
! 1026: speed_operand_dst (s, q, qsize); \
! 1027: speed_cache_fill (s); \
! 1028: \
! 1029: speed_starttime (); \
! 1030: i = s->reps; \
! 1031: do \
! 1032: { \
! 1033: MPN_COPY (a, s->xp, s->size); \
! 1034: function (q, a, s->size, d, 3); \
! 1035: } \
! 1036: while (--i != 0); \
! 1037: t = speed_endtime (); \
! 1038: \
! 1039: TMP_FREE (marker); \
! 1040: return t; \
! 1041: }
! 1042:
! 1043:
! 1044: /* A remainder 2*s->size by s->size limbs */
! 1045:
! 1046: #define SPEED_ROUTINE_MPZ_MOD(function) \
! 1047: { \
! 1048: unsigned i; \
! 1049: mpz_t a, d, r; \
! 1050: \
! 1051: SPEED_RESTRICT_COND (s->size >= 1); \
! 1052: \
! 1053: mpz_init_set_n (d, s->yp, s->size); \
! 1054: \
! 1055: /* high part less than d, low part a duplicate copied in */ \
! 1056: mpz_init_set_n (a, s->xp, s->size); \
! 1057: mpz_mod (a, a, d); \
! 1058: mpz_mul_2exp (a, a, BITS_PER_MP_LIMB * s->size); \
! 1059: MPN_COPY (PTR(a), s->xp, s->size); \
! 1060: \
! 1061: mpz_init (r); \
! 1062: \
! 1063: speed_operand_src (s, PTR(a), SIZ(a)); \
! 1064: speed_operand_src (s, PTR(d), SIZ(d)); \
! 1065: speed_cache_fill (s); \
! 1066: \
! 1067: speed_starttime (); \
! 1068: i = s->reps; \
! 1069: do \
! 1070: function (r, a, d); \
! 1071: while (--i != 0); \
! 1072: return speed_endtime (); \
1.1 maekawa 1073: }
1074:
1.1.1.3 ! ohara 1075:
! 1076: #define SPEED_ROUTINE_REDC(function) \
! 1077: { \
! 1078: unsigned i; \
! 1079: mp_ptr cp, mp, tp, ap; \
! 1080: mp_limb_t Nprim; \
! 1081: double t; \
! 1082: TMP_DECL (marker); \
! 1083: \
! 1084: SPEED_RESTRICT_COND (s->size >= 1); \
! 1085: \
! 1086: TMP_MARK (marker); \
! 1087: ap = SPEED_TMP_ALLOC_LIMBS (2*s->size+1, s->align_xp); \
! 1088: mp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \
! 1089: cp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 1090: tp = SPEED_TMP_ALLOC_LIMBS (2*s->size+1, s->align_wp2); \
! 1091: \
! 1092: MPN_COPY (ap, s->xp, s->size); \
! 1093: MPN_COPY (ap+s->size, s->xp, s->size); \
! 1094: \
! 1095: /* modulus must be odd */ \
! 1096: MPN_COPY (mp, s->yp, s->size); \
! 1097: mp[0] |= 1; \
! 1098: modlimb_invert (Nprim, mp[0]); \
! 1099: \
! 1100: speed_operand_src (s, ap, 2*s->size+1); \
! 1101: speed_operand_dst (s, tp, 2*s->size+1); \
! 1102: speed_operand_src (s, mp, s->size); \
! 1103: speed_operand_dst (s, cp, s->size); \
! 1104: speed_cache_fill (s); \
! 1105: \
! 1106: speed_starttime (); \
! 1107: i = s->reps; \
! 1108: do { \
! 1109: MPN_COPY (tp, ap, 2*s->size); \
! 1110: function (cp, mp, s->size, Nprim, tp); \
! 1111: } while (--i != 0); \
! 1112: t = speed_endtime (); \
! 1113: \
! 1114: TMP_FREE (marker); \
! 1115: return t; \
! 1116: }
! 1117:
! 1118:
! 1119: #define SPEED_ROUTINE_MPN_POPCOUNT(function) \
1.1 maekawa 1120: { \
1.1.1.3 ! ohara 1121: unsigned i; \
! 1122: unsigned long dummy = 0; \
1.1 maekawa 1123: \
1124: SPEED_RESTRICT_COND (s->size >= 1); \
1125: \
1126: speed_operand_src (s, s->xp, s->size); \
1127: speed_cache_fill (s); \
1128: \
1129: speed_starttime (); \
1130: i = s->reps; \
1131: do \
1.1.1.3 ! ohara 1132: dummy += function (s->xp, s->size); \
1.1 maekawa 1133: while (--i != 0); \
1.1.1.3 ! ohara 1134: \
! 1135: noop_1 ((mp_limb_t) dummy); \
1.1 maekawa 1136: return speed_endtime (); \
1137: }
1138:
1.1.1.3 ! ohara 1139: #define SPEED_ROUTINE_MPN_HAMDIST(function) \
! 1140: { \
! 1141: unsigned i; \
! 1142: unsigned long dummy = 0; \
! 1143: \
! 1144: SPEED_RESTRICT_COND (s->size >= 1); \
! 1145: \
! 1146: speed_operand_src (s, s->xp, s->size); \
! 1147: speed_operand_src (s, s->yp, s->size); \
! 1148: speed_cache_fill (s); \
! 1149: \
! 1150: speed_starttime (); \
! 1151: i = s->reps; \
! 1152: do \
! 1153: dummy += function (s->xp, s->yp, s->size); \
! 1154: while (--i != 0); \
! 1155: \
! 1156: noop_1 ((mp_limb_t) dummy); \
! 1157: return speed_endtime (); \
! 1158: }
1.1 maekawa 1159:
1160:
1161: #define SPEED_ROUTINE_MPZ_UI(function) \
1162: { \
1163: mpz_t z; \
1164: unsigned i; \
1165: double t; \
1166: \
1167: SPEED_RESTRICT_COND (s->size >= 0); \
1168: \
1169: mpz_init (z); \
1170: \
1171: speed_starttime (); \
1172: i = s->reps; \
1173: do \
1174: function (z, s->size); \
1175: while (--i != 0); \
1176: t = speed_endtime (); \
1177: \
1178: mpz_clear (z); \
1179: return t; \
1180: }
1181:
1.1.1.3 ! ohara 1182: #define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function)
! 1183: #define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function)
! 1184: #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
! 1185:
! 1186:
! 1187: #define SPEED_ROUTINE_MPZ_2_UI(function) \
! 1188: { \
! 1189: mpz_t z, z2; \
! 1190: unsigned i; \
! 1191: double t; \
! 1192: \
! 1193: SPEED_RESTRICT_COND (s->size >= 0); \
! 1194: \
! 1195: mpz_init (z); \
! 1196: mpz_init (z2); \
! 1197: \
! 1198: speed_starttime (); \
! 1199: i = s->reps; \
! 1200: do \
! 1201: function (z, z2, s->size); \
! 1202: while (--i != 0); \
! 1203: t = speed_endtime (); \
! 1204: \
! 1205: mpz_clear (z); \
! 1206: mpz_clear (z2); \
! 1207: return t; \
! 1208: }
! 1209:
! 1210: #define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
! 1211: #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
! 1212:
! 1213:
! 1214: #define SPEED_ROUTINE_MPN_FIB2_UI(function) \
! 1215: { \
! 1216: mp_ptr fp, f1p; \
! 1217: mp_size_t alloc; \
! 1218: unsigned i; \
! 1219: double t; \
! 1220: TMP_DECL (marker); \
! 1221: \
! 1222: SPEED_RESTRICT_COND (s->size >= 0); \
! 1223: \
! 1224: TMP_MARK (marker); \
! 1225: alloc = MPN_FIB2_SIZE (s->size); \
! 1226: fp = SPEED_TMP_ALLOC_LIMBS (alloc, s->align_xp); \
! 1227: f1p = SPEED_TMP_ALLOC_LIMBS (alloc, s->align_yp); \
! 1228: \
! 1229: speed_starttime (); \
! 1230: i = s->reps; \
! 1231: do \
! 1232: function (fp, f1p, s->size); \
! 1233: while (--i != 0); \
! 1234: t = speed_endtime (); \
! 1235: \
! 1236: TMP_FREE (marker); \
! 1237: return t; \
! 1238: }
! 1239:
1.1 maekawa 1240:
1.1.1.3 ! ohara 1241:
! 1242: /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
! 1243: limbs. m is forced to odd so that redc can be used. e is limited in
! 1244: size so the calculation doesn't take too long. */
1.1 maekawa 1245: #define SPEED_ROUTINE_MPZ_POWM(function) \
1246: { \
1247: mpz_t r, b, e, m; \
1248: unsigned i; \
1249: double t; \
1250: \
1251: SPEED_RESTRICT_COND (s->size >= 1); \
1252: \
1253: mpz_init (r); \
1.1.1.3 ! ohara 1254: mpz_init_set_n (b, s->xp, s->size); \
! 1255: mpz_init_set_n (m, s->yp, s->size); \
! 1256: mpz_setbit (m, 0); /* force m to odd */ \
! 1257: mpz_init_set_n (e, s->xp_block, 6); \
1.1 maekawa 1258: \
1259: speed_starttime (); \
1260: i = s->reps; \
1261: do \
1262: function (r, b, e, m); \
1263: while (--i != 0); \
1264: t = speed_endtime (); \
1265: \
1266: mpz_clear (r); \
1267: mpz_clear (b); \
1268: mpz_clear (e); \
1269: mpz_clear (m); \
1270: return t; \
1271: }
1272:
1.1.1.3 ! ohara 1273: /* (m-2)^0xAAAAAAAA mod m */
! 1274: #define SPEED_ROUTINE_MPZ_POWM_UI(function) \
! 1275: { \
! 1276: mpz_t r, b, m; \
! 1277: unsigned long e = (~ (unsigned long) 0) / 3; \
! 1278: unsigned i; \
! 1279: double t; \
! 1280: \
! 1281: SPEED_RESTRICT_COND (s->size >= 1); \
! 1282: \
! 1283: mpz_init (r); \
! 1284: \
! 1285: /* force m to odd */ \
! 1286: mpz_init (m); \
! 1287: mpz_set_n (m, s->xp, s->size); \
! 1288: PTR(m)[0] |= 1; \
! 1289: \
! 1290: mpz_init_set (b, m); \
! 1291: mpz_sub_ui (b, b, 2); \
! 1292: /* printf ("%X\n", mpz_get_ui(m)); */ \
! 1293: i = s->reps; \
! 1294: speed_starttime (); \
! 1295: do \
! 1296: function (r, b, e, m); \
! 1297: while (--i != 0); \
! 1298: t = speed_endtime (); \
! 1299: \
! 1300: mpz_clear (r); \
! 1301: mpz_clear (b); \
! 1302: mpz_clear (m); \
! 1303: return t; \
! 1304: }
! 1305:
1.1 maekawa 1306:
1.1.1.3 ! ohara 1307: #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \
! 1308: { \
! 1309: mp_ptr wp, wp2, xp, yp; \
! 1310: unsigned i; \
! 1311: double t; \
! 1312: TMP_DECL (marker); \
! 1313: \
! 1314: SPEED_RESTRICT_COND (s->size >= 0); \
! 1315: \
! 1316: TMP_MARK (marker); \
! 1317: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 1318: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
! 1319: xp = s->xp; \
! 1320: yp = s->yp; \
! 1321: \
! 1322: if (s->r == 0) ; \
! 1323: else if (s->r == 1) { xp = wp; } \
! 1324: else if (s->r == 2) { yp = wp2; } \
! 1325: else if (s->r == 3) { xp = wp; yp = wp2; } \
! 1326: else if (s->r == 4) { xp = wp2; yp = wp; } \
! 1327: else { \
! 1328: TMP_FREE (marker); \
! 1329: return -1.0; \
! 1330: } \
! 1331: if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \
! 1332: if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \
! 1333: \
! 1334: speed_operand_src (s, xp, s->size); \
! 1335: speed_operand_src (s, yp, s->size); \
! 1336: speed_operand_dst (s, wp, s->size); \
! 1337: speed_operand_dst (s, wp2, s->size); \
! 1338: speed_cache_fill (s); \
! 1339: \
! 1340: speed_starttime (); \
! 1341: i = s->reps; \
! 1342: do \
! 1343: call; \
! 1344: while (--i != 0); \
! 1345: t = speed_endtime (); \
! 1346: \
! 1347: TMP_FREE (marker); \
! 1348: return t; \
1.1 maekawa 1349: }
1350:
1351: #define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
1352: SPEED_ROUTINE_MPN_ADDSUB_CALL \
1353: (function (wp, wp2, xp, yp, s->size));
1354:
1355: #define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
1356: SPEED_ROUTINE_MPN_ADDSUB_CALL \
1357: (function (wp, wp2, xp, yp, s->size, 0));
1358:
1359:
1.1.1.3 ! ohara 1360: /* Doing an Nx1 gcd with the given r. */
! 1361: #define SPEED_ROUTINE_MPN_GCD_1N(function) \
! 1362: { \
! 1363: mp_ptr xp; \
! 1364: unsigned i; \
! 1365: double t; \
! 1366: mp_limb_t dummy = 0; \
! 1367: TMP_DECL (marker); \
! 1368: \
! 1369: SPEED_RESTRICT_COND (s->size >= 1); \
! 1370: SPEED_RESTRICT_COND (s->r != 0); \
! 1371: \
! 1372: TMP_MARK (marker); \
! 1373: xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
! 1374: MPN_COPY (xp, s->xp, s->size); \
! 1375: xp[0] |= refmpn_zero_p (xp, s->size); \
! 1376: \
! 1377: speed_operand_src (s, s->xp, s->size); \
! 1378: speed_cache_fill (s); \
! 1379: \
! 1380: speed_starttime (); \
! 1381: i = s->reps; \
! 1382: do \
! 1383: dummy += function (xp, s->size, s->r); \
! 1384: while (--i != 0); \
! 1385: t = speed_endtime (); \
! 1386: \
! 1387: noop_1 (dummy); \
! 1388: TMP_FREE (marker); \
! 1389: return t; \
1.1 maekawa 1390: }
1391:
1392:
1393: /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
1394:
1395: #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \
1396: { \
1397: unsigned i, j; \
1398: mp_ptr px, py; \
1399: mp_limb_t x_mask, y_mask; \
1.1.1.3 ! ohara 1400: mp_limb_t dummy = 0; \
1.1 maekawa 1401: double t; \
1402: TMP_DECL (marker); \
1403: \
1404: SPEED_RESTRICT_COND (s->size >= 1); \
1405: SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \
1406: \
1407: TMP_MARK (marker); \
1408: px = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
1409: py = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_yp); \
1410: MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \
1411: MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \
1412: \
1413: x_mask = MP_LIMB_T_LOWBITMASK (s->size); \
1414: y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
1415: for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
1416: { \
1417: px[i] &= x_mask; px[i] += (px[i] == 0); \
1418: py[i] &= y_mask; py[i] += (py[i] == 0); \
1419: setup; \
1420: } \
1421: \
1422: speed_operand_src (s, px, SPEED_BLOCK_SIZE); \
1423: speed_operand_src (s, py, SPEED_BLOCK_SIZE); \
1424: speed_cache_fill (s); \
1425: \
1426: speed_starttime (); \
1427: i = s->reps; \
1428: do \
1429: { \
1430: j = SPEED_BLOCK_SIZE; \
1431: do \
1432: { \
1.1.1.3 ! ohara 1433: dummy += call; \
1.1 maekawa 1434: } \
1435: while (--j != 0); \
1436: } \
1437: while (--i != 0); \
1438: t = speed_endtime (); \
1439: \
1440: TMP_FREE (marker); \
1441: \
1.1.1.3 ! ohara 1442: noop_1 (dummy); \
1.1 maekawa 1443: s->time_divisor = SPEED_BLOCK_SIZE; \
1444: return t; \
1445: }
1446:
1447: #define SPEED_ROUTINE_MPN_GCD_1(function) \
1448: SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
1449:
1450: #define SPEED_ROUTINE_MPN_JACBASE(function) \
1451: SPEED_ROUTINE_MPN_GCD_1_CALL \
1452: ({ \
1.1.1.3 ! ohara 1453: /* require x<y, y odd, y!=1 */ \
1.1 maekawa 1454: px[i] %= py[i]; \
1455: px[i] |= 1; \
1456: py[i] |= 1; \
1457: if (py[i]==1) py[i]=3; \
1458: }, \
1459: function (px[j-1], py[j-1], 0))
1460:
1461:
1462: /* Run some GCDs of s->size limbs each. The number of different data values
1463: is decreased as s->size**2, since GCD is a quadratic algorithm.
1464: SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
1465: though, because the plain gcd is about twice as fast as gcdext. */
1466:
1467: #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \
1468: { \
1469: unsigned i; \
1470: mp_size_t j, pieces, psize; \
1471: mp_ptr wp, wp2, xtmp, ytmp, px, py; \
1472: double t; \
1473: TMP_DECL (marker); \
1474: \
1475: SPEED_RESTRICT_COND (s->size >= 1); \
1476: \
1477: TMP_MARK (marker); \
1478: xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \
1479: ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \
1.1.1.3 ! ohara 1480: wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
! 1481: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp2); \
1.1 maekawa 1482: \
1483: pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \
1484: pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \
1.1.1.2 maekawa 1485: pieces = MAX (pieces, 1); \
1.1 maekawa 1486: \
1487: psize = pieces * s->size; \
1488: px = TMP_ALLOC_LIMBS (psize); \
1489: py = TMP_ALLOC_LIMBS (psize); \
1490: MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
1491: MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
1492: \
1.1.1.3 ! ohara 1493: /* y must be odd, x must have at least as many bits as y, \
! 1494: high limbs must be non-zero */ \
1.1 maekawa 1495: for (j = 0; j < pieces; j++) \
1496: { \
1497: mp_ptr x = px+j*s->size; \
1498: mp_ptr y = py+j*s->size; \
1499: y[0] |= 1; \
1500: if (x[s->size-1] == 0) x[s->size-1] = 1; \
1501: if (y[s->size-1] == 0) y[s->size-1] = 1; \
1502: x[s->size-1] = MAX (x[s->size-1], y[s->size-1]); \
1503: } \
1504: \
1505: speed_operand_src (s, px, psize); \
1506: speed_operand_src (s, py, psize); \
1507: speed_operand_dst (s, xtmp, s->size); \
1508: speed_operand_dst (s, ytmp, s->size); \
1509: speed_operand_dst (s, wp, s->size); \
1510: speed_cache_fill (s); \
1511: \
1512: speed_starttime (); \
1513: i = s->reps; \
1514: do \
1515: { \
1516: j = pieces; \
1517: do \
1518: { \
1519: MPN_COPY (xtmp, px+(j-1)*s->size, s->size); \
1520: MPN_COPY (ytmp, py+(j-1)*s->size, s->size); \
1521: call; \
1522: } \
1523: while (--j != 0); \
1524: } \
1525: while (--i != 0); \
1526: t = speed_endtime (); \
1527: \
1528: TMP_FREE (marker); \
1529: \
1530: s->time_divisor = pieces; \
1531: return t; \
1532: }
1533:
1534: #define SPEED_ROUTINE_MPN_GCD(function) \
1535: SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
1536:
1537: #define SPEED_ROUTINE_MPN_GCDEXT(function) \
1538: SPEED_ROUTINE_MPN_GCD_CALL \
1539: (4, { mp_size_t wp2size; \
1540: function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
1541:
1542:
1.1.1.3 ! ohara 1543: #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \
! 1544: { \
! 1545: unsigned i; \
! 1546: mp_size_t j, pieces, psize, wp2size; \
! 1547: mp_ptr wp, wp2, xtmp, ytmp, px, py; \
! 1548: double t; \
! 1549: TMP_DECL (marker); \
! 1550: \
! 1551: SPEED_RESTRICT_COND (s->size >= 1); \
! 1552: \
! 1553: TMP_MARK (marker); \
! 1554: \
! 1555: xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \
! 1556: ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \
! 1557: MPN_COPY (xtmp, s->xp, s->size); \
! 1558: MPN_COPY (ytmp, s->yp, s->size); \
! 1559: \
! 1560: wp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
! 1561: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp2); \
! 1562: \
! 1563: pieces = SPEED_BLOCK_SIZE / 3; \
! 1564: psize = 3 * pieces; \
! 1565: px = TMP_ALLOC_LIMBS (psize); \
! 1566: py = TMP_ALLOC_LIMBS (psize); \
! 1567: MPN_COPY (px, s->xp_block, psize); \
! 1568: MPN_COPY (py, s->yp_block, psize); \
! 1569: \
! 1570: /* x must have at least as many bits as y, \
! 1571: high limbs must be non-zero */ \
! 1572: for (j = 0; j < pieces; j++) \
! 1573: { \
! 1574: mp_ptr x = px+3*j; \
! 1575: mp_ptr y = py+3*j; \
! 1576: x[2] += (x[2] == 0); \
! 1577: y[2] += (y[2] == 0); \
! 1578: if (x[2] < y[2]) \
! 1579: MP_LIMB_T_SWAP (x[2], y[2]); \
! 1580: } \
! 1581: \
! 1582: speed_operand_src (s, px, psize); \
! 1583: speed_operand_src (s, py, psize); \
! 1584: speed_operand_dst (s, xtmp, s->size); \
! 1585: speed_operand_dst (s, ytmp, s->size); \
! 1586: speed_operand_dst (s, wp, s->size); \
! 1587: speed_cache_fill (s); \
! 1588: \
! 1589: speed_starttime (); \
! 1590: i = s->reps; \
! 1591: do \
! 1592: { \
! 1593: mp_ptr x = px; \
! 1594: mp_ptr y = py; \
! 1595: mp_ptr xth = &xtmp[s->size-3]; \
! 1596: mp_ptr yth = &ytmp[s->size-3]; \
! 1597: j = pieces; \
! 1598: do \
! 1599: { \
! 1600: xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \
! 1601: yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \
! 1602: \
! 1603: ytmp[0] |= 1; /* y must be odd, */ \
! 1604: \
! 1605: function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \
! 1606: \
! 1607: x += 3; \
! 1608: y += 3; \
! 1609: } \
! 1610: while (--j != 0); \
! 1611: } \
! 1612: while (--i != 0); \
! 1613: t = speed_endtime (); \
! 1614: \
! 1615: TMP_FREE (marker); \
! 1616: \
! 1617: s->time_divisor = pieces; \
! 1618: return t; \
! 1619: }
! 1620:
! 1621: #define SPEED_ROUTINE_MPZ_JACOBI(function) \
! 1622: { \
! 1623: mpz_t a, b; \
! 1624: unsigned i; \
! 1625: mp_size_t j, pieces, psize; \
! 1626: mp_ptr px, py; \
! 1627: double t; \
! 1628: int dummy = 0; \
! 1629: TMP_DECL (marker); \
! 1630: \
! 1631: TMP_MARK (marker); \
! 1632: pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \
! 1633: pieces = MAX (pieces, 1); \
! 1634: s->time_divisor = pieces; \
! 1635: \
! 1636: psize = pieces * s->size; \
! 1637: px = TMP_ALLOC_LIMBS (psize); \
! 1638: py = TMP_ALLOC_LIMBS (psize); \
! 1639: MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
! 1640: MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
! 1641: \
! 1642: for (j = 0; j < pieces; j++) \
! 1643: { \
! 1644: mp_ptr x = px+j*s->size; \
! 1645: mp_ptr y = py+j*s->size; \
! 1646: \
! 1647: /* y odd */ \
! 1648: y[0] |= 1; \
! 1649: \
! 1650: /* high limbs non-zero */ \
! 1651: if (x[s->size-1] == 0) x[s->size-1] = 1; \
! 1652: if (y[s->size-1] == 0) y[s->size-1] = 1; \
! 1653: } \
! 1654: \
! 1655: SIZ(a) = s->size; \
! 1656: SIZ(b) = s->size; \
! 1657: \
! 1658: speed_operand_src (s, px, psize); \
! 1659: speed_operand_src (s, py, psize); \
! 1660: speed_cache_fill (s); \
! 1661: \
! 1662: speed_starttime (); \
! 1663: i = s->reps; \
! 1664: do \
! 1665: { \
! 1666: j = pieces; \
! 1667: do \
! 1668: { \
! 1669: PTR(a) = px+(j-1)*s->size; \
! 1670: PTR(b) = py+(j-1)*s->size; \
! 1671: dummy += function (a, b); \
! 1672: } \
! 1673: while (--j != 0); \
! 1674: } \
! 1675: while (--i != 0); \
! 1676: t = speed_endtime (); \
! 1677: \
! 1678: noop_1 ((mp_limb_t) dummy); \
! 1679: TMP_FREE (marker); \
! 1680: return t; \
! 1681: }
! 1682:
1.1 maekawa 1683: #define SPEED_ROUTINE_MPN_DIVREM_2(function) \
1684: { \
1685: mp_ptr wp, xp; \
1686: mp_limb_t yp[2]; \
1687: unsigned i; \
1688: double t; \
1689: TMP_DECL (marker); \
1690: \
1.1.1.3 ! ohara 1691: SPEED_RESTRICT_COND (s->size >= 2); \
1.1 maekawa 1692: \
1693: TMP_MARK (marker); \
1694: xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
1695: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
1696: \
1697: /* source is destroyed */ \
1698: MPN_COPY (xp, s->xp, s->size); \
1699: \
1700: /* divisor must be normalized */ \
1701: MPN_COPY (yp, s->yp_block, 2); \
1.1.1.3 ! ohara 1702: yp[1] |= GMP_LIMB_HIGHBIT; \
1.1 maekawa 1703: \
1704: speed_operand_src (s, xp, s->size); \
1705: speed_operand_src (s, yp, 2); \
1706: speed_operand_dst (s, wp, s->size); \
1707: speed_cache_fill (s); \
1708: \
1709: speed_starttime (); \
1710: i = s->reps; \
1711: do \
1712: function (wp, 0, xp, s->size, yp); \
1713: while (--i != 0); \
1714: t = speed_endtime (); \
1715: \
1716: TMP_FREE (marker); \
1717: return t; \
1718: }
1719:
1720:
1721: #define SPEED_ROUTINE_MODLIMB_INVERT(function) \
1722: { \
1723: unsigned i, j; \
1724: mp_ptr xp; \
1725: mp_limb_t n = 1; \
1726: double t; \
1727: \
1728: xp = s->xp_block-1; \
1729: \
1730: speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
1731: speed_cache_fill (s); \
1732: \
1733: speed_starttime (); \
1734: i = s->reps; \
1735: do \
1736: { \
1737: j = SPEED_BLOCK_SIZE; \
1738: do \
1739: { \
1740: /* randomized but successively dependent */ \
1741: n += (xp[j] << 1); \
1742: \
1743: function (n, n); \
1744: } \
1745: while (--j != 0); \
1746: } \
1747: while (--i != 0); \
1748: t = speed_endtime (); \
1749: \
1750: /* make sure the compiler won't optimize away n */ \
1751: noop_1 (n); \
1752: \
1753: s->time_divisor = SPEED_BLOCK_SIZE; \
1754: return t; \
1755: }
1756:
1.1.1.3 ! ohara 1757:
! 1758: #define SPEED_ROUTINE_MPN_SQRTREM(function) \
! 1759: { \
! 1760: mp_ptr wp, wp2; \
! 1761: unsigned i; \
! 1762: double t; \
! 1763: TMP_DECL (marker); \
! 1764: \
! 1765: SPEED_RESTRICT_COND (s->size >= 1); \
! 1766: \
! 1767: TMP_MARK (marker); \
! 1768: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 1769: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
! 1770: \
! 1771: speed_operand_src (s, s->xp, s->size); \
! 1772: speed_operand_dst (s, wp, s->size); \
! 1773: speed_operand_dst (s, wp2, s->size); \
! 1774: speed_cache_fill (s); \
! 1775: \
! 1776: speed_starttime (); \
! 1777: i = s->reps; \
! 1778: do \
! 1779: function (wp, wp2, s->xp, s->size); \
! 1780: while (--i != 0); \
! 1781: t = speed_endtime (); \
! 1782: \
! 1783: TMP_FREE (marker); \
! 1784: return t; \
! 1785: }
! 1786:
! 1787:
! 1788: /* s->size controls the number of limbs in the input, s->r is the base, or
! 1789: decimal by default. */
! 1790: #define SPEED_ROUTINE_MPN_GET_STR(function) \
! 1791: { \
! 1792: unsigned char *wp; \
! 1793: mp_size_t wsize; \
! 1794: mp_ptr xp; \
! 1795: int base; \
! 1796: unsigned i; \
! 1797: double t; \
! 1798: TMP_DECL (marker); \
! 1799: \
! 1800: SPEED_RESTRICT_COND (s->size >= 1); \
! 1801: \
! 1802: base = s->r == 0 ? 10 : s->r; \
! 1803: SPEED_RESTRICT_COND (base >= 2 && base <= 256); \
! 1804: \
! 1805: TMP_MARK (marker); \
! 1806: xp = SPEED_TMP_ALLOC_LIMBS (s->size + 1, s->align_xp); \
! 1807: \
! 1808: MPN_SIZEINBASE (wsize, s->xp, s->size, base); \
! 1809: wp = TMP_ALLOC (wsize); \
! 1810: \
! 1811: /* use this during development to guard against overflowing wp */ \
! 1812: /* \
! 1813: MPN_COPY (xp, s->xp, s->size); \
! 1814: ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wsize); \
! 1815: */ \
! 1816: \
! 1817: speed_operand_src (s, s->xp, s->size); \
! 1818: speed_operand_dst (s, xp, s->size); \
! 1819: speed_operand_dst (s, (mp_ptr) wp, wsize/BYTES_PER_MP_LIMB); \
! 1820: speed_cache_fill (s); \
! 1821: \
! 1822: speed_starttime (); \
! 1823: i = s->reps; \
! 1824: do \
! 1825: { \
! 1826: MPN_COPY (xp, s->xp, s->size); \
! 1827: function (wp, base, xp, s->size); \
! 1828: } \
! 1829: while (--i != 0); \
! 1830: t = speed_endtime (); \
! 1831: \
! 1832: TMP_FREE (marker); \
! 1833: return t; \
! 1834: }
! 1835:
! 1836: /* s->size controls the number of digits in the input, s->r is the base, or
! 1837: decimal by default. */
! 1838: #define SPEED_ROUTINE_MPN_SET_STR(function) \
! 1839: { \
! 1840: unsigned char *xp; \
! 1841: mp_ptr wp; \
! 1842: mp_size_t wsize; \
! 1843: unsigned i; \
! 1844: int base; \
! 1845: double t; \
! 1846: TMP_DECL (marker); \
! 1847: \
! 1848: SPEED_RESTRICT_COND (s->size >= 1); \
! 1849: \
! 1850: base = s->r == 0 ? 10 : s->r; \
! 1851: SPEED_RESTRICT_COND (base >= 2 && base <= 256); \
! 1852: \
! 1853: TMP_MARK (marker); \
! 1854: \
! 1855: xp = TMP_ALLOC (s->size); \
! 1856: for (i = 0; i < s->size; i++) \
! 1857: xp[i] = s->xp[i] % base; \
! 1858: \
! 1859: wsize = ((mp_size_t) (s->size / __mp_bases[base].chars_per_bit_exactly)) \
! 1860: / BITS_PER_MP_LIMB + 2; \
! 1861: wp = SPEED_TMP_ALLOC_LIMBS (wsize, s->align_wp); \
! 1862: \
! 1863: /* use this during development to check wsize is big enough */ \
! 1864: /* \
! 1865: ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wsize); \
! 1866: */ \
! 1867: \
! 1868: speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB); \
! 1869: speed_operand_dst (s, wp, wsize); \
! 1870: speed_cache_fill (s); \
! 1871: \
! 1872: speed_starttime (); \
! 1873: i = s->reps; \
! 1874: do \
! 1875: function (wp, xp, s->size, base); \
! 1876: while (--i != 0); \
! 1877: t = speed_endtime (); \
! 1878: \
! 1879: TMP_FREE (marker); \
! 1880: return t; \
! 1881: }
! 1882:
! 1883:
! 1884: /* Run an accel gcd find_a() function over various data values. A set of
! 1885: values is used in case some run particularly fast or slow. The size
! 1886: parameter is ignored, the amount of data tested is fixed. */
! 1887:
! 1888: #define SPEED_ROUTINE_MPN_GCD_FINDA(function) \
! 1889: { \
! 1890: unsigned i, j; \
! 1891: mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \
! 1892: double t; \
! 1893: mp_limb_t dummy = 0; \
! 1894: TMP_DECL (marker); \
! 1895: \
! 1896: TMP_MARK (marker); \
! 1897: \
! 1898: /* low must be odd, high must be non-zero */ \
! 1899: for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
! 1900: { \
! 1901: cp[i][0] = s->xp_block[i] | 1; \
! 1902: cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \
! 1903: } \
! 1904: \
! 1905: speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \
! 1906: speed_cache_fill (s); \
! 1907: \
! 1908: speed_starttime (); \
! 1909: i = s->reps; \
! 1910: do \
! 1911: { \
! 1912: j = SPEED_BLOCK_SIZE; \
! 1913: do \
! 1914: { \
! 1915: dummy += function (cp[j-1]); \
! 1916: } \
! 1917: while (--j != 0); \
! 1918: } \
! 1919: while (--i != 0); \
! 1920: t = speed_endtime (); \
! 1921: \
! 1922: noop_1 (dummy); /* don't let the calls go dead */ \
! 1923: TMP_FREE (marker); \
! 1924: \
! 1925: s->time_divisor = SPEED_BLOCK_SIZE; \
! 1926: return t; \
! 1927: }
! 1928:
! 1929:
! 1930: /* "call" should do "count_foo_zeros(c,n)".
! 1931: Give leading=1 if foo is leading zeros, leading=0 for trailing.
! 1932: Give zero=1 if n=0 is allowed in the call, zero=0 if not. */
! 1933:
! 1934: #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \
! 1935: { \
! 1936: mp_ptr xp; \
! 1937: int i, c; \
! 1938: unsigned j; \
! 1939: mp_limb_t n; \
! 1940: double t; \
! 1941: TMP_DECL (marker); \
! 1942: \
! 1943: TMP_MARK (marker); \
! 1944: xp = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
! 1945: \
! 1946: if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \
! 1947: return -1.0; \
! 1948: speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \
! 1949: speed_cache_fill (s); \
! 1950: \
! 1951: c = 0; \
! 1952: speed_starttime (); \
! 1953: j = s->reps; \
! 1954: do { \
! 1955: for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
! 1956: { \
! 1957: n = xp[i]; \
! 1958: n ^= c; \
! 1959:
! 1960: #define SPEED_ROUTINE_COUNT_ZEROS_B() \
! 1961: } \
! 1962: } while (--j != 0); \
! 1963: t = speed_endtime (); \
! 1964: \
! 1965: /* don't let c go dead */ \
! 1966: noop_1 (c); \
! 1967: \
! 1968: s->time_divisor = SPEED_BLOCK_SIZE; \
! 1969: \
! 1970: TMP_FREE (marker); \
! 1971: return t; \
! 1972: } \
! 1973:
! 1974: #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \
! 1975: do { \
! 1976: SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \
! 1977: call; \
! 1978: SPEED_ROUTINE_COUNT_ZEROS_B (); \
! 1979: } while (0) \
! 1980:
! 1981: #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \
! 1982: SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
! 1983: #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \
! 1984: SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
! 1985:
! 1986: #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \
! 1987: SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
! 1988: #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \
! 1989: SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
! 1990:
! 1991:
! 1992: #define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \
! 1993: { \
! 1994: unsigned i, j; \
! 1995: mp_limb_t d, dinv=0; \
! 1996: mp_ptr xp = s->xp_block - 1; \
! 1997: \
! 1998: s->time_divisor = SPEED_BLOCK_SIZE; \
! 1999: \
! 2000: speed_starttime (); \
! 2001: i = s->reps; \
! 2002: do \
! 2003: { \
! 2004: j = SPEED_BLOCK_SIZE; \
! 2005: do \
! 2006: { \
! 2007: d = dinv ^ xp[j]; \
! 2008: d |= GMP_LIMB_HIGHBIT; \
! 2009: do { call; } while (0); \
! 2010: } \
! 2011: while (--j != 0); \
! 2012: } \
! 2013: while (--i != 0); \
! 2014: \
! 2015: /* don't let the compiler optimize everything away */ \
! 2016: noop_1 (dinv); \
! 2017: \
! 2018: return speed_endtime(); \
! 2019: }
! 2020:
! 2021:
1.1 maekawa 2022: #endif
1.1.1.3 ! ohara 2023:
! 2024:
! 2025: #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \
! 2026: { \
! 2027: unsigned i; \
! 2028: speed_starttime (); \
! 2029: i = s->reps; \
! 2030: do \
! 2031: function (); \
! 2032: while (--i != 0); \
! 2033: return speed_endtime (); \
! 2034: }
! 2035:
! 2036:
! 2037: #define SPEED_ROUTINE_MPN_ZERO_CALL(call) \
! 2038: { \
! 2039: mp_ptr wp; \
! 2040: unsigned i; \
! 2041: double t; \
! 2042: TMP_DECL (marker); \
! 2043: \
! 2044: SPEED_RESTRICT_COND (s->size >= 0); \
! 2045: \
! 2046: TMP_MARK (marker); \
! 2047: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
! 2048: speed_operand_dst (s, wp, s->size); \
! 2049: speed_cache_fill (s); \
! 2050: \
! 2051: speed_starttime (); \
! 2052: i = s->reps; \
! 2053: do \
! 2054: call; \
! 2055: while (--i != 0); \
! 2056: t = speed_endtime (); \
! 2057: \
! 2058: TMP_FREE (marker); \
! 2059: return t; \
! 2060: }
! 2061:
! 2062: #define SPEED_ROUTINE_MPN_ZERO(function) \
! 2063: SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>