Annotation of OpenXM_contrib/gmp/tune/speed.h, Revision 1.1.1.2
1.1 maekawa 1: /* Header for speed and threshold things. */
2:
3: /*
4: Copyright (C) 1999, 2000 Free Software Foundation, Inc.
5:
6: This file is part of the GNU MP Library.
7:
8: The GNU MP Library is free software; you can redistribute it and/or modify
9: it under the terms of the GNU Lesser General Public License as published by
10: the Free Software Foundation; either version 2.1 of the License, or (at your
11: option) any later version.
12:
13: The GNU MP Library is distributed in the hope that it will be useful, but
14: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: License for more details.
17:
18: You should have received a copy of the GNU Lesser General Public License
19: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: MA 02111-1307, USA.
22: */
23:
24: #ifndef __SPEED_H__
25: #define __SPEED_H__
26:
27:
28: /* sizes of temporary space required */
29: #define MPN_KARA_MUL_N_TSIZE(n) (2*((n)+BITS_PER_MP_LIMB))
30: #define MPN_KARA_SQR_N_TSIZE(n) (2*((n)+BITS_PER_MP_LIMB))
31: #define MPN_TOOM3_MUL_N_TSIZE(n) (2*(n) + 3*BITS_PER_MP_LIMB)
32: #define MPN_TOOM3_SQR_N_TSIZE(n) (2*((n) + BITS_PER_MP_LIMB))
33:
34:
35: /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
36: newsize long. */
37: #define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \
38: do { \
39: ASSERT ((newsize) >= (oldsize)); \
40: MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \
41: } while (0)
42:
43: /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
44: x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */
45: #define MP_LIMB_T_LOWBITMASK(n) \
46: ((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
47:
48:
49: /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
50:
51: #define TMP_ALLOC_ALIGNED(bytes, align) \
52: align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
53: #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
54: ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
55:
56: /* 32 for pentium, 64 for athlon, might want to configure this for other
57: CPUs. In truth though nothing has yet shown up that cares about cache
58: line boundaries. The only practical effect of this is to restrict the
59: range that s->align_xp can take. Perhaps this could be a variable
60: instead. */
61: #define CACHE_LINE_SIZE 64 /* bytes */
62:
63: #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
64:
65: #define SPEED_TMP_ALLOC_LIMBS(limbs, align) \
66: (speed_tmp_alloc_adjust \
67: (TMP_ALLOC_LIMBS((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK), (align)))
68:
69:
70: /* This is the size for s->xp_block and s->yp_block, used in certain
71: routines that want to run across many different data values and use
72: s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
73:
74: 512 means 2kbytes of data for each of xp_block and yp_block, making 4k
75: total, which should fit easily in any L1 data cache. */
76:
77: #define SPEED_BLOCK_SIZE 512 /* limbs */
78:
79:
80: extern double speed_unittime;
81: extern double speed_cycletime;
82: extern int speed_precision;
83: extern const char *speed_time_string;
84: void speed_time_init _PROTO ((void));
85: void speed_starttime _PROTO ((void));
86: double speed_endtime _PROTO ((void));
87:
88: struct speed_params {
89: unsigned reps; /* how many times to run the routine */
90: mp_ptr xp; /* first argument */
91: mp_ptr yp; /* second argument */
92: mp_size_t size; /* size of both arguments */
93: long r; /* user supplied parameter */
94: mp_size_t align_xp; /* alignment of xp */
95: mp_size_t align_yp; /* alignment of yp */
96: mp_size_t align_wp; /* intended alignment of wp */
97: mp_size_t align_wp2; /* intended alignment of wp2 */
98: mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
99: mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
100:
101: double time_divisor; /* optionally set by the speed routine */
102:
103: /* used by the cache priming things */
104: int cache;
105: unsigned src_num, dst_num;
106: struct {
107: mp_ptr ptr;
108: mp_size_t size;
109: } src[2], dst[3];
110: };
111:
112: typedef double (*speed_function_t) _PROTO ((struct speed_params *s));
113:
114: double speed_measure _PROTO ((speed_function_t fun, struct speed_params *s));
115:
116: /* Prototypes for speed measuring routines */
117:
118: double speed_malloc_free _PROTO ((struct speed_params *s));
119: double speed_malloc_realloc_free _PROTO ((struct speed_params *s));
120: double speed_memcpy _PROTO ((struct speed_params *s));
121: double speed_modlimb_invert _PROTO ((struct speed_params *s));
122: double speed_mp_allocate_free _PROTO ((struct speed_params *s));
123: double speed_mp_allocate_reallocate_free _PROTO ((struct speed_params *s));
124:
125: double speed_mpf_init_clear _PROTO ((struct speed_params *s));
126:
127: double speed_mpn_add_n _PROTO ((struct speed_params *s));
128: double speed_mpn_add_n_self _PROTO ((struct speed_params *s));
129: double speed_mpn_add_n_inplace _PROTO ((struct speed_params *s));
130: double speed_mpn_and_n _PROTO ((struct speed_params *s));
131: double speed_mpn_andn_n _PROTO ((struct speed_params *s));
132: double speed_mpn_addmul_1 _PROTO ((struct speed_params *s));
133: double speed_mpn_bz_divrem_n _PROTO ((struct speed_params *s));
134: double speed_mpn_bz_divrem_sb _PROTO ((struct speed_params *s));
135: double speed_mpn_bz_tdiv_qr _PROTO ((struct speed_params *s));
136: double speed_MPN_COPY _PROTO ((struct speed_params *s));
137: double speed_MPN_COPY_DECR _PROTO ((struct speed_params *s));
138: double speed_MPN_COPY_INCR _PROTO ((struct speed_params *s));
139: double speed_mpn_divexact_by3 _PROTO ((struct speed_params *s));
140: double speed_mpn_divmod_1 _PROTO ((struct speed_params *s));
141: double speed_mpn_divrem_1 _PROTO ((struct speed_params *s));
142: double speed_mpn_divrem_1f _PROTO ((struct speed_params *s));
143: double speed_mpn_divrem_1c _PROTO ((struct speed_params *s));
144: double speed_mpn_divrem_1cf _PROTO ((struct speed_params *s));
145: double speed_mpn_divrem_2 _PROTO ((struct speed_params *s));
146: double speed_mpn_gcd _PROTO ((struct speed_params *s));
147: double speed_mpn_gcd_1 _PROTO ((struct speed_params *s));
148: double speed_mpn_gcdext _PROTO ((struct speed_params *s));
149: double speed_mpn_hamdist _PROTO ((struct speed_params *s));
150: double speed_mpn_ior_n _PROTO ((struct speed_params *s));
151: double speed_mpn_iorn_n _PROTO ((struct speed_params *s));
152: double speed_mpn_jacobi_base _PROTO ((struct speed_params *s));
153: double speed_mpn_kara_mul_n _PROTO ((struct speed_params *s));
154: double speed_mpn_kara_sqr_n _PROTO ((struct speed_params *s));
155: double speed_mpn_lshift _PROTO ((struct speed_params *s));
156: double speed_mpn_mod_1 _PROTO ((struct speed_params *s));
157: double speed_mpn_mod_1c _PROTO ((struct speed_params *s));
158: double speed_mpn_mul_1 _PROTO ((struct speed_params *s));
159: double speed_mpn_mul_basecase _PROTO ((struct speed_params *s));
160: double speed_mpn_mul_fft _PROTO ((struct speed_params *s));
161: double speed_mpn_mul_fft_sqr _PROTO ((struct speed_params *s));
162: double speed_mpn_mul_fft_full _PROTO ((struct speed_params *s));
163: double speed_mpn_mul_fft_full_sqr _PROTO ((struct speed_params *s));
164: double speed_mpn_mul_n _PROTO ((struct speed_params *s));
165: double speed_mpn_mul_n_sqr _PROTO ((struct speed_params *s));
166: double speed_mpn_mul_n_toom3 _PROTO ((struct speed_params *s));
167: double speed_mpn_nand_n _PROTO ((struct speed_params *s));
168: double speed_mpn_nior_n _PROTO ((struct speed_params *s));
169: double speed_mpn_popcount _PROTO ((struct speed_params *s));
170: double speed_mpn_rshift _PROTO ((struct speed_params *s));
171: double speed_mpn_sqr_basecase _PROTO ((struct speed_params *s));
172: double speed_mpn_sqr_n _PROTO ((struct speed_params *s));
173: double speed_mpn_sqr_toom3 _PROTO ((struct speed_params *s));
174: double speed_mpn_sub_n _PROTO ((struct speed_params *s));
175: double speed_mpn_submul_1 _PROTO ((struct speed_params *s));
176: double speed_mpn_toom3_mul_n _PROTO ((struct speed_params *s));
177: double speed_mpn_toom3_sqr_n _PROTO ((struct speed_params *s));
178: double speed_mpn_udiv_qrnnd _PROTO ((struct speed_params *s));
179: double speed_mpn_umul_ppmm _PROTO ((struct speed_params *s));
180: double speed_mpn_xnor_n _PROTO ((struct speed_params *s));
181: double speed_mpn_xor_n _PROTO ((struct speed_params *s));
182:
183: double speed_mpq_init_clear _PROTO ((struct speed_params *s));
184:
185: double speed_mpz_add _PROTO ((struct speed_params *s));
186: double speed_mpz_bin_uiui _PROTO ((struct speed_params *s));
187: double speed_mpz_fac_ui _PROTO ((struct speed_params *s));
188: double speed_mpz_fib_ui _PROTO ((struct speed_params *s));
189: double speed_mpz_init_clear _PROTO ((struct speed_params *s));
190: double speed_mpz_init_realloc_clear _PROTO ((struct speed_params *s));
191: double speed_mpz_powm _PROTO ((struct speed_params *s));
192:
193: double speed_noop _PROTO ((struct speed_params *s));
194: double speed_noop_wxs _PROTO ((struct speed_params *s));
195: double speed_noop_wxys _PROTO ((struct speed_params *s));
196:
197: double speed_udiv_qrnnd _PROTO ((struct speed_params *s));
198: double speed_udiv_qrnnd_preinv _PROTO ((struct speed_params *s));
199: double speed_udiv_qrnnd_preinv2norm _PROTO ((struct speed_params *s));
200: double speed_umul_ppmm _PROTO ((struct speed_params *s));
201:
202:
203: /* Prototypes for other routines */
204:
205: /* low 32-bits in p[0], high 32-bits in p[1] */
206: void speed_cyclecounter _PROTO ((unsigned p[2]));
207:
208: void pentium_wbinvd _PROTO ((void));
209:
210: void noop _PROTO ((void));
211: void noop_1 _PROTO ((mp_limb_t n));
212: void noop_wxs _PROTO ((mp_ptr wp, mp_srcptr xp, mp_size_t size));
213: void noop_wxys _PROTO ((mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
214: mp_size_t size));
215: void mpn_cache_fill _PROTO ((mp_srcptr ptr, mp_size_t size));
216: void mpn_cache_fill_dummy _PROTO ((mp_limb_t n));
217: mp_ptr speed_tmp_alloc_adjust _PROTO ((void *ptr, mp_size_t align));
218: void *_mp_allocate_or_reallocate _PROTO ((void *ptr,
219: size_t oldsize, size_t newsize));
220: void *align_pointer _PROTO ((void *p, size_t align));
221: void *_mp_allocate_func_aligned _PROTO ((size_t bytes, size_t align));
222: void speed_cache_fill _PROTO ((struct speed_params *s));
223: void speed_operand_src _PROTO ((struct speed_params *s,
224: mp_ptr ptr, mp_size_t size));
225: void speed_operand_dst _PROTO ((struct speed_params *s,
226: mp_ptr ptr, mp_size_t size));
227: void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
228:
229: extern int speed_option_addrs;
230: void speed_option_set _PROTO((const char *s));
231:
232:
233: #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
234:
235: /* For mpn_copy or similar. */
236: #define SPEED_ROUTINE_MPN_COPY_CALL(call) \
237: { \
238: mp_ptr wp; \
239: unsigned i; \
240: double t; \
241: TMP_DECL (marker); \
242: \
243: SPEED_RESTRICT_COND (s->size >= 0); \
244: \
245: TMP_MARK (marker); \
246: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
247: \
248: speed_operand_src (s, s->xp, s->size); \
249: speed_operand_dst (s, wp, s->size); \
250: speed_cache_fill (s); \
251: \
252: speed_starttime (); \
253: i = s->reps; \
254: do \
255: call; \
256: while (--i != 0); \
257: t = speed_endtime (); \
258: \
259: TMP_FREE (marker); \
260: return t; \
261: }
262:
263: #define SPEED_ROUTINE_MPN_COPY(function) \
264: SPEED_ROUTINE_MPN_COPY_CALL(function (wp, s->xp, s->size))
265: #define SPEED_ROUTINE_MPN_COPYC(function) \
266: SPEED_ROUTINE_MPN_COPY_CALL(function (wp, s->xp, s->size, 0))
267:
268:
269: /* For mpn_add_n, mpn_sub_n, or similar. */
270: #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \
271: { \
272: mp_ptr wp; \
273: unsigned i; \
274: double t; \
275: TMP_DECL (marker); \
276: \
277: SPEED_RESTRICT_COND (s->size >= 1); \
278: \
279: TMP_MARK (marker); \
280: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
281: \
282: speed_operand_src (s, s->xp, s->size); \
283: speed_operand_src (s, s->yp, s->size); \
284: speed_operand_dst (s, wp, s->size); \
285: speed_cache_fill (s); \
286: \
287: speed_starttime (); \
288: i = s->reps; \
289: do \
290: call; \
291: while (--i != 0); \
292: t = speed_endtime (); \
293: \
294: TMP_FREE (marker); \
295: return t; \
296: }
297:
298: #define SPEED_ROUTINE_MPN_BINARY_N(function) \
299: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size))
300:
301: #define SPEED_ROUTINE_MPN_BINARY_NC(function) \
302: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->yp, s->size, 0))
303:
304: #define SPEED_ROUTINE_MPN_BINARY_N_SELF(function) \
305: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, s->xp, s->xp, s->size))
306:
307: #define SPEED_ROUTINE_MPN_BINARY_N_INPLACE(function) \
308: SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, wp, s->xp, s->size))
309:
310:
311: /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
312: #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \
313: { \
314: mp_ptr wp; \
315: unsigned i; \
316: double t; \
317: TMP_DECL (marker); \
318: \
319: SPEED_RESTRICT_COND (s->size >= 1); \
320: \
321: TMP_MARK (marker); \
322: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
323: \
324: speed_operand_src (s, s->xp, s->size); \
325: speed_operand_dst (s, wp, s->size); \
326: speed_cache_fill (s); \
327: \
328: speed_starttime (); \
329: i = s->reps; \
330: do \
331: call; \
332: while (--i != 0); \
333: t = speed_endtime (); \
334: \
335: TMP_FREE (marker); \
336: return t; \
337: }
338:
339: #define SPEED_ROUTINE_MPN_UNARY_1(function) \
340: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
341:
342: #define SPEED_ROUTINE_MPN_UNARY_1C(function) \
343: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
344:
345: #define SPEED_ROUTINE_MPN_DIVREM_1(function) \
346: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
347:
348: #define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
349: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
350:
351: #define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
352: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
353:
354: #define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
355: SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
356:
357:
358: /* For mpn_mul_basecase, xsize=r, ysize=s->size. */
359: #define SPEED_ROUTINE_MPN_MUL_BASECASE(function) \
360: { \
361: mp_ptr wp; \
362: mp_size_t size1; \
363: unsigned i; \
364: double t; \
365: TMP_DECL (marker); \
366: \
367: size1 = (s->r == 0 ? s->size : s->r); \
368: \
369: SPEED_RESTRICT_COND (s->size >= 1); \
370: SPEED_RESTRICT_COND (size1 >= s->size); \
371: \
372: TMP_MARK (marker); \
373: wp = SPEED_TMP_ALLOC_LIMBS (size1 + s->size, s->align_wp); \
374: \
375: speed_operand_src (s, s->xp, size1); \
376: speed_operand_src (s, s->yp, s->size); \
377: speed_operand_dst (s, wp, size1 + s->size); \
378: speed_cache_fill (s); \
379: \
380: speed_starttime (); \
381: i = s->reps; \
382: do \
383: function (wp, s->xp, size1, s->yp, s->size); \
384: while (--i != 0); \
385: t = speed_endtime (); \
386: \
387: TMP_FREE (marker); \
388: return t; \
389: }
390:
391:
392: #define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \
393: { \
394: mp_ptr wp; \
395: unsigned i; \
396: double t; \
397: TMP_DECL (marker); \
398: \
399: SPEED_RESTRICT_COND (s->size >= 1); \
400: \
401: TMP_MARK (marker); \
402: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
403: \
404: speed_operand_src (s, s->xp, s->size); \
405: speed_operand_src (s, s->yp, s->size); \
406: speed_operand_dst (s, wp, 2*s->size); \
407: speed_cache_fill (s); \
408: \
409: speed_starttime (); \
410: i = s->reps; \
411: do \
412: call; \
413: while (--i != 0); \
414: t = speed_endtime (); \
415: \
416: TMP_FREE (marker); \
417: return t; \
418: }
419:
420: #define SPEED_ROUTINE_MPN_MUL_N(function) \
421: SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
422:
423:
424: #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize) \
425: { \
426: mp_ptr wp, tspace; \
427: unsigned i; \
428: double t; \
429: TMP_DECL (marker); \
430: \
431: SPEED_RESTRICT_COND (s->size >= 1); \
432: \
433: TMP_MARK (marker); \
434: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
435: tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
436: \
437: speed_operand_src (s, s->xp, s->size); \
438: speed_operand_src (s, s->yp, s->size); \
439: speed_operand_dst (s, wp, 2*s->size); \
440: speed_operand_dst (s, tspace, tsize); \
441: speed_cache_fill (s); \
442: \
443: speed_starttime (); \
444: i = s->reps; \
445: do \
446: call; \
447: while (--i != 0); \
448: t = speed_endtime (); \
449: \
450: TMP_FREE (marker); \
451: return t; \
452: }
453:
454: /* FIXME: size restrictions */
455: #define SPEED_ROUTINE_MPN_KARA_MUL_N(function) \
456: SPEED_ROUTINE_MPN_MUL_N_TSPACE \
457: (function (wp, s->xp, s->xp, s->size, tspace), \
458: MPN_KARA_MUL_N_TSIZE (s->size))
459:
460: /* FIXME: size restrictions */
461: #define SPEED_ROUTINE_MPN_TOOM3_MUL_N(function) \
462: SPEED_ROUTINE_MPN_MUL_N_TSPACE \
463: (function (wp, s->xp, s->yp, s->size, tspace), \
464: MPN_TOOM3_MUL_N_TSIZE (s->size))
465:
466:
467: #define SPEED_ROUTINE_MPN_SQR_CALL(call) \
468: { \
469: mp_ptr wp; \
470: unsigned i; \
471: double t; \
472: TMP_DECL (marker); \
473: \
474: SPEED_RESTRICT_COND (s->size >= 1); \
475: \
476: TMP_MARK (marker); \
477: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
478: \
479: speed_operand_src (s, s->xp, s->size); \
480: speed_operand_dst (s, wp, 2*s->size); \
481: speed_cache_fill (s); \
482: \
483: speed_starttime (); \
484: i = s->reps; \
485: do \
486: call; \
487: while (--i != 0); \
488: t = speed_endtime (); \
489: \
490: TMP_FREE (marker); \
491: return t; \
492: }
493:
494: #define SPEED_ROUTINE_MPN_SQR(function) \
495: SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
496:
497:
498: #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize) \
499: { \
500: mp_ptr wp, tspace; \
501: unsigned i; \
502: double t; \
503: TMP_DECL (marker); \
504: \
505: SPEED_RESTRICT_COND (s->size >= 1); \
506: \
507: TMP_MARK (marker); \
508: wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
509: tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
510: \
511: speed_operand_src (s, s->xp, s->size); \
512: speed_operand_dst (s, wp, 2*s->size); \
513: speed_operand_dst (s, tspace, tsize); \
514: speed_cache_fill (s); \
515: \
516: speed_starttime (); \
517: i = s->reps; \
518: do \
519: call; \
520: while (--i != 0); \
521: t = speed_endtime (); \
522: \
523: TMP_FREE (marker); \
524: return t; \
525: }
526:
527: /* FIXME: size restrictions */
528: #define SPEED_ROUTINE_MPN_KARA_SQR_N(function) \
529: SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
530: MPN_KARA_SQR_N_TSIZE (s->size))
531:
532: /* FIXME: size restrictions */
533: #define SPEED_ROUTINE_MPN_TOOM3_SQR_N(function) \
534: SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
535: MPN_TOOM3_SQR_N_TSIZE (s->size))
536:
537:
538: #define SPEED_ROUTINE_MPN_MOD_CALL(call) \
539: { \
540: unsigned i; \
541: \
542: SPEED_RESTRICT_COND (s->size >= 0); \
543: \
544: speed_operand_src (s, s->xp, s->size); \
545: speed_cache_fill (s); \
546: \
547: speed_starttime (); \
548: i = s->reps; \
549: do \
550: call; \
551: while (--i != 0); \
552: return speed_endtime (); \
553: }
554:
555: #define SPEED_ROUTINE_MPN_MOD_1(function) \
556: SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
557:
558: #define SPEED_ROUTINE_MPN_MOD_1C(function) \
559: SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r, 0))
560:
561:
562: /* A division of 2*s->size by s->size limbs */
563:
564: #define SPEED_ROUTINE_MPN_BZ_DIVREM_CALL(call) \
565: { \
566: unsigned i; \
567: mp_ptr a, d, q, r; \
568: double t; \
569: TMP_DECL (marker); \
570: \
571: SPEED_RESTRICT_COND (s->size >= 1); \
572: \
573: TMP_MARK (marker); \
574: a = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_xp); \
575: d = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \
576: q = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
577: r = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
578: \
579: MPN_COPY (a, s->xp, s->size); \
580: MPN_COPY (a+s->size, s->xp, s->size); \
581: \
582: MPN_COPY (d, s->yp, s->size); \
583: \
584: /* normalize the data */ \
585: d[s->size-1] |= MP_LIMB_T_HIGHBIT; \
586: a[2*s->size-1] = d[s->size-1] - 1; \
587: \
588: speed_operand_src (s, a, 2*s->size); \
589: speed_operand_src (s, d, s->size); \
590: speed_operand_dst (s, q, s->size+1); \
591: speed_operand_dst (s, r, s->size); \
592: speed_cache_fill (s); \
593: \
594: speed_starttime (); \
595: i = s->reps; \
596: do \
597: call; \
598: while (--i != 0); \
599: t = speed_endtime (); \
600: \
601: TMP_FREE (marker); \
602: return t; \
603: }
604:
605: #define SPEED_ROUTINE_MPN_BZ_DIVREM_N(function) \
606: SPEED_ROUTINE_MPN_BZ_DIVREM_CALL((*function) (q, a, d, s->size))
607:
608: #define SPEED_ROUTINE_MPN_BZ_DIVREM_SB(function) \
609: SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \
610: ((*function) (q, a, 2*s->size, d, s->size))
611:
612: #define SPEED_ROUTINE_MPN_BZ_TDIV_QR(function) \
613: SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \
614: ((*function) (q, r, 0, a, 2*s->size, d, s->size))
615:
616:
617: #define SPEED_ROUTINE_MPN_POPCOUNT(function) \
618: { \
619: unsigned i; \
620: \
621: SPEED_RESTRICT_COND (s->size >= 1); \
622: \
623: speed_operand_src (s, s->xp, s->size); \
624: speed_cache_fill (s); \
625: \
626: speed_starttime (); \
627: i = s->reps; \
628: do \
629: function (s->xp, s->size); \
630: while (--i != 0); \
631: return speed_endtime (); \
632: }
633:
634: #define SPEED_ROUTINE_MPN_HAMDIST(function) \
635: { \
636: unsigned i; \
637: \
638: SPEED_RESTRICT_COND (s->size >= 1); \
639: \
640: speed_operand_src (s, s->xp, s->size); \
641: speed_operand_src (s, s->yp, s->size); \
642: speed_cache_fill (s); \
643: \
644: speed_starttime (); \
645: i = s->reps; \
646: do \
647: function (s->xp, s->yp, s->size); \
648: while (--i != 0); \
649: return speed_endtime (); \
650: }
651:
652:
653: /* For mpz_fib_ui, mpz_fac_ui, etc */
654:
655: #define SPEED_ROUTINE_MPZ_UI(function) \
656: { \
657: mpz_t z; \
658: unsigned i; \
659: double t; \
660: \
661: SPEED_RESTRICT_COND (s->size >= 0); \
662: \
663: mpz_init (z); \
664: \
665: speed_starttime (); \
666: i = s->reps; \
667: do \
668: function (z, s->size); \
669: while (--i != 0); \
670: t = speed_endtime (); \
671: \
672: mpz_clear (z); \
673: return t; \
674: }
675:
676:
677: /* Calculate 2^(m-1) mod m for random odd m of s->size limbs. Having m odd
678: allows redc to be used. Actually the exponent (m-1) is cut down to at
679: most 6 limbs so the calculation doesn't take too long. */
680: #define SPEED_ROUTINE_MPZ_POWM(function) \
681: { \
682: mpz_t r, b, e, m; \
683: unsigned i; \
684: double t; \
685: \
686: SPEED_RESTRICT_COND (s->size >= 1); \
687: \
688: mpz_init (r); \
689: mpz_init_set_ui (b, 2); \
690: \
691: /* force m to odd */ \
692: mpz_init (m); \
693: mpz_set_n (m, s->xp, s->size); \
694: PTR(m)[0] |= 1; \
695: \
696: mpz_init_set (e, m); \
697: mpz_sub_ui (e, e, 1); \
698: SIZ(e) = MIN (SIZ(e), 6); \
699: \
700: speed_starttime (); \
701: i = s->reps; \
702: do \
703: function (r, b, e, m); \
704: while (--i != 0); \
705: t = speed_endtime (); \
706: \
707: mpz_clear (r); \
708: mpz_clear (b); \
709: mpz_clear (e); \
710: mpz_clear (m); \
711: return t; \
712: }
713:
714:
715: #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \
716: { \
717: mp_ptr wp, wp2, xp, yp; \
718: unsigned i; \
719: double t; \
720: TMP_DECL (marker); \
721: \
722: SPEED_RESTRICT_COND (s->size >= 0); \
723: \
724: TMP_MARK (marker); \
725: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
726: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
727: xp = s->xp; \
728: yp = s->yp; \
729: \
730: switch (s->r) { \
731: case 0: break; \
732: case 1: xp = wp; break; \
733: case 2: yp = wp2; break; \
734: case 3: xp = wp; yp = wp2; break; \
735: case 4: xp = wp2; yp = wp; break; \
736: default: \
737: fprintf (stderr, "Unrecognised r=%ld in addsub measuring\n", s->r); \
738: abort (); \
739: } \
740: if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \
741: if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \
742: \
743: speed_operand_src (s, xp, s->size); \
744: speed_operand_src (s, yp, s->size); \
745: speed_operand_dst (s, wp, s->size); \
746: speed_operand_dst (s, wp2, s->size); \
747: speed_cache_fill (s); \
748: \
749: speed_starttime (); \
750: i = s->reps; \
751: do \
752: call; \
753: while (--i != 0); \
754: t = speed_endtime (); \
755: \
756: TMP_FREE (marker); \
757: return t; \
758: }
759:
760: #define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
761: SPEED_ROUTINE_MPN_ADDSUB_CALL \
762: (function (wp, wp2, xp, yp, s->size));
763:
764: #define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
765: SPEED_ROUTINE_MPN_ADDSUB_CALL \
766: (function (wp, wp2, xp, yp, s->size, 0));
767:
768:
769: #define SPEED_ROUTINE_MPN_GCD_1xN(function) \
770: { \
771: unsigned i; \
772: double t; \
773: TMP_DECL (marker); \
774: \
775: SPEED_RESTRICT_COND (s->size >= 1); \
776: SPEED_RESTRICT_COND (s->r != 0); \
777: \
778: TMP_MARK (marker); \
779: \
780: speed_operand_src (s, s->xp, s->size); \
781: speed_cache_fill (s); \
782: \
783: speed_starttime (); \
784: i = s->reps; \
785: do \
786: function (s->xp, s->size, s->r); \
787: while (--i != 0); \
788: t = speed_endtime (); \
789: \
790: TMP_FREE (marker); \
791: return t; \
792: }
793:
794:
795: /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
796:
797: #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \
798: { \
799: unsigned i, j; \
800: mp_ptr px, py; \
801: mp_limb_t x_mask, y_mask; \
802: double t; \
803: TMP_DECL (marker); \
804: \
805: SPEED_RESTRICT_COND (s->size >= 1); \
806: SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \
807: \
808: TMP_MARK (marker); \
809: px = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
810: py = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_yp); \
811: MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \
812: MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \
813: \
814: x_mask = MP_LIMB_T_LOWBITMASK (s->size); \
815: y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
816: for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
817: { \
818: px[i] &= x_mask; px[i] += (px[i] == 0); \
819: py[i] &= y_mask; py[i] += (py[i] == 0); \
820: setup; \
821: } \
822: \
823: speed_operand_src (s, px, SPEED_BLOCK_SIZE); \
824: speed_operand_src (s, py, SPEED_BLOCK_SIZE); \
825: speed_cache_fill (s); \
826: \
827: speed_starttime (); \
828: i = s->reps; \
829: do \
830: { \
831: j = SPEED_BLOCK_SIZE; \
832: do \
833: { \
834: call; \
835: } \
836: while (--j != 0); \
837: } \
838: while (--i != 0); \
839: t = speed_endtime (); \
840: \
841: TMP_FREE (marker); \
842: \
843: s->time_divisor = SPEED_BLOCK_SIZE; \
844: return t; \
845: }
846:
847: #define SPEED_ROUTINE_MPN_GCD_1(function) \
848: SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
849:
850: #define SPEED_ROUTINE_MPN_JACBASE(function) \
851: SPEED_ROUTINE_MPN_GCD_1_CALL \
852: ({ \
853: px[i] %= py[i]; \
854: px[i] |= 1; \
855: py[i] |= 1; \
856: if (py[i]==1) py[i]=3; \
857: }, \
858: function (px[j-1], py[j-1], 0))
859:
860:
861: /* Run some GCDs of s->size limbs each. The number of different data values
862: is decreased as s->size**2, since GCD is a quadratic algorithm.
863: SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
864: though, because the plain gcd is about twice as fast as gcdext. */
865:
866: #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \
867: { \
868: unsigned i; \
869: mp_size_t j, pieces, psize; \
870: mp_ptr wp, wp2, xtmp, ytmp, px, py; \
871: double t; \
872: TMP_DECL (marker); \
873: \
874: SPEED_RESTRICT_COND (s->size >= 1); \
875: \
876: TMP_MARK (marker); \
877: xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \
878: ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \
879: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
880: wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
881: \
882: pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \
883: pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \
1.1.1.2 ! maekawa 884: pieces = MAX (pieces, 1); \
1.1 maekawa 885: \
886: psize = pieces * s->size; \
887: px = TMP_ALLOC_LIMBS (psize); \
888: py = TMP_ALLOC_LIMBS (psize); \
889: MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
890: MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
891: \
892: /* y must be odd, x must have at least as many bits as y */ \
893: for (j = 0; j < pieces; j++) \
894: { \
895: mp_ptr x = px+j*s->size; \
896: mp_ptr y = py+j*s->size; \
897: y[0] |= 1; \
898: if (x[s->size-1] == 0) x[s->size-1] = 1; \
899: if (y[s->size-1] == 0) y[s->size-1] = 1; \
900: x[s->size-1] = MAX (x[s->size-1], y[s->size-1]); \
901: } \
902: \
903: speed_operand_src (s, px, psize); \
904: speed_operand_src (s, py, psize); \
905: speed_operand_dst (s, xtmp, s->size); \
906: speed_operand_dst (s, ytmp, s->size); \
907: speed_operand_dst (s, wp, s->size); \
908: speed_cache_fill (s); \
909: \
910: speed_starttime (); \
911: i = s->reps; \
912: do \
913: { \
914: j = pieces; \
915: do \
916: { \
917: MPN_COPY (xtmp, px+(j-1)*s->size, s->size); \
918: MPN_COPY (ytmp, py+(j-1)*s->size, s->size); \
919: call; \
920: } \
921: while (--j != 0); \
922: } \
923: while (--i != 0); \
924: t = speed_endtime (); \
925: \
926: TMP_FREE (marker); \
927: \
928: s->time_divisor = pieces; \
929: return t; \
930: }
931:
932: #define SPEED_ROUTINE_MPN_GCD(function) \
933: SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
934:
935: #define SPEED_ROUTINE_MPN_GCDEXT(function) \
936: SPEED_ROUTINE_MPN_GCD_CALL \
937: (4, { mp_size_t wp2size; \
938: function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
939:
940:
941: #define SPEED_ROUTINE_MPN_DIVREM_2(function) \
942: { \
943: mp_ptr wp, xp; \
944: mp_limb_t yp[2]; \
945: unsigned i; \
946: double t; \
947: TMP_DECL (marker); \
948: \
949: SPEED_RESTRICT_COND (s->size >= 1); \
950: \
951: TMP_MARK (marker); \
952: xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
953: wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
954: \
955: /* source is destroyed */ \
956: MPN_COPY (xp, s->xp, s->size); \
957: \
958: /* divisor must be normalized */ \
959: MPN_COPY (yp, s->yp_block, 2); \
960: yp[1] |= MP_LIMB_T_HIGHBIT; \
961: \
962: speed_operand_src (s, xp, s->size); \
963: speed_operand_src (s, yp, 2); \
964: speed_operand_dst (s, wp, s->size); \
965: speed_cache_fill (s); \
966: \
967: speed_starttime (); \
968: i = s->reps; \
969: do \
970: function (wp, 0, xp, s->size, yp); \
971: while (--i != 0); \
972: t = speed_endtime (); \
973: \
974: TMP_FREE (marker); \
975: return t; \
976: }
977:
978:
979: #define SPEED_ROUTINE_MODLIMB_INVERT(function) \
980: { \
981: unsigned i, j; \
982: mp_ptr xp; \
983: mp_limb_t n = 1; \
984: double t; \
985: \
986: xp = s->xp_block-1; \
987: \
988: speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
989: speed_cache_fill (s); \
990: \
991: speed_starttime (); \
992: i = s->reps; \
993: do \
994: { \
995: j = SPEED_BLOCK_SIZE; \
996: do \
997: { \
998: /* randomized but successively dependent */ \
999: n += (xp[j] << 1); \
1000: \
1001: function (n, n); \
1002: } \
1003: while (--j != 0); \
1004: } \
1005: while (--i != 0); \
1006: t = speed_endtime (); \
1007: \
1008: /* make sure the compiler won't optimize away n */ \
1009: noop_1 (n); \
1010: \
1011: s->time_divisor = SPEED_BLOCK_SIZE; \
1012: return t; \
1013: }
1014:
1015: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>