=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/Attic/longlong.h,v retrieving revision 1.1.1.2 retrieving revision 1.1.1.3 diff -u -p -r1.1.1.2 -r1.1.1.3 --- OpenXM_contrib/gmp/Attic/longlong.h 2000/09/09 14:12:17 1.1.1.2 +++ OpenXM_contrib/gmp/Attic/longlong.h 2003/08/25 16:06:00 1.1.1.3 @@ -1,7 +1,7 @@ /* longlong.h -- definitions for mixed size 32/64 bit arithmetic. -Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000 Free Software -Foundation, Inc. +Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002 Free +Software Foundation, Inc. This file is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by @@ -101,7 +101,12 @@ MA 02111-1307, USA. */ Please add support for more CPUs here, or improve the current support for the CPUs below! */ +/* FIXME: The macros using external routines like __MPN(count_leading_zeros) + don't need to be under !NO_ASM */ +#if ! defined (NO_ASM) + #if defined (__alpha) && W_TYPE_SIZE == 64 +/* Most alpha-based machines, except Cray systems. */ #if defined (__GNUC__) #define umul_ppmm(ph, pl, m0, m1) \ do { \ @@ -112,46 +117,98 @@ MA 02111-1307, USA. */ (pl) = __m0 * __m1; \ } while (0) #define UMUL_TIME 18 +#else /* ! __GNUC__ */ +#include +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + (ph) = __UMULH (m0, m1); \ + (pl) = __m0 * __m1; \ + } while (0) +#endif #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ - do { UDItype __di; \ + do { UWtype __di; \ __di = __MPN(invert_limb) (d); \ udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ } while (0) +#define UDIV_PREINV_ALWAYS 1 #define UDIV_NEEDS_NORMALIZATION 1 #define UDIV_TIME 220 -long __MPN(count_leading_zeros) (); +#endif /* LONGLONG_STANDALONE */ +/* clz_tab is required by mpn/alpha/cntlz.asm, and that file is built for + all alphas, even though ev67 and ev68 don't need it. */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#if defined (__GNUC__) && (HAVE_HOST_CPU_alphaev67 || HAVE_HOST_CPU_alphaev68) +#define count_leading_zeros(COUNT,X) \ + __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) +#define count_trailing_zeros(COUNT,X) \ + __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) +#else /* ! (ev67 || ev68) */ +#ifndef LONGLONG_STANDALONE +#if HAVE_ATTRIBUTE_CONST +long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const)); +#else +long __MPN(count_leading_zeros) _PROTO ((UDItype)); +#endif #define count_leading_zeros(count, x) \ ((count) = __MPN(count_leading_zeros) (x)) #endif /* LONGLONG_STANDALONE */ -#else /* ! __GNUC__ */ -#include +#endif /* ! (ev67 || ev68) */ +#endif /* __alpha */ + +#if defined (_CRAY) && W_TYPE_SIZE == 64 +#include +#define UDIV_PREINV_ALWAYS 1 +#define UDIV_NEEDS_NORMALIZATION 1 +#define UDIV_TIME 220 +long __MPN(count_leading_zeros) _PROTO ((UDItype)); +#define count_leading_zeros(count, x) \ + ((count) = _leadz ((UWtype) (x))) +#if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */ #define umul_ppmm(ph, pl, m0, m1) \ do { \ UDItype __m0 = (m0), __m1 = (m1); \ - (ph) = __UMULH (m0, m1); \ + (ph) = _int_mult_upper (m0, m1); \ (pl) = __m0 * __m1; \ } while (0) -#endif -#endif /* __alpha */ +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __di; \ + __di = __MPN(invert_limb) (d); \ + udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ + } while (0) +#endif /* LONGLONG_STANDALONE */ +#endif /* _CRAYIEEE */ +#endif /* _CRAY */ #if defined (__hppa) && W_TYPE_SIZE == 64 +#if defined (__GNUC__) +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add %4,%5,%1\n\tadd,dc %2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "%rM" (ah), "rM" (bh), "%rM" (al), "rM" (bl)) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub %4,%5,%1\n\tsub,db %2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rM" (ah), "rM" (bh), "rM" (al), "rM" (bl)) +#endif /* We put the result pointer parameter last here, since it makes passing of the other parameters more efficient. */ #ifndef LONGLONG_STANDALONE #define umul_ppmm(wh, wl, u, v) \ do { \ - UDItype __p0; \ + UWtype __p0; \ (wh) = __MPN(umul_ppmm) (u, v, &__p0); \ (wl) = __p0; \ } while (0) -extern UDItype __MPN(umul_ppmm) _PROTO ((UDItype, UDItype, UDItype *)); +extern UWtype __MPN(umul_ppmm) _PROTO ((UWtype, UWtype, UWtype *)); #define udiv_qrnnd(q, r, n1, n0, d) \ - do { UDItype __r; \ + do { UWtype __r; \ (q) = __MPN(udiv_qrnnd) (n1, n0, d, &__r); \ (r) = __r; \ } while (0) -extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDItype, UDItype, UDItype *)); +extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype, UWtype, UWtype, UWtype *)); #define UMUL_TIME 8 #define UDIV_TIME 60 #endif /* LONGLONG_STANDALONE */ @@ -163,15 +220,40 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI do { \ UDItype __m0 = (m0), __m1 = (m1); \ __asm__ ("xma.hu %0 = %1, %2, f0" \ - : "=e" (ph) \ - : "e" (m0), "e" (m1)); \ + : "=f" (ph) \ + : "f" (m0), "f" (m1)); \ (pl) = __m0 * __m1; \ } while (0) +#define UMUL_TIME 14 +#define count_leading_zeros(count, x) \ + do { \ + UWtype _x = (x), _y, _a, _c; \ + __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \ + __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \ + _c = (_a - 1) << 3; \ + _x >>= _c; \ + if (_x >= 1 << 4) \ + _x >>= 4, _c += 4; \ + if (_x >= 1 << 2) \ + _x >>= 2, _c += 2; \ + _c += _x >> 1; \ + (count) = W_TYPE_SIZE - 1 - _c; \ + } while (0) #endif +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __di; \ + __di = __MPN(invert_limb) (d); \ + udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ + } while (0) +#define UDIV_PREINV_ALWAYS 1 +#define UDIV_NEEDS_NORMALIZATION 1 #endif +#define UDIV_TIME 220 +#endif -#if defined (__GNUC__) && !defined (NO_ASM) +#if defined (__GNUC__) /* We sometimes need to clobber "cc" with gcc2, but that would not be understood by gcc1. Use cpp to avoid major code duplication. */ @@ -190,7 +272,7 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \ - : "=r" (sh), "=&r" (sl) \ + : "=r" (sh), "=&r" (sl) \ : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) #define umul_ppmm(xh, xl, m0, m1) \ do { \ @@ -213,42 +295,115 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI #define COUNT_LEADING_ZEROS_0 32 #endif /* __a29k__ */ +#if defined (__arc__) +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \ + : "=r" ((USItype) (sh)), \ + "=&r" ((USItype) (sl)) \ + : "%r" ((USItype) (ah)), \ + "rIJ" ((USItype) (bh)), \ + "%r" ((USItype) (al)), \ + "rIJ" ((USItype) (bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" ((USItype) (sh)), \ + "=&r" ((USItype) (sl)) \ + : "r" ((USItype) (ah)), \ + "rIJ" ((USItype) (bh)), \ + "r" ((USItype) (al)), \ + "rIJ" ((USItype) (bl))) +#endif + #if defined (__arm__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \ : "=r" (sh), "=&r" (sl) \ - : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl)) + : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ - __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ - : "=r" (sh), "=&r" (sl) \ - : "r" (ah), "rI" (bh), "r" (al), "rI" (bl)) -#if 1 || defined (__arm_m__) /* `M' series has widening multiply support */ + do { \ + if (__builtin_constant_p (al)) \ + { \ + if (__builtin_constant_p (ah)) \ + __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ + else \ + __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ + } \ + else if (__builtin_constant_p (ah)) \ + { \ + if (__builtin_constant_p (bl)) \ + __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ + else \ + __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \ + } \ + else if (__builtin_constant_p (bl)) \ + { \ + if (__builtin_constant_p (bh)) \ + __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ + else \ + __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \ + : "=r" (sh), "=&r" (sl) \ + : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \ + } \ + else /* only bh might be a constant */ \ + __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\ + } while (0) +#if 1 || defined (__arm_m__) /* `M' series has widening multiply support */ #define umul_ppmm(xh, xl, a, b) \ __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) +#define UMUL_TIME 5 #define smul_ppmm(xh, xl, a, b) \ __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b)) -#define UMUL_TIME 5 +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __di; \ + __di = __MPN(invert_limb) (d); \ + udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ + } while (0) +#define UDIV_PREINV_ALWAYS 1 +#define UDIV_NEEDS_NORMALIZATION 1 +#define UDIV_TIME 70 +#endif /* LONGLONG_STANDALONE */ #else #define umul_ppmm(xh, xl, a, b) \ - __asm__ ("%@ Inlined umul_ppmm - mov %|r0, %2, lsr #16 - mov %|r2, %3, lsr #16 - bic %|r1, %2, %|r0, lsl #16 - bic %|r2, %3, %|r2, lsl #16 - mul %1, %|r1, %|r2 - mul %|r2, %|r0, %|r2 - mul %|r1, %0, %|r1 - mul %0, %|r0, %0 - adds %|r1, %|r2, %|r1 - addcs %0, %0, #65536 - adds %1, %1, %|r1, lsl #16 - adc %0, %0, %|r1, lsr #16" \ + __asm__ ("%@ Inlined umul_ppmm\n" \ +" mov %|r0, %2, lsr #16\n" \ +" mov %|r2, %3, lsr #16\n" \ +" bic %|r1, %2, %|r0, lsl #16\n" \ +" bic %|r2, %3, %|r2, lsl #16\n" \ +" mul %1, %|r1, %|r2\n" \ +" mul %|r2, %|r0, %|r2\n" \ +" mul %|r1, %0, %|r1\n" \ +" mul %0, %|r0, %0\n" \ +" adds %|r1, %|r2, %|r1\n" \ +" addcs %0, %0, #65536\n" \ +" adds %1, %1, %|r1, lsl #16\n" \ +" adc %0, %0, %|r1, lsr #16" \ : "=&r" (xh), "=r" (xl) \ : "r" (a), "r" (b) \ : "r0", "r1", "r2") #define UMUL_TIME 20 +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UWtype __r; \ + (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ + (r) = __r; \ + } while (0) +extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); +#define UDIV_TIME 200 +#endif /* LONGLONG_STANDALONE */ #endif -#define UDIV_TIME 100 #endif /* __arm__ */ #if defined (__clipper__) && W_TYPE_SIZE == 32 @@ -348,44 +503,44 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI #endif #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ - do { USItype __r; \ + do { UWtype __r; \ (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ (r) = __r; \ } while (0) -extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype)); +extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); #endif /* LONGLONG_STANDALONE */ #define count_leading_zeros(count, x) \ do { \ USItype __tmp; \ __asm__ ( \ - "ldi 1,%0 - extru,= %1,15,16,%%r0 ; Bits 31..16 zero? - extru,tr %1,15,16,%1 ; No. Shift down, skip add. - ldo 16(%0),%0 ; Yes. Perform add. - extru,= %1,23,8,%%r0 ; Bits 15..8 zero? - extru,tr %1,23,8,%1 ; No. Shift down, skip add. - ldo 8(%0),%0 ; Yes. Perform add. - extru,= %1,27,4,%%r0 ; Bits 7..4 zero? - extru,tr %1,27,4,%1 ; No. Shift down, skip add. - ldo 4(%0),%0 ; Yes. Perform add. - extru,= %1,29,2,%%r0 ; Bits 3..2 zero? - extru,tr %1,29,2,%1 ; No. Shift down, skip add. - ldo 2(%0),%0 ; Yes. Perform add. - extru %1,30,1,%1 ; Extract bit 1. - sub %0,%1,%0 ; Subtract it. - " : "=r" (count), "=r" (__tmp) : "1" (x)); \ + "ldi 1,%0\n" \ +" extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \ +" extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ +" ldo 16(%0),%0 ; Yes. Perform add.\n" \ +" extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \ +" extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ +" ldo 8(%0),%0 ; Yes. Perform add.\n" \ +" extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \ +" extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ +" ldo 4(%0),%0 ; Yes. Perform add.\n" \ +" extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \ +" extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ +" ldo 2(%0),%0 ; Yes. Perform add.\n" \ +" extru %1,30,1,%1 ; Extract bit 1.\n" \ +" sub %0,%1,%0 ; Subtract it.\n" \ + : "=r" (count), "=r" (__tmp) : "1" (x)); \ } while (0) #endif /* hppa */ -#if (defined (__i370__) || defined (__mvs__)) && W_TYPE_SIZE == 32 +#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32 #define smul_ppmm(xh, xl, m0, m1) \ do { \ union {DItype __ll; \ struct {USItype __h, __l;} __i; \ } __x; \ - __asm__ ("mr %0,%3" \ - : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \ - : "%1" (m0), "r" (m1)); \ + __asm__ ("lr %N0,%1\n\tmr %0,%2" \ + : "=&r" (__x.__ll) \ + : "r" (m0), "r" (m1)); \ (xh) = __x.__i.__h; (xl) = __x.__i.__l; \ } while (0) #define sdiv_qrnnd(q, r, n1, n0, d) \ @@ -416,18 +571,114 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U __asm__ ("mull %3" \ : "=a" (w0), "=d" (w1) \ : "%0" ((USItype)(u)), "rm" ((USItype)(v))) -#define udiv_qrnnd(q, r, n1, n0, d) \ - __asm__ ("divl %4" \ +#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ + __asm__ ("divl %4" /* stringification in K&R C */ \ : "=a" (q), "=d" (r) \ - : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(d))) -#define count_leading_zeros(count, x) \ + : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) + +/* P5 bsrl takes between 10 and 72 cycles depending where the most + significant 1 bit is, hence the use of the alternatives below. bsfl is + slow too, between 18 and 42 depending where the least significant 1 bit + is. The faster count_leading_zeros are pressed into service via the + generic count_trailing_zeros at the end of the file. */ + +#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium + +/* The following should be a fixed 14 cycles or so. Some scheduling + opportunities should be available between the float load/store too. This + is used (with "n&-n" to get trailing zeros) in gcc 3 for __builtin_ffs + and is apparently suggested by the Intel optimizing manual (don't know + exactly where). gcc 2.95 or up will be best for this, so the "double" is + correctly aligned on the stack. */ + +#define count_leading_zeros(c,n) \ do { \ + union { \ + double d; \ + unsigned a[2]; \ + } __u; \ + ASSERT ((n) != 0); \ + __u.d = (UWtype) (n); \ + (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ + } while (0) +#define COUNT_LEADING_ZEROS_0 (0x3FF + 31) + +#else /* ! pentium */ +#if HAVE_HOST_CPU_pentiummmx + +/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 + cache miss reading from __clz_tab. It's favoured over the float above so + as to avoid mixing MMX and x87, since the penalty for switching between + the two is about 100 cycles. + + The asm block sets __shift to -3 if the high 24 bits are clear, -2 for + 16, -1 for 8, or 0 otherwise. This could be written equivalently as + follows, but as of gcc 2.95.2 it results in conditional jumps. + + __shift = -(__n < 0x1000000); + __shift -= (__n < 0x10000); + __shift -= (__n < 0x100); + + The middle two sbbl and cmpl's pair, and with luck something gcc + generates might pair with the first cmpl and the last sbbl. The "32+1" + constant could be folded into __clz_tab[], but it doesn't seem worth + making a different table just for that. */ + +#define count_leading_zeros(c,n) \ + do { \ + USItype __n = (n); \ + USItype __shift; \ + __asm__ ("cmpl $0x1000000, %1\n" \ + "sbbl %0, %0\n" \ + "cmpl $0x10000, %1\n" \ + "sbbl $0, %0\n" \ + "cmpl $0x100, %1\n" \ + "sbbl $0, %0\n" \ + : "=&r" (__shift) : "r" (__n)); \ + __shift = __shift*8 + 24 + 1; \ + (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \ + } while (0) + +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB +#define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ + +#else /* !pentiummmx */ +/* On P6, gcc prior to 3.0 generates a partial register stall for + __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former + being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the + cost of one extra instruction. Do this for "i386" too, since that means + generic x86. */ +#if __GNUC__ < 3 \ + && (HAVE_HOST_CPU_i386 \ + || HAVE_HOST_CPU_i686 \ + || HAVE_HOST_CPU_pentiumpro \ + || HAVE_HOST_CPU_pentium2 \ + || HAVE_HOST_CPU_pentium3) +#define count_leading_zeros(count, x) \ + do { \ USItype __cbtmp; \ + ASSERT ((x) != 0); \ __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ + (count) = 31 - __cbtmp; \ + } while (0) +#else +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + ASSERT ((x) != 0); \ + __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ (count) = __cbtmp ^ 31; \ } while (0) -#define count_trailing_zeros(count, x) \ - __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))) +#endif + +#define count_trailing_zeros(count, x) \ + do { \ + ASSERT ((x) != 0); \ + __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))); \ + } while (0) +#endif /* ! pentiummmx */ +#endif /* ! pentium */ + #ifndef UMUL_TIME #define UMUL_TIME 10 #endif @@ -436,6 +687,39 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #endif #endif /* 80x86 */ +#if defined (__x86_64__) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addq %5,%1\n\tadcq %3,%0" \ + : "=r" ((UDItype)(sh)), "=&r" ((UDItype)(sl)) \ + : "%0" ((UDItype)(ah)), "g" ((UDItype)(bh)), \ + "%1" ((UDItype)(al)), "g" ((UDItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subq %5,%1\n\tsbbq %3,%0" \ + : "=r" ((UDItype)(sh)), "=&r" ((UDItype)(sl)) \ + : "0" ((UDItype)(ah)), "g" ((UDItype)(bh)), \ + "1" ((UDItype)(al)), "g" ((UDItype)(bl))) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mulq %3" \ + : "=a" (w0), "=d" (w1) \ + : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) +#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\ + __asm__ ("divq %4" /* stringification in K&R C */ \ + : "=a" (q), "=d" (r) \ + : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) +#define count_leading_zeros(count, x) \ + do { \ + UDItype __cbtmp; \ + ASSERT ((x) != 0); \ + __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ + (count) = __cbtmp ^ 63; \ + } while (0) +#define count_trailing_zeros(count, x) \ + do { \ + ASSERT ((x) != 0); \ + __asm__ ("bsfq %1,%0" : "=r" (count) : "rm" ((UDItype)(x))); \ + } while (0) +#endif /* x86_64 */ + #if defined (__i860__) && W_TYPE_SIZE == 32 #define rshift_rhlc(r,h,l,c) \ __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \ @@ -508,7 +792,7 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #if defined (__mc68020__) || defined(mc68020) \ || defined (__mc68030__) || defined (mc68030) \ || defined (__mc68040__) || defined (mc68040) \ - || defined (__mc68332__) || defined (mc68332) \ + || defined (__mcpu32__) || defined (mcpu32) \ || defined (__NeXT__) #define umul_ppmm(w1, w0, u, v) \ __asm__ ("mulu%.l %3,%1:%0" \ @@ -527,30 +811,30 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #else /* for other 68k family members use 16x16->32 multiplication */ #define umul_ppmm(xh, xl, a, b) \ do { USItype __umul_tmp1, __umul_tmp2; \ - __asm__ ("| Inlined umul_ppmm - move%.l %5,%3 - move%.l %2,%0 - move%.w %3,%1 - swap %3 - swap %0 - mulu%.w %2,%1 - mulu%.w %3,%0 - mulu%.w %2,%3 - swap %2 - mulu%.w %5,%2 - add%.l %3,%2 - jcc 1f - add%.l %#0x10000,%0 -1: move%.l %2,%3 - clr%.w %2 - swap %2 - swap %3 - clr%.w %3 - add%.l %3,%1 - addx%.l %2,%0 - | End inlined umul_ppmm" \ + __asm__ ("| Inlined umul_ppmm\n" \ +" move%.l %5,%3\n" \ +" move%.l %2,%0\n" \ +" move%.w %3,%1\n" \ +" swap %3\n" \ +" swap %0\n" \ +" mulu%.w %2,%1\n" \ +" mulu%.w %3,%0\n" \ +" mulu%.w %2,%3\n" \ +" swap %2\n" \ +" mulu%.w %5,%2\n" \ +" add%.l %3,%2\n" \ +" jcc 1f\n" \ +" add%.l %#0x10000,%0\n" \ +"1: move%.l %2,%3\n" \ +" clr%.w %2\n" \ +" swap %2\n" \ +" swap %3\n" \ +" clr%.w %3\n" \ +" add%.l %3,%1\n" \ +" addx%.l %2,%0\n" \ +" | End inlined umul_ppmm" \ : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)), \ - "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ + "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ } while (0) #define UMUL_TIME 100 @@ -670,17 +954,25 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U } while (0) #endif /* __ns32000__ */ -/* We should test _IBMR2 here when we add assembly support for the system - vendor compilers. */ -#if (defined (_ARCH_PPC) || defined (_ARCH_PWR) || defined (__powerpc__)) && W_TYPE_SIZE == 32 +/* FIXME: We should test _IBMR2 here when we add assembly support for the + system vendor compilers. + FIXME: What's needed for gcc PowerPC VxWorks? __vxworks__ is not good + enough, since that hits ARM and m68k too. */ +#if (defined (_ARCH_PPC) /* AIX */ \ + || defined (_ARCH_PWR) /* AIX */ \ + || defined (__powerpc__) /* gcc */ \ + || defined (__POWERPC__) /* BEOS */ \ + || defined (__ppc__) /* Darwin */ \ + || defined (PPC) /* GNU/Linux, SysV */ \ + ) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (bh) && (bh) == 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else \ __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ : "=r" (sh), "=&r" (sl) \ @@ -708,7 +1000,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #define count_leading_zeros(count, x) \ __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x)) #define COUNT_LEADING_ZEROS_0 32 -#if defined (_ARCH_PPC) || defined (__powerpc__) +#if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \ + || defined (__ppc__) || defined (PPC) || defined (__vxworks__) #define umul_ppmm(ph, pl, m0, m1) \ do { \ USItype __m0 = (m0), __m1 = (m1); \ @@ -742,10 +1035,10 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U do { \ if (__builtin_constant_p (bh) && (bh) == 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ else \ __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ : "=r" (sh), "=&r" (sl) \ @@ -825,29 +1118,29 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U "1" ((USItype)(al)), "r" ((USItype)(bl))) #define smul_ppmm(ph, pl, m0, m1) \ __asm__ ( \ - "s r2,r2 - mts r10,%2 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - m r2,%3 - cas %0,r2,r0 - mfs r10,%1" \ + "s r2,r2\n" \ +" mts r10,%2\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" m r2,%3\n" \ +" cas %0,r2,r0\n" \ +" mfs r10,%1" \ : "=r" ((USItype)(ph)), "=r" ((USItype)(pl)) \ : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \ - : "r2"); \ + : "r2") #define UMUL_TIME 20 #define UDIV_TIME 200 #define count_leading_zeros(count, x) \ @@ -882,6 +1175,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U : "=r" (sh), "=&r" (sl) \ : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \ __CLOBBER_CC) +/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h + doesn't define anything to indicate that to us, it only sets __sparcv8. */ #if defined (__sparc_v9__) || defined (__sparcv9) /* Perhaps we should use floating-point operations here? */ #if 0 @@ -905,7 +1200,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U (q) = __q; \ } while (0) #else -#if defined (__sparc_v8__) +#if defined (__sparc_v8__) /* gcc normal */ \ + || defined (__sparcv8) /* gcc solaris */ /* Don't match immediate range because, 1) it is not often useful, 2) the 'I' flag thinks of the range as a 13 bit signed interval, while we want to match a 13 bit interval, sign extended to 32 bits, @@ -913,7 +1209,12 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #define umul_ppmm(w1, w0, u, v) \ __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) #define UMUL_TIME 5 -#ifndef SUPERSPARC /* SuperSPARC's udiv only handles 53 bit dividends */ + +#if HAVE_HOST_CPU_supersparc +#define UDIV_TIME 60 /* SuperSPARC timing */ +#else +/* Don't use this on SuperSPARC because its udiv only handles 53 bit + dividends and will trap to the kernel for the rest. */ #define udiv_qrnnd(q, r, n1, n0, d) \ do { \ USItype __q; \ @@ -923,9 +1224,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U (q) = __q; \ } while (0) #define UDIV_TIME 25 -#else -#define UDIV_TIME 60 /* SuperSPARC timing */ -#endif /* SUPERSPARC */ +#endif /* HAVE_HOST_CPU_supersparc */ + #else /* ! __sparc_v8__ */ #if defined (__sparclite__) /* This has hardware multiply but not divide. It also has two additional @@ -934,50 +1234,50 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v)) #define UMUL_TIME 5 #define udiv_qrnnd(q, r, n1, n0, d) \ - __asm__ ("! Inlined udiv_qrnnd - wr %%g0,%2,%%y ! Not a delayed write for sparclite - tst %%g0 - divscc %3,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%%g1 - divscc %%g1,%4,%0 - rd %%y,%1 - bl,a 1f - add %1,%4,%1 -1: ! End of inline udiv_qrnnd" \ - : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) + __asm__ ("! Inlined udiv_qrnnd\n" \ +" wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \ +" tst %%g0\n" \ +" divscc %3,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%%g1\n" \ +" divscc %%g1,%4,%0\n" \ +" rd %%y,%1\n" \ +" bl,a 1f\n" \ +" add %1,%4,%1\n" \ +"1: ! End of inline udiv_qrnnd" \ + : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \ : "%g1" __AND_CLOBBER_CC) #define UDIV_TIME 37 #define count_leading_zeros(count, x) \ - __asm__ ("scan %1,0,%0" : "=r" (x) : "r" (count)) + __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x)) /* Early sparclites return 63 for an argument of 0, but they warn that future implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 undefined. */ @@ -987,46 +1287,46 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ #ifndef umul_ppmm #define umul_ppmm(w1, w0, u, v) \ - __asm__ ("! Inlined umul_ppmm - wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr - sra %3,31,%%g2 ! Don't move this insn - and %2,%%g2,%%g2 ! Don't move this insn - andcc %%g0,0,%%g1 ! Don't move this insn - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,%3,%%g1 - mulscc %%g1,0,%%g1 - add %%g1,%%g2,%0 - rd %%y,%1" \ + __asm__ ("! Inlined umul_ppmm\n" \ +" wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \ +" sra %3,31,%%g2 ! Don't move this insn\n" \ +" and %2,%%g2,%%g2 ! Don't move this insn\n" \ +" andcc %%g0,0,%%g1 ! Don't move this insn\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,%3,%%g1\n" \ +" mulscc %%g1,0,%%g1\n" \ +" add %%g1,%%g2,%0\n" \ +" rd %%y,%1" \ : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \ : "%g1", "%g2" __AND_CLOBBER_CC) #define UMUL_TIME 39 /* 39 instructions */ @@ -1034,11 +1334,11 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #ifndef udiv_qrnnd #ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ - do { USItype __r; \ + do { UWtype __r; \ (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \ (r) = __r; \ } while (0) -extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype)); +extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); #ifndef UDIV_TIME #define UDIV_TIME 140 #endif @@ -1046,6 +1346,27 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #endif /* udiv_qrnnd */ #endif /* __sparc__ */ +#if defined (__sparc__) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ( \ + "addcc %r4,%5,%1\n" \ + " addccc %r6,%7,%%g0\n" \ + " addc %r2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "%rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ + "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ( \ + "subcc %r4,%5,%1\n" \ + " subccc %r6,%7,%%g0\n" \ + " subc %r2,%3,%0" \ + : "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \ + "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#endif + #if defined (__vax__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \ @@ -1076,6 +1397,16 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U __asm__ ("ediv %3,%2,%0,%1" \ : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \ } while (0) +#if 0 +/* FIXME: This instruction appears to be unimplemented on some systems (vax + 8800 maybe). */ +#define count_trailing_zeros(count,x) \ + do { \ + __asm__ ("ffs 0, 31, %1, %0" \ + : "=g" ((USItype) (count)) \ + : "g" ((USItype) (x))); \ + } while (0) +#endif #endif /* __vax__ */ #if defined (__z8000__) && W_TYPE_SIZE == 16 @@ -1106,7 +1437,9 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #endif /* __GNUC__ */ +#endif /* NO_ASM */ + #if !defined (umul_ppmm) && defined (__umulsidi3) #define umul_ppmm(ph, pl, m0, m1) \ { \ @@ -1130,25 +1463,25 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm #define mpn_umul_ppmm __MPN(umul_ppmm) extern mp_limb_t mpn_umul_ppmm _PROTO ((mp_limb_t *, mp_limb_t, mp_limb_t)); -#define umul_ppmm(wh, wl, u, v) \ - do { \ - mp_limb_t __umul_ppmm__p0; \ - (wh) = __MPN(umul_ppmm) (&__umul_ppmm__p0, \ - (mp_limb_t) (u), (mp_limb_t) (v)); \ - (wl) = __umul_ppmm__p0; \ +#define umul_ppmm(wh, wl, u, v) \ + do { \ + mp_limb_t __umul_ppmm__p0; \ + (wh) = __MPN(umul_ppmm) (&__umul_ppmm__p0, \ + (mp_limb_t) (u), (mp_limb_t) (v)); \ + (wl) = __umul_ppmm__p0; \ } while (0) #endif #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd #define mpn_udiv_qrnnd __MPN(udiv_qrnnd) extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *, - mp_limb_t, mp_limb_t, mp_limb_t)); -#define udiv_qrnnd(q, r, n1, n0, d) \ - do { \ - mp_limb_t __udiv_qrnnd__r; \ - (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \ - (mp_limb_t) (n1), (mp_limb_t) (n0), (mp_limb_t) d); \ - (r) = __udiv_qrnnd__r; \ + mp_limb_t, mp_limb_t, mp_limb_t)); +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + mp_limb_t __udiv_qrnnd__r; \ + (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r, \ + (mp_limb_t) (n1), (mp_limb_t) (n0), (mp_limb_t) d); \ + (r) = __udiv_qrnnd__r; \ } while (0) #endif @@ -1233,6 +1566,10 @@ extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *, #define __udiv_qrnnd_c(q, r, n1, n0, d) \ do { \ UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ + \ + ASSERT ((d) != 0); \ + ASSERT ((n1) < (d)); \ + \ __d1 = __ll_highpart (d); \ __d0 = __ll_lowpart (d); \ \ @@ -1284,36 +1621,37 @@ extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *, #endif #if !defined (count_leading_zeros) -extern -#if __STDC__ -const -#endif -unsigned char __clz_tab[]; #define count_leading_zeros(count, x) \ do { \ UWtype __xr = (x); \ UWtype __a; \ \ - if (W_TYPE_SIZE <= 32) \ + if (W_TYPE_SIZE == 32) \ { \ __a = __xr < ((UWtype) 1 << 2*__BITS4) \ - ? (__xr < ((UWtype) 1 << __BITS4) ? 0 : __BITS4) \ - : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 : 3*__BITS4);\ + ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \ + : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \ + : 3*__BITS4 + 1); \ } \ else \ { \ for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ if (((__xr >> __a) & 0xff) != 0) \ break; \ + ++__a; \ } \ \ - (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a); \ + (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ } while (0) /* This version gives a well-defined value for zero. */ -#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE +#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1) #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #endif +#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB +extern const unsigned char __GMP_DECLSPEC __clz_tab[128]; +#endif + #if !defined (count_trailing_zeros) /* Define count_trailing_zeros using count_leading_zeros. The latter might be defined in asm, but if it is not, the C version above is good enough. */ @@ -1321,6 +1659,7 @@ unsigned char __clz_tab[]; do { \ UWtype __ctz_x = (x); \ UWtype __ctz_c; \ + ASSERT (__ctz_x != 0); \ count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ (count) = W_TYPE_SIZE - 1 - __ctz_c; \ } while (0) @@ -1330,6 +1669,12 @@ unsigned char __clz_tab[]; #define UDIV_NEEDS_NORMALIZATION 0 #endif +/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and + that hence the latter should always be used. */ +#ifndef UDIV_PREINV_ALWAYS +#define UDIV_PREINV_ALWAYS 0 +#endif + /* Give defaults for UMUL_TIME and UDIV_TIME. */ #ifndef UMUL_TIME #define UMUL_TIME 1 @@ -1338,10 +1683,3 @@ unsigned char __clz_tab[]; #ifndef UDIV_TIME #define UDIV_TIME UMUL_TIME #endif - -/* count_trailing_zeros is often on the slow side, so make that the default */ -#ifndef COUNT_TRAILING_ZEROS_TIME -#define COUNT_TRAILING_ZEROS_TIME 15 /* cycles */ -#endif - -