===================================================================
RCS file: /home/cvs/OpenXM_contrib/gmp/Attic/longlong.h,v
retrieving revision 1.1.1.2
retrieving revision 1.1.1.3
diff -u -p -r1.1.1.2 -r1.1.1.3
--- OpenXM_contrib/gmp/Attic/longlong.h	2000/09/09 14:12:17	1.1.1.2
+++ OpenXM_contrib/gmp/Attic/longlong.h	2003/08/25 16:06:00	1.1.1.3
@@ -1,7 +1,7 @@
 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
 
-Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
-Foundation, Inc.
+Copyright 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002 Free
+Software Foundation, Inc.
 
 This file is free software; you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published by
@@ -101,7 +101,12 @@ MA 02111-1307, USA. */
    Please add support for more CPUs here, or improve the current support
    for the CPUs below!  */
 
+/* FIXME: The macros using external routines like __MPN(count_leading_zeros)
+   don't need to be under !NO_ASM */
+#if ! defined (NO_ASM)
+
 #if defined (__alpha) && W_TYPE_SIZE == 64
+/* Most alpha-based machines, except Cray systems. */
 #if defined (__GNUC__)
 #define umul_ppmm(ph, pl, m0, m1) \
   do {									\
@@ -112,46 +117,98 @@ MA 02111-1307, USA. */
     (pl) = __m0 * __m1;							\
   } while (0)
 #define UMUL_TIME 18
+#else /* ! __GNUC__ */
+#include <machine/builtins.h>
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    (ph) = __UMULH (m0, m1);						\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#endif
 #ifndef LONGLONG_STANDALONE
 #define udiv_qrnnd(q, r, n1, n0, d) \
-  do { UDItype __di;							\
+  do { UWtype __di;							\
     __di = __MPN(invert_limb) (d);					\
     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
   } while (0)
+#define UDIV_PREINV_ALWAYS  1
 #define UDIV_NEEDS_NORMALIZATION 1
 #define UDIV_TIME 220
-long __MPN(count_leading_zeros) ();
+#endif /* LONGLONG_STANDALONE */
+/* clz_tab is required by mpn/alpha/cntlz.asm, and that file is built for
+   all alphas, even though ev67 and ev68 don't need it. */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#if defined (__GNUC__) && (HAVE_HOST_CPU_alphaev67 || HAVE_HOST_CPU_alphaev68)
+#define count_leading_zeros(COUNT,X) \
+  __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
+#define count_trailing_zeros(COUNT,X) \
+  __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
+#else /* ! (ev67 || ev68) */
+#ifndef LONGLONG_STANDALONE
+#if HAVE_ATTRIBUTE_CONST
+long __MPN(count_leading_zeros) _PROTO ((UDItype)) __attribute__ ((const));
+#else
+long __MPN(count_leading_zeros) _PROTO ((UDItype));
+#endif
 #define count_leading_zeros(count, x) \
   ((count) = __MPN(count_leading_zeros) (x))
 #endif /* LONGLONG_STANDALONE */
-#else /* ! __GNUC__ */
-#include <machine/builtins.h>
+#endif /* ! (ev67 || ev68) */
+#endif /* __alpha */
+
+#if defined (_CRAY) && W_TYPE_SIZE == 64
+#include <intrinsics.h>
+#define UDIV_PREINV_ALWAYS  1
+#define UDIV_NEEDS_NORMALIZATION 1
+#define UDIV_TIME 220
+long __MPN(count_leading_zeros) _PROTO ((UDItype));
+#define count_leading_zeros(count, x) \
+  ((count) = _leadz ((UWtype) (x)))
+#if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
 #define umul_ppmm(ph, pl, m0, m1) \
   do {									\
     UDItype __m0 = (m0), __m1 = (m1);					\
-    (ph) = __UMULH (m0, m1);						\
+    (ph) = _int_mult_upper (m0, m1);					\
     (pl) = __m0 * __m1;							\
   } while (0)
-#endif
-#endif /* __alpha */
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __di;							\
+    __di = __MPN(invert_limb) (d);					\
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
+  } while (0)
+#endif /* LONGLONG_STANDALONE */
+#endif /* _CRAYIEEE */
+#endif /* _CRAY */
 
 #if defined (__hppa) && W_TYPE_SIZE == 64
+#if defined (__GNUC__)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add %4,%5,%1\n\tadd,dc %2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%rM" (ah), "rM" (bh), "%rM" (al), "rM" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub %4,%5,%1\n\tsub,db %2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "rM" (ah), "rM" (bh), "rM" (al), "rM" (bl))
+#endif
 /* We put the result pointer parameter last here, since it makes passing
    of the other parameters more efficient.  */
 #ifndef LONGLONG_STANDALONE
 #define umul_ppmm(wh, wl, u, v) \
   do {									\
-    UDItype __p0;							\
+    UWtype __p0;							\
     (wh) = __MPN(umul_ppmm) (u, v, &__p0);				\
     (wl) = __p0;							\
   } while (0)
-extern UDItype __MPN(umul_ppmm) _PROTO ((UDItype, UDItype, UDItype *));
+extern UWtype __MPN(umul_ppmm) _PROTO ((UWtype, UWtype, UWtype *));
 #define udiv_qrnnd(q, r, n1, n0, d) \
-  do { UDItype __r;							\
+  do { UWtype __r;							\
     (q) = __MPN(udiv_qrnnd) (n1, n0, d, &__r);				\
     (r) = __r;								\
   } while (0)
-extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDItype, UDItype, UDItype *));
+extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype, UWtype, UWtype, UWtype *));
 #define UMUL_TIME 8
 #define UDIV_TIME 60
 #endif /* LONGLONG_STANDALONE */
@@ -163,15 +220,40 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI
   do {									\
     UDItype __m0 = (m0), __m1 = (m1);					\
     __asm__ ("xma.hu %0 = %1, %2, f0"					\
-	     : "=e" (ph)						\
-	     : "e" (m0), "e" (m1));					\
+	     : "=f" (ph)						\
+	     : "f" (m0), "f" (m1));					\
     (pl) = __m0 * __m1;							\
   } while (0)
+#define UMUL_TIME 14
+#define count_leading_zeros(count, x) \
+  do {									\
+    UWtype _x = (x), _y, _a, _c;					\
+    __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
+    __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
+    _c = (_a - 1) << 3;							\
+    _x >>= _c;								\
+    if (_x >= 1 << 4)							\
+      _x >>= 4, _c += 4;						\
+    if (_x >= 1 << 2)							\
+      _x >>= 2, _c += 2;						\
+    _c += _x >> 1;							\
+    (count) =  W_TYPE_SIZE - 1 - _c;					\
+  } while (0)
 #endif
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __di;							\
+    __di = __MPN(invert_limb) (d);					\
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
+  } while (0)
+#define UDIV_PREINV_ALWAYS  1
+#define UDIV_NEEDS_NORMALIZATION 1
 #endif
+#define UDIV_TIME 220
+#endif
 
 
-#if defined (__GNUC__) && !defined (NO_ASM)
+#if defined (__GNUC__)
 
 /* We sometimes need to clobber "cc" with gcc2, but that would not be
    understood by gcc1.  Use cpp to avoid major code duplication.  */
@@ -190,7 +272,7 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI
 	   : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
-	   : "=r" (sh),	"=&r" (sl)					\
+	   : "=r" (sh), "=&r" (sl)					\
 	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
 #define umul_ppmm(xh, xl, m0, m1) \
   do {									\
@@ -213,42 +295,115 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI
 #define COUNT_LEADING_ZEROS_0 32
 #endif /* __a29k__ */
 
+#if defined (__arc__)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "%r" ((USItype) (ah)),					\
+	     "rIJ" ((USItype) (bh)),					\
+	     "%r" ((USItype) (al)),					\
+	     "rIJ" ((USItype) (bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
+	   : "=r" ((USItype) (sh)),					\
+	     "=&r" ((USItype) (sl))					\
+	   : "r" ((USItype) (ah)),					\
+	     "rIJ" ((USItype) (bh)),					\
+	     "r" ((USItype) (al)),					\
+	     "rIJ" ((USItype) (bl)))
+#endif
+
 #if defined (__arm__) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
 	   : "=r" (sh), "=&r" (sl)					\
-	   : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
+	   : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
-  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
-	   : "=r" (sh),	"=&r" (sl)					\
-	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
-#if 1 || defined (__arm_m__)		/* `M' series has widening multiply support */
+  do {									\
+    if (__builtin_constant_p (al))					\
+      {									\
+	if (__builtin_constant_p (ah))					\
+	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
+		   : "=r" (sh), "=&r" (sl)				\
+		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
+	else								\
+	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
+		   : "=r" (sh), "=&r" (sl)				\
+		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
+      }									\
+    else if (__builtin_constant_p (ah))					\
+      {									\
+	if (__builtin_constant_p (bl))					\
+	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
+		   : "=r" (sh), "=&r" (sl)				\
+		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+	else								\
+	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
+		   : "=r" (sh), "=&r" (sl)				\
+		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
+      }									\
+    else if (__builtin_constant_p (bl))					\
+      {									\
+	if (__builtin_constant_p (bh))					\
+	  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"		\
+		   : "=r" (sh), "=&r" (sl)				\
+		   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+	else								\
+	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
+		   : "=r" (sh), "=&r" (sl)				\
+		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
+      }									\
+    else /* only bh might be a constant */				\
+      __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
+    } while (0)
+#if 1 || defined (__arm_m__)	/* `M' series has widening multiply support */
 #define umul_ppmm(xh, xl, a, b) \
   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
+#define UMUL_TIME 5
 #define smul_ppmm(xh, xl, a, b) \
   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
-#define UMUL_TIME 5
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __di;							\
+    __di = __MPN(invert_limb) (d);					\
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
+  } while (0)
+#define UDIV_PREINV_ALWAYS  1
+#define UDIV_NEEDS_NORMALIZATION 1
+#define UDIV_TIME 70
+#endif /* LONGLONG_STANDALONE */
 #else
 #define umul_ppmm(xh, xl, a, b) \
-  __asm__ ("%@ Inlined umul_ppmm
-	mov	%|r0, %2, lsr #16
-	mov	%|r2, %3, lsr #16
-	bic	%|r1, %2, %|r0, lsl #16
-	bic	%|r2, %3, %|r2, lsl #16
-	mul	%1, %|r1, %|r2
-	mul	%|r2, %|r0, %|r2
-	mul	%|r1, %0, %|r1
-	mul	%0, %|r0, %0
-	adds	%|r1, %|r2, %|r1
-	addcs	%0, %0, #65536
-	adds	%1, %1, %|r1, lsl #16
-	adc	%0, %0, %|r1, lsr #16"					\
+  __asm__ ("%@ Inlined umul_ppmm\n"					\
+"	mov	%|r0, %2, lsr #16\n"					\
+"	mov	%|r2, %3, lsr #16\n"					\
+"	bic	%|r1, %2, %|r0, lsl #16\n"				\
+"	bic	%|r2, %3, %|r2, lsl #16\n"				\
+"	mul	%1, %|r1, %|r2\n"					\
+"	mul	%|r2, %|r0, %|r2\n"					\
+"	mul	%|r1, %0, %|r1\n"					\
+"	mul	%0, %|r0, %0\n"						\
+"	adds	%|r1, %|r2, %|r1\n"					\
+"	addcs	%0, %0, #65536\n"					\
+"	adds	%1, %1, %|r1, lsl #16\n"				\
+"	adc	%0, %0, %|r1, lsr #16"					\
 	   : "=&r" (xh), "=r" (xl)					\
 	   : "r" (a), "r" (b)						\
 	   : "r0", "r1", "r2")
 #define UMUL_TIME 20
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UWtype __r;							\
+    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
+    (r) = __r;								\
+  } while (0)
+extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
+#define UDIV_TIME 200
+#endif /* LONGLONG_STANDALONE */
 #endif
-#define UDIV_TIME 100
 #endif /* __arm__ */
 
 #if defined (__clipper__) && W_TYPE_SIZE == 32
@@ -348,44 +503,44 @@ extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDI
 #endif
 #ifndef LONGLONG_STANDALONE
 #define udiv_qrnnd(q, r, n1, n0, d) \
-  do { USItype __r;							\
+  do { UWtype __r;							\
     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
     (r) = __r;								\
   } while (0)
-extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype));
+extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
 #endif /* LONGLONG_STANDALONE */
 #define count_leading_zeros(count, x) \
   do {									\
     USItype __tmp;							\
     __asm__ (								\
-       "ldi		1,%0
-	extru,=		%1,15,16,%%r0		; Bits 31..16 zero?
-	extru,tr	%1,15,16,%1		; No.  Shift down, skip add.
-	ldo		16(%0),%0		; Yes.  Perform add.
-	extru,=		%1,23,8,%%r0		; Bits 15..8 zero?
-	extru,tr	%1,23,8,%1		; No.  Shift down, skip add.
-	ldo		8(%0),%0		; Yes.  Perform add.
-	extru,=		%1,27,4,%%r0		; Bits 7..4 zero?
-	extru,tr	%1,27,4,%1		; No.  Shift down, skip add.
-	ldo		4(%0),%0		; Yes.  Perform add.
-	extru,=		%1,29,2,%%r0		; Bits 3..2 zero?
-	extru,tr	%1,29,2,%1		; No.  Shift down, skip add.
-	ldo		2(%0),%0		; Yes.  Perform add.
-	extru		%1,30,1,%1		; Extract bit 1.
-	sub		%0,%1,%0		; Subtract it.
-	" : "=r" (count), "=r" (__tmp) : "1" (x));			\
+       "ldi		1,%0\n"						\
+"	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
+"	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
+"	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
+"	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
+"	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
+"	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
+"	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
+"	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
+"	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
+"	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
+"	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
+"	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
+"	extru		%1,30,1,%1	; Extract bit 1.\n"		\
+"	sub		%0,%1,%0	; Subtract it.\n"		\
+	: "=r" (count), "=r" (__tmp) : "1" (x));			\
   } while (0)
 #endif /* hppa */
 
-#if (defined (__i370__) || defined (__mvs__)) && W_TYPE_SIZE == 32
+#if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
 #define smul_ppmm(xh, xl, m0, m1) \
   do {									\
     union {DItype __ll;							\
 	   struct {USItype __h, __l;} __i;				\
 	  } __x;							\
-    __asm__ ("mr %0,%3"							\
-	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
-	     : "%1" (m0), "r" (m1));					\
+    __asm__ ("lr %N0,%1\n\tmr %0,%2"					\
+	     : "=&r" (__x.__ll)						\
+	     : "r" (m0), "r" (m1));					\
     (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
   } while (0)
 #define sdiv_qrnnd(q, r, n1, n0, d) \
@@ -416,18 +571,114 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
   __asm__ ("mull %3"							\
 	   : "=a" (w0), "=d" (w1)					\
 	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
-#define udiv_qrnnd(q, r, n1, n0, d) \
-  __asm__ ("divl %4"							\
+#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
+  __asm__ ("divl %4"		     /* stringification in K&R C */	\
 	   : "=a" (q), "=d" (r)						\
-	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(d)))
-#define count_leading_zeros(count, x) \
+	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
+
+/* P5 bsrl takes between 10 and 72 cycles depending where the most
+   significant 1 bit is, hence the use of the alternatives below.  bsfl is
+   slow too, between 18 and 42 depending where the least significant 1 bit
+   is.  The faster count_leading_zeros are pressed into service via the
+   generic count_trailing_zeros at the end of the file.  */
+
+#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium
+
+/* The following should be a fixed 14 cycles or so.  Some scheduling
+   opportunities should be available between the float load/store too.  This
+   is used (with "n&-n" to get trailing zeros) in gcc 3 for __builtin_ffs
+   and is apparently suggested by the Intel optimizing manual (don't know
+   exactly where).  gcc 2.95 or up will be best for this, so the "double" is
+   correctly aligned on the stack.  */
+
+#define count_leading_zeros(c,n)					\
   do {									\
+    union {								\
+      double    d;							\
+      unsigned  a[2];							\
+    } __u;								\
+    ASSERT ((n) != 0);							\
+    __u.d = (UWtype) (n);						\
+    (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
+
+#else /* ! pentium */
+#if HAVE_HOST_CPU_pentiummmx
+
+/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
+   cache miss reading from __clz_tab.  It's favoured over the float above so
+   as to avoid mixing MMX and x87, since the penalty for switching between
+   the two is about 100 cycles.
+
+   The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
+   16, -1 for 8, or 0 otherwise.  This could be written equivalently as
+   follows, but as of gcc 2.95.2 it results in conditional jumps.
+
+       __shift = -(__n < 0x1000000);
+       __shift -= (__n < 0x10000);
+       __shift -= (__n < 0x100);
+
+   The middle two sbbl and cmpl's pair, and with luck something gcc
+   generates might pair with the first cmpl and the last sbbl.  The "32+1"
+   constant could be folded into __clz_tab[], but it doesn't seem worth
+   making a different table just for that.  */
+
+#define count_leading_zeros(c,n)					\
+  do {									\
+    USItype  __n = (n);							\
+    USItype  __shift;							\
+    __asm__ ("cmpl  $0x1000000, %1\n"					\
+	     "sbbl  %0, %0\n"						\
+	     "cmpl  $0x10000, %1\n"					\
+	     "sbbl  $0, %0\n"						\
+	     "cmpl  $0x100, %1\n"					\
+	     "sbbl  $0, %0\n"						\
+	     : "=&r" (__shift) : "r"  (__n));				\
+    __shift = __shift*8 + 24 + 1;					\
+    (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
+  } while (0)
+
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
+
+#else /* !pentiummmx */
+/* On P6, gcc prior to 3.0 generates a partial register stall for
+   __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
+   being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
+   cost of one extra instruction.  Do this for "i386" too, since that means
+   generic x86.  */
+#if __GNUC__ < 3							\
+  && (HAVE_HOST_CPU_i386                                                \
+      || HAVE_HOST_CPU_i686                                             \
+      || HAVE_HOST_CPU_pentiumpro                                       \
+      || HAVE_HOST_CPU_pentium2                                         \
+      || HAVE_HOST_CPU_pentium3)
+#define count_leading_zeros(count, x)					\
+  do {									\
     USItype __cbtmp;							\
+    ASSERT ((x) != 0);							\
     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
+    (count) = 31 - __cbtmp;						\
+  } while (0)
+#else
+#define count_leading_zeros(count, x)					\
+  do {									\
+    USItype __cbtmp;							\
+    ASSERT ((x) != 0);							\
+    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
     (count) = __cbtmp ^ 31;						\
   } while (0)
-#define count_trailing_zeros(count, x) \
-  __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)))
+#endif
+
+#define count_trailing_zeros(count, x)					\
+  do {									\
+    ASSERT ((x) != 0);							\
+    __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)));	\
+  } while (0)
+#endif /* ! pentiummmx */
+#endif /* ! pentium */
+
 #ifndef UMUL_TIME
 #define UMUL_TIME 10
 #endif
@@ -436,6 +687,39 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #endif
 #endif /* 80x86 */
 
+#if defined (__x86_64__) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addq %5,%1\n\tadcq %3,%0"					\
+	   : "=r" ((UDItype)(sh)), "=&r" ((UDItype)(sl))		\
+	   : "%0" ((UDItype)(ah)), "g" ((UDItype)(bh)),			\
+	     "%1" ((UDItype)(al)), "g" ((UDItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subq %5,%1\n\tsbbq %3,%0"					\
+	   : "=r" ((UDItype)(sh)), "=&r" ((UDItype)(sl))		\
+	   : "0" ((UDItype)(ah)), "g" ((UDItype)(bh)),			\
+	     "1" ((UDItype)(al)), "g" ((UDItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulq %3"							\
+	   : "=a" (w0), "=d" (w1)					\
+	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
+#define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
+  __asm__ ("divq %4"		     /* stringification in K&R C */	\
+	   : "=a" (q), "=d" (r)						\
+	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
+#define count_leading_zeros(count, x)					\
+  do {									\
+    UDItype __cbtmp;							\
+    ASSERT ((x) != 0);							\
+    __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
+    (count) = __cbtmp ^ 63;						\
+  } while (0)
+#define count_trailing_zeros(count, x)					\
+  do {									\
+    ASSERT ((x) != 0);							\
+    __asm__ ("bsfq %1,%0" : "=r" (count) : "rm" ((UDItype)(x)));	\
+  } while (0)
+#endif /* x86_64 */
+
 #if defined (__i860__) && W_TYPE_SIZE == 32
 #define rshift_rhlc(r,h,l,c) \
   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
@@ -508,7 +792,7 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #if defined (__mc68020__) || defined(mc68020) \
      || defined (__mc68030__) || defined (mc68030) \
      || defined (__mc68040__) || defined (mc68040) \
-     || defined (__mc68332__) || defined (mc68332) \
+     || defined (__mcpu32__) || defined (mcpu32) \
      || defined (__NeXT__)
 #define umul_ppmm(w1, w0, u, v) \
   __asm__ ("mulu%.l %3,%1:%0"						\
@@ -527,30 +811,30 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #else /* for other 68k family members use 16x16->32 multiplication */
 #define umul_ppmm(xh, xl, a, b) \
   do { USItype __umul_tmp1, __umul_tmp2;				\
-	__asm__ ("| Inlined umul_ppmm
-	move%.l	%5,%3
-	move%.l	%2,%0
-	move%.w	%3,%1
-	swap	%3
-	swap	%0
-	mulu%.w	%2,%1
-	mulu%.w	%3,%0
-	mulu%.w	%2,%3
-	swap	%2
-	mulu%.w	%5,%2
-	add%.l	%3,%2
-	jcc	1f
-	add%.l	%#0x10000,%0
-1:	move%.l	%2,%3
-	clr%.w	%2
-	swap	%2
-	swap	%3
-	clr%.w	%3
-	add%.l	%3,%1
-	addx%.l	%2,%0
-	| End inlined umul_ppmm"					\
+	__asm__ ("| Inlined umul_ppmm\n"				\
+"	move%.l	%5,%3\n"						\
+"	move%.l	%2,%0\n"						\
+"	move%.w	%3,%1\n"						\
+"	swap	%3\n"							\
+"	swap	%0\n"							\
+"	mulu%.w	%2,%1\n"						\
+"	mulu%.w	%3,%0\n"						\
+"	mulu%.w	%2,%3\n"						\
+"	swap	%2\n"							\
+"	mulu%.w	%5,%2\n"						\
+"	add%.l	%3,%2\n"						\
+"	jcc	1f\n"							\
+"	add%.l	%#0x10000,%0\n"						\
+"1:	move%.l	%2,%3\n"						\
+"	clr%.w	%2\n"							\
+"	swap	%2\n"							\
+"	swap	%3\n"							\
+"	clr%.w	%3\n"							\
+"	add%.l	%3,%1\n"						\
+"	addx%.l	%2,%0\n"						\
+"	| End inlined umul_ppmm"					\
 	      : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)),		\
-	        "=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
+		"=d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
 	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
   } while (0)
 #define UMUL_TIME 100
@@ -670,17 +954,25 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
   } while (0)
 #endif /* __ns32000__ */
 
-/* We should test _IBMR2 here when we add assembly support for the system
-   vendor compilers.  */
-#if (defined (_ARCH_PPC) || defined (_ARCH_PWR) || defined (__powerpc__)) && W_TYPE_SIZE == 32
+/* FIXME: We should test _IBMR2 here when we add assembly support for the
+   system vendor compilers.
+   FIXME: What's needed for gcc PowerPC VxWorks?  __vxworks__ is not good
+   enough, since that hits ARM and m68k too.  */
+#if (defined (_ARCH_PPC)	/* AIX */				\
+     || defined (_ARCH_PWR)	/* AIX */				\
+     || defined (__powerpc__)	/* gcc */				\
+     || defined (__POWERPC__)	/* BEOS */				\
+     || defined (__ppc__)	/* Darwin */				\
+     || defined (PPC)		/* GNU/Linux, SysV */			\
+     ) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {									\
     if (__builtin_constant_p (bh) && (bh) == 0)				\
       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
-	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
-	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
     else								\
       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
 	     : "=r" (sh), "=&r" (sl)					\
@@ -708,7 +1000,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #define count_leading_zeros(count, x) \
   __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
 #define COUNT_LEADING_ZEROS_0 32
-#if defined (_ARCH_PPC) || defined (__powerpc__)
+#if defined (_ARCH_PPC) || defined (__powerpc__) || defined (__POWERPC__) \
+  || defined (__ppc__) || defined (PPC) || defined (__vxworks__)
 #define umul_ppmm(ph, pl, m0, m1) \
   do {									\
     USItype __m0 = (m0), __m1 = (m1);					\
@@ -742,10 +1035,10 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
   do {									\
     if (__builtin_constant_p (bh) && (bh) == 0)				\
       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
-	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
       __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
-	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
     else								\
       __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
 	     : "=r" (sh), "=&r" (sl)					\
@@ -825,29 +1118,29 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
 #define smul_ppmm(ph, pl, m0, m1) \
   __asm__ (								\
-       "s	r2,r2
-	mts r10,%2
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	m	r2,%3
-	cas	%0,r2,r0
-	mfs	r10,%1"							\
+       "s	r2,r2\n"						\
+"	mts r10,%2\n"							\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	m	r2,%3\n"						\
+"	cas	%0,r2,r0\n"						\
+"	mfs	r10,%1"							\
 	   : "=r" ((USItype)(ph)), "=r" ((USItype)(pl))			\
 	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
-	   : "r2");							\
+	   : "r2")
 #define UMUL_TIME 20
 #define UDIV_TIME 200
 #define count_leading_zeros(count, x) \
@@ -882,6 +1175,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 	   : "=r" (sh), "=&r" (sl)					\
 	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
 	   __CLOBBER_CC)
+/* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
+   doesn't define anything to indicate that to us, it only sets __sparcv8. */
 #if defined (__sparc_v9__) || defined (__sparcv9)
 /* Perhaps we should use floating-point operations here?  */
 #if 0
@@ -905,7 +1200,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
     (q) = __q;								\
   } while (0)
 #else
-#if defined (__sparc_v8__)
+#if defined (__sparc_v8__)   /* gcc normal */				\
+  || defined (__sparcv8)     /* gcc solaris */
 /* Don't match immediate range because, 1) it is not often useful,
    2) the 'I' flag thinks of the range as a 13 bit signed interval,
    while we want to match a 13 bit interval, sign extended to 32 bits,
@@ -913,7 +1209,12 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #define umul_ppmm(w1, w0, u, v) \
   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
 #define UMUL_TIME 5
-#ifndef SUPERSPARC	/* SuperSPARC's udiv only handles 53 bit dividends */
+
+#if HAVE_HOST_CPU_supersparc
+#define UDIV_TIME 60		/* SuperSPARC timing */
+#else
+/* Don't use this on SuperSPARC because its udiv only handles 53 bit
+   dividends and will trap to the kernel for the rest. */
 #define udiv_qrnnd(q, r, n1, n0, d) \
   do {									\
     USItype __q;							\
@@ -923,9 +1224,8 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
     (q) = __q;								\
   } while (0)
 #define UDIV_TIME 25
-#else
-#define UDIV_TIME 60		/* SuperSPARC timing */
-#endif /* SUPERSPARC */
+#endif /* HAVE_HOST_CPU_supersparc */
+
 #else /* ! __sparc_v8__ */
 #if defined (__sparclite__)
 /* This has hardware multiply but not divide.  It also has two additional
@@ -934,50 +1234,50 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
 #define UMUL_TIME 5
 #define udiv_qrnnd(q, r, n1, n0, d) \
-  __asm__ ("! Inlined udiv_qrnnd
-	wr	%%g0,%2,%%y	! Not a delayed write for sparclite
-	tst	%%g0
-	divscc	%3,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%%g1
-	divscc	%%g1,%4,%0
-	rd	%%y,%1
-	bl,a 1f
-	add	%1,%4,%1
-1:	! End of inline udiv_qrnnd"					\
-	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)
+  __asm__ ("! Inlined udiv_qrnnd\n"					\
+"	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
+"	tst	%%g0\n"							\
+"	divscc	%3,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%%g1\n"						\
+"	divscc	%%g1,%4,%0\n"						\
+"	rd	%%y,%1\n"						\
+"	bl,a 1f\n"							\
+"	add	%1,%4,%1\n"						\
+"1:	! End of inline udiv_qrnnd"					\
+	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
 	   : "%g1" __AND_CLOBBER_CC)
 #define UDIV_TIME 37
 #define count_leading_zeros(count, x) \
-  __asm__ ("scan %1,0,%0" : "=r" (x) : "r" (count))
+  __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
 /* Early sparclites return 63 for an argument of 0, but they warn that future
    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
    undefined.  */
@@ -987,46 +1287,46 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
 #ifndef umul_ppmm
 #define umul_ppmm(w1, w0, u, v) \
-  __asm__ ("! Inlined umul_ppmm
-	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr
-	sra	%3,31,%%g2	! Don't move this insn
-	and	%2,%%g2,%%g2	! Don't move this insn
-	andcc	%%g0,0,%%g1	! Don't move this insn
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,%3,%%g1
-	mulscc	%%g1,0,%%g1
-	add	%%g1,%%g2,%0
-	rd	%%y,%1"							\
+  __asm__ ("! Inlined umul_ppmm\n"					\
+"	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
+"	sra	%3,31,%%g2	! Don't move this insn\n"		\
+"	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
+"	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,%3,%%g1\n"						\
+"	mulscc	%%g1,0,%%g1\n"						\
+"	add	%%g1,%%g2,%0\n"						\
+"	rd	%%y,%1"							\
 	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
 	   : "%g1", "%g2" __AND_CLOBBER_CC)
 #define UMUL_TIME 39		/* 39 instructions */
@@ -1034,11 +1334,11 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #ifndef udiv_qrnnd
 #ifndef LONGLONG_STANDALONE
 #define udiv_qrnnd(q, r, n1, n0, d) \
-  do { USItype __r;							\
+  do { UWtype __r;							\
     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
     (r) = __r;								\
   } while (0)
-extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype));
+extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype));
 #ifndef UDIV_TIME
 #define UDIV_TIME 140
 #endif
@@ -1046,6 +1346,27 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #endif /* udiv_qrnnd */
 #endif /* __sparc__ */
 
+#if defined (__sparc__) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ (								\
+       "addcc	%r4,%5,%1\n"						\
+      "	addccc	%r6,%7,%%g0\n"						\
+      "	addc	%r2,%3,%0"						\
+	  : "=r" (sh), "=&r" (sl)					\
+	  : "%rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
+	    "%rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
+	   __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ (								\
+       "subcc	%r4,%5,%1\n"						\
+      "	subccc	%r6,%7,%%g0\n"						\
+      "	subc	%r2,%3,%0"						\
+	  : "=r" (sh), "=&r" (sl)					\
+	  : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),		\
+	    "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
+	   __CLOBBER_CC)
+#endif
+
 #if defined (__vax__) && W_TYPE_SIZE == 32
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
@@ -1076,6 +1397,16 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
     __asm__ ("ediv %3,%2,%0,%1"						\
 	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
   } while (0)
+#if 0
+/* FIXME: This instruction appears to be unimplemented on some systems (vax
+   8800 maybe). */
+#define count_trailing_zeros(count,x)                                   \
+  do {									\
+    __asm__ ("ffs 0, 31, %1, %0"                                        \
+	     : "=g" ((USItype) (count))					\
+	     : "g" ((USItype) (x)));                                    \
+  } while (0)
+#endif
 #endif /* __vax__ */
 
 #if defined (__z8000__) && W_TYPE_SIZE == 16
@@ -1106,7 +1437,9 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 
 #endif /* __GNUC__ */
 
+#endif /* NO_ASM */
 
+
 #if !defined (umul_ppmm) && defined (__umulsidi3)
 #define umul_ppmm(ph, pl, m0, m1) \
   {									\
@@ -1130,25 +1463,25 @@ extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, U
 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm
 #define mpn_umul_ppmm  __MPN(umul_ppmm)
 extern mp_limb_t mpn_umul_ppmm _PROTO ((mp_limb_t *, mp_limb_t, mp_limb_t));
-#define umul_ppmm(wh, wl, u, v)                                 \
-  do {                                                          \
-    mp_limb_t __umul_ppmm__p0;                                  \
-    (wh) = __MPN(umul_ppmm) (&__umul_ppmm__p0,                  \
-                             (mp_limb_t) (u), (mp_limb_t) (v)); \
-    (wl) = __umul_ppmm__p0;                                     \
+#define umul_ppmm(wh, wl, u, v)						\
+  do {									\
+    mp_limb_t __umul_ppmm__p0;						\
+    (wh) = __MPN(umul_ppmm) (&__umul_ppmm__p0,				\
+			     (mp_limb_t) (u), (mp_limb_t) (v));		\
+    (wl) = __umul_ppmm__p0;						\
   } while (0)
 #endif
 
 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd
 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
 extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *,
-                                         mp_limb_t, mp_limb_t, mp_limb_t));
-#define udiv_qrnnd(q, r, n1, n0, d)                                           \
-  do {                                                                        \
-    mp_limb_t __udiv_qrnnd__r;                                                \
-    (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                                   \
-                          (mp_limb_t) (n1), (mp_limb_t) (n0), (mp_limb_t) d); \
-    (r) = __udiv_qrnnd__r;                                                    \
+					 mp_limb_t, mp_limb_t, mp_limb_t));
+#define udiv_qrnnd(q, r, n1, n0, d)					\
+  do {									\
+    mp_limb_t __udiv_qrnnd__r;						\
+    (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,				\
+			  (mp_limb_t) (n1), (mp_limb_t) (n0), (mp_limb_t) d); \
+    (r) = __udiv_qrnnd__r;						\
   } while (0)
 #endif
 
@@ -1233,6 +1566,10 @@ extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *,
 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
   do {									\
     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
+									\
+    ASSERT ((d) != 0);							\
+    ASSERT ((n1) < (d));						\
+									\
     __d1 = __ll_highpart (d);						\
     __d0 = __ll_lowpart (d);						\
 									\
@@ -1284,36 +1621,37 @@ extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *,
 #endif
 
 #if !defined (count_leading_zeros)
-extern
-#if __STDC__
-const
-#endif
-unsigned char __clz_tab[];
 #define count_leading_zeros(count, x) \
   do {									\
     UWtype __xr = (x);							\
     UWtype __a;								\
 									\
-    if (W_TYPE_SIZE <= 32)						\
+    if (W_TYPE_SIZE == 32)						\
       {									\
 	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
-	  ? (__xr < ((UWtype) 1 << __BITS4) ? 0 : __BITS4)		\
-	  : (__xr < ((UWtype) 1 << 3*__BITS4) ?  2*__BITS4 : 3*__BITS4);\
+	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
+	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
+	  : 3*__BITS4 + 1);						\
       }									\
     else								\
       {									\
 	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
 	  if (((__xr >> __a) & 0xff) != 0)				\
 	    break;							\
+	++__a;								\
       }									\
 									\
-    (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
+    (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
   } while (0)
 /* This version gives a well-defined value for zero. */
-#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
+#define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 #endif
 
+#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+extern const unsigned char __GMP_DECLSPEC __clz_tab[128];
+#endif
+
 #if !defined (count_trailing_zeros)
 /* Define count_trailing_zeros using count_leading_zeros.  The latter might be
    defined in asm, but if it is not, the C version above is good enough.  */
@@ -1321,6 +1659,7 @@ unsigned char __clz_tab[];
   do {									\
     UWtype __ctz_x = (x);						\
     UWtype __ctz_c;							\
+    ASSERT (__ctz_x != 0);						\
     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
     (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
   } while (0)
@@ -1330,6 +1669,12 @@ unsigned char __clz_tab[];
 #define UDIV_NEEDS_NORMALIZATION 0
 #endif
 
+/* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
+   that hence the latter should always be used.  */
+#ifndef UDIV_PREINV_ALWAYS
+#define UDIV_PREINV_ALWAYS 0
+#endif
+
 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
 #ifndef UMUL_TIME
 #define UMUL_TIME 1
@@ -1338,10 +1683,3 @@ unsigned char __clz_tab[];
 #ifndef UDIV_TIME
 #define UDIV_TIME UMUL_TIME
 #endif
-
-/* count_trailing_zeros is often on the slow side, so make that the default */
-#ifndef COUNT_TRAILING_ZEROS_TIME
-#define COUNT_TRAILING_ZEROS_TIME  15  /* cycles */
-#endif
-
-