OpenXM_contrib/gmp/mpn/x86/divrem_1.asm - diff

Return to divrem_1.asm CVS log

Up to [local] / OpenXM_contrib / gmp / mpn / x86

Diff for /OpenXM_contrib/gmp/mpn/x86/Attic/divrem_1.asm between version 1.1.1.1 and 1.1.1.2

version 1.1.1.1, 2000/09/09 14:12:42

version 1.1.1.2, 2003/08/25 16:06:27

Line 1

dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.

dnl

dnl This file is part of the GNU MP Library.

dnl

Line 19 dnl License along with the GNU MP Library; see the fi

dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -

dnl Suite 330, Boston, MA 02111-1307, USA.

include(`../config.m4')

dnl cycles/limb

dnl K6 20

dnl P5 44

dnl P6 39

dnl 486 approx 43 maybe

dnl

dnl The following have their own optimized divrem_1 implementations, but

dnl for reference the code here runs as follows.

dnl

dnl cycles/limb

dnl P6MMX 39

dnl K7 42

C cycles/limb

C 486 approx 43 maybe

C P5 44

C P6 39

C P6MMX 39

C K6 20

C K7 42

C P4 58

include(`../config.m4')

C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,

C mp_srcptr src, mp_size_t size, mp_limb_t divisor);

C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,

C mp_srcptr src, mp_size_t size, mp_limb_t divisor);

C mp_srcptr src, mp_size_t size, mp_limb_t divisor,

C mp_limb_t carry);

C Divide src,size by divisor and store the quotient in dst+xsize,size.

C Extend the division to fractional quotient limbs in dst,xsize. Return the

Line 58 C

Line 53 C

C - If gcc isn't being used then divrem_1.c will get the generic C

C udiv_qrnnd() and be rather slow.

C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't

C - On K6, using the loop instruction is a 10% speedup, but gcc prior to 3.0

C generate that instruction (as of gcc 2.95.2 at least).

C doesn't generate that instruction.

C A test is done to see if the high limb is less the the divisor, and if so

C one less div is done. A div is between 20 and 40 cycles on the various

Line 81 C but that algorithm has been found to suffer from

Line 76 C but that algorithm has been found to suffer from

C carry handling on K6 and too many auxiliary instructions. The

C fractional part however could be done at about 13 c/l.

C P5: Moving the load down to pair with the store might save 1 cycle, but

C P5: Again here the auxiliary instructions hinder a multiply-by-inverse,

C that doesn't seem worth bothering with, since it'd be only a 2.2%

C saving.

C Again here the auxiliary instructions hinder a multiply-by-inverse,

C though there might be a 10-15% speedup available

C It might be thought that moving the load down to pair with the store

C would save 1 cycle, but that doesn't seem to happen in practice, and

C in any case would be a mere 2.2% saving, so it hardly worth bothering

C about.

defframe(PARAM_CARRY, 24)

defframe(PARAM_DIVISOR,20)

defframe(PARAM_SIZE, 16)

Line 96 defframe(PARAM_SRC, 12)

Line 91 defframe(PARAM_SRC, 12)

defframe(PARAM_XSIZE, 8)

defframe(PARAM_DST, 4)

.text

TEXT

ALIGN(16)

PROLOGUE(mpn_divrem_1c)

Line 118 deflit(`FRAME',0)

Line 113 deflit(`FRAME',0)

orl %ecx, %ecx

movl PARAM_CARRY, %edx

jz LF(mpn_divrem_1,fraction)

jz L(fraction)

leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part

jmp LF(mpn_divrem_1,integer_top)

jmp L(integer_top)

EPILOGUE()

Line 221 deflit(`FRAME',8)

Line 216 deflit(`FRAME',8)

movl PARAM_DST, %edi

cld C better safe than sorry, see mpn/x86/README.family

cld C better safe than sorry, see mpn/x86/README

rep

stosl

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>