version 1.1.1.1, 2000/09/09 14:12:42 |
version 1.1.1.2, 2003/08/25 16:06:27 |
|
|
dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. |
dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. |
|
|
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. |
dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
dnl |
dnl |
dnl This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
dnl |
dnl |
Line 19 dnl License along with the GNU MP Library; see the fi |
|
Line 19 dnl License along with the GNU MP Library; see the fi |
|
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl Suite 330, Boston, MA 02111-1307, USA. |
dnl Suite 330, Boston, MA 02111-1307, USA. |
|
|
|
include(`../config.m4') |
|
|
dnl cycles/limb |
|
dnl K6 20 |
|
dnl P5 44 |
|
dnl P6 39 |
|
dnl 486 approx 43 maybe |
|
dnl |
|
dnl |
|
dnl The following have their own optimized divrem_1 implementations, but |
|
dnl for reference the code here runs as follows. |
|
dnl |
|
dnl cycles/limb |
|
dnl P6MMX 39 |
|
dnl K7 42 |
|
|
|
|
C cycles/limb |
|
C 486 approx 43 maybe |
|
C P5 44 |
|
C P6 39 |
|
C P6MMX 39 |
|
C K6 20 |
|
C K7 42 |
|
C P4 58 |
|
|
include(`../config.m4') |
|
|
|
|
|
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, |
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, |
C mp_srcptr src, mp_size_t size, mp_limb_t divisor); |
C mp_srcptr src, mp_size_t size, mp_limb_t divisor); |
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, |
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, |
C mp_srcptr src, mp_size_t size, mp_limb_t divisor); |
C mp_srcptr src, mp_size_t size, mp_limb_t divisor, |
|
C mp_limb_t carry); |
C |
C |
C Divide src,size by divisor and store the quotient in dst+xsize,size. |
C Divide src,size by divisor and store the quotient in dst+xsize,size. |
C Extend the division to fractional quotient limbs in dst,xsize. Return the |
C Extend the division to fractional quotient limbs in dst,xsize. Return the |
|
|
C - If gcc isn't being used then divrem_1.c will get the generic C |
C - If gcc isn't being used then divrem_1.c will get the generic C |
C udiv_qrnnd() and be rather slow. |
C udiv_qrnnd() and be rather slow. |
C |
C |
C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't |
C - On K6, using the loop instruction is a 10% speedup, but gcc prior to 3.0 |
C generate that instruction (as of gcc 2.95.2 at least). |
C doesn't generate that instruction. |
C |
C |
C A test is done to see if the high limb is less the the divisor, and if so |
C A test is done to see if the high limb is less the the divisor, and if so |
C one less div is done. A div is between 20 and 40 cycles on the various |
C one less div is done. A div is between 20 and 40 cycles on the various |
Line 81 C but that algorithm has been found to suffer from |
|
Line 76 C but that algorithm has been found to suffer from |
|
C carry handling on K6 and too many auxiliary instructions. The |
C carry handling on K6 and too many auxiliary instructions. The |
C fractional part however could be done at about 13 c/l. |
C fractional part however could be done at about 13 c/l. |
C |
C |
C P5: Moving the load down to pair with the store might save 1 cycle, but |
C P5: Again here the auxiliary instructions hinder a multiply-by-inverse, |
C that doesn't seem worth bothering with, since it'd be only a 2.2% |
|
C saving. |
|
C |
|
C Again here the auxiliary instructions hinder a multiply-by-inverse, |
|
C though there might be a 10-15% speedup available |
C though there might be a 10-15% speedup available |
|
C |
|
C It might be thought that moving the load down to pair with the store |
|
C would save 1 cycle, but that doesn't seem to happen in practice, and |
|
C in any case would be a mere 2.2% saving, so it hardly worth bothering |
|
C about. |
|
|
|
|
defframe(PARAM_CARRY, 24) |
defframe(PARAM_CARRY, 24) |
defframe(PARAM_DIVISOR,20) |
defframe(PARAM_DIVISOR,20) |
defframe(PARAM_SIZE, 16) |
defframe(PARAM_SIZE, 16) |
Line 96 defframe(PARAM_SRC, 12) |
|
Line 91 defframe(PARAM_SRC, 12) |
|
defframe(PARAM_XSIZE, 8) |
defframe(PARAM_XSIZE, 8) |
defframe(PARAM_DST, 4) |
defframe(PARAM_DST, 4) |
|
|
.text |
TEXT |
ALIGN(16) |
ALIGN(16) |
|
|
PROLOGUE(mpn_divrem_1c) |
PROLOGUE(mpn_divrem_1c) |
Line 118 deflit(`FRAME',0) |
|
Line 113 deflit(`FRAME',0) |
|
orl %ecx, %ecx |
orl %ecx, %ecx |
|
|
movl PARAM_CARRY, %edx |
movl PARAM_CARRY, %edx |
jz LF(mpn_divrem_1,fraction) |
jz L(fraction) |
|
|
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part |
leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part |
jmp LF(mpn_divrem_1,integer_top) |
jmp L(integer_top) |
|
|
EPILOGUE() |
EPILOGUE() |
|
|
Line 221 deflit(`FRAME',8) |
|
Line 216 deflit(`FRAME',8) |
|
|
|
movl PARAM_DST, %edi |
movl PARAM_DST, %edi |
|
|
cld C better safe than sorry, see mpn/x86/README.family |
cld C better safe than sorry, see mpn/x86/README |
|
|
rep |
rep |
stosl |
stosl |