[BACK]Return to divrem_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86

Diff for /OpenXM_contrib/gmp/mpn/x86/Attic/divrem_1.asm between version 1.1.1.1 and 1.1.1.2

version 1.1.1.1, 2000/09/09 14:12:42 version 1.1.1.2, 2003/08/25 16:06:27
Line 1 
Line 1 
 dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.  dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
   
 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.  dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
 dnl  dnl
 dnl  This file is part of the GNU MP Library.  dnl  This file is part of the GNU MP Library.
 dnl  dnl
Line 19  dnl  License along with the GNU MP Library; see the fi
Line 19  dnl  License along with the GNU MP Library; see the fi
 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -  dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
 dnl  Suite 330, Boston, MA 02111-1307, USA.  dnl  Suite 330, Boston, MA 02111-1307, USA.
   
   include(`../config.m4')
   
 dnl        cycles/limb  
 dnl  K6        20  
 dnl  P5        44  
 dnl  P6        39  
 dnl  486   approx 43 maybe  
 dnl  
 dnl  
 dnl  The following have their own optimized divrem_1 implementations, but  
 dnl  for reference the code here runs as follows.  
 dnl  
 dnl        cycles/limb  
 dnl  P6MMX     39  
 dnl  K7        42  
   
   C       cycles/limb
   C 486   approx 43 maybe
   C P5        44
   C P6        39
   C P6MMX     39
   C K6        20
   C K7        42
   C P4        58
   
 include(`../config.m4')  
   
   
 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,  C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
 C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);  C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,  C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
 C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor);  C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
   C                          mp_limb_t carry);
 C  C
 C Divide src,size by divisor and store the quotient in dst+xsize,size.  C Divide src,size by divisor and store the quotient in dst+xsize,size.
 C Extend the division to fractional quotient limbs in dst,xsize.  Return the  C Extend the division to fractional quotient limbs in dst,xsize.  Return the
Line 58  C
Line 53  C
 C - If gcc isn't being used then divrem_1.c will get the generic C  C - If gcc isn't being used then divrem_1.c will get the generic C
 C   udiv_qrnnd() and be rather slow.  C   udiv_qrnnd() and be rather slow.
 C  C
 C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't  C - On K6, using the loop instruction is a 10% speedup, but gcc prior to 3.0
 C   generate that instruction (as of gcc 2.95.2 at least).  C   doesn't generate that instruction.
 C  C
 C A test is done to see if the high limb is less the the divisor, and if so  C A test is done to see if the high limb is less the the divisor, and if so
 C one less div is done.  A div is between 20 and 40 cycles on the various  C one less div is done.  A div is between 20 and 40 cycles on the various
Line 81  C     but that algorithm has been found to suffer from
Line 76  C     but that algorithm has been found to suffer from
 C     carry handling on K6 and too many auxiliary instructions.  The  C     carry handling on K6 and too many auxiliary instructions.  The
 C     fractional part however could be done at about 13 c/l.  C     fractional part however could be done at about 13 c/l.
 C  C
 C P5: Moving the load down to pair with the store might save 1 cycle, but  C P5: Again here the auxiliary instructions hinder a multiply-by-inverse,
 C     that doesn't seem worth bothering with, since it'd be only a 2.2%  
 C     saving.  
 C  
 C     Again here the auxiliary instructions hinder a multiply-by-inverse,  
 C     though there might be a 10-15% speedup available  C     though there might be a 10-15% speedup available
   C
   C     It might be thought that moving the load down to pair with the store
   C     would save 1 cycle, but that doesn't seem to happen in practice, and
   C     in any case would be a mere 2.2% saving, so it hardly worth bothering
   C     about.
   
   
 defframe(PARAM_CARRY,  24)  defframe(PARAM_CARRY,  24)
 defframe(PARAM_DIVISOR,20)  defframe(PARAM_DIVISOR,20)
 defframe(PARAM_SIZE,   16)  defframe(PARAM_SIZE,   16)
Line 96  defframe(PARAM_SRC,    12)
Line 91  defframe(PARAM_SRC,    12)
 defframe(PARAM_XSIZE,  8)  defframe(PARAM_XSIZE,  8)
 defframe(PARAM_DST,    4)  defframe(PARAM_DST,    4)
   
         .text          TEXT
         ALIGN(16)          ALIGN(16)
   
 PROLOGUE(mpn_divrem_1c)  PROLOGUE(mpn_divrem_1c)
Line 118  deflit(`FRAME',0)
Line 113  deflit(`FRAME',0)
         orl     %ecx, %ecx          orl     %ecx, %ecx
   
         movl    PARAM_CARRY, %edx          movl    PARAM_CARRY, %edx
         jz      LF(mpn_divrem_1,fraction)          jz      L(fraction)
   
         leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part          leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part
         jmp     LF(mpn_divrem_1,integer_top)          jmp     L(integer_top)
   
 EPILOGUE()  EPILOGUE()
   
Line 221  deflit(`FRAME',8)
Line 216  deflit(`FRAME',8)
   
         movl    PARAM_DST, %edi          movl    PARAM_DST, %edi
   
         cld     C better safe than sorry, see mpn/x86/README.family          cld     C better safe than sorry, see mpn/x86/README
   
         rep          rep
         stosl          stosl

Legend:
Removed from v.1.1.1.1  
changed lines
  Added in v.1.1.1.2

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>