[BACK]Return to divrem_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86

Annotation of OpenXM_contrib/gmp/mpn/x86/divrem_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
        !             2:
        !             3: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22:
        !            23: dnl        cycles/limb
        !            24: dnl  K6        20
        !            25: dnl  P5        44
        !            26: dnl  P6        39
        !            27: dnl  486   approx 43 maybe
        !            28: dnl
        !            29: dnl
        !            30: dnl  The following have their own optimized divrem_1 implementations, but
        !            31: dnl  for reference the code here runs as follows.
        !            32: dnl
        !            33: dnl        cycles/limb
        !            34: dnl  P6MMX     39
        !            35: dnl  K7        42
        !            36:
        !            37:
        !            38: include(`../config.m4')
        !            39:
        !            40:
        !            41: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
        !            42: C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
        !            43: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
        !            44: C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor);
        !            45: C
        !            46: C Divide src,size by divisor and store the quotient in dst+xsize,size.
        !            47: C Extend the division to fractional quotient limbs in dst,xsize.  Return the
        !            48: C remainder.  Either or both xsize and size can be 0.
        !            49: C
        !            50: C mpn_divrem_1c takes a carry parameter which is an initial high limb,
        !            51: C effectively one extra limb at the top of src,size.  Must have
        !            52: C carry<divisor.
        !            53: C
        !            54: C
        !            55: C Essentially the code is the same as the division based part of
        !            56: C mpn/generic/divrem_1.c, but has the following advantages.
        !            57: C
        !            58: C - If gcc isn't being used then divrem_1.c will get the generic C
        !            59: C   udiv_qrnnd() and be rather slow.
        !            60: C
        !            61: C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
        !            62: C   generate that instruction (as of gcc 2.95.2 at least).
        !            63: C
        !            64: C A test is done to see if the high limb is less the the divisor, and if so
        !            65: C one less div is done.  A div is between 20 and 40 cycles on the various
        !            66: C x86s, so assuming high<divisor about half the time, then this test saves
        !            67: C half that amount.  The branch misprediction penalty on each chip is less
        !            68: C than half a div.
        !            69: C
        !            70: C
        !            71: C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
        !            72: C     here, so it seems there's nothing to gain by rearranging the loop.
        !            73: C     Pairing the mov and loop instructions was found to gain nothing.  (The
        !            74: C     same is true of the mpn/x86/mod_1.asm loop.)
        !            75: C
        !            76: C     With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
        !            77: C     The loop_or_decljnz macro is an easy way to get a 10% speedup.
        !            78: C
        !            79: C     The fast K6 multiply might be thought to suit a multiply-by-inverse,
        !            80: C     but that algorithm has been found to suffer from the releatively poor
        !            81: C     carry handling on K6 and too many auxiliary instructions.  The
        !            82: C     fractional part however could be done at about 13 c/l.
        !            83: C
        !            84: C P5: Moving the load down to pair with the store might save 1 cycle, but
        !            85: C     that doesn't seem worth bothering with, since it'd be only a 2.2%
        !            86: C     saving.
        !            87: C
        !            88: C     Again here the auxiliary instructions hinder a multiply-by-inverse,
        !            89: C     though there might be a 10-15% speedup available
        !            90:
        !            91:
        !            92: defframe(PARAM_CARRY,  24)
        !            93: defframe(PARAM_DIVISOR,20)
        !            94: defframe(PARAM_SIZE,   16)
        !            95: defframe(PARAM_SRC,    12)
        !            96: defframe(PARAM_XSIZE,  8)
        !            97: defframe(PARAM_DST,    4)
        !            98:
        !            99:        .text
        !           100:        ALIGN(16)
        !           101:
        !           102: PROLOGUE(mpn_divrem_1c)
        !           103: deflit(`FRAME',0)
        !           104:
        !           105:        movl    PARAM_SIZE, %ecx
        !           106:        pushl   %edi            FRAME_pushl()
        !           107:
        !           108:        movl    PARAM_SRC, %edi
        !           109:        pushl   %esi            FRAME_pushl()
        !           110:
        !           111:        movl    PARAM_DIVISOR, %esi
        !           112:        pushl   %ebx            FRAME_pushl()
        !           113:
        !           114:        movl    PARAM_DST, %ebx
        !           115:        pushl   %ebp            FRAME_pushl()
        !           116:
        !           117:        movl    PARAM_XSIZE, %ebp
        !           118:        orl     %ecx, %ecx
        !           119:
        !           120:        movl    PARAM_CARRY, %edx
        !           121:        jz      LF(mpn_divrem_1,fraction)
        !           122:
        !           123:        leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part
        !           124:        jmp     LF(mpn_divrem_1,integer_top)
        !           125:
        !           126: EPILOGUE()
        !           127:
        !           128:
        !           129: PROLOGUE(mpn_divrem_1)
        !           130: deflit(`FRAME',0)
        !           131:
        !           132:        movl    PARAM_SIZE, %ecx
        !           133:        pushl   %edi            FRAME_pushl()
        !           134:
        !           135:        movl    PARAM_SRC, %edi
        !           136:        pushl   %esi            FRAME_pushl()
        !           137:
        !           138:        movl    PARAM_DIVISOR, %esi
        !           139:        orl     %ecx,%ecx
        !           140:
        !           141:        jz      L(size_zero)
        !           142:        pushl   %ebx            FRAME_pushl()
        !           143:
        !           144:        movl    -4(%edi,%ecx,4), %eax   C src high limb
        !           145:        xorl    %edx, %edx
        !           146:
        !           147:        movl    PARAM_DST, %ebx
        !           148:        pushl   %ebp            FRAME_pushl()
        !           149:
        !           150:        movl    PARAM_XSIZE, %ebp
        !           151:        cmpl    %esi, %eax
        !           152:
        !           153:        leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part
        !           154:        jae     L(integer_entry)
        !           155:
        !           156:
        !           157:        C high<divisor, so high of dst is zero, and avoid one div
        !           158:
        !           159:        movl    %edx, (%ebx,%ecx,4)
        !           160:        decl    %ecx
        !           161:
        !           162:        movl    %eax, %edx
        !           163:        jz      L(fraction)
        !           164:
        !           165:
        !           166: L(integer_top):
        !           167:        C eax   scratch (quotient)
        !           168:        C ebx   dst+4*xsize-4
        !           169:        C ecx   counter
        !           170:        C edx   scratch (remainder)
        !           171:        C esi   divisor
        !           172:        C edi   src
        !           173:        C ebp   xsize
        !           174:
        !           175:        movl    -4(%edi,%ecx,4), %eax
        !           176: L(integer_entry):
        !           177:
        !           178:        divl    %esi
        !           179:
        !           180:        movl    %eax, (%ebx,%ecx,4)
        !           181:        loop_or_decljnz L(integer_top)
        !           182:
        !           183:
        !           184: L(fraction):
        !           185:        orl     %ebp, %ecx
        !           186:        jz      L(done)
        !           187:
        !           188:        movl    PARAM_DST, %ebx
        !           189:
        !           190:
        !           191: L(fraction_top):
        !           192:        C eax   scratch (quotient)
        !           193:        C ebx   dst
        !           194:        C ecx   counter
        !           195:        C edx   scratch (remainder)
        !           196:        C esi   divisor
        !           197:        C edi
        !           198:        C ebp
        !           199:
        !           200:        xorl    %eax, %eax
        !           201:
        !           202:        divl    %esi
        !           203:
        !           204:        movl    %eax, -4(%ebx,%ecx,4)
        !           205:        loop_or_decljnz L(fraction_top)
        !           206:
        !           207:
        !           208: L(done):
        !           209:        popl    %ebp
        !           210:        movl    %edx, %eax
        !           211:        popl    %ebx
        !           212:        popl    %esi
        !           213:        popl    %edi
        !           214:        ret
        !           215:
        !           216:
        !           217: L(size_zero):
        !           218: deflit(`FRAME',8)
        !           219:        movl    PARAM_XSIZE, %ecx
        !           220:        xorl    %eax, %eax
        !           221:
        !           222:        movl    PARAM_DST, %edi
        !           223:
        !           224:        cld     C better safe than sorry, see mpn/x86/README.family
        !           225:
        !           226:        rep
        !           227:        stosl
        !           228:
        !           229:        popl    %esi
        !           230:        popl    %edi
        !           231:        ret
        !           232: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>