Annotation of OpenXM_contrib/gmp/mpn/x86/divrem_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
! 2:
! 3: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22:
! 23: dnl cycles/limb
! 24: dnl K6 20
! 25: dnl P5 44
! 26: dnl P6 39
! 27: dnl 486 approx 43 maybe
! 28: dnl
! 29: dnl
! 30: dnl The following have their own optimized divrem_1 implementations, but
! 31: dnl for reference the code here runs as follows.
! 32: dnl
! 33: dnl cycles/limb
! 34: dnl P6MMX 39
! 35: dnl K7 42
! 36:
! 37:
! 38: include(`../config.m4')
! 39:
! 40:
! 41: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
! 42: C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
! 43: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
! 44: C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
! 45: C
! 46: C Divide src,size by divisor and store the quotient in dst+xsize,size.
! 47: C Extend the division to fractional quotient limbs in dst,xsize. Return the
! 48: C remainder. Either or both xsize and size can be 0.
! 49: C
! 50: C mpn_divrem_1c takes a carry parameter which is an initial high limb,
! 51: C effectively one extra limb at the top of src,size. Must have
! 52: C carry<divisor.
! 53: C
! 54: C
! 55: C Essentially the code is the same as the division based part of
! 56: C mpn/generic/divrem_1.c, but has the following advantages.
! 57: C
! 58: C - If gcc isn't being used then divrem_1.c will get the generic C
! 59: C udiv_qrnnd() and be rather slow.
! 60: C
! 61: C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
! 62: C generate that instruction (as of gcc 2.95.2 at least).
! 63: C
! 64: C A test is done to see if the high limb is less the the divisor, and if so
! 65: C one less div is done. A div is between 20 and 40 cycles on the various
! 66: C x86s, so assuming high<divisor about half the time, then this test saves
! 67: C half that amount. The branch misprediction penalty on each chip is less
! 68: C than half a div.
! 69: C
! 70: C
! 71: C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
! 72: C here, so it seems there's nothing to gain by rearranging the loop.
! 73: C Pairing the mov and loop instructions was found to gain nothing. (The
! 74: C same is true of the mpn/x86/mod_1.asm loop.)
! 75: C
! 76: C With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
! 77: C The loop_or_decljnz macro is an easy way to get a 10% speedup.
! 78: C
! 79: C The fast K6 multiply might be thought to suit a multiply-by-inverse,
! 80: C but that algorithm has been found to suffer from the releatively poor
! 81: C carry handling on K6 and too many auxiliary instructions. The
! 82: C fractional part however could be done at about 13 c/l.
! 83: C
! 84: C P5: Moving the load down to pair with the store might save 1 cycle, but
! 85: C that doesn't seem worth bothering with, since it'd be only a 2.2%
! 86: C saving.
! 87: C
! 88: C Again here the auxiliary instructions hinder a multiply-by-inverse,
! 89: C though there might be a 10-15% speedup available
! 90:
! 91:
! 92: defframe(PARAM_CARRY, 24)
! 93: defframe(PARAM_DIVISOR,20)
! 94: defframe(PARAM_SIZE, 16)
! 95: defframe(PARAM_SRC, 12)
! 96: defframe(PARAM_XSIZE, 8)
! 97: defframe(PARAM_DST, 4)
! 98:
! 99: .text
! 100: ALIGN(16)
! 101:
! 102: PROLOGUE(mpn_divrem_1c)
! 103: deflit(`FRAME',0)
! 104:
! 105: movl PARAM_SIZE, %ecx
! 106: pushl %edi FRAME_pushl()
! 107:
! 108: movl PARAM_SRC, %edi
! 109: pushl %esi FRAME_pushl()
! 110:
! 111: movl PARAM_DIVISOR, %esi
! 112: pushl %ebx FRAME_pushl()
! 113:
! 114: movl PARAM_DST, %ebx
! 115: pushl %ebp FRAME_pushl()
! 116:
! 117: movl PARAM_XSIZE, %ebp
! 118: orl %ecx, %ecx
! 119:
! 120: movl PARAM_CARRY, %edx
! 121: jz LF(mpn_divrem_1,fraction)
! 122:
! 123: leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
! 124: jmp LF(mpn_divrem_1,integer_top)
! 125:
! 126: EPILOGUE()
! 127:
! 128:
! 129: PROLOGUE(mpn_divrem_1)
! 130: deflit(`FRAME',0)
! 131:
! 132: movl PARAM_SIZE, %ecx
! 133: pushl %edi FRAME_pushl()
! 134:
! 135: movl PARAM_SRC, %edi
! 136: pushl %esi FRAME_pushl()
! 137:
! 138: movl PARAM_DIVISOR, %esi
! 139: orl %ecx,%ecx
! 140:
! 141: jz L(size_zero)
! 142: pushl %ebx FRAME_pushl()
! 143:
! 144: movl -4(%edi,%ecx,4), %eax C src high limb
! 145: xorl %edx, %edx
! 146:
! 147: movl PARAM_DST, %ebx
! 148: pushl %ebp FRAME_pushl()
! 149:
! 150: movl PARAM_XSIZE, %ebp
! 151: cmpl %esi, %eax
! 152:
! 153: leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
! 154: jae L(integer_entry)
! 155:
! 156:
! 157: C high<divisor, so high of dst is zero, and avoid one div
! 158:
! 159: movl %edx, (%ebx,%ecx,4)
! 160: decl %ecx
! 161:
! 162: movl %eax, %edx
! 163: jz L(fraction)
! 164:
! 165:
! 166: L(integer_top):
! 167: C eax scratch (quotient)
! 168: C ebx dst+4*xsize-4
! 169: C ecx counter
! 170: C edx scratch (remainder)
! 171: C esi divisor
! 172: C edi src
! 173: C ebp xsize
! 174:
! 175: movl -4(%edi,%ecx,4), %eax
! 176: L(integer_entry):
! 177:
! 178: divl %esi
! 179:
! 180: movl %eax, (%ebx,%ecx,4)
! 181: loop_or_decljnz L(integer_top)
! 182:
! 183:
! 184: L(fraction):
! 185: orl %ebp, %ecx
! 186: jz L(done)
! 187:
! 188: movl PARAM_DST, %ebx
! 189:
! 190:
! 191: L(fraction_top):
! 192: C eax scratch (quotient)
! 193: C ebx dst
! 194: C ecx counter
! 195: C edx scratch (remainder)
! 196: C esi divisor
! 197: C edi
! 198: C ebp
! 199:
! 200: xorl %eax, %eax
! 201:
! 202: divl %esi
! 203:
! 204: movl %eax, -4(%ebx,%ecx,4)
! 205: loop_or_decljnz L(fraction_top)
! 206:
! 207:
! 208: L(done):
! 209: popl %ebp
! 210: movl %edx, %eax
! 211: popl %ebx
! 212: popl %esi
! 213: popl %edi
! 214: ret
! 215:
! 216:
! 217: L(size_zero):
! 218: deflit(`FRAME',8)
! 219: movl PARAM_XSIZE, %ecx
! 220: xorl %eax, %eax
! 221:
! 222: movl PARAM_DST, %edi
! 223:
! 224: cld C better safe than sorry, see mpn/x86/README.family
! 225:
! 226: rep
! 227: stosl
! 228:
! 229: popl %esi
! 230: popl %edi
! 231: ret
! 232: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>