Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/dive_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
! 2:
! 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C divisor
! 26: C odd even
! 27: C P54: 24.5 30.5 cycles/limb
! 28: C P55: 23.0 28.0
! 29:
! 30:
! 31: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 32: C mp_limb_t divisor);
! 33: C
! 34: C Plain divl is used for small sizes, since the inverse takes a while to
! 35: C setup. Multiplying works out faster for size>=3 when the divisor is odd,
! 36: C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
! 37: C size==3 for even are about the same speed for both divl or mul, but the
! 38: C former is used since it will use up less code cache.
! 39: C
! 40: C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
! 41: C expected. On P54 in the even case the shrdl pairing nonsense (see
! 42: C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
! 43: C further 1.5 slowdown for both odd and even.
! 44:
! 45: defframe(PARAM_DIVISOR,16)
! 46: defframe(PARAM_SIZE, 12)
! 47: defframe(PARAM_SRC, 8)
! 48: defframe(PARAM_DST, 4)
! 49:
! 50: dnl re-use parameter space
! 51: define(VAR_INVERSE,`PARAM_DST')
! 52:
! 53: TEXT
! 54:
! 55: ALIGN(32)
! 56: PROLOGUE(mpn_divexact_1)
! 57: deflit(`FRAME',0)
! 58:
! 59: movl PARAM_DIVISOR, %eax
! 60: movl PARAM_SIZE, %ecx
! 61:
! 62: pushl %esi FRAME_pushl()
! 63: push %edi FRAME_pushl()
! 64:
! 65: movl PARAM_SRC, %esi
! 66: andl $1, %eax
! 67:
! 68: movl PARAM_DST, %edi
! 69: addl %ecx, %eax C size if even, size+1 if odd
! 70:
! 71: cmpl $4, %eax
! 72: jae L(mul_by_inverse)
! 73:
! 74:
! 75: xorl %edx, %edx
! 76: L(div_top):
! 77: movl -4(%esi,%ecx,4), %eax
! 78:
! 79: divl PARAM_DIVISOR
! 80:
! 81: movl %eax, -4(%edi,%ecx,4)
! 82: decl %ecx
! 83:
! 84: jnz L(div_top)
! 85:
! 86: popl %edi
! 87: popl %esi
! 88:
! 89: ret
! 90:
! 91:
! 92:
! 93: L(mul_by_inverse):
! 94: movl PARAM_DIVISOR, %eax
! 95: movl $-1, %ecx
! 96:
! 97: L(strip_twos):
! 98: ASSERT(nz, `orl %eax, %eax')
! 99: shrl %eax
! 100: incl %ecx C shift count
! 101:
! 102: jnc L(strip_twos)
! 103:
! 104: leal 1(%eax,%eax), %edx C d
! 105: andl $127, %eax C d/2, 7 bits
! 106:
! 107: pushl %ebx FRAME_pushl()
! 108: pushl %ebp FRAME_pushl()
! 109:
! 110: ifdef(`PIC',`
! 111: call L(here)
! 112: L(here):
! 113: popl %ebp C eip
! 114:
! 115: addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
! 116: C AGI
! 117: movl modlimb_invert_table@GOT(%ebp), %ebp
! 118: C AGI
! 119: movzbl (%eax,%ebp), %eax
! 120: ',`
! 121:
! 122: dnl non-PIC
! 123: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
! 124: ')
! 125:
! 126: movl %eax, %ebp C inv
! 127: addl %eax, %eax C 2*inv
! 128:
! 129: imull %ebp, %ebp C inv*inv
! 130:
! 131: imull %edx, %ebp C inv*inv*d
! 132:
! 133: subl %ebp, %eax C inv = 2*inv - inv*inv*d
! 134: movl PARAM_SIZE, %ebx
! 135:
! 136: movl %eax, %ebp
! 137: addl %eax, %eax C 2*inv
! 138:
! 139: imull %ebp, %ebp C inv*inv
! 140:
! 141: imull %edx, %ebp C inv*inv*d
! 142:
! 143: subl %ebp, %eax C inv = 2*inv - inv*inv*d
! 144: movl %edx, PARAM_DIVISOR C d without twos
! 145:
! 146: leal (%esi,%ebx,4), %esi C src end
! 147: leal (%edi,%ebx,4), %edi C dst end
! 148:
! 149: negl %ebx C -size
! 150:
! 151: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 152: pushl %eax FRAME_pushl()
! 153: imull PARAM_DIVISOR, %eax
! 154: cmpl $1, %eax
! 155: popl %eax FRAME_popl()')
! 156:
! 157: movl %eax, VAR_INVERSE
! 158: xorl %ebp, %ebp C initial carry bit
! 159:
! 160: movl (%esi,%ebx,4), %eax C src low limb
! 161: orl %ecx, %ecx C shift
! 162:
! 163: movl 4(%esi,%ebx,4), %edx C src second limb (for even)
! 164: jz L(odd_entry)
! 165:
! 166: shrdl( %cl, %edx, %eax)
! 167:
! 168: incl %ebx
! 169: jmp L(even_entry)
! 170:
! 171:
! 172: ALIGN(8)
! 173: L(odd_top):
! 174: C eax scratch
! 175: C ebx counter, limbs, negative
! 176: C ecx
! 177: C edx
! 178: C esi src end
! 179: C edi dst end
! 180: C ebp carry bit, 0 or -1
! 181:
! 182: mull PARAM_DIVISOR
! 183:
! 184: movl (%esi,%ebx,4), %eax
! 185: subl %ebp, %edx
! 186:
! 187: subl %edx, %eax
! 188:
! 189: sbbl %ebp, %ebp
! 190:
! 191: L(odd_entry):
! 192: imull VAR_INVERSE, %eax
! 193:
! 194: movl %eax, (%edi,%ebx,4)
! 195:
! 196: incl %ebx
! 197: jnz L(odd_top)
! 198:
! 199:
! 200: popl %ebp
! 201: popl %ebx
! 202:
! 203: popl %edi
! 204: popl %esi
! 205:
! 206: ret
! 207:
! 208:
! 209: L(even_top):
! 210: C eax scratch
! 211: C ebx counter, limbs, negative
! 212: C ecx twos
! 213: C edx
! 214: C esi src end
! 215: C edi dst end
! 216: C ebp carry bit, 0 or -1
! 217:
! 218: mull PARAM_DIVISOR
! 219:
! 220: subl %ebp, %edx C carry bit
! 221: movl -4(%esi,%ebx,4), %eax C src limb
! 222:
! 223: movl (%esi,%ebx,4), %ebp C and one above it
! 224:
! 225: shrdl( %cl, %ebp, %eax)
! 226:
! 227: subl %edx, %eax C carry limb
! 228:
! 229: sbbl %ebp, %ebp
! 230:
! 231: L(even_entry):
! 232: imull VAR_INVERSE, %eax
! 233:
! 234: movl %eax, -4(%edi,%ebx,4)
! 235: incl %ebx
! 236:
! 237: jnz L(even_top)
! 238:
! 239:
! 240:
! 241: mull PARAM_DIVISOR
! 242:
! 243: movl -4(%esi), %eax C src high limb
! 244: subl %ebp, %edx
! 245:
! 246: shrl %cl, %eax
! 247:
! 248: subl %edx, %eax C no carry if division is exact
! 249:
! 250: imull VAR_INVERSE, %eax
! 251:
! 252: movl %eax, -4(%edi) C dst high limb
! 253: nop C protect against cache bank clash
! 254:
! 255: popl %ebp
! 256: popl %ebx
! 257:
! 258: popl %edi
! 259: popl %esi
! 260:
! 261: ret
! 262:
! 263: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>