Annotation of OpenXM_contrib/gmp/mpn/x86/k6/pre_mod_1.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
! 2:
! 3: dnl Copyright 2000, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C K6: 18.0 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
! 29: C mp_limb_t inverse);
! 30: C
! 31: C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
! 32: C considered worthwhile (just).
! 33: C
! 34: C Future:
! 35: C
! 36: C In theory this code should be made available in mod_1 and mod_1c, but it
! 37: C would take quite a while to overcome the time to calculate an inverse.
! 38: C The threshold would probably be around 20 limbs, or around 30 for an
! 39: C unnormalized divisor.
! 40:
! 41: defframe(PARAM_INVERSE,16)
! 42: defframe(PARAM_DIVISOR,12)
! 43: defframe(PARAM_SIZE, 8)
! 44: defframe(PARAM_SRC, 4)
! 45:
! 46: TEXT
! 47: ALIGN(32)
! 48: PROLOGUE(mpn_preinv_mod_1)
! 49: deflit(`FRAME',0)
! 50:
! 51: ASSERT(ae,`cmpl $1, PARAM_SIZE')
! 52: ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
! 53:
! 54: movl PARAM_SIZE, %ecx
! 55: pushl %ebp FRAME_pushl()
! 56:
! 57: movl PARAM_SRC, %ebp
! 58: pushl %edi FRAME_pushl()
! 59:
! 60: movl PARAM_DIVISOR, %eax
! 61: pushl %esi FRAME_pushl()
! 62:
! 63: movl -4(%ebp,%ecx,4), %esi C src high limb
! 64: pushl %ebx FRAME_pushl()
! 65:
! 66: movl %edx, %edi C first n2 to cancel
! 67: subl %eax, %esi C first n1 = high-divisor
! 68:
! 69: decl %ecx
! 70: jz L(done_sbbl)
! 71:
! 72: L(top):
! 73: C eax scratch
! 74: C ebx n10, nadj, q1
! 75: C ecx counter, size to 1
! 76: C edx scratch
! 77: C esi n2
! 78: C edi old high, for underflow test
! 79: C ebp src
! 80:
! 81: sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
! 82:
! 83: L(entry):
! 84: andl PARAM_DIVISOR, %edi
! 85: L(q1_ff_top):
! 86: movl -4(%ebp,%ecx,4), %ebx
! 87:
! 88: addl %esi, %edi C possible addback
! 89: movl %ebx, %esi C n10
! 90:
! 91: sarl $31, %ebx C -n1 = 0 or -1
! 92: movl %edi, %eax C n2
! 93:
! 94: movl PARAM_INVERSE, %edx
! 95: subl %ebx, %eax C n2+n1
! 96:
! 97: mull %edx C m*(n2+n1)
! 98:
! 99: andl PARAM_DIVISOR, %ebx C -n1 & d
! 100: addl %esi, %ebx C nadj = n10 + (-n1&d), ignoring overflow
! 101:
! 102: addl %ebx, %eax C low m*(n2+n1) + nadj, giving carry flag
! 103: leal 1(%edi), %ebx C n2+1
! 104:
! 105: adcl %ebx, %edx C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
! 106:
! 107: movl PARAM_DIVISOR, %eax C d
! 108: jz L(q1_ff)
! 109:
! 110: mull %edx C (q1+1)*d
! 111:
! 112: subl %eax, %esi C low n-(q1+1)*d
! 113: loop L(top)
! 114:
! 115:
! 116:
! 117: L(done_sbbl):
! 118: sbbl %edx, %edi C high n-(q1+1)*d, 0 or -1
! 119:
! 120: andl PARAM_DIVISOR, %edi
! 121: L(done_esi_edi):
! 122: popl %ebx
! 123:
! 124: leal (%esi,%edi), %eax
! 125: popl %esi
! 126:
! 127: popl %edi
! 128: popl %ebp
! 129:
! 130: ret
! 131:
! 132:
! 133: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
! 134: C of q*d is simply -d and the remainder n-q*d = n10+d. This is rarely
! 135: C reached.
! 136:
! 137: L(q1_ff):
! 138: movl PARAM_DIVISOR, %edi
! 139: loop L(q1_ff_top)
! 140:
! 141: jmp L(done_esi_edi)
! 142:
! 143:
! 144: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>