Annotation of OpenXM_contrib/gmp/mpn/x86/k7/dive_1.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
! 2:
! 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C K7: 11.0 cycles/limb
! 26:
! 27:
! 28: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 29: C mp_limb_t divisor);
! 30: C
! 31: C The dependent chain is mul+imul+sub for 11 cycles and that speed is
! 32: C achieved with no special effort. The load and shrld latencies are hidden
! 33: C by out of order execution.
! 34: C
! 35: C It's a touch faster on size==1 to use the mul-by-inverse than divl.
! 36:
! 37: defframe(PARAM_DIVISOR,16)
! 38: defframe(PARAM_SIZE, 12)
! 39: defframe(PARAM_SRC, 8)
! 40: defframe(PARAM_DST, 4)
! 41:
! 42: defframe(SAVE_EBX, -4)
! 43: defframe(SAVE_ESI, -8)
! 44: defframe(SAVE_EDI, -12)
! 45: defframe(SAVE_EBP, -16)
! 46: defframe(VAR_INVERSE, -20)
! 47: defframe(VAR_DST_END, -24)
! 48:
! 49: deflit(STACK_SPACE, 24)
! 50:
! 51: TEXT
! 52:
! 53: ALIGN(16)
! 54: PROLOGUE(mpn_divexact_1)
! 55: deflit(`FRAME',0)
! 56:
! 57: movl PARAM_DIVISOR, %eax
! 58: subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
! 59: movl $-1, %ecx C shift count
! 60:
! 61: movl %ebp, SAVE_EBP
! 62: movl PARAM_SIZE, %ebp
! 63:
! 64: movl %esi, SAVE_ESI
! 65: movl %edi, SAVE_EDI
! 66:
! 67: C If there's usually only one or two trailing zero bits then this
! 68: C should be faster than bsfl.
! 69: L(strip_twos):
! 70: incl %ecx
! 71: shrl %eax
! 72: jnc L(strip_twos)
! 73:
! 74: movl %ebx, SAVE_EBX
! 75: leal 1(%eax,%eax), %ebx C d without twos
! 76: andl $127, %eax C d/2, 7 bits
! 77:
! 78: ifdef(`PIC',`
! 79: call L(movl_eip_edx)
! 80:
! 81: addl $_GLOBAL_OFFSET_TABLE_, %edx
! 82:
! 83: movl modlimb_invert_table@GOT(%edx), %edx
! 84:
! 85: movzbl (%eax,%edx), %eax C inv 8 bits
! 86: ',`
! 87: dnl non-PIC
! 88: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
! 89: ')
! 90:
! 91: leal (%eax,%eax), %edx C 2*inv
! 92: movl %ebx, PARAM_DIVISOR C d without twos
! 93:
! 94: imull %eax, %eax C inv*inv
! 95:
! 96: movl PARAM_SRC, %esi
! 97: movl PARAM_DST, %edi
! 98:
! 99: imull %ebx, %eax C inv*inv*d
! 100:
! 101: subl %eax, %edx C inv = 2*inv - inv*inv*d
! 102: leal (%edx,%edx), %eax C 2*inv
! 103:
! 104: imull %edx, %edx C inv*inv
! 105:
! 106: leal (%esi,%ebp,4), %esi C src end
! 107: leal (%edi,%ebp,4), %edi C dst end
! 108: negl %ebp C -size
! 109:
! 110: imull %ebx, %edx C inv*inv*d
! 111:
! 112: subl %edx, %eax C inv = 2*inv - inv*inv*d
! 113:
! 114: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 115: pushl %eax FRAME_pushl()
! 116: imull PARAM_DIVISOR, %eax
! 117: cmpl $1, %eax
! 118: popl %eax FRAME_popl()')
! 119:
! 120: movl %eax, VAR_INVERSE
! 121: movl (%esi,%ebp,4), %eax C src[0]
! 122:
! 123: incl %ebp
! 124: jz L(one)
! 125:
! 126: movl (%esi,%ebp,4), %edx C src[1]
! 127:
! 128: shrdl( %cl, %edx, %eax)
! 129:
! 130: movl %edi, VAR_DST_END
! 131: xorl %ebx, %ebx
! 132: jmp L(entry)
! 133:
! 134: ifdef(`PIC',`
! 135: L(movl_eip_edx):
! 136: movl (%esp), %edx
! 137: ret
! 138: ')
! 139:
! 140: ALIGN(8)
! 141: L(top):
! 142: C eax q
! 143: C ebx carry bit, 0 or 1
! 144: C ecx shift
! 145: C edx
! 146: C esi src end
! 147: C edi dst end
! 148: C ebp counter, limbs, negative
! 149:
! 150: mull PARAM_DIVISOR C carry limb in edx
! 151:
! 152: movl -4(%esi,%ebp,4), %eax
! 153: movl (%esi,%ebp,4), %edi
! 154:
! 155: shrdl( %cl, %edi, %eax)
! 156:
! 157: subl %ebx, %eax C apply carry bit
! 158: setc %bl
! 159: movl VAR_DST_END, %edi
! 160:
! 161: subl %edx, %eax C apply carry limb
! 162: adcl $0, %ebx
! 163:
! 164: L(entry):
! 165: imull VAR_INVERSE, %eax
! 166:
! 167: movl %eax, -4(%edi,%ebp,4)
! 168: incl %ebp
! 169: jnz L(top)
! 170:
! 171:
! 172: mull PARAM_DIVISOR C carry limb in edx
! 173:
! 174: movl -4(%esi), %eax C src high limb
! 175: shrl %cl, %eax
! 176: movl SAVE_ESI, %esi
! 177:
! 178: subl %ebx, %eax C apply carry bit
! 179: movl SAVE_EBX, %ebx
! 180: movl SAVE_EBP, %ebp
! 181:
! 182: subl %edx, %eax C apply carry limb
! 183:
! 184: imull VAR_INVERSE, %eax
! 185:
! 186: movl %eax, -4(%edi)
! 187: movl SAVE_EDI, %edi
! 188: addl $STACK_SPACE, %esp
! 189:
! 190: ret
! 191:
! 192:
! 193: L(one):
! 194: shrl %cl, %eax
! 195: movl SAVE_ESI, %esi
! 196: movl SAVE_EBX, %ebx
! 197:
! 198: imull VAR_INVERSE, %eax
! 199:
! 200: movl SAVE_EBP, %ebp
! 201: movl %eax, -4(%edi)
! 202:
! 203: movl SAVE_EDI, %edi
! 204: addl $STACK_SPACE, %esp
! 205:
! 206: ret
! 207:
! 208: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>