Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mode1o.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C K7: 11.0 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
! 29: C mp_limb_t divisor);
! 30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t divisor, mp_limb_t carry);
! 32: C
! 33: C With the loop running at just 11 cycles it doesn't seem worth bothering to
! 34: C check for high<divisor to save one step.
! 35: C
! 36: C Using a divl for size==1 measures slower than the modexact method, which
! 37: C is not too surprising since for the latter it's only about 24 cycles to
! 38: C calculate the modular inverse.
! 39:
! 40: defframe(PARAM_CARRY, 16)
! 41: defframe(PARAM_DIVISOR,12)
! 42: defframe(PARAM_SIZE, 8)
! 43: defframe(PARAM_SRC, 4)
! 44:
! 45: defframe(SAVE_EBX, -4)
! 46: defframe(SAVE_ESI, -8)
! 47: defframe(SAVE_EDI, -12)
! 48: defframe(SAVE_EBP, -16)
! 49:
! 50: deflit(STACK_SPACE, 16)
! 51:
! 52: TEXT
! 53:
! 54: ALIGN(16)
! 55: PROLOGUE(mpn_modexact_1c_odd)
! 56: deflit(`FRAME',0)
! 57:
! 58: movl PARAM_CARRY, %ecx
! 59: jmp L(start_1c)
! 60:
! 61:
! 62: ifdef(`PIC',`
! 63: L(movl_eip_edi):
! 64: movl (%esp), %edi
! 65: ret
! 66: ')
! 67:
! 68: EPILOGUE()
! 69:
! 70:
! 71: ALIGN(16)
! 72: PROLOGUE(mpn_modexact_1_odd)
! 73: deflit(`FRAME',0)
! 74:
! 75: xorl %ecx, %ecx
! 76: L(start_1c):
! 77: movl PARAM_DIVISOR, %eax
! 78: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
! 79:
! 80: movl %esi, SAVE_ESI
! 81: movl PARAM_DIVISOR, %esi
! 82:
! 83: movl %edi, SAVE_EDI
! 84:
! 85: shrl %eax C d/2
! 86:
! 87: andl $127, %eax
! 88:
! 89: ifdef(`PIC',`
! 90: call L(movl_eip_edi)
! 91: addl $_GLOBAL_OFFSET_TABLE_, %edi
! 92: movl modlimb_invert_table@GOT(%edi), %edi
! 93: movzbl (%eax,%edi), %edi C inv 8 bits
! 94:
! 95: ',`
! 96: dnl non-PIC
! 97: movzbl modlimb_invert_table(%eax), %edi C inv 8 bits
! 98: ')
! 99:
! 100: xorl %edx, %edx C initial extra carry
! 101: leal (%edi,%edi), %eax C 2*inv
! 102:
! 103: imull %edi, %edi C inv*inv
! 104:
! 105: movl %ebp, SAVE_EBP
! 106: movl PARAM_SIZE, %ebp
! 107:
! 108: movl %ebx, SAVE_EBX
! 109: movl PARAM_SRC, %ebx
! 110:
! 111: imull %esi, %edi C inv*inv*d
! 112:
! 113: subl %edi, %eax C inv = 2*inv - inv*inv*d
! 114: leal (%eax,%eax), %edi C 2*inv
! 115:
! 116: imull %eax, %eax C inv*inv
! 117:
! 118: imull %esi, %eax C inv*inv*d
! 119:
! 120: leal (%ebx,%ebp,4), %ebx C src end
! 121: negl %ebp C -size
! 122:
! 123: subl %eax, %edi C inv = 2*inv - inv*inv*d
! 124:
! 125: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 126: movl %esi, %eax
! 127: imull %edi, %eax
! 128: cmpl $1, %eax')
! 129:
! 130:
! 131: C The dependent chain here is
! 132: C
! 133: C cycles
! 134: C subl %edx, %eax 1
! 135: C imull %edi, %eax 4
! 136: C mull %esi 6 (high limb)
! 137: C ----
! 138: C total 11
! 139: C
! 140: C Out of order execution hides the load latency for the source data, so no
! 141: C special scheduling is required.
! 142:
! 143: L(top):
! 144: C eax src limb
! 145: C ebx src end ptr
! 146: C ecx next carry bit, 0 or 1 (or initial carry param)
! 147: C edx carry limb, high of last product
! 148: C esi divisor
! 149: C edi inverse
! 150: C ebp counter, limbs, negative
! 151:
! 152: movl (%ebx,%ebp,4), %eax
! 153:
! 154: subl %ecx, %eax C apply carry bit
! 155: movl $0, %ecx
! 156:
! 157: setc %cl C new carry bit
! 158:
! 159: subl %edx, %eax C apply carry limb
! 160: adcl $0, %ecx
! 161:
! 162: imull %edi, %eax
! 163:
! 164: mull %esi
! 165:
! 166: incl %ebp
! 167: jnz L(top)
! 168:
! 169:
! 170: movl SAVE_ESI, %esi
! 171: movl SAVE_EDI, %edi
! 172: leal (%ecx,%edx), %eax
! 173:
! 174: movl SAVE_EBX, %ebx
! 175: movl SAVE_EBP, %ebp
! 176: addl $STACK_SPACE, %esp
! 177:
! 178: ret
! 179:
! 180: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>