Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mode1o.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C K6: 10.0 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
! 29: C mp_limb_t divisor);
! 30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t divisor, mp_limb_t carry);
! 32: C
! 33: C A special case for high<divisor at the end measured only about 4 cycles
! 34: C faster, and so isn't used.
! 35: C
! 36: C A special case for size==1 using a divl rather than the inverse measured
! 37: C only about 5 cycles faster, and so isn't used. When size==1 and
! 38: C high<divisor it can skip a division and be a full 24 cycles faster, but
! 39: C this isn't an important case.
! 40:
! 41: defframe(PARAM_CARRY, 16)
! 42: defframe(PARAM_DIVISOR,12)
! 43: defframe(PARAM_SIZE, 8)
! 44: defframe(PARAM_SRC, 4)
! 45:
! 46: TEXT
! 47:
! 48: ALIGN(32)
! 49: PROLOGUE(mpn_modexact_1c_odd)
! 50: deflit(`FRAME',0)
! 51:
! 52: movl PARAM_DIVISOR, %ecx
! 53: pushl %esi FRAME_pushl()
! 54:
! 55: movl PARAM_CARRY, %edx
! 56: jmp L(start_1c)
! 57:
! 58: ifdef(`PIC',`
! 59: L(movl_eip_edi):
! 60: movl (%esp), %edi
! 61: ret
! 62: ')
! 63:
! 64: EPILOGUE()
! 65:
! 66:
! 67: ALIGN(16)
! 68: PROLOGUE(mpn_modexact_1_odd)
! 69: deflit(`FRAME',0)
! 70:
! 71: movl PARAM_DIVISOR, %ecx
! 72: pushl %esi FRAME_pushl()
! 73:
! 74: xorl %edx, %edx
! 75: L(start_1c):
! 76: pushl %edi FRAME_pushl()
! 77:
! 78: shrl %ecx C d/2
! 79: movl PARAM_DIVISOR, %esi
! 80:
! 81: andl $127, %ecx C d/2, 7 bits
! 82: pushl %ebp FRAME_pushl()
! 83:
! 84: ifdef(`PIC',`
! 85: call L(movl_eip_edi)
! 86:
! 87: addl $_GLOBAL_OFFSET_TABLE_, %edi
! 88: C
! 89: movl modlimb_invert_table@GOT(%edi), %edi
! 90: C
! 91: Zdisp( movzbl, 0,(%ecx,%edi), %edi) C inv 8 bits
! 92: ',`
! 93:
! 94: dnl non-PIC
! 95: movzbl modlimb_invert_table(%ecx), %edi C inv 8 bits
! 96: ')
! 97: leal (%edi,%edi), %ecx C 2*inv
! 98:
! 99: imull %edi, %edi C inv*inv
! 100:
! 101: movl PARAM_SRC, %eax
! 102: movl PARAM_SIZE, %ebp
! 103:
! 104: imull %esi, %edi C inv*inv*d
! 105:
! 106: pushl %ebx FRAME_pushl()
! 107: leal (%eax,%ebp,4), %ebx C src end
! 108:
! 109: subl %edi, %ecx C inv = 2*inv - inv*inv*d
! 110: leal (%ecx,%ecx), %edi C 2*inv
! 111:
! 112: imull %ecx, %ecx C inv*inv
! 113:
! 114: movl (%eax), %eax C src low limb
! 115: negl %ebp C -size
! 116:
! 117: imull %esi, %ecx C inv*inv*d
! 118:
! 119: subl %ecx, %edi C inv = 2*inv - inv*inv*d
! 120:
! 121: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 122: pushl %eax
! 123: movl %esi, %eax
! 124: imull %edi, %eax
! 125: cmpl $1, %eax
! 126: popl %eax')
! 127:
! 128: jmp L(entry)
! 129:
! 130:
! 131: C Rotating the mul to the top of the loop saves 1 cycle, presumably by
! 132: C hiding the loop control under the imul latency.
! 133: C
! 134: C The run time is 10 cycles, but decoding is only 9 (and the dependent chain
! 135: C only 8). It's not clear how to get down to 9 cycles.
! 136: C
! 137: C The xor and rcl to handle the carry bit could be an sbb instead, with the
! 138: C the carry bit add becoming a sub, but that doesn't save anything.
! 139:
! 140: L(top):
! 141: C eax (low product)
! 142: C ebx src end
! 143: C ecx carry bit, 0 or 1
! 144: C edx (high product, being carry limb)
! 145: C esi divisor
! 146: C edi inverse
! 147: C ebp counter, limbs, negative
! 148:
! 149: mull %esi
! 150:
! 151: movl (%ebx,%ebp,4), %eax
! 152: addl %ecx, %edx C apply carry bit to carry limb
! 153: ASSERT(a, `cmpl %edx, %esi')
! 154:
! 155: L(entry):
! 156: xorl %ecx, %ecx
! 157: subl %edx, %eax C apply carry limb
! 158:
! 159: rcll %ecx
! 160:
! 161: imull %edi, %eax
! 162:
! 163: incl %ebp
! 164: jnz L(top)
! 165:
! 166:
! 167:
! 168: popl %ebx
! 169: popl %ebp
! 170:
! 171: mull %esi
! 172:
! 173: popl %edi
! 174: popl %esi
! 175:
! 176: leal (%ecx,%edx), %eax
! 177:
! 178: ret
! 179:
! 180: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>