Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mode1o.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C P6: 10.0 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
! 29: C mp_limb_t divisor);
! 30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t divisor, mp_limb_t carry);
! 32: C
! 33: C It's not worth skipping a step at the end when high<divisor since the main
! 34: C loop is only 10 cycles.
! 35:
! 36: defframe(PARAM_CARRY, 16)
! 37: defframe(PARAM_DIVISOR,12)
! 38: defframe(PARAM_SIZE, 8)
! 39: defframe(PARAM_SRC, 4)
! 40:
! 41: dnl Not enough room under modexact_1 to make these re-use the parameter
! 42: dnl space, unfortunately.
! 43: defframe(SAVE_EBX, -4)
! 44: defframe(SAVE_ESI, -8)
! 45: defframe(SAVE_EDI, -12)
! 46: deflit(STACK_SPACE, 12)
! 47:
! 48: TEXT
! 49:
! 50: ALIGN(16)
! 51: PROLOGUE(mpn_modexact_1c_odd)
! 52: deflit(`FRAME',0)
! 53:
! 54: movl PARAM_CARRY, %ecx
! 55: jmp L(start_1c)
! 56:
! 57: EPILOGUE()
! 58:
! 59: ALIGN(16)
! 60: PROLOGUE(mpn_modexact_1_odd)
! 61: deflit(`FRAME',0)
! 62:
! 63: xorl %ecx, %ecx
! 64: L(start_1c):
! 65: movl PARAM_DIVISOR, %eax
! 66:
! 67: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
! 68:
! 69: movl %esi, SAVE_ESI
! 70: movl PARAM_SRC, %esi
! 71:
! 72: shrl %eax C d/2
! 73: movl %edi, SAVE_EDI
! 74:
! 75: andl $127, %eax
! 76:
! 77: ifdef(`PIC',`
! 78: call L(movl_eip_edi)
! 79: addl $_GLOBAL_OFFSET_TABLE_, %edi
! 80: movl modlimb_invert_table@GOT(%edi), %edi
! 81: movzbl (%eax,%edi), %edi C inv 8 bits
! 82:
! 83: ',`
! 84: dnl non-PIC
! 85: movzbl modlimb_invert_table(%eax), %edi C inv 8 bits
! 86: ')
! 87:
! 88: xorl %edx, %edx C initial extra carry
! 89: leal (%edi,%edi), %eax C 2*inv
! 90:
! 91: imull %edi, %edi C inv*inv
! 92:
! 93: movl %ebx, SAVE_EBX
! 94: movl PARAM_SIZE, %ebx
! 95:
! 96: imull PARAM_DIVISOR, %edi C inv*inv*d
! 97:
! 98: subl %edi, %eax C inv = 2*inv - inv*inv*d
! 99: leal (%eax,%eax), %edi C 2*inv
! 100:
! 101: imull %eax, %eax C inv*inv
! 102:
! 103: imull PARAM_DIVISOR, %eax C inv*inv*d
! 104:
! 105: leal (%esi,%ebx,4), %esi C src end
! 106: negl %ebx C -size
! 107:
! 108: subl %eax, %edi C inv = 2*inv - inv*inv*d
! 109:
! 110: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 111: movl PARAM_DIVISOR, %eax
! 112: imull %edi, %eax
! 113: cmpl $1, %eax')
! 114:
! 115:
! 116: C The dependent chain here is
! 117: C
! 118: C subl %edx, %eax 1
! 119: C imull %edi, %eax 4
! 120: C mull PARAM_DIVISOR 5
! 121: C ----
! 122: C total 10
! 123: C
! 124: C and this is the measured speed. No special scheduling is necessary, out
! 125: C of order execution hides the load latency.
! 126:
! 127: L(top):
! 128: C eax scratch (src limb)
! 129: C ebx counter, limbs, negative
! 130: C ecx carry bit, 0 or 1
! 131: C edx carry limb, high of last product
! 132: C esi &src[size]
! 133: C edi inverse
! 134: C ebp
! 135:
! 136: movl (%esi,%ebx,4), %eax
! 137: subl %ecx, %eax
! 138:
! 139: sbbl %ecx, %ecx
! 140: subl %edx, %eax
! 141:
! 142: sbbl $0, %ecx
! 143:
! 144: imull %edi, %eax
! 145:
! 146: negl %ecx
! 147:
! 148: mull PARAM_DIVISOR
! 149:
! 150: incl %ebx
! 151: jnz L(top)
! 152:
! 153:
! 154: movl SAVE_ESI, %esi
! 155: leal (%ecx,%edx), %eax
! 156:
! 157: movl SAVE_EDI, %edi
! 158:
! 159: movl SAVE_EBX, %ebx
! 160: addl $STACK_SPACE, %esp
! 161:
! 162: ret
! 163:
! 164:
! 165: ifdef(`PIC',`
! 166: L(movl_eip_edi):
! 167: movl (%esp), %edi
! 168: ret
! 169: ')
! 170:
! 171: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>