Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/sse2/mode1o.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
! 2:
! 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C P4: 19.0 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
! 29: C mp_limb_t divisor);
! 30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t divisor, mp_limb_t carry);
! 32: C
! 33:
! 34: defframe(PARAM_CARRY, 16)
! 35: defframe(PARAM_DIVISOR,12)
! 36: defframe(PARAM_SIZE, 8)
! 37: defframe(PARAM_SRC, 4)
! 38:
! 39: TEXT
! 40:
! 41: ALIGN(16)
! 42: PROLOGUE(mpn_modexact_1c_odd)
! 43: deflit(`FRAME',0)
! 44:
! 45: movd PARAM_CARRY, %mm1
! 46: jmp L(start_1c)
! 47:
! 48: ifdef(`PIC',`
! 49: L(movl_eip_edx):
! 50: movl (%esp), %edx
! 51: ret
! 52: ')
! 53:
! 54: EPILOGUE()
! 55:
! 56:
! 57: ALIGN(16)
! 58: PROLOGUE(mpn_modexact_1_odd)
! 59: deflit(`FRAME',0)
! 60:
! 61: pxor %mm1, %mm1 C carry limb
! 62: L(start_1c):
! 63: movl PARAM_DIVISOR, %eax
! 64:
! 65: movd PARAM_DIVISOR, %mm7
! 66:
! 67: shrl %eax
! 68:
! 69: andl $127, %eax C d/2, 7 bits
! 70:
! 71: ifdef(`PIC',`
! 72: call L(movl_eip_edx)
! 73:
! 74: addl $_GLOBAL_OFFSET_TABLE_, %edx
! 75:
! 76: movl modlimb_invert_table@GOT(%edx), %edx
! 77: C
! 78: movzbl (%eax,%edx), %eax C inv 8 bits
! 79: ',`
! 80: dnl non-PIC
! 81: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
! 82: ')
! 83:
! 84: C
! 85:
! 86: movd %eax, %mm6 C inv
! 87:
! 88: movd %eax, %mm0 C inv
! 89:
! 90: pmuludq %mm6, %mm6 C inv*inv
! 91:
! 92: C
! 93:
! 94: pmuludq %mm7, %mm6 C inv*inv*d
! 95: paddd %mm0, %mm0 C 2*inv
! 96:
! 97: C
! 98:
! 99: psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
! 100: pxor %mm6, %mm6
! 101:
! 102: paddd %mm0, %mm6
! 103: pmuludq %mm0, %mm0 C inv*inv
! 104:
! 105: C
! 106:
! 107: pmuludq %mm7, %mm0 C inv*inv*d
! 108: paddd %mm6, %mm6 C 2*inv
! 109:
! 110:
! 111: movl PARAM_SRC, %eax
! 112: movl PARAM_SIZE, %ecx
! 113:
! 114: C
! 115:
! 116: psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
! 117:
! 118: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 119: pushl %eax FRAME_pushl()
! 120: movd %mm6, %eax
! 121: imul PARAM_DIVISOR, %eax
! 122: cmpl $1, %eax
! 123: popl %eax FRAME_popl()')
! 124:
! 125: pxor %mm0, %mm0 C carry bit
! 126:
! 127:
! 128: C The dependent chain here is as follows.
! 129: C
! 130: C latency
! 131: C psubq s = (src-cbit) - climb 2
! 132: C pmuludq q = s*inverse 8
! 133: C pmuludq prod = q*divisor 8
! 134: C psrlq climb = high(prod) 2
! 135: C --
! 136: C 20
! 137: C
! 138: C Yet the loop measures 19.0 c/l, so obviously there's something gained
! 139: C there over a straight reading of the chip documentation.
! 140:
! 141: L(top):
! 142: C eax src, incrementing
! 143: C ebx
! 144: C ecx counter, limbs
! 145: C edx
! 146: C
! 147: C mm0 carry bit
! 148: C mm1 carry limb
! 149: C mm6 inverse
! 150: C mm7 divisor
! 151:
! 152: movd (%eax), %mm2
! 153: addl $4, %eax
! 154:
! 155: psubq %mm0, %mm2 C src - cbit
! 156:
! 157: psubq %mm1, %mm2 C src - cbit - climb
! 158: movq %mm2, %mm0
! 159: psrlq $63, %mm0 C new cbit
! 160:
! 161: pmuludq %mm6, %mm2 C s*inverse
! 162:
! 163: movq %mm7, %mm1
! 164: pmuludq %mm2, %mm1 C q*divisor
! 165: psrlq $32, %mm1 C new climb
! 166:
! 167: subl $1, %ecx
! 168: jnz L(top)
! 169:
! 170:
! 171: L(done):
! 172: paddq %mm1, %mm0
! 173: movd %mm0, %eax
! 174: emms
! 175: ret
! 176:
! 177: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>