Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mode1o.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C P5: 23.0 cycles/limb
! 26:
! 27:
! 28: C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
! 29: C mp_limb_t divisor);
! 30: C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t divisor, mp_limb_t carry);
! 32: C
! 33: C There seems no way to pair up the two lone instructions in the main loop.
! 34: C
! 35: C The special case for size==1 saves about 20 cycles (non-PIC), making it
! 36: C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
! 37: C all sizes.
! 38: C
! 39: C Alternatives:
! 40: C
! 41: C Using mmx for the multiplies might be possible, with pmullw and pmulhw
! 42: C having just 3 cycle latencies, but carry bit handling would probably be
! 43: C complicated.
! 44:
! 45: defframe(PARAM_CARRY, 16)
! 46: defframe(PARAM_DIVISOR,12)
! 47: defframe(PARAM_SIZE, 8)
! 48: defframe(PARAM_SRC, 4)
! 49:
! 50: dnl re-using parameter space
! 51: define(VAR_INVERSE,`PARAM_SIZE')
! 52:
! 53: TEXT
! 54:
! 55: ALIGN(16)
! 56: PROLOGUE(mpn_modexact_1c_odd)
! 57: deflit(`FRAME',0)
! 58:
! 59: movl PARAM_DIVISOR, %eax
! 60: movl PARAM_CARRY, %edx
! 61:
! 62: jmp L(start_1c)
! 63:
! 64: EPILOGUE()
! 65:
! 66: ALIGN(16)
! 67: PROLOGUE(mpn_modexact_1_odd)
! 68: deflit(`FRAME',0)
! 69:
! 70: movl PARAM_DIVISOR, %eax
! 71: xorl %edx, %edx C carry
! 72:
! 73: L(start_1c):
! 74:
! 75: ifdef(`PIC',`
! 76: call L(here) FRAME_pushl()
! 77: L(here):
! 78:
! 79: shrl %eax C d/2
! 80: movl (%esp), %ecx C eip
! 81:
! 82: addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
! 83: movl %ebx, (%esp) C push ebx
! 84:
! 85: andl $127, %eax
! 86: movl PARAM_SIZE, %ebx
! 87:
! 88: movl modlimb_invert_table@GOT(%ecx), %ecx
! 89: subl $2, %ebx
! 90:
! 91: movb (%eax,%ecx), %cl C inv 8 bits
! 92: jc L(one_limb)
! 93:
! 94: ',`
! 95: dnl non-PIC
! 96: shrl %eax C d/2
! 97: pushl %ebx FRAME_pushl()
! 98:
! 99: movl PARAM_SIZE, %ebx
! 100: andl $127, %eax
! 101:
! 102: subl $2, %ebx
! 103: jc L(one_limb)
! 104:
! 105: movb modlimb_invert_table(%eax), %cl C inv 8 bits
! 106: ')
! 107:
! 108: movl %ecx, %eax
! 109: addl %ecx, %ecx C 2*inv
! 110:
! 111: imull %eax, %eax C inv*inv
! 112:
! 113: imull PARAM_DIVISOR, %eax C inv*inv*d
! 114:
! 115: subl %eax, %ecx C inv = 2*inv - inv*inv*d
! 116:
! 117: movl %ecx, %eax
! 118: addl %ecx, %ecx C 2*inv
! 119:
! 120: imull %eax, %eax C inv*inv
! 121:
! 122: imull PARAM_DIVISOR, %eax C inv*inv*d
! 123:
! 124: subl %eax, %ecx C inv = 2*inv - inv*inv*d
! 125: pushl %esi FRAME_pushl()
! 126:
! 127: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 128: movl %ecx, %eax
! 129: imull PARAM_DIVISOR, %eax
! 130: cmpl $1, %eax')
! 131:
! 132: movl PARAM_SRC, %esi
! 133: movl %ecx, VAR_INVERSE
! 134:
! 135: movl (%esi), %eax C src[0]
! 136: leal 4(%esi,%ebx,4), %esi C &src[size-1]
! 137:
! 138: xorl $-1, %ebx C -(size-1)
! 139: ASSERT(nz)
! 140: jmp L(entry)
! 141:
! 142:
! 143: C The use of VAR_INVERSE means only a store is needed for that value, rather
! 144: C than a push and pop of say %edi.
! 145:
! 146: ALIGN(16)
! 147: L(top):
! 148: C eax scratch, low product
! 149: C ebx counter, limbs, negative
! 150: C ecx carry bit
! 151: C edx scratch, high product
! 152: C esi &src[size-1]
! 153: C edi
! 154: C ebp
! 155:
! 156: mull PARAM_DIVISOR C h:dummy = q*d
! 157:
! 158: movl (%esi,%ebx,4), %eax C src[i]
! 159: subl %ecx, %edx C h -= -c
! 160:
! 161: L(entry):
! 162: subl %edx, %eax C s = src[i] - h
! 163:
! 164: sbbl %ecx, %ecx C new -c (0 or -1)
! 165:
! 166: imull VAR_INVERSE, %eax C q = s*i
! 167:
! 168: incl %ebx
! 169: jnz L(top)
! 170:
! 171:
! 172: mull PARAM_DIVISOR
! 173:
! 174: movl (%esi), %eax C src high
! 175: subl %ecx, %edx C h -= -c
! 176:
! 177: cmpl PARAM_DIVISOR, %eax
! 178:
! 179: jbe L(skip_last)
! 180: deflit(FRAME_LAST,FRAME)
! 181:
! 182:
! 183: subl %edx, %eax C s = src[i] - h
! 184: popl %esi FRAME_popl()
! 185:
! 186: sbbl %ecx, %ecx C c (0 or -1)
! 187: popl %ebx FRAME_popl()
! 188:
! 189: imull VAR_INVERSE, %eax C q = s*i
! 190:
! 191: mull PARAM_DIVISOR C h:dummy = q*d
! 192:
! 193: movl %edx, %eax
! 194:
! 195: subl %ecx, %eax
! 196:
! 197: ret
! 198:
! 199:
! 200: C When high<divisor can skip last step.
! 201:
! 202: L(skip_last):
! 203: deflit(`FRAME',FRAME_LAST)
! 204: C eax src high
! 205: C ebx
! 206: C ecx
! 207: C edx r
! 208: C esi
! 209:
! 210: subl %eax, %edx C r-s
! 211: popl %esi FRAME_popl()
! 212:
! 213: sbbl %eax, %eax C -1 if underflow
! 214: movl PARAM_DIVISOR, %ebx
! 215:
! 216: andl %ebx, %eax C divisor if underflow
! 217: popl %ebx FRAME_popl()
! 218:
! 219: addl %edx, %eax C addback if underflow
! 220:
! 221: ret
! 222:
! 223:
! 224: C Special case for size==1 using a division for r = c-a mod d.
! 225: C Could look for a-c<d and save a division sometimes, but that doesn't seem
! 226: C worth bothering about.
! 227:
! 228: L(one_limb):
! 229: deflit(`FRAME',4)
! 230: C eax
! 231: C ebx size-2 (==-1)
! 232: C ecx
! 233: C edx carry
! 234: C esi src end
! 235: C edi
! 236: C ebp
! 237:
! 238: movl %edx, %eax
! 239: movl PARAM_SRC, %edx
! 240:
! 241: movl PARAM_DIVISOR, %ecx
! 242: popl %ebx FRAME_popl()
! 243:
! 244: subl (%edx), %eax C c-a
! 245:
! 246: sbbl %edx, %edx
! 247: decl %ecx C d-1
! 248:
! 249: andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a
! 250:
! 251: divl PARAM_DIVISOR
! 252:
! 253: movl %edx, %eax
! 254:
! 255: ret
! 256:
! 257: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>