Annotation of OpenXM_contrib/gmp/mpn/x86/p6/dive_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
! 2:
! 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C odd even divisor
! 26: C P6: 10.0 12.0 cycles/limb
! 27:
! 28:
! 29: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 30: C mp_limb_t divisor);
! 31: C
! 32: C The odd case is basically the same as mpn_modexact_1_odd, just with an
! 33: C extra store, and it runs at the same 10 cycles which is the dependent
! 34: C chain.
! 35: C
! 36: C The shifts for the even case aren't on the dependent chain so in principle
! 37: C it could run the same too, but nothing running at 10 has been found.
! 38: C Perhaps there's too many uops (an extra 4 over the odd case).
! 39:
! 40: defframe(PARAM_DIVISOR,16)
! 41: defframe(PARAM_SIZE, 12)
! 42: defframe(PARAM_SRC, 8)
! 43: defframe(PARAM_DST, 4)
! 44:
! 45: defframe(SAVE_EBX, -4)
! 46: defframe(SAVE_ESI, -8)
! 47: defframe(SAVE_EDI, -12)
! 48: defframe(SAVE_EBP, -16)
! 49: defframe(VAR_INVERSE, -20)
! 50: deflit(STACK_SPACE, 20)
! 51:
! 52: TEXT
! 53:
! 54: ALIGN(16)
! 55: PROLOGUE(mpn_divexact_1)
! 56: deflit(`FRAME',0)
! 57:
! 58: movl PARAM_DIVISOR, %eax
! 59: subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
! 60:
! 61: movl %esi, SAVE_ESI
! 62: movl PARAM_SRC, %esi
! 63:
! 64: movl %ebx, SAVE_EBX
! 65: movl PARAM_SIZE, %ebx
! 66:
! 67: bsfl %eax, %ecx C trailing twos
! 68:
! 69: movl %ebp, SAVE_EBP
! 70:
! 71: shrl %cl, %eax C d without twos
! 72:
! 73: movl %eax, %edx
! 74: shrl %eax C d/2 without twos
! 75:
! 76: movl %edx, PARAM_DIVISOR
! 77: andl $127, %eax
! 78:
! 79: ifdef(`PIC',`
! 80: call L(movl_eip_ebp)
! 81: addl $_GLOBAL_OFFSET_TABLE_, %ebp
! 82: movl modlimb_invert_table@GOT(%ebp), %ebp
! 83: movzbl (%eax,%ebp), %ebp C inv 8 bits
! 84:
! 85: ',`
! 86: dnl non-PIC
! 87: movzbl modlimb_invert_table(%eax), %ebp C inv 8 bits
! 88: ')
! 89:
! 90: leal (%ebp,%ebp), %eax C 2*inv
! 91:
! 92: imull %ebp, %ebp C inv*inv
! 93:
! 94: movl %edi, SAVE_EDI
! 95: movl PARAM_DST, %edi
! 96:
! 97: leal (%esi,%ebx,4), %esi C src end
! 98:
! 99: imull PARAM_DIVISOR, %ebp C inv*inv*d
! 100:
! 101: subl %ebp, %eax C inv = 2*inv - inv*inv*d
! 102: leal (%eax,%eax), %ebp C 2*inv
! 103:
! 104: imull %eax, %eax C inv*inv
! 105:
! 106: leal (%edi,%ebx,4), %edi C dst end
! 107: negl %ebx C -size
! 108:
! 109: movl %edi, PARAM_DST
! 110:
! 111: imull PARAM_DIVISOR, %eax C inv*inv*d
! 112:
! 113: subl %eax, %ebp C inv = 2*inv - inv*inv*d
! 114:
! 115: ASSERT(e,` C d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 116: movl PARAM_DIVISOR, %eax
! 117: imull %ebp, %eax
! 118: cmpl $1, %eax')
! 119:
! 120: movl %ebp, VAR_INVERSE
! 121: movl (%esi,%ebx,4), %eax C src[0]
! 122:
! 123: orl %ecx, %ecx
! 124: jnz L(even)
! 125:
! 126: C ecx initial carry is zero
! 127: jmp L(odd_entry)
! 128:
! 129:
! 130: C The dependent chain here is
! 131: C
! 132: C subl %edx, %eax 1
! 133: C imull %ebp, %eax 4
! 134: C mull PARAM_DIVISOR 5
! 135: C ----
! 136: C total 10
! 137: C
! 138: C and this is the measured speed. No special scheduling is necessary, out
! 139: C of order execution hides the load latency.
! 140:
! 141: L(odd_top):
! 142: C eax scratch (src limb)
! 143: C ebx counter, limbs, negative
! 144: C ecx carry bit
! 145: C edx carry limb, high of last product
! 146: C esi &src[size]
! 147: C edi &dst[size]
! 148: C ebp
! 149:
! 150: mull PARAM_DIVISOR
! 151:
! 152: movl (%esi,%ebx,4), %eax
! 153: subl %ecx, %eax
! 154:
! 155: sbbl %ecx, %ecx
! 156: subl %edx, %eax
! 157:
! 158: sbbl $0, %ecx
! 159:
! 160: L(odd_entry):
! 161: imull VAR_INVERSE, %eax
! 162:
! 163: movl %eax, (%edi,%ebx,4)
! 164: negl %ecx
! 165:
! 166: incl %ebx
! 167: jnz L(odd_top)
! 168:
! 169:
! 170: movl SAVE_ESI, %esi
! 171:
! 172: movl SAVE_EDI, %edi
! 173:
! 174: movl SAVE_EBP, %ebp
! 175:
! 176: movl SAVE_EBX, %ebx
! 177: addl $STACK_SPACE, %esp
! 178:
! 179: ret
! 180:
! 181:
! 182: L(even):
! 183: C eax src[0]
! 184: C ebx counter, limbs, negative
! 185: C ecx shift
! 186: C edx
! 187: C esi
! 188: C edi
! 189: C ebp
! 190:
! 191: xorl %ebp, %ebp C initial carry bit
! 192: xorl %edx, %edx C initial carry limb (for size==1)
! 193:
! 194: incl %ebx
! 195: jz L(even_one)
! 196:
! 197: movl (%esi,%ebx,4), %edi C src[1]
! 198:
! 199: shrdl( %cl, %edi, %eax)
! 200:
! 201: jmp L(even_entry)
! 202:
! 203:
! 204: L(even_top):
! 205: C eax scratch
! 206: C ebx counter, limbs, negative
! 207: C ecx shift
! 208: C edx scratch
! 209: C esi &src[size]
! 210: C edi &dst[size] and scratch
! 211: C ebp carry bit
! 212:
! 213: movl (%esi,%ebx,4), %edi
! 214:
! 215: mull PARAM_DIVISOR
! 216:
! 217: movl -4(%esi,%ebx,4), %eax
! 218: shrdl( %cl, %edi, %eax)
! 219:
! 220: subl %ebp, %eax
! 221:
! 222: sbbl %ebp, %ebp
! 223: subl %edx, %eax
! 224:
! 225: sbbl $0, %ebp
! 226:
! 227: L(even_entry):
! 228: imull VAR_INVERSE, %eax
! 229:
! 230: movl PARAM_DST, %edi
! 231: negl %ebp
! 232:
! 233: movl %eax, -4(%edi,%ebx,4)
! 234: incl %ebx
! 235: jnz L(even_top)
! 236:
! 237:
! 238:
! 239: mull PARAM_DIVISOR
! 240:
! 241: movl -4(%esi), %eax
! 242:
! 243: L(even_one):
! 244: shrl %cl, %eax
! 245: movl SAVE_ESI, %esi
! 246:
! 247: subl %ebp, %eax
! 248: movl SAVE_EBP, %ebp
! 249:
! 250: subl %edx, %eax
! 251: movl SAVE_EBX, %ebx
! 252:
! 253: imull VAR_INVERSE, %eax
! 254:
! 255: movl %eax, -4(%edi)
! 256: movl SAVE_EDI, %edi
! 257: addl $STACK_SPACE, %esp
! 258:
! 259: ret
! 260:
! 261:
! 262: ifdef(`PIC',`
! 263: L(movl_eip_ebp):
! 264: movl (%esp), %ebp
! 265: ret
! 266: ')
! 267:
! 268: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>