Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/dive_1.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C divisor
! 26: C odd even
! 27: C K6: 10.0 12.0 cycles/limb
! 28: C K6-2: 10.0 11.5
! 29:
! 30:
! 31: C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 32: C mp_limb_t divisor);
! 33: C
! 34: C A simple divl is used for size==1. This is about 10 cycles faster for an
! 35: C odd divisor or 20 cycles for an even divisor.
! 36: C
! 37: C The loops are quite sensitive to code alignment, speeds should be
! 38: C rechecked (odd and even divisor, pic and non-pic) if contemplating
! 39: C changing anything.
! 40:
! 41: defframe(PARAM_DIVISOR,16)
! 42: defframe(PARAM_SIZE, 12)
! 43: defframe(PARAM_SRC, 8)
! 44: defframe(PARAM_DST, 4)
! 45:
! 46: dnl re-use parameter space
! 47: define(VAR_INVERSE,`PARAM_DST')
! 48:
! 49: TEXT
! 50:
! 51: ALIGN(32)
! 52: PROLOGUE(mpn_divexact_1)
! 53: deflit(`FRAME',0)
! 54:
! 55: movl PARAM_SIZE, %ecx
! 56:
! 57: movl PARAM_SRC, %eax
! 58: xorl %edx, %edx
! 59:
! 60: cmpl $1, %ecx
! 61: jnz L(two_or_more)
! 62:
! 63: movl (%eax), %eax
! 64:
! 65: divl PARAM_DIVISOR
! 66:
! 67: movl PARAM_DST, %ecx
! 68: movl %eax, (%ecx)
! 69:
! 70: ret
! 71:
! 72:
! 73: L(two_or_more):
! 74: movl PARAM_DIVISOR, %eax
! 75: pushl %ebx FRAME_pushl()
! 76:
! 77: movl PARAM_SRC, %ebx
! 78: pushl %ebp FRAME_pushl()
! 79:
! 80: L(strip_twos):
! 81: shrl %eax
! 82: incl %edx C will get shift+1
! 83:
! 84: jnc L(strip_twos)
! 85: pushl %esi FRAME_pushl()
! 86:
! 87: leal 1(%eax,%eax), %esi C d without twos
! 88: andl $127, %eax C d/2, 7 bits
! 89:
! 90: ifdef(`PIC',`
! 91: call L(movl_eip_ebp)
! 92:
! 93: addl $_GLOBAL_OFFSET_TABLE_, %ebp
! 94: C
! 95: movl modlimb_invert_table@GOT(%ebp), %ebp
! 96: C
! 97: Zdisp( movzbl, 0,(%eax,%ebp), %eax)
! 98: ',`
! 99:
! 100: dnl non-PIC
! 101: movzbl modlimb_invert_table(%eax), %eax C inv 8 bits
! 102: ')
! 103: pushl %edi FRAME_pushl()
! 104:
! 105: leal (%eax,%eax), %ebp C 2*inv
! 106:
! 107: imull %eax, %eax C inv*inv
! 108:
! 109: movl PARAM_DST, %edi
! 110:
! 111: imull %esi, %eax C inv*inv*d
! 112:
! 113: subl %eax, %ebp C inv = 2*inv - inv*inv*d
! 114: leal (%ebp,%ebp), %eax C 2*inv
! 115:
! 116: imull %ebp, %ebp C inv*inv
! 117:
! 118: movl %esi, PARAM_DIVISOR C d without twos
! 119: leal (%ebx,%ecx,4), %ebx C src end
! 120:
! 121: imull %esi, %ebp C inv*inv*d
! 122:
! 123: leal (%edi,%ecx,4), %edi C dst end
! 124: negl %ecx C -size
! 125:
! 126: subl %ebp, %eax C inv = 2*inv - inv*inv*d
! 127: subl $1, %edx C shift amount, and clear carry
! 128:
! 129: ASSERT(e,` C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
! 130: pushl %eax FRAME_pushl()
! 131: imull PARAM_DIVISOR, %eax
! 132: cmpl $1, %eax
! 133: popl %eax FRAME_popl()')
! 134:
! 135: movl %eax, VAR_INVERSE
! 136: jnz L(even)
! 137:
! 138: movl (%ebx,%ecx,4), %esi C src low limb
! 139: jmp L(odd_entry)
! 140:
! 141:
! 142: ALIGN(16)
! 143: nop C code alignment
! 144: L(odd_top):
! 145: C eax scratch
! 146: C ebx src end
! 147: C ecx counter, limbs, negative
! 148: C edx inverse
! 149: C esi next limb, adjusted for carry
! 150: C edi dst end
! 151: C ebp carry bit, 0 or -1
! 152:
! 153: imull %edx, %esi
! 154:
! 155: movl PARAM_DIVISOR, %eax
! 156: movl %esi, -4(%edi,%ecx,4)
! 157:
! 158: mull %esi C carry limb in edx
! 159:
! 160: subl %ebp, %edx C apply carry bit
! 161: movl (%ebx,%ecx,4), %esi
! 162:
! 163: L(odd_entry):
! 164: subl %edx, %esi C apply carry limb
! 165: movl VAR_INVERSE, %edx
! 166:
! 167: sbbl %ebp, %ebp C 0 or -1
! 168:
! 169: incl %ecx
! 170: jnz L(odd_top)
! 171:
! 172:
! 173: imull %edx, %esi
! 174:
! 175: movl %esi, -4(%edi,%ecx,4)
! 176:
! 177: popl %edi
! 178: popl %esi
! 179:
! 180: popl %ebp
! 181: popl %ebx
! 182:
! 183: ret
! 184:
! 185:
! 186: ifdef(`PIC',`
! 187: L(movl_eip_ebp):
! 188: movl (%esp), %ebp
! 189: ret
! 190:
! 191: ALIGN(8)
! 192: nop C code alignment, necessary for claimed speed
! 193: nop
! 194: ',`
! 195: C non-PIC code alignment already ok at 0x9a
! 196: ')
! 197:
! 198: L(even):
! 199: C eax
! 200: C ebx src end
! 201: C ecx -size
! 202: C edx twos
! 203: C esi
! 204: C edi dst end
! 205: C ebp
! 206:
! 207: xorl %ebp, %ebp
! 208: Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
! 209:
! 210: movd %edx, %mm7
! 211: movl VAR_INVERSE, %edx
! 212:
! 213: addl $2, %ecx
! 214: psrlq %mm7, %mm0
! 215:
! 216: movd %mm0, %esi
! 217: jz L(even_two) C if only two limbs
! 218:
! 219:
! 220: C Out-of-order execution is good enough to hide the load/rshift/movd
! 221: C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
! 222: C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
! 223: C been found. Maybe the fact every second movq is unaligned costs the extra
! 224: C 0.5.
! 225:
! 226: L(even_top):
! 227: C eax scratch
! 228: C ebx src end
! 229: C ecx counter, limbs, negative
! 230: C edx inverse
! 231: C esi next limb, adjusted for carry
! 232: C edi dst end
! 233: C ebp carry bit, 0 or -1
! 234: C
! 235: C mm0 scratch, source limbs
! 236: C mm7 twos
! 237:
! 238: imull %edx, %esi
! 239:
! 240: movl %esi, -8(%edi,%ecx,4)
! 241: movl PARAM_DIVISOR, %eax
! 242:
! 243: mull %esi C carry limb in edx
! 244:
! 245: movq -4(%ebx,%ecx,4), %mm0
! 246: psrlq %mm7, %mm0
! 247:
! 248: movd %mm0, %esi
! 249: subl %ebp, %edx C apply carry bit
! 250:
! 251: subl %edx, %esi C apply carry limb
! 252: movl VAR_INVERSE, %edx
! 253:
! 254: sbbl %ebp, %ebp C 0 or -1
! 255:
! 256: incl %ecx
! 257: jnz L(even_top)
! 258:
! 259:
! 260: L(even_two):
! 261: movd -4(%ebx), %mm0 C src high limb
! 262: psrlq %mm7, %mm0
! 263:
! 264: imull %edx, %esi
! 265:
! 266: movl %esi, -8(%edi)
! 267: movl PARAM_DIVISOR, %eax
! 268:
! 269: mull %esi C carry limb in edx
! 270:
! 271: movd %mm0, %esi
! 272: subl %ebp, %edx C apply carry bit
! 273:
! 274: movl VAR_INVERSE, %eax
! 275: subl %edx, %esi C apply carry limb
! 276:
! 277: imull %eax, %esi
! 278:
! 279: movl %esi, -4(%edi)
! 280:
! 281: popl %edi
! 282: popl %esi
! 283:
! 284: popl %ebp
! 285: popl %ebx
! 286:
! 287: emms_or_femms
! 288:
! 289: ret
! 290:
! 291: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>