Annotation of OpenXM_contrib/gmp/mpn/x86/k6/gcd_1.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K6 mpn_mod_1 -- mpn by 1 gcd.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C K6: 9.5 cycles/bit (approx) 1x1 gcd
! 26: C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
! 27:
! 28:
! 29: C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
! 30: C
! 31: C This code is nothing very special, but offers a speedup over what gcc 2.95
! 32: C can do with mpn/generic/gcd_1.c.
! 33: C
! 34: C Future:
! 35: C
! 36: C Using a lookup table to count trailing zeros seems a touch quicker, but
! 37: C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
! 38: C it too.
! 39:
! 40:
! 41: dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
! 42: dnl bigger than y, then a division x%y is done to reduce it.
! 43: dnl
! 44: dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
! 45: dnl there should be an advantage in the divl at about 4 or 5 bits, which is
! 46: dnl what's found.
! 47:
! 48: deflit(DIV_THRESHOLD, 5)
! 49:
! 50:
! 51: defframe(PARAM_LIMB, 12)
! 52: defframe(PARAM_SIZE, 8)
! 53: defframe(PARAM_SRC, 4)
! 54:
! 55: TEXT
! 56: ALIGN(16)
! 57:
! 58: PROLOGUE(mpn_gcd_1)
! 59: deflit(`FRAME',0)
! 60:
! 61: ASSERT(ne, `cmpl $0, PARAM_LIMB')
! 62: ASSERT(ae, `cmpl $1, PARAM_SIZE')
! 63:
! 64:
! 65: movl PARAM_SRC, %eax
! 66: pushl %ebx FRAME_pushl()
! 67:
! 68: movl PARAM_LIMB, %edx
! 69: movl $-1, %ecx
! 70:
! 71: movl (%eax), %ebx C src low limb
! 72:
! 73: movl %ebx, %eax C src low limb
! 74: orl %edx, %ebx
! 75:
! 76: L(common_twos):
! 77: shrl %ebx
! 78: incl %ecx
! 79:
! 80: jnc L(common_twos) C 1/4 chance on random data
! 81: shrl %cl, %edx C y
! 82:
! 83: cmpl $1, PARAM_SIZE
! 84: ja L(size_two_or_more)
! 85:
! 86:
! 87: ASSERT(nz, `orl %eax, %eax') C should have src limb != 0
! 88:
! 89: shrl %cl, %eax C x
! 90:
! 91:
! 92: C Swap if necessary to make x>=y. Measures a touch quicker as a
! 93: C jump than a branch free calculation.
! 94: C
! 95: C eax x
! 96: C ebx
! 97: C ecx common twos
! 98: C edx y
! 99:
! 100: movl %eax, %ebx
! 101: cmpl %eax, %edx
! 102:
! 103: jb L(noswap)
! 104: movl %edx, %eax
! 105:
! 106: movl %ebx, %edx
! 107: movl %eax, %ebx
! 108: L(noswap):
! 109:
! 110:
! 111: C See if it's worth reducing x with a divl.
! 112: C
! 113: C eax x
! 114: C ebx x
! 115: C ecx common twos
! 116: C edx y
! 117:
! 118: shrl $DIV_THRESHOLD, %ebx
! 119:
! 120: cmpl %ebx, %edx
! 121: ja L(nodiv)
! 122:
! 123:
! 124: C Reduce x to x%y.
! 125: C
! 126: C eax x
! 127: C ebx
! 128: C ecx common twos
! 129: C edx y
! 130:
! 131: movl %edx, %ebx
! 132: xorl %edx, %edx
! 133:
! 134: divl %ebx
! 135:
! 136: orl %edx, %edx C y
! 137: nop C code alignment
! 138:
! 139: movl %ebx, %eax C x
! 140: jz L(done_shll)
! 141: L(nodiv):
! 142:
! 143:
! 144: C eax x
! 145: C ebx
! 146: C ecx common twos
! 147: C edx y
! 148: C esi
! 149: C edi
! 150: C ebp
! 151:
! 152: L(strip_y):
! 153: shrl %edx
! 154: jnc L(strip_y)
! 155:
! 156: leal 1(%edx,%edx), %edx
! 157: movl %ecx, %ebx C common twos
! 158:
! 159: leal 1(%eax), %ecx
! 160: jmp L(strip_x_and)
! 161:
! 162:
! 163: C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
! 164: C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
! 165: C only 1/4.
! 166: C
! 167: C A second computed %cl shift was tried, but that measured a touch slower
! 168: C than branching back.
! 169: C
! 170: C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
! 171: C measured about 1 cycle/bit slower.
! 172:
! 173: C eax x
! 174: C ebx common twos
! 175: C ecx scratch
! 176: C edx y
! 177:
! 178: ALIGN(4)
! 179: L(swap):
! 180: addl %eax, %edx C x-y+y = x
! 181: negl %eax C -(x-y) = y-x
! 182:
! 183: L(strip_x):
! 184: shrl %eax C odd-odd = even, so always one to strip
! 185: ASSERT(nz)
! 186:
! 187: L(strip_x_leal):
! 188: leal 1(%eax), %ecx
! 189:
! 190: L(strip_x_and):
! 191: andl $1, %ecx C (x^1)&1
! 192:
! 193: shrl %cl, %eax C shift if x even
! 194:
! 195: testb $1, %al
! 196: jz L(strip_x)
! 197:
! 198: ASSERT(nz,`testl $1, %eax') C x, y odd
! 199: ASSERT(nz,`testl $1, %edx')
! 200:
! 201: subl %edx, %eax
! 202: jb L(swap)
! 203: ja L(strip_x)
! 204:
! 205:
! 206: movl %edx, %eax
! 207: movl %ebx, %ecx
! 208:
! 209: L(done_shll):
! 210: shll %cl, %eax
! 211: popl %ebx
! 212:
! 213: ret
! 214:
! 215:
! 216: C -----------------------------------------------------------------------------
! 217: C Two or more limbs.
! 218: C
! 219: C x={src,size} is reduced modulo y using either a plain mod_1 style
! 220: C remainder, or a modexact_1 style exact division.
! 221:
! 222: deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
! 223:
! 224: ALIGN(8)
! 225: L(size_two_or_more):
! 226: C eax
! 227: C ebx
! 228: C ecx common twos
! 229: C edx y, without common twos
! 230: C esi
! 231: C edi
! 232: C ebp
! 233:
! 234: deflit(FRAME_TWO_OR_MORE, FRAME)
! 235:
! 236: pushl %edi defframe_pushl(SAVE_EDI)
! 237: movl PARAM_SRC, %ebx
! 238:
! 239: L(y_twos):
! 240: shrl %edx
! 241: jnc L(y_twos)
! 242:
! 243: movl %ecx, %edi C common twos
! 244: movl PARAM_SIZE, %ecx
! 245:
! 246: pushl %esi defframe_pushl(SAVE_ESI)
! 247: leal 1(%edx,%edx), %esi C y (odd)
! 248:
! 249: movl -4(%ebx,%ecx,4), %eax C src high limb
! 250:
! 251: cmpl %edx, %eax C carry if high<divisor
! 252:
! 253: sbbl %edx, %edx C -1 if high<divisor
! 254:
! 255: addl %edx, %ecx C skip one limb if high<divisor
! 256: andl %eax, %edx
! 257:
! 258: cmpl $MODEXACT_THRESHOLD, %ecx
! 259: jae L(modexact)
! 260:
! 261:
! 262: L(divide_top):
! 263: C eax scratch (quotient)
! 264: C ebx src
! 265: C ecx counter, size-1 to 1
! 266: C edx carry (remainder)
! 267: C esi divisor (odd)
! 268: C edi
! 269: C ebp
! 270:
! 271: movl -4(%ebx,%ecx,4), %eax
! 272: divl %esi
! 273: loop L(divide_top)
! 274:
! 275:
! 276: movl %edx, %eax C x
! 277: movl %esi, %edx C y (odd)
! 278:
! 279: movl %edi, %ebx C common twos
! 280: popl %esi
! 281:
! 282: popl %edi
! 283: leal 1(%eax), %ecx
! 284:
! 285: orl %eax, %eax
! 286: jnz L(strip_x_and)
! 287:
! 288:
! 289: movl %ebx, %ecx
! 290: movl %edx, %eax
! 291:
! 292: shll %cl, %eax
! 293: popl %ebx
! 294:
! 295: ret
! 296:
! 297:
! 298: ALIGN(8)
! 299: L(modexact):
! 300: C eax
! 301: C ebx src ptr
! 302: C ecx size or size-1
! 303: C edx
! 304: C esi y odd
! 305: C edi common twos
! 306: C ebp
! 307:
! 308: movl PARAM_SIZE, %eax
! 309: pushl %esi FRAME_pushl()
! 310:
! 311: pushl %eax FRAME_pushl()
! 312:
! 313: pushl %ebx FRAME_pushl()
! 314:
! 315: ifdef(`PIC',`
! 316: nop C code alignment
! 317: call L(movl_eip_ebx)
! 318: L(here):
! 319: addl $_GLOBAL_OFFSET_TABLE_, %ebx
! 320: call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
! 321: ',`
! 322: call GSYM_PREFIX`'mpn_modexact_1_odd
! 323: ')
! 324:
! 325: movl %esi, %edx C y odd
! 326: movl SAVE_ESI, %esi
! 327:
! 328: movl %edi, %ebx C common twos
! 329: movl SAVE_EDI, %edi
! 330:
! 331: addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp
! 332: orl %eax, %eax
! 333:
! 334: leal 1(%eax), %ecx
! 335: jnz L(strip_x_and)
! 336:
! 337:
! 338: movl %ebx, %ecx
! 339: movl %edx, %eax
! 340:
! 341: shll %cl, %eax
! 342: popl %ebx
! 343:
! 344: ret
! 345:
! 346:
! 347: ifdef(`PIC',`
! 348: L(movl_eip_ebx):
! 349: movl (%esp), %ebx
! 350: ret
! 351: ')
! 352:
! 353: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>