Annotation of OpenXM_contrib/gmp/mpn/x86/k7/gcd_1.asm, Revision 1.1
1.1 ! ohara 1: dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C K7: 6.75 cycles/bit (approx) 1x1 gcd
! 26: C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
! 27:
! 28:
! 29: dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
! 30: dnl where x is the larger of the two. See tune/README for more.
! 31: dnl
! 32: dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
! 33: dnl suggests 40/7*2=11.4 but 7 seems to be about right.
! 34:
! 35: deflit(DIV_THRESHOLD, 7)
! 36:
! 37:
! 38: C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
! 39: C
! 40: C This is mixed in with the code, but as per the k7 optimization manual it's
! 41: C a full cache line and suitably aligned so it won't get swapped between
! 42: C code and data. Having it in TEXT rather than RODATA saves needing a GOT
! 43: C entry when PIC.
! 44: C
! 45: C Actually, there doesn't seem to be a measurable difference between this in
! 46: C it's own cache line or plonked in the middle of the code. Presumably
! 47: C since TEXT is read-only there's no worries about coherency.
! 48:
! 49: deflit(MASK, 63)
! 50: deflit(MAXSHIFT, 6)
! 51:
! 52: TEXT
! 53: ALIGN(64)
! 54: L(table):
! 55: .byte MAXSHIFT
! 56: forloop(i,1,MASK,
! 57: ` .byte m4_count_trailing_zeros(i)
! 58: ')
! 59:
! 60:
! 61: C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
! 62: C
! 63:
! 64: defframe(PARAM_LIMB, 12)
! 65: defframe(PARAM_SIZE, 8)
! 66: defframe(PARAM_SRC, 4)
! 67:
! 68: defframe(SAVE_EBX, -4)
! 69: defframe(SAVE_ESI, -8)
! 70: defframe(SAVE_EDI, -12)
! 71: defframe(SAVE_EBP, -16)
! 72: defframe(CALL_DIVISOR,-20)
! 73: defframe(CALL_SIZE, -24)
! 74: defframe(CALL_SRC, -28)
! 75:
! 76: deflit(STACK_SPACE, 28)
! 77:
! 78: TEXT
! 79: ALIGN(16)
! 80:
! 81: PROLOGUE(mpn_gcd_1)
! 82: deflit(`FRAME',0)
! 83:
! 84: ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0
! 85: ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1
! 86:
! 87: movl PARAM_SRC, %eax
! 88: movl PARAM_LIMB, %edx
! 89: subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
! 90:
! 91: movl %esi, SAVE_ESI
! 92: movl %ebx, SAVE_EBX
! 93:
! 94: movl (%eax), %esi C src low limb
! 95:
! 96: ifdef(`PIC',`
! 97: movl %edi, SAVE_EDI
! 98: call L(movl_eip_to_edi)
! 99: L(here):
! 100: addl $L(table)-L(here), %edi
! 101: ')
! 102:
! 103: movl %esi, %ebx
! 104: orl %edx, %esi C x|y
! 105: movl $-1, %ecx
! 106:
! 107: L(twos):
! 108: incl %ecx
! 109: shrl %esi
! 110: jnc L(twos) C 3/4 chance of x or y odd already
! 111:
! 112: shrl %cl, %ebx
! 113: shrl %cl, %edx
! 114: movl %ecx, %esi C common twos
! 115:
! 116: movl PARAM_SIZE, %ecx
! 117: cmpl $1, %ecx
! 118: ja L(divide)
! 119:
! 120:
! 121: C eax
! 122: C ebx x
! 123: C ecx
! 124: C edx y
! 125: C esi common twos
! 126: C edi [PIC] L(table)
! 127: C ebp
! 128:
! 129: movl %edx, %eax
! 130: cmpl %ebx, %edx
! 131:
! 132: cmovb( %ebx, %eax) C swap to make x bigger than y
! 133: cmovb( %edx, %ebx)
! 134:
! 135:
! 136: L(strip_y):
! 137: C eax x
! 138: C ebx y
! 139: C ecx
! 140: C edx
! 141: C esi common twos
! 142: C edi [PIC] L(table)
! 143: C ebp
! 144:
! 145: ASSERT(nz,`orl %ebx,%ebx')
! 146: shrl %ebx
! 147: jnc L(strip_y)
! 148: rcll %ebx
! 149:
! 150:
! 151: C eax x
! 152: C ebx y (odd)
! 153: C ecx
! 154: C edx
! 155: C esi common twos
! 156: C edi [PIC] L(table)
! 157: C ebp
! 158:
! 159: movl %eax, %ecx
! 160: movl %ebx, %edx
! 161: shrl $DIV_THRESHOLD, %eax
! 162:
! 163: cmpl %eax, %ebx
! 164: movl %ecx, %eax
! 165: ja L(strip_x_entry) C do x%y if x much bigger than y
! 166:
! 167:
! 168: xorl %edx, %edx
! 169:
! 170: divl %ebx
! 171:
! 172: orl %edx, %edx
! 173: movl %edx, %eax C remainder -> x
! 174: movl %ebx, %edx C y
! 175:
! 176: jz L(done_ebx)
! 177: jmp L(strip_x)
! 178:
! 179:
! 180: C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by
! 181: C ensuring the end of the jnz at the end of this loop doesn't cross
! 182: C into the next cache line at 0xC0.
! 183: C
! 184: C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
! 185: C it crosses but doesn't suffer any measurable slowdown.
! 186:
! 187: L(top):
! 188: C eax x
! 189: C ebx y-x
! 190: C ecx x-y
! 191: C edx y
! 192: C esi twos, for use at end
! 193: C edi [PIC] L(table)
! 194:
! 195: cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x
! 196: cmovc( %eax, %edx)
! 197:
! 198: L(strip_x):
! 199: movl %ecx, %eax
! 200: L(strip_x_entry):
! 201: andl $MASK, %ecx
! 202:
! 203: ASSERT(nz, `orl %eax, %eax')
! 204:
! 205: ifdef(`PIC',`
! 206: movb (%ecx,%edi), %cl
! 207: ',`
! 208: movb L(table) (%ecx), %cl
! 209: ')
! 210:
! 211: shrl %cl, %eax
! 212: cmpb $MAXSHIFT, %cl
! 213:
! 214: movl %eax, %ecx
! 215: movl %edx, %ebx
! 216: je L(strip_x)
! 217:
! 218: ASSERT(nz, `testl $1, %eax') C both odd
! 219: ASSERT(nz, `testl $1, %edx')
! 220:
! 221: subl %eax, %ebx
! 222: subl %edx, %ecx
! 223: jnz L(top)
! 224:
! 225:
! 226: L(done):
! 227: movl %esi, %ecx
! 228: movl SAVE_ESI, %esi
! 229: ifdef(`PIC',`
! 230: movl SAVE_EDI, %edi
! 231: ')
! 232:
! 233: shll %cl, %eax
! 234: movl SAVE_EBX, %ebx
! 235: addl $FRAME, %esp
! 236:
! 237: ret
! 238:
! 239:
! 240:
! 241: C -----------------------------------------------------------------------------
! 242: C two or more limbs
! 243:
! 244: dnl MODEXACT_THRESHOLD is the size at which it's better to call
! 245: dnl mpn_modexact_1_odd than do an inline loop.
! 246:
! 247: deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
! 248:
! 249: L(divide):
! 250: C eax src
! 251: C ebx
! 252: C ecx size
! 253: C edx y
! 254: C esi common twos
! 255: C edi [PIC] L(table)
! 256: C ebp
! 257:
! 258: L(divide_strip_y):
! 259: ASSERT(nz,`orl %edx,%edx')
! 260: shrl %edx
! 261: jnc L(divide_strip_y)
! 262: leal 1(%edx,%edx), %ebx C y now odd
! 263:
! 264: movl %ebp, SAVE_EBP
! 265: movl %eax, %ebp
! 266: movl -4(%eax,%ecx,4), %eax C src high limb
! 267:
! 268: cmp $MODEXACT_THRESHOLD, %ecx
! 269: jae L(modexact)
! 270:
! 271: cmpl %ebx, %eax C high cmp divisor
! 272: movl $0, %edx
! 273:
! 274: cmovc( %eax, %edx) C skip a div if high<divisor
! 275: sbbl $0, %ecx
! 276:
! 277:
! 278: L(divide_top):
! 279: C eax scratch (quotient)
! 280: C ebx y
! 281: C ecx counter (size to 1, inclusive)
! 282: C edx carry (remainder)
! 283: C esi common twos
! 284: C edi [PIC] L(table)
! 285: C ebp src
! 286:
! 287: movl -4(%ebp,%ecx,4), %eax
! 288:
! 289: divl %ebx
! 290:
! 291: decl %ecx
! 292: jnz L(divide_top)
! 293:
! 294:
! 295: C eax
! 296: C ebx y (odd)
! 297: C ecx
! 298: C edx x
! 299: C esi common twos
! 300: C edi [PIC] L(table)
! 301: C ebp
! 302:
! 303: orl %edx, %edx
! 304: movl SAVE_EBP, %ebp
! 305: movl %edx, %eax
! 306:
! 307: movl %edx, %ecx
! 308: movl %ebx, %edx
! 309: jnz L(strip_x_entry)
! 310:
! 311:
! 312: L(done_ebx):
! 313: movl %ebx, %eax
! 314: jmp L(done)
! 315:
! 316:
! 317:
! 318: L(modexact):
! 319: C eax
! 320: C ebx y
! 321: C ecx size
! 322: C edx
! 323: C esi common twos
! 324: C edi [PIC] L(table)
! 325: C ebp src
! 326:
! 327: ifdef(`PIC',`
! 328: movl %ebp, CALL_SRC
! 329: movl %ebx, %ebp C y
! 330: movl %edi, %ebx C L(table)
! 331:
! 332: addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
! 333: movl %ebp, CALL_DIVISOR
! 334: movl %ecx, CALL_SIZE
! 335:
! 336: call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
! 337: ',`
! 338: dnl non-PIC
! 339: movl %ebx, CALL_DIVISOR
! 340: movl %ebp, CALL_SRC
! 341: movl %ecx, CALL_SIZE
! 342:
! 343: call GSYM_PREFIX`'mpn_modexact_1_odd
! 344: ')
! 345:
! 346: C eax x
! 347: C ebx [non-PIC] y
! 348: C ecx
! 349: C edx
! 350: C esi common twos
! 351: C edi [PIC] L(table)
! 352: C ebp [PIC] y
! 353:
! 354: orl %eax, %eax
! 355: movl ifdef(`PIC',`%ebp',`%ebx'), %edx
! 356: movl SAVE_EBP, %ebp
! 357:
! 358: movl %eax, %ecx
! 359: jnz L(strip_x_entry)
! 360:
! 361: movl %edx, %eax
! 362: jmp L(done)
! 363:
! 364:
! 365: ifdef(`PIC', `
! 366: L(movl_eip_to_edi):
! 367: movl (%esp), %edi
! 368: ret
! 369: ')
! 370:
! 371: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>