Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/divrem_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_divrem_1 -- mpn by limb division.
! 2: dnl
! 3: dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
! 30: C mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t divisor);
! 32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
! 33: C mp_srcptr src, mp_size_t size,
! 34: C mp_limb_t divisor, mp_limb_t carry);
! 35: C
! 36: C The method and nomenclature follow part 8 of "Division by Invariant
! 37: C Integers using Multiplication" by Granlund and Montgomery, reference in
! 38: C gmp.texi.
! 39: C
! 40: C The "and"s shown in the paper are done here with "cmov"s. "m" is written
! 41: C for m', and "d" for d_norm, which won't cause any confusion since it's
! 42: C only the normalized divisor that's of any use in the code. "b" is written
! 43: C for 2^N, the size of a limb, N being 32 here.
! 44: C
! 45: C mpn_divrem_1 avoids one division if the src high limb is less than the
! 46: C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal
! 47: C circumstances that will be a very rare event.
! 48: C
! 49: C There's a small bias towards expecting xsize==0, by having code for
! 50: C xsize==0 in a straight line and xsize!=0 under forward jumps.
! 51:
! 52:
! 53: dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
! 54: dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
! 55: dnl
! 56: dnl The inverse takes about 50 cycles to calculate, but after that the
! 57: dnl multiply is 17 c/l versus division at 42 c/l.
! 58: dnl
! 59: dnl At 3 limbs the mul is a touch faster than div on the integer part, and
! 60: dnl even more so on the fractional part.
! 61:
! 62: deflit(MUL_THRESHOLD, 3)
! 63:
! 64:
! 65: defframe(PARAM_CARRY, 24)
! 66: defframe(PARAM_DIVISOR,20)
! 67: defframe(PARAM_SIZE, 16)
! 68: defframe(PARAM_SRC, 12)
! 69: defframe(PARAM_XSIZE, 8)
! 70: defframe(PARAM_DST, 4)
! 71:
! 72: defframe(SAVE_EBX, -4)
! 73: defframe(SAVE_ESI, -8)
! 74: defframe(SAVE_EDI, -12)
! 75: defframe(SAVE_EBP, -16)
! 76:
! 77: defframe(VAR_NORM, -20)
! 78: defframe(VAR_INVERSE, -24)
! 79: defframe(VAR_SRC, -28)
! 80: defframe(VAR_DST, -32)
! 81: defframe(VAR_DST_STOP,-36)
! 82:
! 83: deflit(STACK_SPACE, 36)
! 84:
! 85: .text
! 86: ALIGN(32)
! 87:
! 88: PROLOGUE(mpn_divrem_1c)
! 89: deflit(`FRAME',0)
! 90: movl PARAM_CARRY, %edx
! 91: movl PARAM_SIZE, %ecx
! 92: subl $STACK_SPACE, %esp
! 93: deflit(`FRAME',STACK_SPACE)
! 94:
! 95: movl %ebx, SAVE_EBX
! 96: movl PARAM_XSIZE, %ebx
! 97:
! 98: movl %edi, SAVE_EDI
! 99: movl PARAM_DST, %edi
! 100:
! 101: movl %ebp, SAVE_EBP
! 102: movl PARAM_DIVISOR, %ebp
! 103:
! 104: movl %esi, SAVE_ESI
! 105: movl PARAM_SRC, %esi
! 106:
! 107: leal -4(%edi,%ebx,4), %edi
! 108: jmp LF(mpn_divrem_1,start_1c)
! 109:
! 110: EPILOGUE()
! 111:
! 112:
! 113: C offset 0x31, close enough to aligned
! 114: PROLOGUE(mpn_divrem_1)
! 115: deflit(`FRAME',0)
! 116:
! 117: movl PARAM_SIZE, %ecx
! 118: movl $0, %edx C initial carry (if can't skip a div)
! 119: subl $STACK_SPACE, %esp
! 120: deflit(`FRAME',STACK_SPACE)
! 121:
! 122: movl %ebp, SAVE_EBP
! 123: movl PARAM_DIVISOR, %ebp
! 124:
! 125: movl %ebx, SAVE_EBX
! 126: movl PARAM_XSIZE, %ebx
! 127:
! 128: movl %esi, SAVE_ESI
! 129: movl PARAM_SRC, %esi
! 130: orl %ecx, %ecx
! 131:
! 132: movl %edi, SAVE_EDI
! 133: movl PARAM_DST, %edi
! 134: leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
! 135:
! 136: jz L(no_skip_div)
! 137: movl -4(%esi,%ecx,4), %eax C src high limb
! 138:
! 139: cmpl %ebp, %eax C one less div if high<divisor
! 140: jnb L(no_skip_div)
! 141:
! 142: movl $0, (%edi,%ecx,4) C dst high limb
! 143: decl %ecx C size-1
! 144: movl %eax, %edx C src high limb as initial carry
! 145: L(no_skip_div):
! 146:
! 147:
! 148: L(start_1c):
! 149: C eax
! 150: C ebx xsize
! 151: C ecx size
! 152: C edx carry
! 153: C esi src
! 154: C edi &dst[xsize-1]
! 155: C ebp divisor
! 156:
! 157: leal (%ebx,%ecx), %eax C size+xsize
! 158: cmpl $MUL_THRESHOLD, %eax
! 159: jae L(mul_by_inverse)
! 160:
! 161:
! 162: C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
! 163: C It'd be possible to write them out without the looping, but no speedup
! 164: C would be expected.
! 165: C
! 166: C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
! 167: C integer part, but curiously not on the fractional part, where %ebp is a
! 168: C (fixed) couple of cycles faster.
! 169:
! 170: orl %ecx, %ecx
! 171: jz L(divide_no_integer)
! 172:
! 173: L(divide_integer):
! 174: C eax scratch (quotient)
! 175: C ebx xsize
! 176: C ecx counter
! 177: C edx scratch (remainder)
! 178: C esi src
! 179: C edi &dst[xsize-1]
! 180: C ebp divisor
! 181:
! 182: movl -4(%esi,%ecx,4), %eax
! 183:
! 184: divl PARAM_DIVISOR
! 185:
! 186: movl %eax, (%edi,%ecx,4)
! 187: decl %ecx
! 188: jnz L(divide_integer)
! 189:
! 190:
! 191: L(divide_no_integer):
! 192: movl PARAM_DST, %edi
! 193: orl %ebx, %ebx
! 194: jnz L(divide_fraction)
! 195:
! 196: L(divide_done):
! 197: movl SAVE_ESI, %esi
! 198: movl SAVE_EDI, %edi
! 199: movl %edx, %eax
! 200:
! 201: movl SAVE_EBX, %ebx
! 202: movl SAVE_EBP, %ebp
! 203: addl $STACK_SPACE, %esp
! 204:
! 205: ret
! 206:
! 207:
! 208: L(divide_fraction):
! 209: C eax scratch (quotient)
! 210: C ebx counter
! 211: C ecx
! 212: C edx scratch (remainder)
! 213: C esi
! 214: C edi dst
! 215: C ebp divisor
! 216:
! 217: movl $0, %eax
! 218:
! 219: divl %ebp
! 220:
! 221: movl %eax, -4(%edi,%ebx,4)
! 222: decl %ebx
! 223: jnz L(divide_fraction)
! 224:
! 225: jmp L(divide_done)
! 226:
! 227:
! 228:
! 229: C -----------------------------------------------------------------------------
! 230:
! 231: L(mul_by_inverse):
! 232: C eax
! 233: C ebx xsize
! 234: C ecx size
! 235: C edx carry
! 236: C esi src
! 237: C edi &dst[xsize-1]
! 238: C ebp divisor
! 239:
! 240: bsrl %ebp, %eax C 31-l
! 241:
! 242: leal 12(%edi), %ebx
! 243: leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
! 244:
! 245: movl %edi, VAR_DST
! 246: movl %ebx, VAR_DST_STOP
! 247:
! 248: movl %ecx, %ebx C size
! 249: movl $31, %ecx
! 250:
! 251: movl %edx, %edi C carry
! 252: movl $-1, %edx
! 253:
! 254: C
! 255:
! 256: xorl %eax, %ecx C l
! 257: incl %eax C 32-l
! 258:
! 259: shll %cl, %ebp C d normalized
! 260: movl %ecx, VAR_NORM
! 261:
! 262: movd %eax, %mm7
! 263:
! 264: movl $-1, %eax
! 265: subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
! 266:
! 267: divl %ebp C floor (b*(b-d)-1) / d
! 268:
! 269: orl %ebx, %ebx C size
! 270: movl %eax, VAR_INVERSE
! 271: leal -12(%esi,%ebx,4), %eax C &src[size-3]
! 272:
! 273: jz L(start_zero)
! 274: movl %eax, VAR_SRC
! 275: cmpl $1, %ebx
! 276:
! 277: movl 8(%eax), %esi C src high limb
! 278: jz L(start_one)
! 279:
! 280: L(start_two_or_more):
! 281: movl 4(%eax), %edx C src second highest limb
! 282:
! 283: shldl( %cl, %esi, %edi) C n2 = carry,high << l
! 284:
! 285: shldl( %cl, %edx, %esi) C n10 = high,second << l
! 286:
! 287: cmpl $2, %ebx
! 288: je L(integer_two_left)
! 289: jmp L(integer_top)
! 290:
! 291:
! 292: L(start_one):
! 293: shldl( %cl, %esi, %edi) C n2 = carry,high << l
! 294:
! 295: shll %cl, %esi C n10 = high << l
! 296: movl %eax, VAR_SRC
! 297: jmp L(integer_one_left)
! 298:
! 299:
! 300: L(start_zero):
! 301: shll %cl, %edi C n2 = carry << l
! 302: movl $0, %esi C n10 = 0
! 303:
! 304: C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
! 305: C must have xsize!=0
! 306: jmp L(fraction_some)
! 307:
! 308:
! 309:
! 310: C -----------------------------------------------------------------------------
! 311: C
! 312: C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
! 313: C execution. The instruction scheduling is important, with various
! 314: C apparently equivalent forms running 1 to 5 cycles slower.
! 315: C
! 316: C A lower bound for the time would seem to be 16 cycles, based on the
! 317: C following successive dependencies.
! 318: C
! 319: C cycles
! 320: C n2+n1 1
! 321: C mul 6
! 322: C q1+1 1
! 323: C mul 6
! 324: C sub 1
! 325: C addback 1
! 326: C ---
! 327: C 16
! 328: C
! 329: C This chain is what the loop has already, but 16 cycles isn't achieved.
! 330: C K7 has enough decode, and probably enough execute (depending maybe on what
! 331: C a mul actually consumes), but nothing running under 17 has been found.
! 332: C
! 333: C In theory n2+n1 could be done in the sub and addback stages (by
! 334: C calculating both n2 and n2+n1 there), but lack of registers makes this an
! 335: C unlikely proposition.
! 336: C
! 337: C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
! 338: C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
! 339: C chain, and nothing better than 18 cycles has been found when using it.
! 340: C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
! 341: C be an extremely rare event.
! 342: C
! 343: C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
! 344: C if some special data is coming out with this always, the q1_ff special
! 345: C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
! 346: C induce the q1_ff case, for speed measurements or testing. Note that
! 347: C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
! 348: C
! 349: C The instruction groupings and empty comments show the cycles for a naive
! 350: C in-order view of the code (conveniently ignoring the load latency on
! 351: C VAR_INVERSE). This shows some of where the time is going, but is nonsense
! 352: C to the extent that out-of-order execution rearranges it. In this case
! 353: C there's 19 cycles shown, but it executes at 17.
! 354:
! 355: ALIGN(16)
! 356: L(integer_top):
! 357: C eax scratch
! 358: C ebx scratch (nadj, q1)
! 359: C ecx scratch (src, dst)
! 360: C edx scratch
! 361: C esi n10
! 362: C edi n2
! 363: C ebp divisor
! 364: C
! 365: C mm0 scratch (src qword)
! 366: C mm7 rshift for normalization
! 367:
! 368: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
! 369: movl %edi, %eax C n2
! 370: movl VAR_SRC, %ecx
! 371:
! 372: leal (%ebp,%esi), %ebx
! 373: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
! 374: sbbl $-1, %eax C n2+n1
! 375:
! 376: mull VAR_INVERSE C m*(n2+n1)
! 377:
! 378: movq (%ecx), %mm0 C next limb and the one below it
! 379: subl $4, %ecx
! 380:
! 381: movl %ecx, VAR_SRC
! 382:
! 383: C
! 384:
! 385: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 386: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 387: movl %ebp, %eax C d
! 388:
! 389: C
! 390:
! 391: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 392: jz L(q1_ff)
! 393: movl VAR_DST, %ecx
! 394:
! 395: mull %ebx C (q1+1)*d
! 396:
! 397: psrlq %mm7, %mm0
! 398:
! 399: leal -4(%ecx), %ecx
! 400:
! 401: C
! 402:
! 403: subl %eax, %esi
! 404: movl VAR_DST_STOP, %eax
! 405:
! 406: C
! 407:
! 408: sbbl %edx, %edi C n - (q1+1)*d
! 409: movl %esi, %edi C remainder -> n2
! 410: leal (%ebp,%esi), %edx
! 411:
! 412: movd %mm0, %esi
! 413:
! 414: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 415: sbbl $0, %ebx C q
! 416: cmpl %eax, %ecx
! 417:
! 418: movl %ebx, (%ecx)
! 419: movl %ecx, VAR_DST
! 420: jne L(integer_top)
! 421:
! 422:
! 423: L(integer_loop_done):
! 424:
! 425:
! 426: C -----------------------------------------------------------------------------
! 427: C
! 428: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
! 429: C q1_ff special case. This make the code a bit smaller and simpler, and
! 430: C costs only 1 cycle (each).
! 431:
! 432: L(integer_two_left):
! 433: C eax scratch
! 434: C ebx scratch (nadj, q1)
! 435: C ecx scratch (src, dst)
! 436: C edx scratch
! 437: C esi n10
! 438: C edi n2
! 439: C ebp divisor
! 440: C
! 441: C mm0 src limb, shifted
! 442: C mm7 rshift
! 443:
! 444: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
! 445: movl %edi, %eax C n2
! 446: movl PARAM_SRC, %ecx
! 447:
! 448: leal (%ebp,%esi), %ebx
! 449: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
! 450: sbbl $-1, %eax C n2+n1
! 451:
! 452: mull VAR_INVERSE C m*(n2+n1)
! 453:
! 454: movd (%ecx), %mm0 C src low limb
! 455:
! 456: movl VAR_DST_STOP, %ecx
! 457:
! 458: C
! 459:
! 460: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 461: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 462: movl %ebp, %eax C d
! 463:
! 464: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 465:
! 466: sbbl $0, %ebx
! 467:
! 468: mull %ebx C (q1+1)*d
! 469:
! 470: psllq $32, %mm0
! 471:
! 472: psrlq %mm7, %mm0
! 473:
! 474: C
! 475:
! 476: subl %eax, %esi
! 477:
! 478: C
! 479:
! 480: sbbl %edx, %edi C n - (q1+1)*d
! 481: movl %esi, %edi C remainder -> n2
! 482: leal (%ebp,%esi), %edx
! 483:
! 484: movd %mm0, %esi
! 485:
! 486: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 487: sbbl $0, %ebx C q
! 488:
! 489: movl %ebx, -4(%ecx)
! 490:
! 491:
! 492: C -----------------------------------------------------------------------------
! 493: L(integer_one_left):
! 494: C eax scratch
! 495: C ebx scratch (nadj, q1)
! 496: C ecx dst
! 497: C edx scratch
! 498: C esi n10
! 499: C edi n2
! 500: C ebp divisor
! 501: C
! 502: C mm0 src limb, shifted
! 503: C mm7 rshift
! 504:
! 505: movl VAR_DST_STOP, %ecx
! 506: cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
! 507: movl %edi, %eax C n2
! 508:
! 509: leal (%ebp,%esi), %ebx
! 510: cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
! 511: sbbl $-1, %eax C n2+n1
! 512:
! 513: mull VAR_INVERSE C m*(n2+n1)
! 514:
! 515: C
! 516:
! 517: C
! 518:
! 519: C
! 520:
! 521: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 522: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 523: movl %ebp, %eax C d
! 524:
! 525: C
! 526:
! 527: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 528:
! 529: sbbl $0, %ebx C q1 if q1+1 overflowed
! 530:
! 531: mull %ebx
! 532:
! 533: C
! 534:
! 535: C
! 536:
! 537: C
! 538:
! 539: subl %eax, %esi
! 540:
! 541: C
! 542:
! 543: sbbl %edx, %edi C n - (q1+1)*d
! 544: movl %esi, %edi C remainder -> n2
! 545: leal (%ebp,%esi), %edx
! 546:
! 547: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 548: sbbl $0, %ebx C q
! 549:
! 550: movl %ebx, -8(%ecx)
! 551: subl $8, %ecx
! 552:
! 553:
! 554:
! 555: L(integer_none):
! 556: cmpl $0, PARAM_XSIZE
! 557: jne L(fraction_some)
! 558:
! 559: movl %edi, %eax
! 560: L(fraction_done):
! 561: movl VAR_NORM, %ecx
! 562: movl SAVE_EBP, %ebp
! 563:
! 564: movl SAVE_EDI, %edi
! 565: movl SAVE_ESI, %esi
! 566:
! 567: movl SAVE_EBX, %ebx
! 568: addl $STACK_SPACE, %esp
! 569:
! 570: shrl %cl, %eax
! 571: emms
! 572:
! 573: ret
! 574:
! 575:
! 576: C -----------------------------------------------------------------------------
! 577: C
! 578: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
! 579: C of q*d is simply -d and the remainder n-q*d = n10+d
! 580:
! 581: L(q1_ff):
! 582: C eax (divisor)
! 583: C ebx (q1+1 == 0)
! 584: C ecx
! 585: C edx
! 586: C esi n10
! 587: C edi n2
! 588: C ebp divisor
! 589:
! 590: movl VAR_DST, %ecx
! 591: movl VAR_DST_STOP, %edx
! 592: subl $4, %ecx
! 593:
! 594: psrlq %mm7, %mm0
! 595: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
! 596: movl %ecx, VAR_DST
! 597:
! 598: movd %mm0, %esi C next n10
! 599:
! 600: movl $-1, (%ecx)
! 601: cmpl %ecx, %edx
! 602: jne L(integer_top)
! 603:
! 604: jmp L(integer_loop_done)
! 605:
! 606:
! 607:
! 608: C -----------------------------------------------------------------------------
! 609: C
! 610: C Being the fractional part, the "source" limbs are all zero, meaning
! 611: C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
! 612: C
! 613: C The loop runs at 15 cycles. The dependent chain is the same as the
! 614: C general case above, but without the n2+n1 stage (due to n1==0), so 15
! 615: C would seem to be the lower bound.
! 616: C
! 617: C A not entirely obvious simplification is that q1+1 never overflows a limb,
! 618: C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
! 619: C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
! 620: C rnd() means rounding down to a multiple of d.
! 621: C
! 622: C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
! 623: C = m*d + b*d - m - b
! 624: C = floor((b(b-d)-1)/d)*d + b*d - m - b
! 625: C = rnd(b(b-d)-1) + b*d - m - b
! 626: C = rnd(b(b-d)-1 + b*d) - m - b
! 627: C = rnd(b*b-1) - m - b
! 628: C <= (b-2)*b
! 629: C
! 630: C Unchanged from the general case is that the final quotient limb q can be
! 631: C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
! 632: C equation 8.4 of the paper which simplifies as follows when n1==0 and
! 633: C n0==0.
! 634: C
! 635: C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
! 636: C
! 637: C As before, the instruction groupings and empty comments show a naive
! 638: C in-order view of the code, which is made a nonsense by out of order
! 639: C execution. There's 17 cycles shown, but it executes at 15.
! 640: C
! 641: C Rotating the store q and remainder->n2 instructions up to the top of the
! 642: C loop gets the run time down from 16 to 15.
! 643:
! 644: ALIGN(16)
! 645: L(fraction_some):
! 646: C eax
! 647: C ebx
! 648: C ecx
! 649: C edx
! 650: C esi
! 651: C edi carry
! 652: C ebp divisor
! 653:
! 654: movl PARAM_DST, %esi
! 655: movl VAR_DST_STOP, %ecx
! 656: movl %edi, %eax
! 657:
! 658: subl $8, %ecx
! 659:
! 660: jmp L(fraction_entry)
! 661:
! 662:
! 663: ALIGN(16)
! 664: L(fraction_top):
! 665: C eax n2 carry, then scratch
! 666: C ebx scratch (nadj, q1)
! 667: C ecx dst, decrementing
! 668: C edx scratch
! 669: C esi dst stop point
! 670: C edi (will be n2)
! 671: C ebp divisor
! 672:
! 673: movl %ebx, (%ecx) C previous q
! 674: movl %eax, %edi C remainder->n2
! 675:
! 676: L(fraction_entry):
! 677: mull VAR_INVERSE C m*n2
! 678:
! 679: movl %ebp, %eax C d
! 680: subl $4, %ecx C dst
! 681: leal 1(%edi), %ebx
! 682:
! 683: C
! 684:
! 685: C
! 686:
! 687: C
! 688:
! 689: C
! 690:
! 691: addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
! 692:
! 693: mull %ebx C (q1+1)*d
! 694:
! 695: C
! 696:
! 697: C
! 698:
! 699: C
! 700:
! 701: negl %eax C low of n - (q1+1)*d
! 702:
! 703: C
! 704:
! 705: sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
! 706: leal (%ebp,%eax), %edx
! 707:
! 708: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
! 709: sbbl $0, %ebx C q
! 710: cmpl %esi, %ecx
! 711:
! 712: jne L(fraction_top)
! 713:
! 714:
! 715: movl %ebx, (%ecx)
! 716: jmp L(fraction_done)
! 717:
! 718: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>