Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mmx/divrem_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
! 2: dnl
! 3: dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
! 30: C mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t divisor);
! 32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
! 33: C mp_srcptr src, mp_size_t size,
! 34: C mp_limb_t divisor, mp_limb_t carry);
! 35: C
! 36: C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
! 37: C see that file for some comments. It's likely what's here can be improved.
! 38:
! 39:
! 40: dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
! 41: dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
! 42: dnl
! 43: dnl The different speeds of the integer and fraction parts means that using
! 44: dnl xsize+size isn't quite right. The threshold wants to be a bit higher
! 45: dnl for the integer part and a bit lower for the fraction part. (Or what's
! 46: dnl really wanted is to speed up the integer part!)
! 47: dnl
! 48: dnl The threshold is set to make the integer part right. At 4 limbs the
! 49: dnl div and mul are about the same there, but on the fractional part the
! 50: dnl mul is much faster.
! 51:
! 52: deflit(MUL_THRESHOLD, 4)
! 53:
! 54:
! 55: defframe(PARAM_CARRY, 24)
! 56: defframe(PARAM_DIVISOR,20)
! 57: defframe(PARAM_SIZE, 16)
! 58: defframe(PARAM_SRC, 12)
! 59: defframe(PARAM_XSIZE, 8)
! 60: defframe(PARAM_DST, 4)
! 61:
! 62: defframe(SAVE_EBX, -4)
! 63: defframe(SAVE_ESI, -8)
! 64: defframe(SAVE_EDI, -12)
! 65: defframe(SAVE_EBP, -16)
! 66:
! 67: defframe(VAR_NORM, -20)
! 68: defframe(VAR_INVERSE, -24)
! 69: defframe(VAR_SRC, -28)
! 70: defframe(VAR_DST, -32)
! 71: defframe(VAR_DST_STOP,-36)
! 72:
! 73: deflit(STACK_SPACE, 36)
! 74:
! 75: .text
! 76: ALIGN(16)
! 77:
! 78: PROLOGUE(mpn_divrem_1c)
! 79: deflit(`FRAME',0)
! 80: movl PARAM_CARRY, %edx
! 81:
! 82: movl PARAM_SIZE, %ecx
! 83: subl $STACK_SPACE, %esp
! 84: deflit(`FRAME',STACK_SPACE)
! 85:
! 86: movl %ebx, SAVE_EBX
! 87: movl PARAM_XSIZE, %ebx
! 88:
! 89: movl %edi, SAVE_EDI
! 90: movl PARAM_DST, %edi
! 91:
! 92: movl %ebp, SAVE_EBP
! 93: movl PARAM_DIVISOR, %ebp
! 94:
! 95: movl %esi, SAVE_ESI
! 96: movl PARAM_SRC, %esi
! 97:
! 98: leal -4(%edi,%ebx,4), %edi
! 99: jmp LF(mpn_divrem_1,start_1c)
! 100:
! 101: EPILOGUE()
! 102:
! 103:
! 104: C offset 0x31, close enough to aligned
! 105: PROLOGUE(mpn_divrem_1)
! 106: deflit(`FRAME',0)
! 107:
! 108: movl PARAM_SIZE, %ecx
! 109: movl $0, %edx C initial carry (if can't skip a div)
! 110: subl $STACK_SPACE, %esp
! 111: deflit(`FRAME',STACK_SPACE)
! 112:
! 113: movl %ebp, SAVE_EBP
! 114: movl PARAM_DIVISOR, %ebp
! 115:
! 116: movl %ebx, SAVE_EBX
! 117: movl PARAM_XSIZE, %ebx
! 118:
! 119: movl %esi, SAVE_ESI
! 120: movl PARAM_SRC, %esi
! 121: orl %ecx, %ecx
! 122:
! 123: movl %edi, SAVE_EDI
! 124: movl PARAM_DST, %edi
! 125:
! 126: leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
! 127: jz L(no_skip_div)
! 128:
! 129: movl -4(%esi,%ecx,4), %eax C src high limb
! 130: cmpl %ebp, %eax C one less div if high<divisor
! 131: jnb L(no_skip_div)
! 132:
! 133: movl $0, (%edi,%ecx,4) C dst high limb
! 134: decl %ecx C size-1
! 135: movl %eax, %edx C src high limb as initial carry
! 136: L(no_skip_div):
! 137:
! 138:
! 139: L(start_1c):
! 140: C eax
! 141: C ebx xsize
! 142: C ecx size
! 143: C edx carry
! 144: C esi src
! 145: C edi &dst[xsize-1]
! 146: C ebp divisor
! 147:
! 148: leal (%ebx,%ecx), %eax C size+xsize
! 149: cmpl $MUL_THRESHOLD, %eax
! 150: jae L(mul_by_inverse)
! 151:
! 152: orl %ecx, %ecx
! 153: jz L(divide_no_integer)
! 154:
! 155: L(divide_integer):
! 156: C eax scratch (quotient)
! 157: C ebx xsize
! 158: C ecx counter
! 159: C edx scratch (remainder)
! 160: C esi src
! 161: C edi &dst[xsize-1]
! 162: C ebp divisor
! 163:
! 164: movl -4(%esi,%ecx,4), %eax
! 165:
! 166: divl %ebp
! 167:
! 168: movl %eax, (%edi,%ecx,4)
! 169: decl %ecx
! 170: jnz L(divide_integer)
! 171:
! 172:
! 173: L(divide_no_integer):
! 174: movl PARAM_DST, %edi
! 175: orl %ebx, %ebx
! 176: jnz L(divide_fraction)
! 177:
! 178: L(divide_done):
! 179: movl SAVE_ESI, %esi
! 180:
! 181: movl SAVE_EDI, %edi
! 182:
! 183: movl SAVE_EBX, %ebx
! 184: movl %edx, %eax
! 185:
! 186: movl SAVE_EBP, %ebp
! 187: addl $STACK_SPACE, %esp
! 188:
! 189: ret
! 190:
! 191:
! 192: L(divide_fraction):
! 193: C eax scratch (quotient)
! 194: C ebx counter
! 195: C ecx
! 196: C edx scratch (remainder)
! 197: C esi
! 198: C edi dst
! 199: C ebp divisor
! 200:
! 201: movl $0, %eax
! 202:
! 203: divl %ebp
! 204:
! 205: movl %eax, -4(%edi,%ebx,4)
! 206: decl %ebx
! 207: jnz L(divide_fraction)
! 208:
! 209: jmp L(divide_done)
! 210:
! 211:
! 212:
! 213: C -----------------------------------------------------------------------------
! 214:
! 215: L(mul_by_inverse):
! 216: C eax
! 217: C ebx xsize
! 218: C ecx size
! 219: C edx carry
! 220: C esi src
! 221: C edi &dst[xsize-1]
! 222: C ebp divisor
! 223:
! 224: leal 12(%edi), %ebx
! 225:
! 226: movl %ebx, VAR_DST_STOP
! 227: leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
! 228:
! 229: movl %edi, VAR_DST
! 230: movl %ecx, %ebx C size
! 231:
! 232: bsrl %ebp, %ecx C 31-l
! 233: movl %edx, %edi C carry
! 234:
! 235: leal 1(%ecx), %eax C 32-l
! 236: xorl $31, %ecx C l
! 237:
! 238: movl %ecx, VAR_NORM
! 239: movl $-1, %edx
! 240:
! 241: shll %cl, %ebp C d normalized
! 242: movd %eax, %mm7
! 243:
! 244: movl $-1, %eax
! 245: subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
! 246:
! 247: divl %ebp C floor (b*(b-d)-1) / d
! 248:
! 249: movl %eax, VAR_INVERSE
! 250: orl %ebx, %ebx C size
! 251: leal -12(%esi,%ebx,4), %eax C &src[size-3]
! 252:
! 253: movl %eax, VAR_SRC
! 254: jz L(start_zero)
! 255:
! 256: movl 8(%eax), %esi C src high limb
! 257: cmpl $1, %ebx
! 258: jz L(start_one)
! 259:
! 260: L(start_two_or_more):
! 261: movl 4(%eax), %edx C src second highest limb
! 262:
! 263: shldl( %cl, %esi, %edi) C n2 = carry,high << l
! 264:
! 265: shldl( %cl, %edx, %esi) C n10 = high,second << l
! 266:
! 267: cmpl $2, %ebx
! 268: je L(integer_two_left)
! 269: jmp L(integer_top)
! 270:
! 271:
! 272: L(start_one):
! 273: shldl( %cl, %esi, %edi) C n2 = carry,high << l
! 274:
! 275: shll %cl, %esi C n10 = high << l
! 276: jmp L(integer_one_left)
! 277:
! 278:
! 279: L(start_zero):
! 280: shll %cl, %edi C n2 = carry << l
! 281: movl $0, %esi C n10 = 0
! 282:
! 283: C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
! 284: C must have xsize!=0
! 285: jmp L(fraction_some)
! 286:
! 287:
! 288:
! 289: C -----------------------------------------------------------------------------
! 290: C
! 291: C This loop runs at about 25 cycles, which is probably sub-optimal, and
! 292: C certainly more than the dependent chain would suggest. A better loop, or
! 293: C a better rough analysis of what's possible, would be welcomed.
! 294: C
! 295: C In the current implementation, the following successively dependent
! 296: C micro-ops seem to exist.
! 297: C
! 298: C uops
! 299: C n2+n1 1 (addl)
! 300: C mul 5
! 301: C q1+1 3 (addl/adcl)
! 302: C mul 5
! 303: C sub 3 (subl/sbbl)
! 304: C addback 2 (cmov)
! 305: C ---
! 306: C 19
! 307: C
! 308: C Lack of registers hinders explicit scheduling and it might be that the
! 309: C normal out of order execution isn't able to hide enough under the mul
! 310: C latencies.
! 311: C
! 312: C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
! 313: C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
! 314: C combination was tried for the addback (despite the fact it would lengthen
! 315: C the dependent chain) but found to be no faster.
! 316:
! 317:
! 318: ALIGN(16)
! 319: L(integer_top):
! 320: C eax scratch
! 321: C ebx scratch (nadj, q1)
! 322: C ecx scratch (src, dst)
! 323: C edx scratch
! 324: C esi n10
! 325: C edi n2
! 326: C ebp d
! 327: C
! 328: C mm0 scratch (src qword)
! 329: C mm7 rshift for normalization
! 330:
! 331: movl %esi, %eax
! 332: movl %ebp, %ebx
! 333:
! 334: sarl $31, %eax C -n1
! 335: movl VAR_SRC, %ecx
! 336:
! 337: andl %eax, %ebx C -n1 & d
! 338: negl %eax C n1
! 339:
! 340: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 341: addl %edi, %eax C n2+n1
! 342: movq (%ecx), %mm0 C next src limb and the one below it
! 343:
! 344: mull VAR_INVERSE C m*(n2+n1)
! 345:
! 346: subl $4, %ecx
! 347:
! 348: movl %ecx, VAR_SRC
! 349:
! 350: C
! 351:
! 352: C
! 353:
! 354: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 355: movl %ebp, %eax C d
! 356: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 357:
! 358: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 359: jz L(q1_ff)
! 360:
! 361: mull %ebx C (q1+1)*d
! 362:
! 363: movl VAR_DST, %ecx
! 364: psrlq %mm7, %mm0
! 365:
! 366: C
! 367:
! 368: C
! 369:
! 370: C
! 371:
! 372: subl %eax, %esi
! 373: movl VAR_DST_STOP, %eax
! 374:
! 375: sbbl %edx, %edi C n - (q1+1)*d
! 376: movl %esi, %edi C remainder -> n2
! 377: leal (%ebp,%esi), %edx
! 378:
! 379: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 380: movd %mm0, %esi
! 381:
! 382: sbbl $0, %ebx C q
! 383: subl $4, %ecx
! 384:
! 385: movl %ebx, (%ecx)
! 386: cmpl %eax, %ecx
! 387:
! 388: movl %ecx, VAR_DST
! 389: jne L(integer_top)
! 390:
! 391:
! 392: L(integer_loop_done):
! 393:
! 394:
! 395: C -----------------------------------------------------------------------------
! 396: C
! 397: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
! 398: C q1_ff special case. This make the code a bit smaller and simpler, and
! 399: C costs only 2 cycles (each).
! 400:
! 401: L(integer_two_left):
! 402: C eax scratch
! 403: C ebx scratch (nadj, q1)
! 404: C ecx scratch (src, dst)
! 405: C edx scratch
! 406: C esi n10
! 407: C edi n2
! 408: C ebp divisor
! 409: C
! 410: C mm0 src limb, shifted
! 411: C mm7 rshift
! 412:
! 413:
! 414: movl %esi, %eax
! 415: movl %ebp, %ebx
! 416:
! 417: sarl $31, %eax C -n1
! 418: movl PARAM_SRC, %ecx
! 419:
! 420: andl %eax, %ebx C -n1 & d
! 421: negl %eax C n1
! 422:
! 423: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 424: addl %edi, %eax C n2+n1
! 425:
! 426: mull VAR_INVERSE C m*(n2+n1)
! 427:
! 428: movd (%ecx), %mm0 C src low limb
! 429:
! 430: movl VAR_DST_STOP, %ecx
! 431:
! 432: C
! 433:
! 434: C
! 435:
! 436: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 437: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 438: movl %ebp, %eax C d
! 439:
! 440: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 441:
! 442: sbbl $0, %ebx
! 443:
! 444: mull %ebx C (q1+1)*d
! 445:
! 446: psllq $32, %mm0
! 447:
! 448: psrlq %mm7, %mm0
! 449:
! 450: C
! 451:
! 452: C
! 453:
! 454: subl %eax, %esi
! 455:
! 456: sbbl %edx, %edi C n - (q1+1)*d
! 457: movl %esi, %edi C remainder -> n2
! 458: leal (%ebp,%esi), %edx
! 459:
! 460: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 461: movd %mm0, %esi
! 462:
! 463: sbbl $0, %ebx C q
! 464:
! 465: movl %ebx, -4(%ecx)
! 466:
! 467:
! 468: C -----------------------------------------------------------------------------
! 469: L(integer_one_left):
! 470: C eax scratch
! 471: C ebx scratch (nadj, q1)
! 472: C ecx scratch (dst)
! 473: C edx scratch
! 474: C esi n10
! 475: C edi n2
! 476: C ebp divisor
! 477: C
! 478: C mm0 src limb, shifted
! 479: C mm7 rshift
! 480:
! 481:
! 482: movl %esi, %eax
! 483: movl %ebp, %ebx
! 484:
! 485: sarl $31, %eax C -n1
! 486: movl VAR_DST_STOP, %ecx
! 487:
! 488: andl %eax, %ebx C -n1 & d
! 489: negl %eax C n1
! 490:
! 491: addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
! 492: addl %edi, %eax C n2+n1
! 493:
! 494: mull VAR_INVERSE C m*(n2+n1)
! 495:
! 496: C
! 497:
! 498: C
! 499:
! 500: C
! 501:
! 502: addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
! 503: leal 1(%edi), %ebx C n2<<32 + m*(n2+n1))
! 504: movl %ebp, %eax C d
! 505:
! 506: C
! 507:
! 508: adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
! 509:
! 510: sbbl $0, %ebx C q1 if q1+1 overflowed
! 511:
! 512: mull %ebx
! 513:
! 514: C
! 515:
! 516: C
! 517:
! 518: C
! 519:
! 520: C
! 521:
! 522: subl %eax, %esi
! 523: movl PARAM_XSIZE, %eax
! 524:
! 525: sbbl %edx, %edi C n - (q1+1)*d
! 526: movl %esi, %edi C remainder -> n2
! 527: leal (%ebp,%esi), %edx
! 528:
! 529: cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
! 530:
! 531: sbbl $0, %ebx C q
! 532:
! 533: movl %ebx, -8(%ecx)
! 534: subl $8, %ecx
! 535:
! 536:
! 537:
! 538: orl %eax, %eax C xsize
! 539: jnz L(fraction_some)
! 540:
! 541: movl %edi, %eax
! 542: L(fraction_done):
! 543: movl VAR_NORM, %ecx
! 544: movl SAVE_EBP, %ebp
! 545:
! 546: movl SAVE_EDI, %edi
! 547:
! 548: movl SAVE_ESI, %esi
! 549:
! 550: movl SAVE_EBX, %ebx
! 551: addl $STACK_SPACE, %esp
! 552:
! 553: shrl %cl, %eax
! 554: emms
! 555:
! 556: ret
! 557:
! 558:
! 559: C -----------------------------------------------------------------------------
! 560: C
! 561: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
! 562: C of q*d is simply -d and the remainder n-q*d = n10+d
! 563:
! 564: L(q1_ff):
! 565: C eax (divisor)
! 566: C ebx (q1+1 == 0)
! 567: C ecx
! 568: C edx
! 569: C esi n10
! 570: C edi n2
! 571: C ebp divisor
! 572:
! 573: movl VAR_DST, %ecx
! 574: movl VAR_DST_STOP, %edx
! 575: subl $4, %ecx
! 576:
! 577: movl %ecx, VAR_DST
! 578: psrlq %mm7, %mm0
! 579: leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
! 580:
! 581: movl $-1, (%ecx)
! 582: movd %mm0, %esi C next n10
! 583:
! 584: cmpl %ecx, %edx
! 585: jne L(integer_top)
! 586:
! 587: jmp L(integer_loop_done)
! 588:
! 589:
! 590:
! 591: C -----------------------------------------------------------------------------
! 592: C
! 593: C In the current implementation, the following successively dependent
! 594: C micro-ops seem to exist.
! 595: C
! 596: C uops
! 597: C mul 5
! 598: C q1+1 1 (addl)
! 599: C mul 5
! 600: C sub 3 (negl/sbbl)
! 601: C addback 2 (cmov)
! 602: C ---
! 603: C 16
! 604: C
! 605: C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
! 606: C the addback was found to be a touch slower.
! 607:
! 608:
! 609: ALIGN(16)
! 610: L(fraction_some):
! 611: C eax
! 612: C ebx
! 613: C ecx
! 614: C edx
! 615: C esi
! 616: C edi carry
! 617: C ebp divisor
! 618:
! 619: movl PARAM_DST, %esi
! 620: movl VAR_DST_STOP, %ecx
! 621: movl %edi, %eax
! 622:
! 623: subl $8, %ecx
! 624:
! 625:
! 626: ALIGN(16)
! 627: L(fraction_top):
! 628: C eax n2, then scratch
! 629: C ebx scratch (nadj, q1)
! 630: C ecx dst, decrementing
! 631: C edx scratch
! 632: C esi dst stop point
! 633: C edi n2
! 634: C ebp divisor
! 635:
! 636: mull VAR_INVERSE C m*n2
! 637:
! 638: movl %ebp, %eax C d
! 639: subl $4, %ecx C dst
! 640: leal 1(%edi), %ebx
! 641:
! 642: C
! 643:
! 644: C
! 645:
! 646: C
! 647:
! 648: addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
! 649:
! 650: mull %ebx C (q1+1)*d
! 651:
! 652: C
! 653:
! 654: C
! 655:
! 656: C
! 657:
! 658: C
! 659:
! 660: negl %eax C low of n - (q1+1)*d
! 661:
! 662: sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
! 663: leal (%ebp,%eax), %edx
! 664:
! 665: cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
! 666:
! 667: sbbl $0, %ebx C q
! 668: movl %eax, %edi C remainder->n2
! 669: cmpl %esi, %ecx
! 670:
! 671: movl %ebx, (%ecx) C previous q
! 672: jne L(fraction_top)
! 673:
! 674:
! 675: jmp L(fraction_done)
! 676:
! 677: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>