[BACK]Return to divrem_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / p6 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mmx/divrem_1.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
                      2: dnl
                      3: dnl  P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
                      4:
                      5:
                      6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      7: dnl
                      8: dnl  This file is part of the GNU MP Library.
                      9: dnl
                     10: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     11: dnl  modify it under the terms of the GNU Lesser General Public License as
                     12: dnl  published by the Free Software Foundation; either version 2.1 of the
                     13: dnl  License, or (at your option) any later version.
                     14: dnl
                     15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     18: dnl  Lesser General Public License for more details.
                     19: dnl
                     20: dnl  You should have received a copy of the GNU Lesser General Public
                     21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     23: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     24:
                     25:
                     26: include(`../config.m4')
                     27:
                     28:
                     29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
                     30: C                         mp_srcptr src, mp_size_t size,
                     31: C                         mp_limb_t divisor);
                     32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
                     33: C                          mp_srcptr src, mp_size_t size,
                     34: C                          mp_limb_t divisor, mp_limb_t carry);
                     35: C
                     36: C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
                     37: C see that file for some comments.  It's likely what's here can be improved.
                     38:
                     39:
                     40: dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
                     41: dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
                     42: dnl
                     43: dnl  The different speeds of the integer and fraction parts means that using
                     44: dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
                     45: dnl  for the integer part and a bit lower for the fraction part.  (Or what's
                     46: dnl  really wanted is to speed up the integer part!)
                     47: dnl
                     48: dnl  The threshold is set to make the integer part right.  At 4 limbs the
                     49: dnl  div and mul are about the same there, but on the fractional part the
                     50: dnl  mul is much faster.
                     51:
                     52: deflit(MUL_THRESHOLD, 4)
                     53:
                     54:
                     55: defframe(PARAM_CARRY,  24)
                     56: defframe(PARAM_DIVISOR,20)
                     57: defframe(PARAM_SIZE,   16)
                     58: defframe(PARAM_SRC,    12)
                     59: defframe(PARAM_XSIZE,  8)
                     60: defframe(PARAM_DST,    4)
                     61:
                     62: defframe(SAVE_EBX,    -4)
                     63: defframe(SAVE_ESI,    -8)
                     64: defframe(SAVE_EDI,    -12)
                     65: defframe(SAVE_EBP,    -16)
                     66:
                     67: defframe(VAR_NORM,    -20)
                     68: defframe(VAR_INVERSE, -24)
                     69: defframe(VAR_SRC,     -28)
                     70: defframe(VAR_DST,     -32)
                     71: defframe(VAR_DST_STOP,-36)
                     72:
                     73: deflit(STACK_SPACE, 36)
                     74:
                     75:        .text
                     76:        ALIGN(16)
                     77:
                     78: PROLOGUE(mpn_divrem_1c)
                     79: deflit(`FRAME',0)
                     80:        movl    PARAM_CARRY, %edx
                     81:
                     82:        movl    PARAM_SIZE, %ecx
                     83:        subl    $STACK_SPACE, %esp
                     84: deflit(`FRAME',STACK_SPACE)
                     85:
                     86:        movl    %ebx, SAVE_EBX
                     87:        movl    PARAM_XSIZE, %ebx
                     88:
                     89:        movl    %edi, SAVE_EDI
                     90:        movl    PARAM_DST, %edi
                     91:
                     92:        movl    %ebp, SAVE_EBP
                     93:        movl    PARAM_DIVISOR, %ebp
                     94:
                     95:        movl    %esi, SAVE_ESI
                     96:        movl    PARAM_SRC, %esi
                     97:
                     98:        leal    -4(%edi,%ebx,4), %edi
                     99:        jmp     LF(mpn_divrem_1,start_1c)
                    100:
                    101: EPILOGUE()
                    102:
                    103:
                    104:        C offset 0x31, close enough to aligned
                    105: PROLOGUE(mpn_divrem_1)
                    106: deflit(`FRAME',0)
                    107:
                    108:        movl    PARAM_SIZE, %ecx
                    109:        movl    $0, %edx                C initial carry (if can't skip a div)
                    110:        subl    $STACK_SPACE, %esp
                    111: deflit(`FRAME',STACK_SPACE)
                    112:
                    113:        movl    %ebp, SAVE_EBP
                    114:        movl    PARAM_DIVISOR, %ebp
                    115:
                    116:        movl    %ebx, SAVE_EBX
                    117:        movl    PARAM_XSIZE, %ebx
                    118:
                    119:        movl    %esi, SAVE_ESI
                    120:        movl    PARAM_SRC, %esi
                    121:        orl     %ecx, %ecx
                    122:
                    123:        movl    %edi, SAVE_EDI
                    124:        movl    PARAM_DST, %edi
                    125:
                    126:        leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
                    127:        jz      L(no_skip_div)
                    128:
                    129:        movl    -4(%esi,%ecx,4), %eax   C src high limb
                    130:        cmpl    %ebp, %eax              C one less div if high<divisor
                    131:        jnb     L(no_skip_div)
                    132:
                    133:        movl    $0, (%edi,%ecx,4)       C dst high limb
                    134:        decl    %ecx                    C size-1
                    135:        movl    %eax, %edx              C src high limb as initial carry
                    136: L(no_skip_div):
                    137:
                    138:
                    139: L(start_1c):
                    140:        C eax
                    141:        C ebx   xsize
                    142:        C ecx   size
                    143:        C edx   carry
                    144:        C esi   src
                    145:        C edi   &dst[xsize-1]
                    146:        C ebp   divisor
                    147:
                    148:        leal    (%ebx,%ecx), %eax       C size+xsize
                    149:        cmpl    $MUL_THRESHOLD, %eax
                    150:        jae     L(mul_by_inverse)
                    151:
                    152:        orl     %ecx, %ecx
                    153:        jz      L(divide_no_integer)
                    154:
                    155: L(divide_integer):
                    156:        C eax   scratch (quotient)
                    157:        C ebx   xsize
                    158:        C ecx   counter
                    159:        C edx   scratch (remainder)
                    160:        C esi   src
                    161:        C edi   &dst[xsize-1]
                    162:        C ebp   divisor
                    163:
                    164:        movl    -4(%esi,%ecx,4), %eax
                    165:
                    166:        divl    %ebp
                    167:
                    168:        movl    %eax, (%edi,%ecx,4)
                    169:        decl    %ecx
                    170:        jnz     L(divide_integer)
                    171:
                    172:
                    173: L(divide_no_integer):
                    174:        movl    PARAM_DST, %edi
                    175:        orl     %ebx, %ebx
                    176:        jnz     L(divide_fraction)
                    177:
                    178: L(divide_done):
                    179:        movl    SAVE_ESI, %esi
                    180:
                    181:        movl    SAVE_EDI, %edi
                    182:
                    183:        movl    SAVE_EBX, %ebx
                    184:        movl    %edx, %eax
                    185:
                    186:        movl    SAVE_EBP, %ebp
                    187:        addl    $STACK_SPACE, %esp
                    188:
                    189:        ret
                    190:
                    191:
                    192: L(divide_fraction):
                    193:        C eax   scratch (quotient)
                    194:        C ebx   counter
                    195:        C ecx
                    196:        C edx   scratch (remainder)
                    197:        C esi
                    198:        C edi   dst
                    199:        C ebp   divisor
                    200:
                    201:        movl    $0, %eax
                    202:
                    203:        divl    %ebp
                    204:
                    205:        movl    %eax, -4(%edi,%ebx,4)
                    206:        decl    %ebx
                    207:        jnz     L(divide_fraction)
                    208:
                    209:        jmp     L(divide_done)
                    210:
                    211:
                    212:
                    213: C -----------------------------------------------------------------------------
                    214:
                    215: L(mul_by_inverse):
                    216:        C eax
                    217:        C ebx   xsize
                    218:        C ecx   size
                    219:        C edx   carry
                    220:        C esi   src
                    221:        C edi   &dst[xsize-1]
                    222:        C ebp   divisor
                    223:
                    224:        leal    12(%edi), %ebx
                    225:
                    226:        movl    %ebx, VAR_DST_STOP
                    227:        leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
                    228:
                    229:        movl    %edi, VAR_DST
                    230:        movl    %ecx, %ebx              C size
                    231:
                    232:        bsrl    %ebp, %ecx              C 31-l
                    233:        movl    %edx, %edi              C carry
                    234:
                    235:        leal    1(%ecx), %eax           C 32-l
                    236:        xorl    $31, %ecx               C l
                    237:
                    238:        movl    %ecx, VAR_NORM
                    239:        movl    $-1, %edx
                    240:
                    241:        shll    %cl, %ebp               C d normalized
                    242:        movd    %eax, %mm7
                    243:
                    244:        movl    $-1, %eax
                    245:        subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
                    246:
                    247:        divl    %ebp                    C floor (b*(b-d)-1) / d
                    248:
                    249:        movl    %eax, VAR_INVERSE
                    250:        orl     %ebx, %ebx              C size
                    251:        leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
                    252:
                    253:        movl    %eax, VAR_SRC
                    254:        jz      L(start_zero)
                    255:
                    256:        movl    8(%eax), %esi           C src high limb
                    257:        cmpl    $1, %ebx
                    258:        jz      L(start_one)
                    259:
                    260: L(start_two_or_more):
                    261:        movl    4(%eax), %edx           C src second highest limb
                    262:
                    263:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    264:
                    265:        shldl(  %cl, %edx, %esi)        C n10 = high,second << l
                    266:
                    267:        cmpl    $2, %ebx
                    268:        je      L(integer_two_left)
                    269:        jmp     L(integer_top)
                    270:
                    271:
                    272: L(start_one):
                    273:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    274:
                    275:        shll    %cl, %esi               C n10 = high << l
                    276:        jmp     L(integer_one_left)
                    277:
                    278:
                    279: L(start_zero):
                    280:        shll    %cl, %edi               C n2 = carry << l
                    281:        movl    $0, %esi                C n10 = 0
                    282:
                    283:        C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
                    284:        C must have xsize!=0
                    285:        jmp     L(fraction_some)
                    286:
                    287:
                    288:
                    289: C -----------------------------------------------------------------------------
                    290: C
                    291: C This loop runs at about 25 cycles, which is probably sub-optimal, and
                    292: C certainly more than the dependent chain would suggest.  A better loop, or
                    293: C a better rough analysis of what's possible, would be welcomed.
                    294: C
                    295: C In the current implementation, the following successively dependent
                    296: C micro-ops seem to exist.
                    297: C
                    298: C                     uops
                    299: C              n2+n1   1   (addl)
                    300: C              mul     5
                    301: C              q1+1    3   (addl/adcl)
                    302: C              mul     5
                    303: C              sub     3   (subl/sbbl)
                    304: C              addback 2   (cmov)
                    305: C                     ---
                    306: C                     19
                    307: C
                    308: C Lack of registers hinders explicit scheduling and it might be that the
                    309: C normal out of order execution isn't able to hide enough under the mul
                    310: C latencies.
                    311: C
                    312: C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
                    313: C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
                    314: C combination was tried for the addback (despite the fact it would lengthen
                    315: C the dependent chain) but found to be no faster.
                    316:
                    317:
                    318:        ALIGN(16)
                    319: L(integer_top):
                    320:        C eax   scratch
                    321:        C ebx   scratch (nadj, q1)
                    322:        C ecx   scratch (src, dst)
                    323:        C edx   scratch
                    324:        C esi   n10
                    325:        C edi   n2
                    326:        C ebp   d
                    327:        C
                    328:        C mm0   scratch (src qword)
                    329:        C mm7   rshift for normalization
                    330:
                    331:        movl    %esi, %eax
                    332:        movl    %ebp, %ebx
                    333:
                    334:        sarl    $31, %eax          C -n1
                    335:        movl    VAR_SRC, %ecx
                    336:
                    337:        andl    %eax, %ebx         C -n1 & d
                    338:        negl    %eax               C n1
                    339:
                    340:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
                    341:        addl    %edi, %eax         C n2+n1
                    342:        movq    (%ecx), %mm0       C next src limb and the one below it
                    343:
                    344:        mull    VAR_INVERSE        C m*(n2+n1)
                    345:
                    346:        subl    $4, %ecx
                    347:
                    348:        movl    %ecx, VAR_SRC
                    349:
                    350:        C
                    351:
                    352:        C
                    353:
                    354:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
                    355:        movl    %ebp, %eax         C d
                    356:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
                    357:
                    358:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    359:        jz      L(q1_ff)
                    360:
                    361:        mull    %ebx               C (q1+1)*d
                    362:
                    363:        movl    VAR_DST, %ecx
                    364:        psrlq   %mm7, %mm0
                    365:
                    366:        C
                    367:
                    368:        C
                    369:
                    370:        C
                    371:
                    372:        subl    %eax, %esi
                    373:        movl    VAR_DST_STOP, %eax
                    374:
                    375:        sbbl    %edx, %edi         C n - (q1+1)*d
                    376:        movl    %esi, %edi         C remainder -> n2
                    377:        leal    (%ebp,%esi), %edx
                    378:
                    379:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    380:        movd    %mm0, %esi
                    381:
                    382:        sbbl    $0, %ebx           C q
                    383:        subl    $4, %ecx
                    384:
                    385:        movl    %ebx, (%ecx)
                    386:        cmpl    %eax, %ecx
                    387:
                    388:        movl    %ecx, VAR_DST
                    389:        jne     L(integer_top)
                    390:
                    391:
                    392: L(integer_loop_done):
                    393:
                    394:
                    395: C -----------------------------------------------------------------------------
                    396: C
                    397: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
                    398: C q1_ff special case.  This make the code a bit smaller and simpler, and
                    399: C costs only 2 cycles (each).
                    400:
                    401: L(integer_two_left):
                    402:        C eax   scratch
                    403:        C ebx   scratch (nadj, q1)
                    404:        C ecx   scratch (src, dst)
                    405:        C edx   scratch
                    406:        C esi   n10
                    407:        C edi   n2
                    408:        C ebp   divisor
                    409:        C
                    410:        C mm0   src limb, shifted
                    411:        C mm7   rshift
                    412:
                    413:
                    414:        movl    %esi, %eax
                    415:        movl    %ebp, %ebx
                    416:
                    417:        sarl    $31, %eax          C -n1
                    418:        movl    PARAM_SRC, %ecx
                    419:
                    420:        andl    %eax, %ebx         C -n1 & d
                    421:        negl    %eax               C n1
                    422:
                    423:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
                    424:        addl    %edi, %eax         C n2+n1
                    425:
                    426:        mull    VAR_INVERSE        C m*(n2+n1)
                    427:
                    428:        movd    (%ecx), %mm0       C src low limb
                    429:
                    430:        movl    VAR_DST_STOP, %ecx
                    431:
                    432:        C
                    433:
                    434:        C
                    435:
                    436:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
                    437:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
                    438:        movl    %ebp, %eax         C d
                    439:
                    440:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    441:
                    442:        sbbl    $0, %ebx
                    443:
                    444:        mull    %ebx               C (q1+1)*d
                    445:
                    446:        psllq   $32, %mm0
                    447:
                    448:        psrlq   %mm7, %mm0
                    449:
                    450:        C
                    451:
                    452:        C
                    453:
                    454:        subl    %eax, %esi
                    455:
                    456:        sbbl    %edx, %edi         C n - (q1+1)*d
                    457:        movl    %esi, %edi         C remainder -> n2
                    458:        leal    (%ebp,%esi), %edx
                    459:
                    460:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    461:        movd    %mm0, %esi
                    462:
                    463:        sbbl    $0, %ebx           C q
                    464:
                    465:        movl    %ebx, -4(%ecx)
                    466:
                    467:
                    468: C -----------------------------------------------------------------------------
                    469: L(integer_one_left):
                    470:        C eax   scratch
                    471:        C ebx   scratch (nadj, q1)
                    472:        C ecx   scratch (dst)
                    473:        C edx   scratch
                    474:        C esi   n10
                    475:        C edi   n2
                    476:        C ebp   divisor
                    477:        C
                    478:        C mm0   src limb, shifted
                    479:        C mm7   rshift
                    480:
                    481:
                    482:        movl    %esi, %eax
                    483:        movl    %ebp, %ebx
                    484:
                    485:        sarl    $31, %eax          C -n1
                    486:        movl    VAR_DST_STOP, %ecx
                    487:
                    488:        andl    %eax, %ebx         C -n1 & d
                    489:        negl    %eax               C n1
                    490:
                    491:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
                    492:        addl    %edi, %eax         C n2+n1
                    493:
                    494:        mull    VAR_INVERSE        C m*(n2+n1)
                    495:
                    496:        C
                    497:
                    498:        C
                    499:
                    500:        C
                    501:
                    502:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
                    503:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
                    504:        movl    %ebp, %eax         C d
                    505:
                    506:        C
                    507:
                    508:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    509:
                    510:        sbbl    $0, %ebx           C q1 if q1+1 overflowed
                    511:
                    512:        mull    %ebx
                    513:
                    514:        C
                    515:
                    516:        C
                    517:
                    518:        C
                    519:
                    520:        C
                    521:
                    522:        subl    %eax, %esi
                    523:        movl    PARAM_XSIZE, %eax
                    524:
                    525:        sbbl    %edx, %edi         C n - (q1+1)*d
                    526:        movl    %esi, %edi         C remainder -> n2
                    527:        leal    (%ebp,%esi), %edx
                    528:
                    529:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    530:
                    531:        sbbl    $0, %ebx           C q
                    532:
                    533:        movl    %ebx, -8(%ecx)
                    534:        subl    $8, %ecx
                    535:
                    536:
                    537:
                    538:        orl     %eax, %eax         C xsize
                    539:        jnz     L(fraction_some)
                    540:
                    541:        movl    %edi, %eax
                    542: L(fraction_done):
                    543:        movl    VAR_NORM, %ecx
                    544:        movl    SAVE_EBP, %ebp
                    545:
                    546:        movl    SAVE_EDI, %edi
                    547:
                    548:        movl    SAVE_ESI, %esi
                    549:
                    550:        movl    SAVE_EBX, %ebx
                    551:        addl    $STACK_SPACE, %esp
                    552:
                    553:        shrl    %cl, %eax
                    554:        emms
                    555:
                    556:        ret
                    557:
                    558:
                    559: C -----------------------------------------------------------------------------
                    560: C
                    561: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
                    562: C of q*d is simply -d and the remainder n-q*d = n10+d
                    563:
                    564: L(q1_ff):
                    565:        C eax   (divisor)
                    566:        C ebx   (q1+1 == 0)
                    567:        C ecx
                    568:        C edx
                    569:        C esi   n10
                    570:        C edi   n2
                    571:        C ebp   divisor
                    572:
                    573:        movl    VAR_DST, %ecx
                    574:        movl    VAR_DST_STOP, %edx
                    575:        subl    $4, %ecx
                    576:
                    577:        movl    %ecx, VAR_DST
                    578:        psrlq   %mm7, %mm0
                    579:        leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
                    580:
                    581:        movl    $-1, (%ecx)
                    582:        movd    %mm0, %esi              C next n10
                    583:
                    584:        cmpl    %ecx, %edx
                    585:        jne     L(integer_top)
                    586:
                    587:        jmp     L(integer_loop_done)
                    588:
                    589:
                    590:
                    591: C -----------------------------------------------------------------------------
                    592: C
                    593: C In the current implementation, the following successively dependent
                    594: C micro-ops seem to exist.
                    595: C
                    596: C                     uops
                    597: C              mul     5
                    598: C              q1+1    1   (addl)
                    599: C              mul     5
                    600: C              sub     3   (negl/sbbl)
                    601: C              addback 2   (cmov)
                    602: C                     ---
                    603: C                     16
                    604: C
                    605: C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
                    606: C the addback was found to be a touch slower.
                    607:
                    608:
                    609:        ALIGN(16)
                    610: L(fraction_some):
                    611:        C eax
                    612:        C ebx
                    613:        C ecx
                    614:        C edx
                    615:        C esi
                    616:        C edi   carry
                    617:        C ebp   divisor
                    618:
                    619:        movl    PARAM_DST, %esi
                    620:        movl    VAR_DST_STOP, %ecx
                    621:        movl    %edi, %eax
                    622:
                    623:        subl    $8, %ecx
                    624:
                    625:
                    626:        ALIGN(16)
                    627: L(fraction_top):
                    628:        C eax   n2, then scratch
                    629:        C ebx   scratch (nadj, q1)
                    630:        C ecx   dst, decrementing
                    631:        C edx   scratch
                    632:        C esi   dst stop point
                    633:        C edi   n2
                    634:        C ebp   divisor
                    635:
                    636:        mull    VAR_INVERSE     C m*n2
                    637:
                    638:        movl    %ebp, %eax      C d
                    639:        subl    $4, %ecx        C dst
                    640:        leal    1(%edi), %ebx
                    641:
                    642:        C
                    643:
                    644:        C
                    645:
                    646:        C
                    647:
                    648:        addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
                    649:
                    650:        mull    %ebx            C (q1+1)*d
                    651:
                    652:        C
                    653:
                    654:        C
                    655:
                    656:        C
                    657:
                    658:        C
                    659:
                    660:        negl    %eax            C low of n - (q1+1)*d
                    661:
                    662:        sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
                    663:        leal    (%ebp,%eax), %edx
                    664:
                    665:        cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
                    666:
                    667:        sbbl    $0, %ebx        C q
                    668:        movl    %eax, %edi      C remainder->n2
                    669:        cmpl    %esi, %ecx
                    670:
                    671:        movl    %ebx, (%ecx)    C previous q
                    672:        jne     L(fraction_top)
                    673:
                    674:
                    675:        jmp     L(fraction_done)
                    676:
                    677: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>