OpenXM_contrib/gmp/mpn/x86/p6/mmx/divrem_1.asm - annotate

Return to divrem_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / p6 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mmx/divrem_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
        !             2: dnl
        !             3: dnl  P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
        !            30: C                         mp_srcptr src, mp_size_t size,
        !            31: C                         mp_limb_t divisor);
        !            32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
        !            33: C                          mp_srcptr src, mp_size_t size,
        !            34: C                          mp_limb_t divisor, mp_limb_t carry);
        !            35: C
        !            36: C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
        !            37: C see that file for some comments.  It's likely what's here can be improved.
        !            38:
        !            39:
        !            40: dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
        !            41: dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
        !            42: dnl
        !            43: dnl  The different speeds of the integer and fraction parts means that using
        !            44: dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
        !            45: dnl  for the integer part and a bit lower for the fraction part.  (Or what's
        !            46: dnl  really wanted is to speed up the integer part!)
        !            47: dnl
        !            48: dnl  The threshold is set to make the integer part right.  At 4 limbs the
        !            49: dnl  div and mul are about the same there, but on the fractional part the
        !            50: dnl  mul is much faster.
        !            51:
        !            52: deflit(MUL_THRESHOLD, 4)
        !            53:
        !            54:
        !            55: defframe(PARAM_CARRY,  24)
        !            56: defframe(PARAM_DIVISOR,20)
        !            57: defframe(PARAM_SIZE,   16)
        !            58: defframe(PARAM_SRC,    12)
        !            59: defframe(PARAM_XSIZE,  8)
        !            60: defframe(PARAM_DST,    4)
        !            61:
        !            62: defframe(SAVE_EBX,    -4)
        !            63: defframe(SAVE_ESI,    -8)
        !            64: defframe(SAVE_EDI,    -12)
        !            65: defframe(SAVE_EBP,    -16)
        !            66:
        !            67: defframe(VAR_NORM,    -20)
        !            68: defframe(VAR_INVERSE, -24)
        !            69: defframe(VAR_SRC,     -28)
        !            70: defframe(VAR_DST,     -32)
        !            71: defframe(VAR_DST_STOP,-36)
        !            72:
        !            73: deflit(STACK_SPACE, 36)
        !            74:
        !            75:        .text
        !            76:        ALIGN(16)
        !            77:
        !            78: PROLOGUE(mpn_divrem_1c)
        !            79: deflit(`FRAME',0)
        !            80:        movl    PARAM_CARRY, %edx
        !            81:
        !            82:        movl    PARAM_SIZE, %ecx
        !            83:        subl    $STACK_SPACE, %esp
        !            84: deflit(`FRAME',STACK_SPACE)
        !            85:
        !            86:        movl    %ebx, SAVE_EBX
        !            87:        movl    PARAM_XSIZE, %ebx
        !            88:
        !            89:        movl    %edi, SAVE_EDI
        !            90:        movl    PARAM_DST, %edi
        !            91:
        !            92:        movl    %ebp, SAVE_EBP
        !            93:        movl    PARAM_DIVISOR, %ebp
        !            94:
        !            95:        movl    %esi, SAVE_ESI
        !            96:        movl    PARAM_SRC, %esi
        !            97:
        !            98:        leal    -4(%edi,%ebx,4), %edi
        !            99:        jmp     LF(mpn_divrem_1,start_1c)
        !           100:
        !           101: EPILOGUE()
        !           102:
        !           103:
        !           104:        C offset 0x31, close enough to aligned
        !           105: PROLOGUE(mpn_divrem_1)
        !           106: deflit(`FRAME',0)
        !           107:
        !           108:        movl    PARAM_SIZE, %ecx
        !           109:        movl    $0, %edx                C initial carry (if can't skip a div)
        !           110:        subl    $STACK_SPACE, %esp
        !           111: deflit(`FRAME',STACK_SPACE)
        !           112:
        !           113:        movl    %ebp, SAVE_EBP
        !           114:        movl    PARAM_DIVISOR, %ebp
        !           115:
        !           116:        movl    %ebx, SAVE_EBX
        !           117:        movl    PARAM_XSIZE, %ebx
        !           118:
        !           119:        movl    %esi, SAVE_ESI
        !           120:        movl    PARAM_SRC, %esi
        !           121:        orl     %ecx, %ecx
        !           122:
        !           123:        movl    %edi, SAVE_EDI
        !           124:        movl    PARAM_DST, %edi
        !           125:
        !           126:        leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
        !           127:        jz      L(no_skip_div)
        !           128:
        !           129:        movl    -4(%esi,%ecx,4), %eax   C src high limb
        !           130:        cmpl    %ebp, %eax              C one less div if high<divisor
        !           131:        jnb     L(no_skip_div)
        !           132:
        !           133:        movl    $0, (%edi,%ecx,4)       C dst high limb
        !           134:        decl    %ecx                    C size-1
        !           135:        movl    %eax, %edx              C src high limb as initial carry
        !           136: L(no_skip_div):
        !           137:
        !           138:
        !           139: L(start_1c):
        !           140:        C eax
        !           141:        C ebx   xsize
        !           142:        C ecx   size
        !           143:        C edx   carry
        !           144:        C esi   src
        !           145:        C edi   &dst[xsize-1]
        !           146:        C ebp   divisor
        !           147:
        !           148:        leal    (%ebx,%ecx), %eax       C size+xsize
        !           149:        cmpl    $MUL_THRESHOLD, %eax
        !           150:        jae     L(mul_by_inverse)
        !           151:
        !           152:        orl     %ecx, %ecx
        !           153:        jz      L(divide_no_integer)
        !           154:
        !           155: L(divide_integer):
        !           156:        C eax   scratch (quotient)
        !           157:        C ebx   xsize
        !           158:        C ecx   counter
        !           159:        C edx   scratch (remainder)
        !           160:        C esi   src
        !           161:        C edi   &dst[xsize-1]
        !           162:        C ebp   divisor
        !           163:
        !           164:        movl    -4(%esi,%ecx,4), %eax
        !           165:
        !           166:        divl    %ebp
        !           167:
        !           168:        movl    %eax, (%edi,%ecx,4)
        !           169:        decl    %ecx
        !           170:        jnz     L(divide_integer)
        !           171:
        !           172:
        !           173: L(divide_no_integer):
        !           174:        movl    PARAM_DST, %edi
        !           175:        orl     %ebx, %ebx
        !           176:        jnz     L(divide_fraction)
        !           177:
        !           178: L(divide_done):
        !           179:        movl    SAVE_ESI, %esi
        !           180:
        !           181:        movl    SAVE_EDI, %edi
        !           182:
        !           183:        movl    SAVE_EBX, %ebx
        !           184:        movl    %edx, %eax
        !           185:
        !           186:        movl    SAVE_EBP, %ebp
        !           187:        addl    $STACK_SPACE, %esp
        !           188:
        !           189:        ret
        !           190:
        !           191:
        !           192: L(divide_fraction):
        !           193:        C eax   scratch (quotient)
        !           194:        C ebx   counter
        !           195:        C ecx
        !           196:        C edx   scratch (remainder)
        !           197:        C esi
        !           198:        C edi   dst
        !           199:        C ebp   divisor
        !           200:
        !           201:        movl    $0, %eax
        !           202:
        !           203:        divl    %ebp
        !           204:
        !           205:        movl    %eax, -4(%edi,%ebx,4)
        !           206:        decl    %ebx
        !           207:        jnz     L(divide_fraction)
        !           208:
        !           209:        jmp     L(divide_done)
        !           210:
        !           211:
        !           212:
        !           213: C -----------------------------------------------------------------------------
        !           214:
        !           215: L(mul_by_inverse):
        !           216:        C eax
        !           217:        C ebx   xsize
        !           218:        C ecx   size
        !           219:        C edx   carry
        !           220:        C esi   src
        !           221:        C edi   &dst[xsize-1]
        !           222:        C ebp   divisor
        !           223:
        !           224:        leal    12(%edi), %ebx
        !           225:
        !           226:        movl    %ebx, VAR_DST_STOP
        !           227:        leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
        !           228:
        !           229:        movl    %edi, VAR_DST
        !           230:        movl    %ecx, %ebx              C size
        !           231:
        !           232:        bsrl    %ebp, %ecx              C 31-l
        !           233:        movl    %edx, %edi              C carry
        !           234:
        !           235:        leal    1(%ecx), %eax           C 32-l
        !           236:        xorl    $31, %ecx               C l
        !           237:
        !           238:        movl    %ecx, VAR_NORM
        !           239:        movl    $-1, %edx
        !           240:
        !           241:        shll    %cl, %ebp               C d normalized
        !           242:        movd    %eax, %mm7
        !           243:
        !           244:        movl    $-1, %eax
        !           245:        subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
        !           246:
        !           247:        divl    %ebp                    C floor (b*(b-d)-1) / d
        !           248:
        !           249:        movl    %eax, VAR_INVERSE
        !           250:        orl     %ebx, %ebx              C size
        !           251:        leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
        !           252:
        !           253:        movl    %eax, VAR_SRC
        !           254:        jz      L(start_zero)
        !           255:
        !           256:        movl    8(%eax), %esi           C src high limb
        !           257:        cmpl    $1, %ebx
        !           258:        jz      L(start_one)
        !           259:
        !           260: L(start_two_or_more):
        !           261:        movl    4(%eax), %edx           C src second highest limb
        !           262:
        !           263:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
        !           264:
        !           265:        shldl(  %cl, %edx, %esi)        C n10 = high,second << l
        !           266:
        !           267:        cmpl    $2, %ebx
        !           268:        je      L(integer_two_left)
        !           269:        jmp     L(integer_top)
        !           270:
        !           271:
        !           272: L(start_one):
        !           273:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
        !           274:
        !           275:        shll    %cl, %esi               C n10 = high << l
        !           276:        jmp     L(integer_one_left)
        !           277:
        !           278:
        !           279: L(start_zero):
        !           280:        shll    %cl, %edi               C n2 = carry << l
        !           281:        movl    $0, %esi                C n10 = 0
        !           282:
        !           283:        C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
        !           284:        C must have xsize!=0
        !           285:        jmp     L(fraction_some)
        !           286:
        !           287:
        !           288:
        !           289: C -----------------------------------------------------------------------------
        !           290: C
        !           291: C This loop runs at about 25 cycles, which is probably sub-optimal, and
        !           292: C certainly more than the dependent chain would suggest.  A better loop, or
        !           293: C a better rough analysis of what's possible, would be welcomed.
        !           294: C
        !           295: C In the current implementation, the following successively dependent
        !           296: C micro-ops seem to exist.
        !           297: C
        !           298: C                     uops
        !           299: C              n2+n1   1   (addl)
        !           300: C              mul     5
        !           301: C              q1+1    3   (addl/adcl)
        !           302: C              mul     5
        !           303: C              sub     3   (subl/sbbl)
        !           304: C              addback 2   (cmov)
        !           305: C                     ---
        !           306: C                     19
        !           307: C
        !           308: C Lack of registers hinders explicit scheduling and it might be that the
        !           309: C normal out of order execution isn't able to hide enough under the mul
        !           310: C latencies.
        !           311: C
        !           312: C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
        !           313: C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
        !           314: C combination was tried for the addback (despite the fact it would lengthen
        !           315: C the dependent chain) but found to be no faster.
        !           316:
        !           317:
        !           318:        ALIGN(16)
        !           319: L(integer_top):
        !           320:        C eax   scratch
        !           321:        C ebx   scratch (nadj, q1)
        !           322:        C ecx   scratch (src, dst)
        !           323:        C edx   scratch
        !           324:        C esi   n10
        !           325:        C edi   n2
        !           326:        C ebp   d
        !           327:        C
        !           328:        C mm0   scratch (src qword)
        !           329:        C mm7   rshift for normalization
        !           330:
        !           331:        movl    %esi, %eax
        !           332:        movl    %ebp, %ebx
        !           333:
        !           334:        sarl    $31, %eax          C -n1
        !           335:        movl    VAR_SRC, %ecx
        !           336:
        !           337:        andl    %eax, %ebx         C -n1 & d
        !           338:        negl    %eax               C n1
        !           339:
        !           340:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
        !           341:        addl    %edi, %eax         C n2+n1
        !           342:        movq    (%ecx), %mm0       C next src limb and the one below it
        !           343:
        !           344:        mull    VAR_INVERSE        C m*(n2+n1)
        !           345:
        !           346:        subl    $4, %ecx
        !           347:
        !           348:        movl    %ecx, VAR_SRC
        !           349:
        !           350:        C
        !           351:
        !           352:        C
        !           353:
        !           354:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           355:        movl    %ebp, %eax         C d
        !           356:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
        !           357:
        !           358:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           359:        jz      L(q1_ff)
        !           360:
        !           361:        mull    %ebx               C (q1+1)*d
        !           362:
        !           363:        movl    VAR_DST, %ecx
        !           364:        psrlq   %mm7, %mm0
        !           365:
        !           366:        C
        !           367:
        !           368:        C
        !           369:
        !           370:        C
        !           371:
        !           372:        subl    %eax, %esi
        !           373:        movl    VAR_DST_STOP, %eax
        !           374:
        !           375:        sbbl    %edx, %edi         C n - (q1+1)*d
        !           376:        movl    %esi, %edi         C remainder -> n2
        !           377:        leal    (%ebp,%esi), %edx
        !           378:
        !           379:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
        !           380:        movd    %mm0, %esi
        !           381:
        !           382:        sbbl    $0, %ebx           C q
        !           383:        subl    $4, %ecx
        !           384:
        !           385:        movl    %ebx, (%ecx)
        !           386:        cmpl    %eax, %ecx
        !           387:
        !           388:        movl    %ecx, VAR_DST
        !           389:        jne     L(integer_top)
        !           390:
        !           391:
        !           392: L(integer_loop_done):
        !           393:
        !           394:
        !           395: C -----------------------------------------------------------------------------
        !           396: C
        !           397: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
        !           398: C q1_ff special case.  This make the code a bit smaller and simpler, and
        !           399: C costs only 2 cycles (each).
        !           400:
        !           401: L(integer_two_left):
        !           402:        C eax   scratch
        !           403:        C ebx   scratch (nadj, q1)
        !           404:        C ecx   scratch (src, dst)
        !           405:        C edx   scratch
        !           406:        C esi   n10
        !           407:        C edi   n2
        !           408:        C ebp   divisor
        !           409:        C
        !           410:        C mm0   src limb, shifted
        !           411:        C mm7   rshift
        !           412:
        !           413:
        !           414:        movl    %esi, %eax
        !           415:        movl    %ebp, %ebx
        !           416:
        !           417:        sarl    $31, %eax          C -n1
        !           418:        movl    PARAM_SRC, %ecx
        !           419:
        !           420:        andl    %eax, %ebx         C -n1 & d
        !           421:        negl    %eax               C n1
        !           422:
        !           423:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
        !           424:        addl    %edi, %eax         C n2+n1
        !           425:
        !           426:        mull    VAR_INVERSE        C m*(n2+n1)
        !           427:
        !           428:        movd    (%ecx), %mm0       C src low limb
        !           429:
        !           430:        movl    VAR_DST_STOP, %ecx
        !           431:
        !           432:        C
        !           433:
        !           434:        C
        !           435:
        !           436:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           437:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
        !           438:        movl    %ebp, %eax         C d
        !           439:
        !           440:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           441:
        !           442:        sbbl    $0, %ebx
        !           443:
        !           444:        mull    %ebx               C (q1+1)*d
        !           445:
        !           446:        psllq   $32, %mm0
        !           447:
        !           448:        psrlq   %mm7, %mm0
        !           449:
        !           450:        C
        !           451:
        !           452:        C
        !           453:
        !           454:        subl    %eax, %esi
        !           455:
        !           456:        sbbl    %edx, %edi         C n - (q1+1)*d
        !           457:        movl    %esi, %edi         C remainder -> n2
        !           458:        leal    (%ebp,%esi), %edx
        !           459:
        !           460:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
        !           461:        movd    %mm0, %esi
        !           462:
        !           463:        sbbl    $0, %ebx           C q
        !           464:
        !           465:        movl    %ebx, -4(%ecx)
        !           466:
        !           467:
        !           468: C -----------------------------------------------------------------------------
        !           469: L(integer_one_left):
        !           470:        C eax   scratch
        !           471:        C ebx   scratch (nadj, q1)
        !           472:        C ecx   scratch (dst)
        !           473:        C edx   scratch
        !           474:        C esi   n10
        !           475:        C edi   n2
        !           476:        C ebp   divisor
        !           477:        C
        !           478:        C mm0   src limb, shifted
        !           479:        C mm7   rshift
        !           480:
        !           481:
        !           482:        movl    %esi, %eax
        !           483:        movl    %ebp, %ebx
        !           484:
        !           485:        sarl    $31, %eax          C -n1
        !           486:        movl    VAR_DST_STOP, %ecx
        !           487:
        !           488:        andl    %eax, %ebx         C -n1 & d
        !           489:        negl    %eax               C n1
        !           490:
        !           491:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
        !           492:        addl    %edi, %eax         C n2+n1
        !           493:
        !           494:        mull    VAR_INVERSE        C m*(n2+n1)
        !           495:
        !           496:        C
        !           497:
        !           498:        C
        !           499:
        !           500:        C
        !           501:
        !           502:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           503:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
        !           504:        movl    %ebp, %eax         C d
        !           505:
        !           506:        C
        !           507:
        !           508:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           509:
        !           510:        sbbl    $0, %ebx           C q1 if q1+1 overflowed
        !           511:
        !           512:        mull    %ebx
        !           513:
        !           514:        C
        !           515:
        !           516:        C
        !           517:
        !           518:        C
        !           519:
        !           520:        C
        !           521:
        !           522:        subl    %eax, %esi
        !           523:        movl    PARAM_XSIZE, %eax
        !           524:
        !           525:        sbbl    %edx, %edi         C n - (q1+1)*d
        !           526:        movl    %esi, %edi         C remainder -> n2
        !           527:        leal    (%ebp,%esi), %edx
        !           528:
        !           529:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
        !           530:
        !           531:        sbbl    $0, %ebx           C q
        !           532:
        !           533:        movl    %ebx, -8(%ecx)
        !           534:        subl    $8, %ecx
        !           535:
        !           536:
        !           537:
        !           538:        orl     %eax, %eax         C xsize
        !           539:        jnz     L(fraction_some)
        !           540:
        !           541:        movl    %edi, %eax
        !           542: L(fraction_done):
        !           543:        movl    VAR_NORM, %ecx
        !           544:        movl    SAVE_EBP, %ebp
        !           545:
        !           546:        movl    SAVE_EDI, %edi
        !           547:
        !           548:        movl    SAVE_ESI, %esi
        !           549:
        !           550:        movl    SAVE_EBX, %ebx
        !           551:        addl    $STACK_SPACE, %esp
        !           552:
        !           553:        shrl    %cl, %eax
        !           554:        emms
        !           555:
        !           556:        ret
        !           557:
        !           558:
        !           559: C -----------------------------------------------------------------------------
        !           560: C
        !           561: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
        !           562: C of q*d is simply -d and the remainder n-q*d = n10+d
        !           563:
        !           564: L(q1_ff):
        !           565:        C eax   (divisor)
        !           566:        C ebx   (q1+1 == 0)
        !           567:        C ecx
        !           568:        C edx
        !           569:        C esi   n10
        !           570:        C edi   n2
        !           571:        C ebp   divisor
        !           572:
        !           573:        movl    VAR_DST, %ecx
        !           574:        movl    VAR_DST_STOP, %edx
        !           575:        subl    $4, %ecx
        !           576:
        !           577:        movl    %ecx, VAR_DST
        !           578:        psrlq   %mm7, %mm0
        !           579:        leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
        !           580:
        !           581:        movl    $-1, (%ecx)
        !           582:        movd    %mm0, %esi              C next n10
        !           583:
        !           584:        cmpl    %ecx, %edx
        !           585:        jne     L(integer_top)
        !           586:
        !           587:        jmp     L(integer_loop_done)
        !           588:
        !           589:
        !           590:
        !           591: C -----------------------------------------------------------------------------
        !           592: C
        !           593: C In the current implementation, the following successively dependent
        !           594: C micro-ops seem to exist.
        !           595: C
        !           596: C                     uops
        !           597: C              mul     5
        !           598: C              q1+1    1   (addl)
        !           599: C              mul     5
        !           600: C              sub     3   (negl/sbbl)
        !           601: C              addback 2   (cmov)
        !           602: C                     ---
        !           603: C                     16
        !           604: C
        !           605: C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
        !           606: C the addback was found to be a touch slower.
        !           607:
        !           608:
        !           609:        ALIGN(16)
        !           610: L(fraction_some):
        !           611:        C eax
        !           612:        C ebx
        !           613:        C ecx
        !           614:        C edx
        !           615:        C esi
        !           616:        C edi   carry
        !           617:        C ebp   divisor
        !           618:
        !           619:        movl    PARAM_DST, %esi
        !           620:        movl    VAR_DST_STOP, %ecx
        !           621:        movl    %edi, %eax
        !           622:
        !           623:        subl    $8, %ecx
        !           624:
        !           625:
        !           626:        ALIGN(16)
        !           627: L(fraction_top):
        !           628:        C eax   n2, then scratch
        !           629:        C ebx   scratch (nadj, q1)
        !           630:        C ecx   dst, decrementing
        !           631:        C edx   scratch
        !           632:        C esi   dst stop point
        !           633:        C edi   n2
        !           634:        C ebp   divisor
        !           635:
        !           636:        mull    VAR_INVERSE     C m*n2
        !           637:
        !           638:        movl    %ebp, %eax      C d
        !           639:        subl    $4, %ecx        C dst
        !           640:        leal    1(%edi), %ebx
        !           641:
        !           642:        C
        !           643:
        !           644:        C
        !           645:
        !           646:        C
        !           647:
        !           648:        addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
        !           649:
        !           650:        mull    %ebx            C (q1+1)*d
        !           651:
        !           652:        C
        !           653:
        !           654:        C
        !           655:
        !           656:        C
        !           657:
        !           658:        C
        !           659:
        !           660:        negl    %eax            C low of n - (q1+1)*d
        !           661:
        !           662:        sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
        !           663:        leal    (%ebp,%eax), %edx
        !           664:
        !           665:        cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
        !           666:
        !           667:        sbbl    $0, %ebx        C q
        !           668:        movl    %eax, %edi      C remainder->n2
        !           669:        cmpl    %esi, %ecx
        !           670:
        !           671:        movl    %ebx, (%ecx)    C previous q
        !           672:        jne     L(fraction_top)
        !           673:
        !           674:
        !           675:        jmp     L(fraction_done)
        !           676:
        !           677: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>