[BACK]Return to divrem_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/divrem_1.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K7 mpn_divrem_1 -- mpn by limb division.
                      2: dnl
                      3: dnl  K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
                      4:
                      5:
                      6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      7: dnl
                      8: dnl  This file is part of the GNU MP Library.
                      9: dnl
                     10: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     11: dnl  modify it under the terms of the GNU Lesser General Public License as
                     12: dnl  published by the Free Software Foundation; either version 2.1 of the
                     13: dnl  License, or (at your option) any later version.
                     14: dnl
                     15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     18: dnl  Lesser General Public License for more details.
                     19: dnl
                     20: dnl  You should have received a copy of the GNU Lesser General Public
                     21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     23: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     24:
                     25:
                     26: include(`../config.m4')
                     27:
                     28:
                     29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
                     30: C                         mp_srcptr src, mp_size_t size,
                     31: C                         mp_limb_t divisor);
                     32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
                     33: C                          mp_srcptr src, mp_size_t size,
                     34: C                          mp_limb_t divisor, mp_limb_t carry);
                     35: C
                     36: C The method and nomenclature follow part 8 of "Division by Invariant
                     37: C Integers using Multiplication" by Granlund and Montgomery, reference in
                     38: C gmp.texi.
                     39: C
                     40: C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
                     41: C for m', and "d" for d_norm, which won't cause any confusion since it's
                     42: C only the normalized divisor that's of any use in the code.  "b" is written
                     43: C for 2^N, the size of a limb, N being 32 here.
                     44: C
                     45: C mpn_divrem_1 avoids one division if the src high limb is less than the
                     46: C divisor.  mpn_divrem_1c doesn't check for a zero carry, since in normal
                     47: C circumstances that will be a very rare event.
                     48: C
                     49: C There's a small bias towards expecting xsize==0, by having code for
                     50: C xsize==0 in a straight line and xsize!=0 under forward jumps.
                     51:
                     52:
                     53: dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
                     54: dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
                     55: dnl
                     56: dnl  The inverse takes about 50 cycles to calculate, but after that the
                     57: dnl  multiply is 17 c/l versus division at 42 c/l.
                     58: dnl
                     59: dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
                     60: dnl  even more so on the fractional part.
                     61:
                     62: deflit(MUL_THRESHOLD, 3)
                     63:
                     64:
                     65: defframe(PARAM_CARRY,  24)
                     66: defframe(PARAM_DIVISOR,20)
                     67: defframe(PARAM_SIZE,   16)
                     68: defframe(PARAM_SRC,    12)
                     69: defframe(PARAM_XSIZE,  8)
                     70: defframe(PARAM_DST,    4)
                     71:
                     72: defframe(SAVE_EBX,    -4)
                     73: defframe(SAVE_ESI,    -8)
                     74: defframe(SAVE_EDI,    -12)
                     75: defframe(SAVE_EBP,    -16)
                     76:
                     77: defframe(VAR_NORM,    -20)
                     78: defframe(VAR_INVERSE, -24)
                     79: defframe(VAR_SRC,     -28)
                     80: defframe(VAR_DST,     -32)
                     81: defframe(VAR_DST_STOP,-36)
                     82:
                     83: deflit(STACK_SPACE, 36)
                     84:
                     85:        .text
                     86:        ALIGN(32)
                     87:
                     88: PROLOGUE(mpn_divrem_1c)
                     89: deflit(`FRAME',0)
                     90:        movl    PARAM_CARRY, %edx
                     91:        movl    PARAM_SIZE, %ecx
                     92:        subl    $STACK_SPACE, %esp
                     93: deflit(`FRAME',STACK_SPACE)
                     94:
                     95:        movl    %ebx, SAVE_EBX
                     96:        movl    PARAM_XSIZE, %ebx
                     97:
                     98:        movl    %edi, SAVE_EDI
                     99:        movl    PARAM_DST, %edi
                    100:
                    101:        movl    %ebp, SAVE_EBP
                    102:        movl    PARAM_DIVISOR, %ebp
                    103:
                    104:        movl    %esi, SAVE_ESI
                    105:        movl    PARAM_SRC, %esi
                    106:
                    107:        leal    -4(%edi,%ebx,4), %edi
                    108:        jmp     LF(mpn_divrem_1,start_1c)
                    109:
                    110: EPILOGUE()
                    111:
                    112:
                    113:        C offset 0x31, close enough to aligned
                    114: PROLOGUE(mpn_divrem_1)
                    115: deflit(`FRAME',0)
                    116:
                    117:        movl    PARAM_SIZE, %ecx
                    118:        movl    $0, %edx                C initial carry (if can't skip a div)
                    119:        subl    $STACK_SPACE, %esp
                    120: deflit(`FRAME',STACK_SPACE)
                    121:
                    122:        movl    %ebp, SAVE_EBP
                    123:        movl    PARAM_DIVISOR, %ebp
                    124:
                    125:        movl    %ebx, SAVE_EBX
                    126:        movl    PARAM_XSIZE, %ebx
                    127:
                    128:        movl    %esi, SAVE_ESI
                    129:        movl    PARAM_SRC, %esi
                    130:        orl     %ecx, %ecx
                    131:
                    132:        movl    %edi, SAVE_EDI
                    133:        movl    PARAM_DST, %edi
                    134:        leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
                    135:
                    136:        jz      L(no_skip_div)
                    137:        movl    -4(%esi,%ecx,4), %eax   C src high limb
                    138:
                    139:        cmpl    %ebp, %eax              C one less div if high<divisor
                    140:        jnb     L(no_skip_div)
                    141:
                    142:        movl    $0, (%edi,%ecx,4)       C dst high limb
                    143:        decl    %ecx                    C size-1
                    144:        movl    %eax, %edx              C src high limb as initial carry
                    145: L(no_skip_div):
                    146:
                    147:
                    148: L(start_1c):
                    149:        C eax
                    150:        C ebx   xsize
                    151:        C ecx   size
                    152:        C edx   carry
                    153:        C esi   src
                    154:        C edi   &dst[xsize-1]
                    155:        C ebp   divisor
                    156:
                    157:        leal    (%ebx,%ecx), %eax       C size+xsize
                    158:        cmpl    $MUL_THRESHOLD, %eax
                    159:        jae     L(mul_by_inverse)
                    160:
                    161:
                    162: C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
                    163: C It'd be possible to write them out without the looping, but no speedup
                    164: C would be expected.
                    165: C
                    166: C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
                    167: C integer part, but curiously not on the fractional part, where %ebp is a
                    168: C (fixed) couple of cycles faster.
                    169:
                    170:        orl     %ecx, %ecx
                    171:        jz      L(divide_no_integer)
                    172:
                    173: L(divide_integer):
                    174:        C eax   scratch (quotient)
                    175:        C ebx   xsize
                    176:        C ecx   counter
                    177:        C edx   scratch (remainder)
                    178:        C esi   src
                    179:        C edi   &dst[xsize-1]
                    180:        C ebp   divisor
                    181:
                    182:        movl    -4(%esi,%ecx,4), %eax
                    183:
                    184:        divl    PARAM_DIVISOR
                    185:
                    186:        movl    %eax, (%edi,%ecx,4)
                    187:        decl    %ecx
                    188:        jnz     L(divide_integer)
                    189:
                    190:
                    191: L(divide_no_integer):
                    192:        movl    PARAM_DST, %edi
                    193:        orl     %ebx, %ebx
                    194:        jnz     L(divide_fraction)
                    195:
                    196: L(divide_done):
                    197:        movl    SAVE_ESI, %esi
                    198:        movl    SAVE_EDI, %edi
                    199:        movl    %edx, %eax
                    200:
                    201:        movl    SAVE_EBX, %ebx
                    202:        movl    SAVE_EBP, %ebp
                    203:        addl    $STACK_SPACE, %esp
                    204:
                    205:        ret
                    206:
                    207:
                    208: L(divide_fraction):
                    209:        C eax   scratch (quotient)
                    210:        C ebx   counter
                    211:        C ecx
                    212:        C edx   scratch (remainder)
                    213:        C esi
                    214:        C edi   dst
                    215:        C ebp   divisor
                    216:
                    217:        movl    $0, %eax
                    218:
                    219:        divl    %ebp
                    220:
                    221:        movl    %eax, -4(%edi,%ebx,4)
                    222:        decl    %ebx
                    223:        jnz     L(divide_fraction)
                    224:
                    225:        jmp     L(divide_done)
                    226:
                    227:
                    228:
                    229: C -----------------------------------------------------------------------------
                    230:
                    231: L(mul_by_inverse):
                    232:        C eax
                    233:        C ebx   xsize
                    234:        C ecx   size
                    235:        C edx   carry
                    236:        C esi   src
                    237:        C edi   &dst[xsize-1]
                    238:        C ebp   divisor
                    239:
                    240:        bsrl    %ebp, %eax              C 31-l
                    241:
                    242:        leal    12(%edi), %ebx
                    243:        leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
                    244:
                    245:        movl    %edi, VAR_DST
                    246:        movl    %ebx, VAR_DST_STOP
                    247:
                    248:        movl    %ecx, %ebx              C size
                    249:        movl    $31, %ecx
                    250:
                    251:        movl    %edx, %edi              C carry
                    252:        movl    $-1, %edx
                    253:
                    254:        C
                    255:
                    256:        xorl    %eax, %ecx              C l
                    257:        incl    %eax                    C 32-l
                    258:
                    259:        shll    %cl, %ebp               C d normalized
                    260:        movl    %ecx, VAR_NORM
                    261:
                    262:        movd    %eax, %mm7
                    263:
                    264:        movl    $-1, %eax
                    265:        subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
                    266:
                    267:        divl    %ebp                    C floor (b*(b-d)-1) / d
                    268:
                    269:        orl     %ebx, %ebx              C size
                    270:        movl    %eax, VAR_INVERSE
                    271:        leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
                    272:
                    273:        jz      L(start_zero)
                    274:        movl    %eax, VAR_SRC
                    275:        cmpl    $1, %ebx
                    276:
                    277:        movl    8(%eax), %esi           C src high limb
                    278:        jz      L(start_one)
                    279:
                    280: L(start_two_or_more):
                    281:        movl    4(%eax), %edx           C src second highest limb
                    282:
                    283:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    284:
                    285:        shldl(  %cl, %edx, %esi)        C n10 = high,second << l
                    286:
                    287:        cmpl    $2, %ebx
                    288:        je      L(integer_two_left)
                    289:        jmp     L(integer_top)
                    290:
                    291:
                    292: L(start_one):
                    293:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    294:
                    295:        shll    %cl, %esi               C n10 = high << l
                    296:        movl    %eax, VAR_SRC
                    297:        jmp     L(integer_one_left)
                    298:
                    299:
                    300: L(start_zero):
                    301:        shll    %cl, %edi               C n2 = carry << l
                    302:        movl    $0, %esi                C n10 = 0
                    303:
                    304:        C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
                    305:        C must have xsize!=0
                    306:        jmp     L(fraction_some)
                    307:
                    308:
                    309:
                    310: C -----------------------------------------------------------------------------
                    311: C
                    312: C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
                    313: C execution.  The instruction scheduling is important, with various
                    314: C apparently equivalent forms running 1 to 5 cycles slower.
                    315: C
                    316: C A lower bound for the time would seem to be 16 cycles, based on the
                    317: C following successive dependencies.
                    318: C
                    319: C                    cycles
                    320: C              n2+n1   1
                    321: C              mul     6
                    322: C              q1+1    1
                    323: C              mul     6
                    324: C              sub     1
                    325: C              addback 1
                    326: C                     ---
                    327: C                     16
                    328: C
                    329: C This chain is what the loop has already, but 16 cycles isn't achieved.
                    330: C K7 has enough decode, and probably enough execute (depending maybe on what
                    331: C a mul actually consumes), but nothing running under 17 has been found.
                    332: C
                    333: C In theory n2+n1 could be done in the sub and addback stages (by
                    334: C calculating both n2 and n2+n1 there), but lack of registers makes this an
                    335: C unlikely proposition.
                    336: C
                    337: C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
                    338: C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
                    339: C chain, and nothing better than 18 cycles has been found when using it.
                    340: C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
                    341: C be an extremely rare event.
                    342: C
                    343: C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
                    344: C if some special data is coming out with this always, the q1_ff special
                    345: C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
                    346: C induce the q1_ff case, for speed measurements or testing.  Note that
                    347: C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
                    348: C
                    349: C The instruction groupings and empty comments show the cycles for a naive
                    350: C in-order view of the code (conveniently ignoring the load latency on
                    351: C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
                    352: C to the extent that out-of-order execution rearranges it.  In this case
                    353: C there's 19 cycles shown, but it executes at 17.
                    354:
                    355:        ALIGN(16)
                    356: L(integer_top):
                    357:        C eax   scratch
                    358:        C ebx   scratch (nadj, q1)
                    359:        C ecx   scratch (src, dst)
                    360:        C edx   scratch
                    361:        C esi   n10
                    362:        C edi   n2
                    363:        C ebp   divisor
                    364:        C
                    365:        C mm0   scratch (src qword)
                    366:        C mm7   rshift for normalization
                    367:
                    368:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
                    369:        movl    %edi, %eax         C n2
                    370:        movl    VAR_SRC, %ecx
                    371:
                    372:        leal    (%ebp,%esi), %ebx
                    373:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
                    374:        sbbl    $-1, %eax          C n2+n1
                    375:
                    376:        mull    VAR_INVERSE        C m*(n2+n1)
                    377:
                    378:        movq    (%ecx), %mm0       C next limb and the one below it
                    379:        subl    $4, %ecx
                    380:
                    381:        movl    %ecx, VAR_SRC
                    382:
                    383:        C
                    384:
                    385:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
                    386:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
                    387:        movl    %ebp, %eax         C d
                    388:
                    389:        C
                    390:
                    391:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    392:        jz      L(q1_ff)
                    393:        movl    VAR_DST, %ecx
                    394:
                    395:        mull    %ebx               C (q1+1)*d
                    396:
                    397:        psrlq   %mm7, %mm0
                    398:
                    399:        leal    -4(%ecx), %ecx
                    400:
                    401:        C
                    402:
                    403:        subl    %eax, %esi
                    404:        movl    VAR_DST_STOP, %eax
                    405:
                    406:        C
                    407:
                    408:        sbbl    %edx, %edi         C n - (q1+1)*d
                    409:        movl    %esi, %edi         C remainder -> n2
                    410:        leal    (%ebp,%esi), %edx
                    411:
                    412:        movd    %mm0, %esi
                    413:
                    414:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    415:        sbbl    $0, %ebx           C q
                    416:        cmpl    %eax, %ecx
                    417:
                    418:        movl    %ebx, (%ecx)
                    419:        movl    %ecx, VAR_DST
                    420:        jne     L(integer_top)
                    421:
                    422:
                    423: L(integer_loop_done):
                    424:
                    425:
                    426: C -----------------------------------------------------------------------------
                    427: C
                    428: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
                    429: C q1_ff special case.  This make the code a bit smaller and simpler, and
                    430: C costs only 1 cycle (each).
                    431:
                    432: L(integer_two_left):
                    433:        C eax   scratch
                    434:        C ebx   scratch (nadj, q1)
                    435:        C ecx   scratch (src, dst)
                    436:        C edx   scratch
                    437:        C esi   n10
                    438:        C edi   n2
                    439:        C ebp   divisor
                    440:        C
                    441:        C mm0   src limb, shifted
                    442:        C mm7   rshift
                    443:
                    444:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
                    445:        movl    %edi, %eax         C n2
                    446:        movl    PARAM_SRC, %ecx
                    447:
                    448:        leal    (%ebp,%esi), %ebx
                    449:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
                    450:        sbbl    $-1, %eax          C n2+n1
                    451:
                    452:        mull    VAR_INVERSE        C m*(n2+n1)
                    453:
                    454:        movd    (%ecx), %mm0       C src low limb
                    455:
                    456:        movl    VAR_DST_STOP, %ecx
                    457:
                    458:        C
                    459:
                    460:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
                    461:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
                    462:        movl    %ebp, %eax         C d
                    463:
                    464:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    465:
                    466:        sbbl    $0, %ebx
                    467:
                    468:        mull    %ebx               C (q1+1)*d
                    469:
                    470:        psllq   $32, %mm0
                    471:
                    472:        psrlq   %mm7, %mm0
                    473:
                    474:        C
                    475:
                    476:        subl    %eax, %esi
                    477:
                    478:        C
                    479:
                    480:        sbbl    %edx, %edi         C n - (q1+1)*d
                    481:        movl    %esi, %edi         C remainder -> n2
                    482:        leal    (%ebp,%esi), %edx
                    483:
                    484:        movd    %mm0, %esi
                    485:
                    486:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    487:        sbbl    $0, %ebx           C q
                    488:
                    489:        movl    %ebx, -4(%ecx)
                    490:
                    491:
                    492: C -----------------------------------------------------------------------------
                    493: L(integer_one_left):
                    494:        C eax   scratch
                    495:        C ebx   scratch (nadj, q1)
                    496:        C ecx   dst
                    497:        C edx   scratch
                    498:        C esi   n10
                    499:        C edi   n2
                    500:        C ebp   divisor
                    501:        C
                    502:        C mm0   src limb, shifted
                    503:        C mm7   rshift
                    504:
                    505:        movl    VAR_DST_STOP, %ecx
                    506:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
                    507:        movl    %edi, %eax         C n2
                    508:
                    509:        leal    (%ebp,%esi), %ebx
                    510:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
                    511:        sbbl    $-1, %eax          C n2+n1
                    512:
                    513:        mull    VAR_INVERSE        C m*(n2+n1)
                    514:
                    515:        C
                    516:
                    517:        C
                    518:
                    519:        C
                    520:
                    521:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
                    522:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
                    523:        movl    %ebp, %eax         C d
                    524:
                    525:        C
                    526:
                    527:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    528:
                    529:        sbbl    $0, %ebx           C q1 if q1+1 overflowed
                    530:
                    531:        mull    %ebx
                    532:
                    533:        C
                    534:
                    535:        C
                    536:
                    537:        C
                    538:
                    539:        subl    %eax, %esi
                    540:
                    541:        C
                    542:
                    543:        sbbl    %edx, %edi         C n - (q1+1)*d
                    544:        movl    %esi, %edi         C remainder -> n2
                    545:        leal    (%ebp,%esi), %edx
                    546:
                    547:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    548:        sbbl    $0, %ebx           C q
                    549:
                    550:        movl    %ebx, -8(%ecx)
                    551:        subl    $8, %ecx
                    552:
                    553:
                    554:
                    555: L(integer_none):
                    556:        cmpl    $0, PARAM_XSIZE
                    557:        jne     L(fraction_some)
                    558:
                    559:        movl    %edi, %eax
                    560: L(fraction_done):
                    561:        movl    VAR_NORM, %ecx
                    562:        movl    SAVE_EBP, %ebp
                    563:
                    564:        movl    SAVE_EDI, %edi
                    565:        movl    SAVE_ESI, %esi
                    566:
                    567:        movl    SAVE_EBX, %ebx
                    568:        addl    $STACK_SPACE, %esp
                    569:
                    570:        shrl    %cl, %eax
                    571:        emms
                    572:
                    573:        ret
                    574:
                    575:
                    576: C -----------------------------------------------------------------------------
                    577: C
                    578: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
                    579: C of q*d is simply -d and the remainder n-q*d = n10+d
                    580:
                    581: L(q1_ff):
                    582:        C eax   (divisor)
                    583:        C ebx   (q1+1 == 0)
                    584:        C ecx
                    585:        C edx
                    586:        C esi   n10
                    587:        C edi   n2
                    588:        C ebp   divisor
                    589:
                    590:        movl    VAR_DST, %ecx
                    591:        movl    VAR_DST_STOP, %edx
                    592:        subl    $4, %ecx
                    593:
                    594:        psrlq   %mm7, %mm0
                    595:        leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
                    596:        movl    %ecx, VAR_DST
                    597:
                    598:        movd    %mm0, %esi              C next n10
                    599:
                    600:        movl    $-1, (%ecx)
                    601:        cmpl    %ecx, %edx
                    602:        jne     L(integer_top)
                    603:
                    604:        jmp     L(integer_loop_done)
                    605:
                    606:
                    607:
                    608: C -----------------------------------------------------------------------------
                    609: C
                    610: C Being the fractional part, the "source" limbs are all zero, meaning
                    611: C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
                    612: C
                    613: C The loop runs at 15 cycles.  The dependent chain is the same as the
                    614: C general case above, but without the n2+n1 stage (due to n1==0), so 15
                    615: C would seem to be the lower bound.
                    616: C
                    617: C A not entirely obvious simplification is that q1+1 never overflows a limb,
                    618: C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
                    619: C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
                    620: C rnd() means rounding down to a multiple of d.
                    621: C
                    622: C      m*n2 + b*n2 <= m*(d-1) + b*(d-1)
                    623: C                   = m*d + b*d - m - b
                    624: C                   = floor((b(b-d)-1)/d)*d + b*d - m - b
                    625: C                   = rnd(b(b-d)-1) + b*d - m - b
                    626: C                   = rnd(b(b-d)-1 + b*d) - m - b
                    627: C                   = rnd(b*b-1) - m - b
                    628: C                   <= (b-2)*b
                    629: C
                    630: C Unchanged from the general case is that the final quotient limb q can be
                    631: C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
                    632: C equation 8.4 of the paper which simplifies as follows when n1==0 and
                    633: C n0==0.
                    634: C
                    635: C      n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
                    636: C
                    637: C As before, the instruction groupings and empty comments show a naive
                    638: C in-order view of the code, which is made a nonsense by out of order
                    639: C execution.  There's 17 cycles shown, but it executes at 15.
                    640: C
                    641: C Rotating the store q and remainder->n2 instructions up to the top of the
                    642: C loop gets the run time down from 16 to 15.
                    643:
                    644:        ALIGN(16)
                    645: L(fraction_some):
                    646:        C eax
                    647:        C ebx
                    648:        C ecx
                    649:        C edx
                    650:        C esi
                    651:        C edi   carry
                    652:        C ebp   divisor
                    653:
                    654:        movl    PARAM_DST, %esi
                    655:        movl    VAR_DST_STOP, %ecx
                    656:        movl    %edi, %eax
                    657:
                    658:        subl    $8, %ecx
                    659:
                    660:        jmp     L(fraction_entry)
                    661:
                    662:
                    663:        ALIGN(16)
                    664: L(fraction_top):
                    665:        C eax   n2 carry, then scratch
                    666:        C ebx   scratch (nadj, q1)
                    667:        C ecx   dst, decrementing
                    668:        C edx   scratch
                    669:        C esi   dst stop point
                    670:        C edi   (will be n2)
                    671:        C ebp   divisor
                    672:
                    673:        movl    %ebx, (%ecx)    C previous q
                    674:        movl    %eax, %edi      C remainder->n2
                    675:
                    676: L(fraction_entry):
                    677:        mull    VAR_INVERSE     C m*n2
                    678:
                    679:        movl    %ebp, %eax      C d
                    680:        subl    $4, %ecx        C dst
                    681:        leal    1(%edi), %ebx
                    682:
                    683:        C
                    684:
                    685:        C
                    686:
                    687:        C
                    688:
                    689:        C
                    690:
                    691:        addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
                    692:
                    693:        mull    %ebx            C (q1+1)*d
                    694:
                    695:        C
                    696:
                    697:        C
                    698:
                    699:        C
                    700:
                    701:        negl    %eax            C low of n - (q1+1)*d
                    702:
                    703:        C
                    704:
                    705:        sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
                    706:        leal    (%ebp,%eax), %edx
                    707:
                    708:        cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
                    709:        sbbl    $0, %ebx        C q
                    710:        cmpl    %esi, %ecx
                    711:
                    712:        jne     L(fraction_top)
                    713:
                    714:
                    715:        movl    %ebx, (%ecx)
                    716:        jmp     L(fraction_done)
                    717:
                    718: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>