[BACK]Return to divrem_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/divrem_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_divrem_1 -- mpn by limb division.
        !             2: dnl
        !             3: dnl  K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
        !            30: C                         mp_srcptr src, mp_size_t size,
        !            31: C                         mp_limb_t divisor);
        !            32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
        !            33: C                          mp_srcptr src, mp_size_t size,
        !            34: C                          mp_limb_t divisor, mp_limb_t carry);
        !            35: C
        !            36: C The method and nomenclature follow part 8 of "Division by Invariant
        !            37: C Integers using Multiplication" by Granlund and Montgomery, reference in
        !            38: C gmp.texi.
        !            39: C
        !            40: C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
        !            41: C for m', and "d" for d_norm, which won't cause any confusion since it's
        !            42: C only the normalized divisor that's of any use in the code.  "b" is written
        !            43: C for 2^N, the size of a limb, N being 32 here.
        !            44: C
        !            45: C mpn_divrem_1 avoids one division if the src high limb is less than the
        !            46: C divisor.  mpn_divrem_1c doesn't check for a zero carry, since in normal
        !            47: C circumstances that will be a very rare event.
        !            48: C
        !            49: C There's a small bias towards expecting xsize==0, by having code for
        !            50: C xsize==0 in a straight line and xsize!=0 under forward jumps.
        !            51:
        !            52:
        !            53: dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
        !            54: dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
        !            55: dnl
        !            56: dnl  The inverse takes about 50 cycles to calculate, but after that the
        !            57: dnl  multiply is 17 c/l versus division at 42 c/l.
        !            58: dnl
        !            59: dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
        !            60: dnl  even more so on the fractional part.
        !            61:
        !            62: deflit(MUL_THRESHOLD, 3)
        !            63:
        !            64:
        !            65: defframe(PARAM_CARRY,  24)
        !            66: defframe(PARAM_DIVISOR,20)
        !            67: defframe(PARAM_SIZE,   16)
        !            68: defframe(PARAM_SRC,    12)
        !            69: defframe(PARAM_XSIZE,  8)
        !            70: defframe(PARAM_DST,    4)
        !            71:
        !            72: defframe(SAVE_EBX,    -4)
        !            73: defframe(SAVE_ESI,    -8)
        !            74: defframe(SAVE_EDI,    -12)
        !            75: defframe(SAVE_EBP,    -16)
        !            76:
        !            77: defframe(VAR_NORM,    -20)
        !            78: defframe(VAR_INVERSE, -24)
        !            79: defframe(VAR_SRC,     -28)
        !            80: defframe(VAR_DST,     -32)
        !            81: defframe(VAR_DST_STOP,-36)
        !            82:
        !            83: deflit(STACK_SPACE, 36)
        !            84:
        !            85:        .text
        !            86:        ALIGN(32)
        !            87:
        !            88: PROLOGUE(mpn_divrem_1c)
        !            89: deflit(`FRAME',0)
        !            90:        movl    PARAM_CARRY, %edx
        !            91:        movl    PARAM_SIZE, %ecx
        !            92:        subl    $STACK_SPACE, %esp
        !            93: deflit(`FRAME',STACK_SPACE)
        !            94:
        !            95:        movl    %ebx, SAVE_EBX
        !            96:        movl    PARAM_XSIZE, %ebx
        !            97:
        !            98:        movl    %edi, SAVE_EDI
        !            99:        movl    PARAM_DST, %edi
        !           100:
        !           101:        movl    %ebp, SAVE_EBP
        !           102:        movl    PARAM_DIVISOR, %ebp
        !           103:
        !           104:        movl    %esi, SAVE_ESI
        !           105:        movl    PARAM_SRC, %esi
        !           106:
        !           107:        leal    -4(%edi,%ebx,4), %edi
        !           108:        jmp     LF(mpn_divrem_1,start_1c)
        !           109:
        !           110: EPILOGUE()
        !           111:
        !           112:
        !           113:        C offset 0x31, close enough to aligned
        !           114: PROLOGUE(mpn_divrem_1)
        !           115: deflit(`FRAME',0)
        !           116:
        !           117:        movl    PARAM_SIZE, %ecx
        !           118:        movl    $0, %edx                C initial carry (if can't skip a div)
        !           119:        subl    $STACK_SPACE, %esp
        !           120: deflit(`FRAME',STACK_SPACE)
        !           121:
        !           122:        movl    %ebp, SAVE_EBP
        !           123:        movl    PARAM_DIVISOR, %ebp
        !           124:
        !           125:        movl    %ebx, SAVE_EBX
        !           126:        movl    PARAM_XSIZE, %ebx
        !           127:
        !           128:        movl    %esi, SAVE_ESI
        !           129:        movl    PARAM_SRC, %esi
        !           130:        orl     %ecx, %ecx
        !           131:
        !           132:        movl    %edi, SAVE_EDI
        !           133:        movl    PARAM_DST, %edi
        !           134:        leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
        !           135:
        !           136:        jz      L(no_skip_div)
        !           137:        movl    -4(%esi,%ecx,4), %eax   C src high limb
        !           138:
        !           139:        cmpl    %ebp, %eax              C one less div if high<divisor
        !           140:        jnb     L(no_skip_div)
        !           141:
        !           142:        movl    $0, (%edi,%ecx,4)       C dst high limb
        !           143:        decl    %ecx                    C size-1
        !           144:        movl    %eax, %edx              C src high limb as initial carry
        !           145: L(no_skip_div):
        !           146:
        !           147:
        !           148: L(start_1c):
        !           149:        C eax
        !           150:        C ebx   xsize
        !           151:        C ecx   size
        !           152:        C edx   carry
        !           153:        C esi   src
        !           154:        C edi   &dst[xsize-1]
        !           155:        C ebp   divisor
        !           156:
        !           157:        leal    (%ebx,%ecx), %eax       C size+xsize
        !           158:        cmpl    $MUL_THRESHOLD, %eax
        !           159:        jae     L(mul_by_inverse)
        !           160:
        !           161:
        !           162: C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
        !           163: C It'd be possible to write them out without the looping, but no speedup
        !           164: C would be expected.
        !           165: C
        !           166: C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
        !           167: C integer part, but curiously not on the fractional part, where %ebp is a
        !           168: C (fixed) couple of cycles faster.
        !           169:
        !           170:        orl     %ecx, %ecx
        !           171:        jz      L(divide_no_integer)
        !           172:
        !           173: L(divide_integer):
        !           174:        C eax   scratch (quotient)
        !           175:        C ebx   xsize
        !           176:        C ecx   counter
        !           177:        C edx   scratch (remainder)
        !           178:        C esi   src
        !           179:        C edi   &dst[xsize-1]
        !           180:        C ebp   divisor
        !           181:
        !           182:        movl    -4(%esi,%ecx,4), %eax
        !           183:
        !           184:        divl    PARAM_DIVISOR
        !           185:
        !           186:        movl    %eax, (%edi,%ecx,4)
        !           187:        decl    %ecx
        !           188:        jnz     L(divide_integer)
        !           189:
        !           190:
        !           191: L(divide_no_integer):
        !           192:        movl    PARAM_DST, %edi
        !           193:        orl     %ebx, %ebx
        !           194:        jnz     L(divide_fraction)
        !           195:
        !           196: L(divide_done):
        !           197:        movl    SAVE_ESI, %esi
        !           198:        movl    SAVE_EDI, %edi
        !           199:        movl    %edx, %eax
        !           200:
        !           201:        movl    SAVE_EBX, %ebx
        !           202:        movl    SAVE_EBP, %ebp
        !           203:        addl    $STACK_SPACE, %esp
        !           204:
        !           205:        ret
        !           206:
        !           207:
        !           208: L(divide_fraction):
        !           209:        C eax   scratch (quotient)
        !           210:        C ebx   counter
        !           211:        C ecx
        !           212:        C edx   scratch (remainder)
        !           213:        C esi
        !           214:        C edi   dst
        !           215:        C ebp   divisor
        !           216:
        !           217:        movl    $0, %eax
        !           218:
        !           219:        divl    %ebp
        !           220:
        !           221:        movl    %eax, -4(%edi,%ebx,4)
        !           222:        decl    %ebx
        !           223:        jnz     L(divide_fraction)
        !           224:
        !           225:        jmp     L(divide_done)
        !           226:
        !           227:
        !           228:
        !           229: C -----------------------------------------------------------------------------
        !           230:
        !           231: L(mul_by_inverse):
        !           232:        C eax
        !           233:        C ebx   xsize
        !           234:        C ecx   size
        !           235:        C edx   carry
        !           236:        C esi   src
        !           237:        C edi   &dst[xsize-1]
        !           238:        C ebp   divisor
        !           239:
        !           240:        bsrl    %ebp, %eax              C 31-l
        !           241:
        !           242:        leal    12(%edi), %ebx
        !           243:        leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
        !           244:
        !           245:        movl    %edi, VAR_DST
        !           246:        movl    %ebx, VAR_DST_STOP
        !           247:
        !           248:        movl    %ecx, %ebx              C size
        !           249:        movl    $31, %ecx
        !           250:
        !           251:        movl    %edx, %edi              C carry
        !           252:        movl    $-1, %edx
        !           253:
        !           254:        C
        !           255:
        !           256:        xorl    %eax, %ecx              C l
        !           257:        incl    %eax                    C 32-l
        !           258:
        !           259:        shll    %cl, %ebp               C d normalized
        !           260:        movl    %ecx, VAR_NORM
        !           261:
        !           262:        movd    %eax, %mm7
        !           263:
        !           264:        movl    $-1, %eax
        !           265:        subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
        !           266:
        !           267:        divl    %ebp                    C floor (b*(b-d)-1) / d
        !           268:
        !           269:        orl     %ebx, %ebx              C size
        !           270:        movl    %eax, VAR_INVERSE
        !           271:        leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
        !           272:
        !           273:        jz      L(start_zero)
        !           274:        movl    %eax, VAR_SRC
        !           275:        cmpl    $1, %ebx
        !           276:
        !           277:        movl    8(%eax), %esi           C src high limb
        !           278:        jz      L(start_one)
        !           279:
        !           280: L(start_two_or_more):
        !           281:        movl    4(%eax), %edx           C src second highest limb
        !           282:
        !           283:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
        !           284:
        !           285:        shldl(  %cl, %edx, %esi)        C n10 = high,second << l
        !           286:
        !           287:        cmpl    $2, %ebx
        !           288:        je      L(integer_two_left)
        !           289:        jmp     L(integer_top)
        !           290:
        !           291:
        !           292: L(start_one):
        !           293:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
        !           294:
        !           295:        shll    %cl, %esi               C n10 = high << l
        !           296:        movl    %eax, VAR_SRC
        !           297:        jmp     L(integer_one_left)
        !           298:
        !           299:
        !           300: L(start_zero):
        !           301:        shll    %cl, %edi               C n2 = carry << l
        !           302:        movl    $0, %esi                C n10 = 0
        !           303:
        !           304:        C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
        !           305:        C must have xsize!=0
        !           306:        jmp     L(fraction_some)
        !           307:
        !           308:
        !           309:
        !           310: C -----------------------------------------------------------------------------
        !           311: C
        !           312: C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
        !           313: C execution.  The instruction scheduling is important, with various
        !           314: C apparently equivalent forms running 1 to 5 cycles slower.
        !           315: C
        !           316: C A lower bound for the time would seem to be 16 cycles, based on the
        !           317: C following successive dependencies.
        !           318: C
        !           319: C                    cycles
        !           320: C              n2+n1   1
        !           321: C              mul     6
        !           322: C              q1+1    1
        !           323: C              mul     6
        !           324: C              sub     1
        !           325: C              addback 1
        !           326: C                     ---
        !           327: C                     16
        !           328: C
        !           329: C This chain is what the loop has already, but 16 cycles isn't achieved.
        !           330: C K7 has enough decode, and probably enough execute (depending maybe on what
        !           331: C a mul actually consumes), but nothing running under 17 has been found.
        !           332: C
        !           333: C In theory n2+n1 could be done in the sub and addback stages (by
        !           334: C calculating both n2 and n2+n1 there), but lack of registers makes this an
        !           335: C unlikely proposition.
        !           336: C
        !           337: C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
        !           338: C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
        !           339: C chain, and nothing better than 18 cycles has been found when using it.
        !           340: C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
        !           341: C be an extremely rare event.
        !           342: C
        !           343: C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
        !           344: C if some special data is coming out with this always, the q1_ff special
        !           345: C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
        !           346: C induce the q1_ff case, for speed measurements or testing.  Note that
        !           347: C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
        !           348: C
        !           349: C The instruction groupings and empty comments show the cycles for a naive
        !           350: C in-order view of the code (conveniently ignoring the load latency on
        !           351: C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
        !           352: C to the extent that out-of-order execution rearranges it.  In this case
        !           353: C there's 19 cycles shown, but it executes at 17.
        !           354:
        !           355:        ALIGN(16)
        !           356: L(integer_top):
        !           357:        C eax   scratch
        !           358:        C ebx   scratch (nadj, q1)
        !           359:        C ecx   scratch (src, dst)
        !           360:        C edx   scratch
        !           361:        C esi   n10
        !           362:        C edi   n2
        !           363:        C ebp   divisor
        !           364:        C
        !           365:        C mm0   scratch (src qword)
        !           366:        C mm7   rshift for normalization
        !           367:
        !           368:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
        !           369:        movl    %edi, %eax         C n2
        !           370:        movl    VAR_SRC, %ecx
        !           371:
        !           372:        leal    (%ebp,%esi), %ebx
        !           373:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
        !           374:        sbbl    $-1, %eax          C n2+n1
        !           375:
        !           376:        mull    VAR_INVERSE        C m*(n2+n1)
        !           377:
        !           378:        movq    (%ecx), %mm0       C next limb and the one below it
        !           379:        subl    $4, %ecx
        !           380:
        !           381:        movl    %ecx, VAR_SRC
        !           382:
        !           383:        C
        !           384:
        !           385:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           386:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
        !           387:        movl    %ebp, %eax         C d
        !           388:
        !           389:        C
        !           390:
        !           391:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           392:        jz      L(q1_ff)
        !           393:        movl    VAR_DST, %ecx
        !           394:
        !           395:        mull    %ebx               C (q1+1)*d
        !           396:
        !           397:        psrlq   %mm7, %mm0
        !           398:
        !           399:        leal    -4(%ecx), %ecx
        !           400:
        !           401:        C
        !           402:
        !           403:        subl    %eax, %esi
        !           404:        movl    VAR_DST_STOP, %eax
        !           405:
        !           406:        C
        !           407:
        !           408:        sbbl    %edx, %edi         C n - (q1+1)*d
        !           409:        movl    %esi, %edi         C remainder -> n2
        !           410:        leal    (%ebp,%esi), %edx
        !           411:
        !           412:        movd    %mm0, %esi
        !           413:
        !           414:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
        !           415:        sbbl    $0, %ebx           C q
        !           416:        cmpl    %eax, %ecx
        !           417:
        !           418:        movl    %ebx, (%ecx)
        !           419:        movl    %ecx, VAR_DST
        !           420:        jne     L(integer_top)
        !           421:
        !           422:
        !           423: L(integer_loop_done):
        !           424:
        !           425:
        !           426: C -----------------------------------------------------------------------------
        !           427: C
        !           428: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
        !           429: C q1_ff special case.  This make the code a bit smaller and simpler, and
        !           430: C costs only 1 cycle (each).
        !           431:
        !           432: L(integer_two_left):
        !           433:        C eax   scratch
        !           434:        C ebx   scratch (nadj, q1)
        !           435:        C ecx   scratch (src, dst)
        !           436:        C edx   scratch
        !           437:        C esi   n10
        !           438:        C edi   n2
        !           439:        C ebp   divisor
        !           440:        C
        !           441:        C mm0   src limb, shifted
        !           442:        C mm7   rshift
        !           443:
        !           444:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
        !           445:        movl    %edi, %eax         C n2
        !           446:        movl    PARAM_SRC, %ecx
        !           447:
        !           448:        leal    (%ebp,%esi), %ebx
        !           449:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
        !           450:        sbbl    $-1, %eax          C n2+n1
        !           451:
        !           452:        mull    VAR_INVERSE        C m*(n2+n1)
        !           453:
        !           454:        movd    (%ecx), %mm0       C src low limb
        !           455:
        !           456:        movl    VAR_DST_STOP, %ecx
        !           457:
        !           458:        C
        !           459:
        !           460:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           461:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
        !           462:        movl    %ebp, %eax         C d
        !           463:
        !           464:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           465:
        !           466:        sbbl    $0, %ebx
        !           467:
        !           468:        mull    %ebx               C (q1+1)*d
        !           469:
        !           470:        psllq   $32, %mm0
        !           471:
        !           472:        psrlq   %mm7, %mm0
        !           473:
        !           474:        C
        !           475:
        !           476:        subl    %eax, %esi
        !           477:
        !           478:        C
        !           479:
        !           480:        sbbl    %edx, %edi         C n - (q1+1)*d
        !           481:        movl    %esi, %edi         C remainder -> n2
        !           482:        leal    (%ebp,%esi), %edx
        !           483:
        !           484:        movd    %mm0, %esi
        !           485:
        !           486:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
        !           487:        sbbl    $0, %ebx           C q
        !           488:
        !           489:        movl    %ebx, -4(%ecx)
        !           490:
        !           491:
        !           492: C -----------------------------------------------------------------------------
        !           493: L(integer_one_left):
        !           494:        C eax   scratch
        !           495:        C ebx   scratch (nadj, q1)
        !           496:        C ecx   dst
        !           497:        C edx   scratch
        !           498:        C esi   n10
        !           499:        C edi   n2
        !           500:        C ebp   divisor
        !           501:        C
        !           502:        C mm0   src limb, shifted
        !           503:        C mm7   rshift
        !           504:
        !           505:        movl    VAR_DST_STOP, %ecx
        !           506:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
        !           507:        movl    %edi, %eax         C n2
        !           508:
        !           509:        leal    (%ebp,%esi), %ebx
        !           510:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
        !           511:        sbbl    $-1, %eax          C n2+n1
        !           512:
        !           513:        mull    VAR_INVERSE        C m*(n2+n1)
        !           514:
        !           515:        C
        !           516:
        !           517:        C
        !           518:
        !           519:        C
        !           520:
        !           521:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
        !           522:        leal    1(%edi), %ebx      C n2<<32 + m*(n2+n1))
        !           523:        movl    %ebp, %eax         C d
        !           524:
        !           525:        C
        !           526:
        !           527:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
        !           528:
        !           529:        sbbl    $0, %ebx           C q1 if q1+1 overflowed
        !           530:
        !           531:        mull    %ebx
        !           532:
        !           533:        C
        !           534:
        !           535:        C
        !           536:
        !           537:        C
        !           538:
        !           539:        subl    %eax, %esi
        !           540:
        !           541:        C
        !           542:
        !           543:        sbbl    %edx, %edi         C n - (q1+1)*d
        !           544:        movl    %esi, %edi         C remainder -> n2
        !           545:        leal    (%ebp,%esi), %edx
        !           546:
        !           547:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
        !           548:        sbbl    $0, %ebx           C q
        !           549:
        !           550:        movl    %ebx, -8(%ecx)
        !           551:        subl    $8, %ecx
        !           552:
        !           553:
        !           554:
        !           555: L(integer_none):
        !           556:        cmpl    $0, PARAM_XSIZE
        !           557:        jne     L(fraction_some)
        !           558:
        !           559:        movl    %edi, %eax
        !           560: L(fraction_done):
        !           561:        movl    VAR_NORM, %ecx
        !           562:        movl    SAVE_EBP, %ebp
        !           563:
        !           564:        movl    SAVE_EDI, %edi
        !           565:        movl    SAVE_ESI, %esi
        !           566:
        !           567:        movl    SAVE_EBX, %ebx
        !           568:        addl    $STACK_SPACE, %esp
        !           569:
        !           570:        shrl    %cl, %eax
        !           571:        emms
        !           572:
        !           573:        ret
        !           574:
        !           575:
        !           576: C -----------------------------------------------------------------------------
        !           577: C
        !           578: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
        !           579: C of q*d is simply -d and the remainder n-q*d = n10+d
        !           580:
        !           581: L(q1_ff):
        !           582:        C eax   (divisor)
        !           583:        C ebx   (q1+1 == 0)
        !           584:        C ecx
        !           585:        C edx
        !           586:        C esi   n10
        !           587:        C edi   n2
        !           588:        C ebp   divisor
        !           589:
        !           590:        movl    VAR_DST, %ecx
        !           591:        movl    VAR_DST_STOP, %edx
        !           592:        subl    $4, %ecx
        !           593:
        !           594:        psrlq   %mm7, %mm0
        !           595:        leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
        !           596:        movl    %ecx, VAR_DST
        !           597:
        !           598:        movd    %mm0, %esi              C next n10
        !           599:
        !           600:        movl    $-1, (%ecx)
        !           601:        cmpl    %ecx, %edx
        !           602:        jne     L(integer_top)
        !           603:
        !           604:        jmp     L(integer_loop_done)
        !           605:
        !           606:
        !           607:
        !           608: C -----------------------------------------------------------------------------
        !           609: C
        !           610: C Being the fractional part, the "source" limbs are all zero, meaning
        !           611: C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
        !           612: C
        !           613: C The loop runs at 15 cycles.  The dependent chain is the same as the
        !           614: C general case above, but without the n2+n1 stage (due to n1==0), so 15
        !           615: C would seem to be the lower bound.
        !           616: C
        !           617: C A not entirely obvious simplification is that q1+1 never overflows a limb,
        !           618: C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
        !           619: C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
        !           620: C rnd() means rounding down to a multiple of d.
        !           621: C
        !           622: C      m*n2 + b*n2 <= m*(d-1) + b*(d-1)
        !           623: C                   = m*d + b*d - m - b
        !           624: C                   = floor((b(b-d)-1)/d)*d + b*d - m - b
        !           625: C                   = rnd(b(b-d)-1) + b*d - m - b
        !           626: C                   = rnd(b(b-d)-1 + b*d) - m - b
        !           627: C                   = rnd(b*b-1) - m - b
        !           628: C                   <= (b-2)*b
        !           629: C
        !           630: C Unchanged from the general case is that the final quotient limb q can be
        !           631: C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
        !           632: C equation 8.4 of the paper which simplifies as follows when n1==0 and
        !           633: C n0==0.
        !           634: C
        !           635: C      n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
        !           636: C
        !           637: C As before, the instruction groupings and empty comments show a naive
        !           638: C in-order view of the code, which is made a nonsense by out of order
        !           639: C execution.  There's 17 cycles shown, but it executes at 15.
        !           640: C
        !           641: C Rotating the store q and remainder->n2 instructions up to the top of the
        !           642: C loop gets the run time down from 16 to 15.
        !           643:
        !           644:        ALIGN(16)
        !           645: L(fraction_some):
        !           646:        C eax
        !           647:        C ebx
        !           648:        C ecx
        !           649:        C edx
        !           650:        C esi
        !           651:        C edi   carry
        !           652:        C ebp   divisor
        !           653:
        !           654:        movl    PARAM_DST, %esi
        !           655:        movl    VAR_DST_STOP, %ecx
        !           656:        movl    %edi, %eax
        !           657:
        !           658:        subl    $8, %ecx
        !           659:
        !           660:        jmp     L(fraction_entry)
        !           661:
        !           662:
        !           663:        ALIGN(16)
        !           664: L(fraction_top):
        !           665:        C eax   n2 carry, then scratch
        !           666:        C ebx   scratch (nadj, q1)
        !           667:        C ecx   dst, decrementing
        !           668:        C edx   scratch
        !           669:        C esi   dst stop point
        !           670:        C edi   (will be n2)
        !           671:        C ebp   divisor
        !           672:
        !           673:        movl    %ebx, (%ecx)    C previous q
        !           674:        movl    %eax, %edi      C remainder->n2
        !           675:
        !           676: L(fraction_entry):
        !           677:        mull    VAR_INVERSE     C m*n2
        !           678:
        !           679:        movl    %ebp, %eax      C d
        !           680:        subl    $4, %ecx        C dst
        !           681:        leal    1(%edi), %ebx
        !           682:
        !           683:        C
        !           684:
        !           685:        C
        !           686:
        !           687:        C
        !           688:
        !           689:        C
        !           690:
        !           691:        addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
        !           692:
        !           693:        mull    %ebx            C (q1+1)*d
        !           694:
        !           695:        C
        !           696:
        !           697:        C
        !           698:
        !           699:        C
        !           700:
        !           701:        negl    %eax            C low of n - (q1+1)*d
        !           702:
        !           703:        C
        !           704:
        !           705:        sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
        !           706:        leal    (%ebp,%eax), %edx
        !           707:
        !           708:        cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
        !           709:        sbbl    $0, %ebx        C q
        !           710:        cmpl    %esi, %ecx
        !           711:
        !           712:        jne     L(fraction_top)
        !           713:
        !           714:
        !           715:        movl    %ebx, (%ecx)
        !           716:        jmp     L(fraction_done)
        !           717:
        !           718: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>