OpenXM_contrib/gmp/mpn/x86/k7/mmx/divrem_1.asm - annotate

Return to divrem_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/divrem_1.asm, Revision 1.1.1.2

1.1.1.2 ! ohara       1: dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
        !             2: dnl  division.
1.1       maekawa     3:
1.1.1.2 ! ohara       4: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
        !             5: dnl
1.1       maekawa     6: dnl  This file is part of the GNU MP Library.
1.1.1.2 ! ohara       7: dnl
1.1       maekawa     8: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      9: dnl  modify it under the terms of the GNU Lesser General Public License as
                     10: dnl  published by the Free Software Foundation; either version 2.1 of the
                     11: dnl  License, or (at your option) any later version.
1.1.1.2 ! ohara      12: dnl
1.1       maekawa    13: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     14: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     16: dnl  Lesser General Public License for more details.
1.1.1.2 ! ohara      17: dnl
1.1       maekawa    18: dnl  You should have received a copy of the GNU Lesser General Public
                     19: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     20: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     21: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
                     25:
1.1.1.2 ! ohara      26: C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
        !            27:
        !            28:
1.1       maekawa    29: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
                     30: C                         mp_srcptr src, mp_size_t size,
                     31: C                         mp_limb_t divisor);
                     32: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
                     33: C                          mp_srcptr src, mp_size_t size,
                     34: C                          mp_limb_t divisor, mp_limb_t carry);
1.1.1.2 ! ohara      35: C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
        !            36: C                                mp_srcptr src, mp_size_t size,
        !            37: C                                mp_limb_t divisor, mp_limb_t inverse,
        !            38: C                                unsigned shift);
1.1       maekawa    39: C
                     40: C The method and nomenclature follow part 8 of "Division by Invariant
                     41: C Integers using Multiplication" by Granlund and Montgomery, reference in
                     42: C gmp.texi.
                     43: C
                     44: C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
                     45: C for m', and "d" for d_norm, which won't cause any confusion since it's
                     46: C only the normalized divisor that's of any use in the code.  "b" is written
                     47: C for 2^N, the size of a limb, N being 32 here.
                     48: C
1.1.1.2 ! ohara      49: C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
        !            50: C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
        !            51: C carry, since in normal circumstances that will be a very rare event.
        !            52: C
        !            53: C The test for skipping a division is branch free (once size>=1 is tested).
        !            54: C The store to the destination high limb is 0 when a divide is skipped, or
        !            55: C if it's not skipped then a copy of the src high limb is used.  The latter
        !            56: C is in case src==dst.
1.1       maekawa    57: C
                     58: C There's a small bias towards expecting xsize==0, by having code for
                     59: C xsize==0 in a straight line and xsize!=0 under forward jumps.
1.1.1.2 ! ohara      60: C
        !            61: C Alternatives:
        !            62: C
        !            63: C If the divisor is normalized (high bit set) then a division step can
        !            64: C always be skipped, since the high destination limb is always 0 or 1 in
        !            65: C that case.  It doesn't seem worth checking for this though, since it
        !            66: C probably occurs infrequently, in particular note that big_base for a
        !            67: C decimal mpn_get_str is not normalized in a 32-bit limb.
1.1       maekawa    68:
                     69:
                     70: dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
                     71: dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
                     72: dnl
                     73: dnl  The inverse takes about 50 cycles to calculate, but after that the
                     74: dnl  multiply is 17 c/l versus division at 42 c/l.
                     75: dnl
                     76: dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
                     77: dnl  even more so on the fractional part.
                     78:
                     79: deflit(MUL_THRESHOLD, 3)
                     80:
                     81:
1.1.1.2 ! ohara      82: defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
        !            83: defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
        !            84: defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
1.1       maekawa    85: defframe(PARAM_DIVISOR,20)
                     86: defframe(PARAM_SIZE,   16)
                     87: defframe(PARAM_SRC,    12)
                     88: defframe(PARAM_XSIZE,  8)
                     89: defframe(PARAM_DST,    4)
                     90:
                     91: defframe(SAVE_EBX,    -4)
                     92: defframe(SAVE_ESI,    -8)
                     93: defframe(SAVE_EDI,    -12)
                     94: defframe(SAVE_EBP,    -16)
                     95:
                     96: defframe(VAR_NORM,    -20)
                     97: defframe(VAR_INVERSE, -24)
                     98: defframe(VAR_SRC,     -28)
                     99: defframe(VAR_DST,     -32)
                    100: defframe(VAR_DST_STOP,-36)
                    101:
                    102: deflit(STACK_SPACE, 36)
                    103:
1.1.1.2 ! ohara     104:        TEXT
1.1       maekawa   105:        ALIGN(32)
                    106:
1.1.1.2 ! ohara     107: PROLOGUE(mpn_preinv_divrem_1)
        !           108: deflit(`FRAME',0)
        !           109:        movl    PARAM_XSIZE, %ecx
        !           110:        movl    PARAM_DST, %edx
        !           111:        subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
        !           112:
        !           113:        movl    %esi, SAVE_ESI
        !           114:        movl    PARAM_SRC, %esi
        !           115:
        !           116:        movl    %ebx, SAVE_EBX
        !           117:        movl    PARAM_SIZE, %ebx
        !           118:
        !           119:        leal    8(%edx,%ecx,4), %edx    C &dst[xsize+2]
        !           120:        movl    %ebp, SAVE_EBP
        !           121:        movl    PARAM_DIVISOR, %ebp
        !           122:
        !           123:        movl    %edx, VAR_DST_STOP      C &dst[xsize+2]
        !           124:        movl    %edi, SAVE_EDI
        !           125:        xorl    %edi, %edi              C carry
        !           126:
        !           127:        movl    -4(%esi,%ebx,4), %eax   C src high limb
        !           128:        xor     %ecx, %ecx
        !           129:
        !           130:        C
        !           131:
        !           132:        C
        !           133:
        !           134:        cmpl    %ebp, %eax              C high cmp divisor
        !           135:
        !           136:        cmovc(  %eax, %edi)             C high is carry if high<divisor
        !           137:        cmovnc( %eax, %ecx)             C 0 if skip div, src high if not
        !           138:                                        C (the latter in case src==dst)
        !           139:
        !           140:        movl    %ecx, -12(%edx,%ebx,4)  C dst high limb
        !           141:        sbbl    $0, %ebx                C skip one division if high<divisor
        !           142:        movl    PARAM_PREINV_SHIFT, %ecx
        !           143:
        !           144:        leal    -8(%edx,%ebx,4), %edx   C &dst[xsize+size]
        !           145:        movl    $32, %eax
        !           146:
        !           147:        movl    %edx, VAR_DST           C &dst[xsize+size]
        !           148:
        !           149:        shll    %cl, %ebp               C d normalized
        !           150:        subl    %ecx, %eax
        !           151:        movl    %ecx, VAR_NORM
        !           152:
        !           153:        movd    %eax, %mm7              C rshift
        !           154:        movl    PARAM_PREINV_INVERSE, %eax
        !           155:        jmp     L(start_preinv)
        !           156:
        !           157: EPILOGUE()
        !           158:
        !           159:
        !           160:        ALIGN(16)
        !           161:
1.1       maekawa   162: PROLOGUE(mpn_divrem_1c)
                    163: deflit(`FRAME',0)
                    164:        movl    PARAM_CARRY, %edx
                    165:        movl    PARAM_SIZE, %ecx
                    166:        subl    $STACK_SPACE, %esp
                    167: deflit(`FRAME',STACK_SPACE)
                    168:
                    169:        movl    %ebx, SAVE_EBX
                    170:        movl    PARAM_XSIZE, %ebx
                    171:
                    172:        movl    %edi, SAVE_EDI
                    173:        movl    PARAM_DST, %edi
                    174:
                    175:        movl    %ebp, SAVE_EBP
                    176:        movl    PARAM_DIVISOR, %ebp
                    177:
                    178:        movl    %esi, SAVE_ESI
                    179:        movl    PARAM_SRC, %esi
                    180:
1.1.1.2 ! ohara     181:        leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
        !           182:        jmp     L(start_1c)
1.1       maekawa   183:
                    184: EPILOGUE()
                    185:
                    186:
1.1.1.2 ! ohara     187:        C offset 0xa1, close enough to aligned
1.1       maekawa   188: PROLOGUE(mpn_divrem_1)
                    189: deflit(`FRAME',0)
                    190:
                    191:        movl    PARAM_SIZE, %ecx
                    192:        movl    $0, %edx                C initial carry (if can't skip a div)
                    193:        subl    $STACK_SPACE, %esp
                    194: deflit(`FRAME',STACK_SPACE)
                    195:
1.1.1.2 ! ohara     196:        movl    %esi, SAVE_ESI
        !           197:        movl    PARAM_SRC, %esi
1.1       maekawa   198:
                    199:        movl    %ebx, SAVE_EBX
                    200:        movl    PARAM_XSIZE, %ebx
                    201:
1.1.1.2 ! ohara     202:        movl    %ebp, SAVE_EBP
        !           203:        movl    PARAM_DIVISOR, %ebp
        !           204:        orl     %ecx, %ecx              C size
1.1       maekawa   205:
                    206:        movl    %edi, SAVE_EDI
                    207:        movl    PARAM_DST, %edi
                    208:        leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
                    209:
1.1.1.2 ! ohara     210:        jz      L(no_skip_div)          C if size==0
1.1       maekawa   211:        movl    -4(%esi,%ecx,4), %eax   C src high limb
1.1.1.2 ! ohara     212:        xorl    %esi, %esi
1.1       maekawa   213:
1.1.1.2 ! ohara     214:        cmpl    %ebp, %eax              C high cmp divisor
1.1       maekawa   215:
1.1.1.2 ! ohara     216:        cmovc(  %eax, %edx)             C high is carry if high<divisor
        !           217:        cmovnc( %eax, %esi)             C 0 if skip div, src high if not
        !           218:
        !           219:        movl    %esi, (%edi,%ecx,4)     C dst high limb
        !           220:        sbbl    $0, %ecx                C size-1 if high<divisor
        !           221:        movl    PARAM_SRC, %esi         C reload
1.1       maekawa   222: L(no_skip_div):
                    223:
                    224:
                    225: L(start_1c):
1.1.1.2 ! ohara     226:        C eax
1.1       maekawa   227:        C ebx   xsize
                    228:        C ecx   size
                    229:        C edx   carry
                    230:        C esi   src
                    231:        C edi   &dst[xsize-1]
                    232:        C ebp   divisor
                    233:
                    234:        leal    (%ebx,%ecx), %eax       C size+xsize
                    235:        cmpl    $MUL_THRESHOLD, %eax
                    236:        jae     L(mul_by_inverse)
                    237:
                    238:
                    239: C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
                    240: C It'd be possible to write them out without the looping, but no speedup
                    241: C would be expected.
                    242: C
                    243: C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
                    244: C integer part, but curiously not on the fractional part, where %ebp is a
                    245: C (fixed) couple of cycles faster.
                    246:
                    247:        orl     %ecx, %ecx
                    248:        jz      L(divide_no_integer)
                    249:
                    250: L(divide_integer):
                    251:        C eax   scratch (quotient)
                    252:        C ebx   xsize
                    253:        C ecx   counter
                    254:        C edx   scratch (remainder)
                    255:        C esi   src
                    256:        C edi   &dst[xsize-1]
                    257:        C ebp   divisor
                    258:
                    259:        movl    -4(%esi,%ecx,4), %eax
                    260:
                    261:        divl    PARAM_DIVISOR
                    262:
                    263:        movl    %eax, (%edi,%ecx,4)
                    264:        decl    %ecx
                    265:        jnz     L(divide_integer)
                    266:
                    267:
                    268: L(divide_no_integer):
                    269:        movl    PARAM_DST, %edi
                    270:        orl     %ebx, %ebx
                    271:        jnz     L(divide_fraction)
                    272:
                    273: L(divide_done):
                    274:        movl    SAVE_ESI, %esi
                    275:        movl    SAVE_EDI, %edi
                    276:        movl    %edx, %eax
                    277:
                    278:        movl    SAVE_EBX, %ebx
                    279:        movl    SAVE_EBP, %ebp
                    280:        addl    $STACK_SPACE, %esp
                    281:
                    282:        ret
                    283:
                    284:
                    285: L(divide_fraction):
                    286:        C eax   scratch (quotient)
                    287:        C ebx   counter
                    288:        C ecx
                    289:        C edx   scratch (remainder)
                    290:        C esi
                    291:        C edi   dst
                    292:        C ebp   divisor
                    293:
                    294:        movl    $0, %eax
                    295:
                    296:        divl    %ebp
                    297:
                    298:        movl    %eax, -4(%edi,%ebx,4)
                    299:        decl    %ebx
                    300:        jnz     L(divide_fraction)
                    301:
                    302:        jmp     L(divide_done)
                    303:
                    304:
                    305:
                    306: C -----------------------------------------------------------------------------
                    307:
                    308: L(mul_by_inverse):
                    309:        C eax
                    310:        C ebx   xsize
                    311:        C ecx   size
                    312:        C edx   carry
                    313:        C esi   src
                    314:        C edi   &dst[xsize-1]
                    315:        C ebp   divisor
                    316:
                    317:        bsrl    %ebp, %eax              C 31-l
                    318:
1.1.1.2 ! ohara     319:        leal    12(%edi), %ebx          C &dst[xsize+2], loop dst stop
1.1       maekawa   320:        leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
                    321:
                    322:        movl    %edi, VAR_DST
                    323:        movl    %ebx, VAR_DST_STOP
                    324:
                    325:        movl    %ecx, %ebx              C size
                    326:        movl    $31, %ecx
                    327:
                    328:        movl    %edx, %edi              C carry
                    329:        movl    $-1, %edx
                    330:
                    331:        C
                    332:
                    333:        xorl    %eax, %ecx              C l
                    334:        incl    %eax                    C 32-l
                    335:
                    336:        shll    %cl, %ebp               C d normalized
                    337:        movl    %ecx, VAR_NORM
                    338:
                    339:        movd    %eax, %mm7
                    340:
                    341:        movl    $-1, %eax
                    342:        subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
                    343:
                    344:        divl    %ebp                    C floor (b*(b-d)-1) / d
                    345:
1.1.1.2 ! ohara     346: L(start_preinv):
        !           347:        C eax   inverse
        !           348:        C ebx   size
        !           349:        C ecx   shift
        !           350:        C edx
        !           351:        C esi   src
        !           352:        C edi   carry
        !           353:        C ebp   divisor
        !           354:        C
        !           355:        C mm7   rshift
        !           356:
1.1       maekawa   357:        orl     %ebx, %ebx              C size
                    358:        movl    %eax, VAR_INVERSE
                    359:        leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
                    360:
                    361:        jz      L(start_zero)
                    362:        movl    %eax, VAR_SRC
                    363:        cmpl    $1, %ebx
                    364:
                    365:        movl    8(%eax), %esi           C src high limb
                    366:        jz      L(start_one)
                    367:
                    368: L(start_two_or_more):
                    369:        movl    4(%eax), %edx           C src second highest limb
                    370:
                    371:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    372:
                    373:        shldl(  %cl, %edx, %esi)        C n10 = high,second << l
                    374:
                    375:        cmpl    $2, %ebx
                    376:        je      L(integer_two_left)
                    377:        jmp     L(integer_top)
                    378:
                    379:
                    380: L(start_one):
                    381:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    382:
                    383:        shll    %cl, %esi               C n10 = high << l
                    384:        movl    %eax, VAR_SRC
                    385:        jmp     L(integer_one_left)
                    386:
                    387:
                    388: L(start_zero):
1.1.1.2 ! ohara     389:        C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
        !           390:        C skipped a division.
        !           391:
1.1       maekawa   392:        shll    %cl, %edi               C n2 = carry << l
1.1.1.2 ! ohara     393:        movl    %edi, %eax              C return value for zero_done
        !           394:        cmpl    $0, PARAM_XSIZE
1.1       maekawa   395:
1.1.1.2 ! ohara     396:        je      L(zero_done)
1.1       maekawa   397:        jmp     L(fraction_some)
                    398:
                    399:
                    400:
                    401: C -----------------------------------------------------------------------------
                    402: C
                    403: C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
                    404: C execution.  The instruction scheduling is important, with various
                    405: C apparently equivalent forms running 1 to 5 cycles slower.
                    406: C
                    407: C A lower bound for the time would seem to be 16 cycles, based on the
                    408: C following successive dependencies.
                    409: C
                    410: C                    cycles
                    411: C              n2+n1   1
                    412: C              mul     6
                    413: C              q1+1    1
                    414: C              mul     6
                    415: C              sub     1
                    416: C              addback 1
                    417: C                     ---
                    418: C                     16
                    419: C
                    420: C This chain is what the loop has already, but 16 cycles isn't achieved.
                    421: C K7 has enough decode, and probably enough execute (depending maybe on what
                    422: C a mul actually consumes), but nothing running under 17 has been found.
                    423: C
                    424: C In theory n2+n1 could be done in the sub and addback stages (by
                    425: C calculating both n2 and n2+n1 there), but lack of registers makes this an
                    426: C unlikely proposition.
                    427: C
                    428: C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
                    429: C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
                    430: C chain, and nothing better than 18 cycles has been found when using it.
                    431: C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
                    432: C be an extremely rare event.
                    433: C
                    434: C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
                    435: C if some special data is coming out with this always, the q1_ff special
                    436: C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
                    437: C induce the q1_ff case, for speed measurements or testing.  Note that
                    438: C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
                    439: C
                    440: C The instruction groupings and empty comments show the cycles for a naive
                    441: C in-order view of the code (conveniently ignoring the load latency on
                    442: C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
                    443: C to the extent that out-of-order execution rearranges it.  In this case
                    444: C there's 19 cycles shown, but it executes at 17.
                    445:
                    446:        ALIGN(16)
                    447: L(integer_top):
                    448:        C eax   scratch
                    449:        C ebx   scratch (nadj, q1)
                    450:        C ecx   scratch (src, dst)
                    451:        C edx   scratch
                    452:        C esi   n10
                    453:        C edi   n2
                    454:        C ebp   divisor
                    455:        C
                    456:        C mm0   scratch (src qword)
                    457:        C mm7   rshift for normalization
                    458:
                    459:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
                    460:        movl    %edi, %eax         C n2
                    461:        movl    VAR_SRC, %ecx
                    462:
                    463:        leal    (%ebp,%esi), %ebx
                    464:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
                    465:        sbbl    $-1, %eax          C n2+n1
                    466:
                    467:        mull    VAR_INVERSE        C m*(n2+n1)
                    468:
                    469:        movq    (%ecx), %mm0       C next limb and the one below it
                    470:        subl    $4, %ecx
                    471:
                    472:        movl    %ecx, VAR_SRC
                    473:
                    474:        C
                    475:
                    476:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara     477:        leal    1(%edi), %ebx      C n2+1
1.1       maekawa   478:        movl    %ebp, %eax         C d
                    479:
                    480:        C
                    481:
                    482:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    483:        jz      L(q1_ff)
                    484:        movl    VAR_DST, %ecx
                    485:
                    486:        mull    %ebx               C (q1+1)*d
                    487:
                    488:        psrlq   %mm7, %mm0
                    489:
                    490:        leal    -4(%ecx), %ecx
                    491:
                    492:        C
                    493:
                    494:        subl    %eax, %esi
                    495:        movl    VAR_DST_STOP, %eax
                    496:
                    497:        C
                    498:
                    499:        sbbl    %edx, %edi         C n - (q1+1)*d
                    500:        movl    %esi, %edi         C remainder -> n2
                    501:        leal    (%ebp,%esi), %edx
                    502:
                    503:        movd    %mm0, %esi
                    504:
                    505:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    506:        sbbl    $0, %ebx           C q
                    507:        cmpl    %eax, %ecx
                    508:
                    509:        movl    %ebx, (%ecx)
                    510:        movl    %ecx, VAR_DST
                    511:        jne     L(integer_top)
                    512:
                    513:
                    514: L(integer_loop_done):
                    515:
                    516:
                    517: C -----------------------------------------------------------------------------
                    518: C
                    519: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
                    520: C q1_ff special case.  This make the code a bit smaller and simpler, and
                    521: C costs only 1 cycle (each).
                    522:
                    523: L(integer_two_left):
                    524:        C eax   scratch
                    525:        C ebx   scratch (nadj, q1)
                    526:        C ecx   scratch (src, dst)
                    527:        C edx   scratch
                    528:        C esi   n10
                    529:        C edi   n2
                    530:        C ebp   divisor
                    531:        C
                    532:        C mm7   rshift
                    533:
                    534:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
                    535:        movl    %edi, %eax         C n2
                    536:        movl    PARAM_SRC, %ecx
                    537:
                    538:        leal    (%ebp,%esi), %ebx
                    539:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
                    540:        sbbl    $-1, %eax          C n2+n1
                    541:
                    542:        mull    VAR_INVERSE        C m*(n2+n1)
                    543:
                    544:        movd    (%ecx), %mm0       C src low limb
                    545:
                    546:        movl    VAR_DST_STOP, %ecx
                    547:
                    548:        C
                    549:
                    550:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara     551:        leal    1(%edi), %ebx      C n2+1
1.1       maekawa   552:        movl    %ebp, %eax         C d
                    553:
                    554:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    555:
                    556:        sbbl    $0, %ebx
                    557:
                    558:        mull    %ebx               C (q1+1)*d
                    559:
                    560:        psllq   $32, %mm0
                    561:
                    562:        psrlq   %mm7, %mm0
                    563:
                    564:        C
                    565:
                    566:        subl    %eax, %esi
                    567:
                    568:        C
                    569:
                    570:        sbbl    %edx, %edi         C n - (q1+1)*d
                    571:        movl    %esi, %edi         C remainder -> n2
                    572:        leal    (%ebp,%esi), %edx
                    573:
                    574:        movd    %mm0, %esi
                    575:
                    576:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    577:        sbbl    $0, %ebx           C q
                    578:
                    579:        movl    %ebx, -4(%ecx)
                    580:
                    581:
                    582: C -----------------------------------------------------------------------------
                    583: L(integer_one_left):
                    584:        C eax   scratch
                    585:        C ebx   scratch (nadj, q1)
                    586:        C ecx   dst
                    587:        C edx   scratch
                    588:        C esi   n10
                    589:        C edi   n2
                    590:        C ebp   divisor
                    591:        C
                    592:        C mm7   rshift
                    593:
                    594:        movl    VAR_DST_STOP, %ecx
                    595:        cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
                    596:        movl    %edi, %eax         C n2
                    597:
                    598:        leal    (%ebp,%esi), %ebx
                    599:        cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
                    600:        sbbl    $-1, %eax          C n2+n1
                    601:
                    602:        mull    VAR_INVERSE        C m*(n2+n1)
                    603:
                    604:        C
                    605:
                    606:        C
                    607:
                    608:        C
                    609:
                    610:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara     611:        leal    1(%edi), %ebx      C n2+1
1.1       maekawa   612:        movl    %ebp, %eax         C d
                    613:
                    614:        C
                    615:
                    616:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    617:
                    618:        sbbl    $0, %ebx           C q1 if q1+1 overflowed
                    619:
                    620:        mull    %ebx
                    621:
                    622:        C
                    623:
                    624:        C
                    625:
                    626:        C
                    627:
                    628:        subl    %eax, %esi
                    629:
                    630:        C
                    631:
                    632:        sbbl    %edx, %edi         C n - (q1+1)*d
                    633:        movl    %esi, %edi         C remainder -> n2
                    634:        leal    (%ebp,%esi), %edx
                    635:
                    636:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    637:        sbbl    $0, %ebx           C q
                    638:
                    639:        movl    %ebx, -8(%ecx)
                    640:        subl    $8, %ecx
                    641:
                    642:
                    643:
                    644: L(integer_none):
                    645:        cmpl    $0, PARAM_XSIZE
                    646:        jne     L(fraction_some)
                    647:
                    648:        movl    %edi, %eax
                    649: L(fraction_done):
                    650:        movl    VAR_NORM, %ecx
1.1.1.2 ! ohara     651: L(zero_done):
1.1       maekawa   652:        movl    SAVE_EBP, %ebp
                    653:
                    654:        movl    SAVE_EDI, %edi
                    655:        movl    SAVE_ESI, %esi
                    656:
                    657:        movl    SAVE_EBX, %ebx
                    658:        addl    $STACK_SPACE, %esp
                    659:
                    660:        shrl    %cl, %eax
                    661:        emms
                    662:
                    663:        ret
                    664:
                    665:
                    666: C -----------------------------------------------------------------------------
                    667: C
                    668: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
                    669: C of q*d is simply -d and the remainder n-q*d = n10+d
                    670:
                    671: L(q1_ff):
                    672:        C eax   (divisor)
                    673:        C ebx   (q1+1 == 0)
                    674:        C ecx
                    675:        C edx
                    676:        C esi   n10
                    677:        C edi   n2
                    678:        C ebp   divisor
                    679:
                    680:        movl    VAR_DST, %ecx
                    681:        movl    VAR_DST_STOP, %edx
                    682:        subl    $4, %ecx
                    683:
                    684:        psrlq   %mm7, %mm0
                    685:        leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
                    686:        movl    %ecx, VAR_DST
                    687:
                    688:        movd    %mm0, %esi              C next n10
                    689:
                    690:        movl    $-1, (%ecx)
                    691:        cmpl    %ecx, %edx
                    692:        jne     L(integer_top)
                    693:
                    694:        jmp     L(integer_loop_done)
                    695:
                    696:
                    697:
                    698: C -----------------------------------------------------------------------------
                    699: C
                    700: C Being the fractional part, the "source" limbs are all zero, meaning
                    701: C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
                    702: C
                    703: C The loop runs at 15 cycles.  The dependent chain is the same as the
                    704: C general case above, but without the n2+n1 stage (due to n1==0), so 15
                    705: C would seem to be the lower bound.
                    706: C
                    707: C A not entirely obvious simplification is that q1+1 never overflows a limb,
                    708: C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
                    709: C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
                    710: C rnd() means rounding down to a multiple of d.
                    711: C
                    712: C      m*n2 + b*n2 <= m*(d-1) + b*(d-1)
                    713: C                   = m*d + b*d - m - b
                    714: C                   = floor((b(b-d)-1)/d)*d + b*d - m - b
                    715: C                   = rnd(b(b-d)-1) + b*d - m - b
                    716: C                   = rnd(b(b-d)-1 + b*d) - m - b
                    717: C                   = rnd(b*b-1) - m - b
                    718: C                   <= (b-2)*b
                    719: C
                    720: C Unchanged from the general case is that the final quotient limb q can be
                    721: C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
                    722: C equation 8.4 of the paper which simplifies as follows when n1==0 and
                    723: C n0==0.
                    724: C
                    725: C      n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
                    726: C
                    727: C As before, the instruction groupings and empty comments show a naive
                    728: C in-order view of the code, which is made a nonsense by out of order
                    729: C execution.  There's 17 cycles shown, but it executes at 15.
                    730: C
                    731: C Rotating the store q and remainder->n2 instructions up to the top of the
                    732: C loop gets the run time down from 16 to 15.
                    733:
                    734:        ALIGN(16)
                    735: L(fraction_some):
                    736:        C eax
                    737:        C ebx
                    738:        C ecx
                    739:        C edx
                    740:        C esi
                    741:        C edi   carry
                    742:        C ebp   divisor
                    743:
                    744:        movl    PARAM_DST, %esi
1.1.1.2 ! ohara     745:        movl    VAR_DST_STOP, %ecx      C &dst[xsize+2]
1.1       maekawa   746:        movl    %edi, %eax
                    747:
1.1.1.2 ! ohara     748:        subl    $8, %ecx                C &dst[xsize]
1.1       maekawa   749:        jmp     L(fraction_entry)
                    750:
                    751:
                    752:        ALIGN(16)
                    753: L(fraction_top):
                    754:        C eax   n2 carry, then scratch
                    755:        C ebx   scratch (nadj, q1)
                    756:        C ecx   dst, decrementing
                    757:        C edx   scratch
                    758:        C esi   dst stop point
                    759:        C edi   (will be n2)
                    760:        C ebp   divisor
                    761:
                    762:        movl    %ebx, (%ecx)    C previous q
                    763:        movl    %eax, %edi      C remainder->n2
                    764:
                    765: L(fraction_entry):
                    766:        mull    VAR_INVERSE     C m*n2
                    767:
                    768:        movl    %ebp, %eax      C d
                    769:        subl    $4, %ecx        C dst
                    770:        leal    1(%edi), %ebx
                    771:
                    772:        C
                    773:
                    774:        C
                    775:
                    776:        C
                    777:
                    778:        C
                    779:
                    780:        addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
                    781:
                    782:        mull    %ebx            C (q1+1)*d
                    783:
                    784:        C
                    785:
                    786:        C
                    787:
                    788:        C
                    789:
                    790:        negl    %eax            C low of n - (q1+1)*d
                    791:
                    792:        C
                    793:
                    794:        sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
                    795:        leal    (%ebp,%eax), %edx
                    796:
                    797:        cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
                    798:        sbbl    $0, %ebx        C q
                    799:        cmpl    %esi, %ecx
                    800:
                    801:        jne     L(fraction_top)
                    802:
                    803:
                    804:        movl    %ebx, (%ecx)
                    805:        jmp     L(fraction_done)
                    806:
                    807: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>