OpenXM_contrib/gmp/mpn/x86/p6/mmx/divrem_1.asm - annotate

Return to divrem_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / p6 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/p6/mmx/divrem_1.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
        !            26:
        !            27:
1.1       maekawa    28: C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
                     29: C                         mp_srcptr src, mp_size_t size,
                     30: C                         mp_limb_t divisor);
                     31: C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
                     32: C                          mp_srcptr src, mp_size_t size,
                     33: C                          mp_limb_t divisor, mp_limb_t carry);
1.1.1.2 ! ohara      34: C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
        !            35: C                                mp_srcptr src, mp_size_t size,
        !            36: C                                mp_limb_t divisor, mp_limb_t inverse,
        !            37: C                                unsigned shift);
1.1       maekawa    38: C
                     39: C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
1.1.1.2 ! ohara      40: C see that file for some comments.  It's possible what's here can be improved.
1.1       maekawa    41:
                     42:
                     43: dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
                     44: dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
                     45: dnl
                     46: dnl  The different speeds of the integer and fraction parts means that using
                     47: dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
                     48: dnl  for the integer part and a bit lower for the fraction part.  (Or what's
                     49: dnl  really wanted is to speed up the integer part!)
                     50: dnl
                     51: dnl  The threshold is set to make the integer part right.  At 4 limbs the
                     52: dnl  div and mul are about the same there, but on the fractional part the
                     53: dnl  mul is much faster.
                     54:
                     55: deflit(MUL_THRESHOLD, 4)
                     56:
                     57:
1.1.1.2 ! ohara      58: defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
        !            59: defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
        !            60: defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
1.1       maekawa    61: defframe(PARAM_DIVISOR,20)
                     62: defframe(PARAM_SIZE,   16)
                     63: defframe(PARAM_SRC,    12)
                     64: defframe(PARAM_XSIZE,  8)
                     65: defframe(PARAM_DST,    4)
                     66:
                     67: defframe(SAVE_EBX,    -4)
                     68: defframe(SAVE_ESI,    -8)
                     69: defframe(SAVE_EDI,    -12)
                     70: defframe(SAVE_EBP,    -16)
                     71:
                     72: defframe(VAR_NORM,    -20)
                     73: defframe(VAR_INVERSE, -24)
                     74: defframe(VAR_SRC,     -28)
                     75: defframe(VAR_DST,     -32)
                     76: defframe(VAR_DST_STOP,-36)
                     77:
                     78: deflit(STACK_SPACE, 36)
                     79:
1.1.1.2 ! ohara      80:        TEXT
        !            81:        ALIGN(16)
        !            82:
        !            83: PROLOGUE(mpn_preinv_divrem_1)
        !            84: deflit(`FRAME',0)
        !            85:        movl    PARAM_XSIZE, %ecx
        !            86:        subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
        !            87:
        !            88:        movl    %esi, SAVE_ESI
        !            89:        movl    PARAM_SRC, %esi
        !            90:
        !            91:        movl    %ebx, SAVE_EBX
        !            92:        movl    PARAM_SIZE, %ebx
        !            93:
        !            94:        movl    %ebp, SAVE_EBP
        !            95:        movl    PARAM_DIVISOR, %ebp
        !            96:
        !            97:        movl    %edi, SAVE_EDI
        !            98:        movl    PARAM_DST, %edx
        !            99:
        !           100:        movl    -4(%esi,%ebx,4), %eax   C src high limb
        !           101:        xorl    %edi, %edi              C initial carry (if can't skip a div)
        !           102:
        !           103:        C
        !           104:
        !           105:        leal    8(%edx,%ecx,4), %edx    C &dst[xsize+2]
        !           106:        xor     %ecx, %ecx
        !           107:
        !           108:        movl    %edx, VAR_DST_STOP      C &dst[xsize+2]
        !           109:        cmpl    %ebp, %eax              C high cmp divisor
        !           110:
        !           111:        cmovc(  %eax, %edi)             C high is carry if high<divisor
        !           112:
        !           113:        cmovnc( %eax, %ecx)             C 0 if skip div, src high if not
        !           114:                                        C (the latter in case src==dst)
        !           115:
        !           116:        movl    %ecx, -12(%edx,%ebx,4)  C dst high limb
        !           117:
        !           118:        sbbl    $0, %ebx                C skip one division if high<divisor
        !           119:        movl    PARAM_PREINV_SHIFT, %ecx
        !           120:
        !           121:        leal    -8(%edx,%ebx,4), %edx   C &dst[xsize+size]
        !           122:        movl    $32, %eax
        !           123:
        !           124:        movl    %edx, VAR_DST           C &dst[xsize+size]
        !           125:
        !           126:        shll    %cl, %ebp               C d normalized
        !           127:        subl    %ecx, %eax
        !           128:        movl    %ecx, VAR_NORM
        !           129:
        !           130:        movd    %eax, %mm7              C rshift
        !           131:        movl    PARAM_PREINV_INVERSE, %eax
        !           132:        jmp     L(start_preinv)
        !           133:
        !           134: EPILOGUE()
        !           135:
        !           136:
        !           137:
1.1       maekawa   138:        ALIGN(16)
                    139:
                    140: PROLOGUE(mpn_divrem_1c)
                    141: deflit(`FRAME',0)
                    142:        movl    PARAM_CARRY, %edx
                    143:
                    144:        movl    PARAM_SIZE, %ecx
                    145:        subl    $STACK_SPACE, %esp
                    146: deflit(`FRAME',STACK_SPACE)
                    147:
                    148:        movl    %ebx, SAVE_EBX
                    149:        movl    PARAM_XSIZE, %ebx
                    150:
                    151:        movl    %edi, SAVE_EDI
                    152:        movl    PARAM_DST, %edi
                    153:
                    154:        movl    %ebp, SAVE_EBP
                    155:        movl    PARAM_DIVISOR, %ebp
                    156:
                    157:        movl    %esi, SAVE_ESI
                    158:        movl    PARAM_SRC, %esi
                    159:
                    160:        leal    -4(%edi,%ebx,4), %edi
1.1.1.2 ! ohara     161:        jmp     L(start_1c)
1.1       maekawa   162:
                    163: EPILOGUE()
                    164:
                    165:
                    166:        C offset 0x31, close enough to aligned
                    167: PROLOGUE(mpn_divrem_1)
                    168: deflit(`FRAME',0)
                    169:
                    170:        movl    PARAM_SIZE, %ecx
                    171:        movl    $0, %edx                C initial carry (if can't skip a div)
                    172:        subl    $STACK_SPACE, %esp
                    173: deflit(`FRAME',STACK_SPACE)
                    174:
                    175:        movl    %ebp, SAVE_EBP
                    176:        movl    PARAM_DIVISOR, %ebp
                    177:
                    178:        movl    %ebx, SAVE_EBX
                    179:        movl    PARAM_XSIZE, %ebx
                    180:
                    181:        movl    %esi, SAVE_ESI
                    182:        movl    PARAM_SRC, %esi
1.1.1.2 ! ohara     183:        orl     %ecx, %ecx              C size
1.1       maekawa   184:
                    185:        movl    %edi, SAVE_EDI
                    186:        movl    PARAM_DST, %edi
                    187:
                    188:        leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
1.1.1.2 ! ohara     189:        jz      L(no_skip_div)          C if size==0
1.1       maekawa   190:
                    191:        movl    -4(%esi,%ecx,4), %eax   C src high limb
1.1.1.2 ! ohara     192:        xorl    %esi, %esi
        !           193:        cmpl    %ebp, %eax              C high cmp divisor
1.1       maekawa   194:
1.1.1.2 ! ohara     195:        cmovc(  %eax, %edx)             C high is carry if high<divisor
        !           196:
        !           197:        cmovnc( %eax, %esi)             C 0 if skip div, src high if not
        !           198:                                        C (the latter in case src==dst)
        !           199:
        !           200:        movl    %esi, (%edi,%ecx,4)     C dst high limb
        !           201:
        !           202:        sbbl    $0, %ecx                C size-1 if high<divisor
        !           203:        movl    PARAM_SRC, %esi         C reload
1.1       maekawa   204: L(no_skip_div):
                    205:
                    206:
                    207: L(start_1c):
                    208:        C eax
                    209:        C ebx   xsize
                    210:        C ecx   size
                    211:        C edx   carry
                    212:        C esi   src
                    213:        C edi   &dst[xsize-1]
                    214:        C ebp   divisor
                    215:
                    216:        leal    (%ebx,%ecx), %eax       C size+xsize
                    217:        cmpl    $MUL_THRESHOLD, %eax
                    218:        jae     L(mul_by_inverse)
                    219:
                    220:        orl     %ecx, %ecx
                    221:        jz      L(divide_no_integer)
                    222:
                    223: L(divide_integer):
                    224:        C eax   scratch (quotient)
                    225:        C ebx   xsize
                    226:        C ecx   counter
                    227:        C edx   scratch (remainder)
                    228:        C esi   src
                    229:        C edi   &dst[xsize-1]
                    230:        C ebp   divisor
                    231:
                    232:        movl    -4(%esi,%ecx,4), %eax
                    233:
                    234:        divl    %ebp
                    235:
                    236:        movl    %eax, (%edi,%ecx,4)
                    237:        decl    %ecx
                    238:        jnz     L(divide_integer)
                    239:
                    240:
                    241: L(divide_no_integer):
                    242:        movl    PARAM_DST, %edi
                    243:        orl     %ebx, %ebx
                    244:        jnz     L(divide_fraction)
                    245:
                    246: L(divide_done):
                    247:        movl    SAVE_ESI, %esi
                    248:
                    249:        movl    SAVE_EDI, %edi
                    250:
                    251:        movl    SAVE_EBX, %ebx
                    252:        movl    %edx, %eax
                    253:
                    254:        movl    SAVE_EBP, %ebp
                    255:        addl    $STACK_SPACE, %esp
                    256:
                    257:        ret
                    258:
                    259:
                    260: L(divide_fraction):
                    261:        C eax   scratch (quotient)
                    262:        C ebx   counter
                    263:        C ecx
                    264:        C edx   scratch (remainder)
                    265:        C esi
                    266:        C edi   dst
                    267:        C ebp   divisor
                    268:
                    269:        movl    $0, %eax
                    270:
                    271:        divl    %ebp
                    272:
                    273:        movl    %eax, -4(%edi,%ebx,4)
                    274:        decl    %ebx
                    275:        jnz     L(divide_fraction)
                    276:
                    277:        jmp     L(divide_done)
                    278:
                    279:
                    280:
                    281: C -----------------------------------------------------------------------------
                    282:
                    283: L(mul_by_inverse):
                    284:        C eax
                    285:        C ebx   xsize
                    286:        C ecx   size
                    287:        C edx   carry
                    288:        C esi   src
                    289:        C edi   &dst[xsize-1]
                    290:        C ebp   divisor
                    291:
1.1.1.2 ! ohara     292:        leal    12(%edi), %ebx          C &dst[xsize+2], loop dst stop
1.1       maekawa   293:
                    294:        movl    %ebx, VAR_DST_STOP
                    295:        leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
                    296:
                    297:        movl    %edi, VAR_DST
                    298:        movl    %ecx, %ebx              C size
                    299:
                    300:        bsrl    %ebp, %ecx              C 31-l
                    301:        movl    %edx, %edi              C carry
                    302:
                    303:        leal    1(%ecx), %eax           C 32-l
                    304:        xorl    $31, %ecx               C l
                    305:
                    306:        movl    %ecx, VAR_NORM
                    307:        movl    $-1, %edx
                    308:
                    309:        shll    %cl, %ebp               C d normalized
                    310:        movd    %eax, %mm7
                    311:
                    312:        movl    $-1, %eax
                    313:        subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
                    314:
                    315:        divl    %ebp                    C floor (b*(b-d)-1) / d
                    316:
1.1.1.2 ! ohara     317: L(start_preinv):
        !           318:        C eax   inverse
        !           319:        C ebx   size
        !           320:        C ecx   shift
        !           321:        C edx
        !           322:        C esi   src
        !           323:        C edi   carry
        !           324:        C ebp   divisor
        !           325:        C
        !           326:        C mm7   rshift
        !           327:
1.1       maekawa   328:        movl    %eax, VAR_INVERSE
                    329:        orl     %ebx, %ebx              C size
                    330:        leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
                    331:
                    332:        movl    %eax, VAR_SRC
                    333:        jz      L(start_zero)
                    334:
                    335:        movl    8(%eax), %esi           C src high limb
                    336:        cmpl    $1, %ebx
                    337:        jz      L(start_one)
                    338:
                    339: L(start_two_or_more):
                    340:        movl    4(%eax), %edx           C src second highest limb
                    341:
                    342:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    343:
                    344:        shldl(  %cl, %edx, %esi)        C n10 = high,second << l
                    345:
                    346:        cmpl    $2, %ebx
                    347:        je      L(integer_two_left)
                    348:        jmp     L(integer_top)
                    349:
                    350:
                    351: L(start_one):
                    352:        shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
                    353:
                    354:        shll    %cl, %esi               C n10 = high << l
                    355:        jmp     L(integer_one_left)
                    356:
                    357:
                    358: L(start_zero):
1.1.1.2 ! ohara     359:        C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
        !           360:        C skipped a division.
        !           361:
1.1       maekawa   362:        shll    %cl, %edi               C n2 = carry << l
1.1.1.2 ! ohara     363:        movl    %edi, %eax              C return value for zero_done
        !           364:        cmpl    $0, PARAM_XSIZE
1.1       maekawa   365:
1.1.1.2 ! ohara     366:        je      L(zero_done)
1.1       maekawa   367:        jmp     L(fraction_some)
                    368:
                    369:
                    370:
                    371: C -----------------------------------------------------------------------------
                    372: C
                    373: C This loop runs at about 25 cycles, which is probably sub-optimal, and
                    374: C certainly more than the dependent chain would suggest.  A better loop, or
                    375: C a better rough analysis of what's possible, would be welcomed.
                    376: C
                    377: C In the current implementation, the following successively dependent
                    378: C micro-ops seem to exist.
                    379: C
                    380: C                     uops
                    381: C              n2+n1   1   (addl)
                    382: C              mul     5
                    383: C              q1+1    3   (addl/adcl)
                    384: C              mul     5
                    385: C              sub     3   (subl/sbbl)
                    386: C              addback 2   (cmov)
                    387: C                     ---
                    388: C                     19
                    389: C
                    390: C Lack of registers hinders explicit scheduling and it might be that the
                    391: C normal out of order execution isn't able to hide enough under the mul
                    392: C latencies.
                    393: C
                    394: C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
                    395: C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
                    396: C combination was tried for the addback (despite the fact it would lengthen
                    397: C the dependent chain) but found to be no faster.
                    398:
                    399:
                    400:        ALIGN(16)
                    401: L(integer_top):
                    402:        C eax   scratch
                    403:        C ebx   scratch (nadj, q1)
                    404:        C ecx   scratch (src, dst)
                    405:        C edx   scratch
                    406:        C esi   n10
                    407:        C edi   n2
                    408:        C ebp   d
                    409:        C
                    410:        C mm0   scratch (src qword)
                    411:        C mm7   rshift for normalization
                    412:
                    413:        movl    %esi, %eax
                    414:        movl    %ebp, %ebx
                    415:
                    416:        sarl    $31, %eax          C -n1
                    417:        movl    VAR_SRC, %ecx
                    418:
                    419:        andl    %eax, %ebx         C -n1 & d
                    420:        negl    %eax               C n1
                    421:
                    422:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
                    423:        addl    %edi, %eax         C n2+n1
                    424:        movq    (%ecx), %mm0       C next src limb and the one below it
                    425:
                    426:        mull    VAR_INVERSE        C m*(n2+n1)
                    427:
                    428:        subl    $4, %ecx
                    429:
                    430:        movl    %ecx, VAR_SRC
                    431:
                    432:        C
                    433:
                    434:        C
                    435:
                    436:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
                    437:        movl    %ebp, %eax         C d
1.1.1.2 ! ohara     438:        leal    1(%edi), %ebx      C n2+1
1.1       maekawa   439:
                    440:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    441:        jz      L(q1_ff)
                    442:
                    443:        mull    %ebx               C (q1+1)*d
                    444:
                    445:        movl    VAR_DST, %ecx
                    446:        psrlq   %mm7, %mm0
                    447:
                    448:        C
                    449:
                    450:        C
                    451:
                    452:        C
                    453:
                    454:        subl    %eax, %esi
                    455:        movl    VAR_DST_STOP, %eax
                    456:
                    457:        sbbl    %edx, %edi         C n - (q1+1)*d
                    458:        movl    %esi, %edi         C remainder -> n2
                    459:        leal    (%ebp,%esi), %edx
                    460:
                    461:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    462:        movd    %mm0, %esi
                    463:
                    464:        sbbl    $0, %ebx           C q
                    465:        subl    $4, %ecx
                    466:
                    467:        movl    %ebx, (%ecx)
                    468:        cmpl    %eax, %ecx
                    469:
                    470:        movl    %ecx, VAR_DST
                    471:        jne     L(integer_top)
                    472:
                    473:
                    474: L(integer_loop_done):
                    475:
                    476:
                    477: C -----------------------------------------------------------------------------
                    478: C
                    479: C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
                    480: C q1_ff special case.  This make the code a bit smaller and simpler, and
                    481: C costs only 2 cycles (each).
                    482:
                    483: L(integer_two_left):
                    484:        C eax   scratch
                    485:        C ebx   scratch (nadj, q1)
                    486:        C ecx   scratch (src, dst)
                    487:        C edx   scratch
                    488:        C esi   n10
                    489:        C edi   n2
                    490:        C ebp   divisor
                    491:        C
                    492:        C mm7   rshift
                    493:
                    494:
                    495:        movl    %esi, %eax
                    496:        movl    %ebp, %ebx
                    497:
                    498:        sarl    $31, %eax          C -n1
                    499:        movl    PARAM_SRC, %ecx
                    500:
                    501:        andl    %eax, %ebx         C -n1 & d
                    502:        negl    %eax               C n1
                    503:
                    504:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
                    505:        addl    %edi, %eax         C n2+n1
                    506:
                    507:        mull    VAR_INVERSE        C m*(n2+n1)
                    508:
                    509:        movd    (%ecx), %mm0       C src low limb
                    510:
                    511:        movl    VAR_DST_STOP, %ecx
                    512:
                    513:        C
                    514:
                    515:        C
                    516:
                    517:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara     518:        leal    1(%edi), %ebx      C n2+1
1.1       maekawa   519:        movl    %ebp, %eax         C d
                    520:
                    521:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    522:
                    523:        sbbl    $0, %ebx
                    524:
                    525:        mull    %ebx               C (q1+1)*d
                    526:
                    527:        psllq   $32, %mm0
                    528:
                    529:        psrlq   %mm7, %mm0
                    530:
                    531:        C
                    532:
                    533:        C
                    534:
                    535:        subl    %eax, %esi
                    536:
                    537:        sbbl    %edx, %edi         C n - (q1+1)*d
                    538:        movl    %esi, %edi         C remainder -> n2
                    539:        leal    (%ebp,%esi), %edx
                    540:
                    541:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    542:        movd    %mm0, %esi
                    543:
                    544:        sbbl    $0, %ebx           C q
                    545:
                    546:        movl    %ebx, -4(%ecx)
                    547:
                    548:
                    549: C -----------------------------------------------------------------------------
                    550: L(integer_one_left):
                    551:        C eax   scratch
                    552:        C ebx   scratch (nadj, q1)
                    553:        C ecx   scratch (dst)
                    554:        C edx   scratch
                    555:        C esi   n10
                    556:        C edi   n2
                    557:        C ebp   divisor
                    558:        C
                    559:        C mm7   rshift
                    560:
                    561:
                    562:        movl    %esi, %eax
                    563:        movl    %ebp, %ebx
                    564:
                    565:        sarl    $31, %eax          C -n1
                    566:        movl    VAR_DST_STOP, %ecx
                    567:
                    568:        andl    %eax, %ebx         C -n1 & d
                    569:        negl    %eax               C n1
                    570:
                    571:        addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
                    572:        addl    %edi, %eax         C n2+n1
                    573:
                    574:        mull    VAR_INVERSE        C m*(n2+n1)
                    575:
                    576:        C
                    577:
                    578:        C
                    579:
                    580:        C
                    581:
                    582:        addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
1.1.1.2 ! ohara     583:        leal    1(%edi), %ebx      C n2+1
1.1       maekawa   584:        movl    %ebp, %eax         C d
                    585:
                    586:        C
                    587:
                    588:        adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
                    589:
                    590:        sbbl    $0, %ebx           C q1 if q1+1 overflowed
                    591:
                    592:        mull    %ebx
                    593:
                    594:        C
                    595:
                    596:        C
                    597:
                    598:        C
                    599:
                    600:        C
                    601:
                    602:        subl    %eax, %esi
                    603:        movl    PARAM_XSIZE, %eax
                    604:
                    605:        sbbl    %edx, %edi         C n - (q1+1)*d
                    606:        movl    %esi, %edi         C remainder -> n2
                    607:        leal    (%ebp,%esi), %edx
                    608:
                    609:        cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
                    610:
                    611:        sbbl    $0, %ebx           C q
                    612:
                    613:        movl    %ebx, -8(%ecx)
                    614:        subl    $8, %ecx
                    615:
                    616:
                    617:
                    618:        orl     %eax, %eax         C xsize
                    619:        jnz     L(fraction_some)
                    620:
                    621:        movl    %edi, %eax
                    622: L(fraction_done):
                    623:        movl    VAR_NORM, %ecx
1.1.1.2 ! ohara     624: L(zero_done):
1.1       maekawa   625:        movl    SAVE_EBP, %ebp
                    626:
                    627:        movl    SAVE_EDI, %edi
                    628:
                    629:        movl    SAVE_ESI, %esi
                    630:
                    631:        movl    SAVE_EBX, %ebx
                    632:        addl    $STACK_SPACE, %esp
                    633:
                    634:        shrl    %cl, %eax
                    635:        emms
                    636:
                    637:        ret
                    638:
                    639:
                    640: C -----------------------------------------------------------------------------
                    641: C
                    642: C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
                    643: C of q*d is simply -d and the remainder n-q*d = n10+d
                    644:
                    645: L(q1_ff):
                    646:        C eax   (divisor)
                    647:        C ebx   (q1+1 == 0)
                    648:        C ecx
                    649:        C edx
                    650:        C esi   n10
                    651:        C edi   n2
                    652:        C ebp   divisor
                    653:
                    654:        movl    VAR_DST, %ecx
                    655:        movl    VAR_DST_STOP, %edx
                    656:        subl    $4, %ecx
                    657:
                    658:        movl    %ecx, VAR_DST
                    659:        psrlq   %mm7, %mm0
                    660:        leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
                    661:
                    662:        movl    $-1, (%ecx)
                    663:        movd    %mm0, %esi              C next n10
                    664:
                    665:        cmpl    %ecx, %edx
                    666:        jne     L(integer_top)
                    667:
                    668:        jmp     L(integer_loop_done)
                    669:
                    670:
                    671:
                    672: C -----------------------------------------------------------------------------
                    673: C
                    674: C In the current implementation, the following successively dependent
                    675: C micro-ops seem to exist.
                    676: C
                    677: C                     uops
                    678: C              mul     5
                    679: C              q1+1    1   (addl)
                    680: C              mul     5
                    681: C              sub     3   (negl/sbbl)
                    682: C              addback 2   (cmov)
                    683: C                     ---
                    684: C                     16
                    685: C
                    686: C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
                    687: C the addback was found to be a touch slower.
                    688:
                    689:
                    690:        ALIGN(16)
                    691: L(fraction_some):
                    692:        C eax
                    693:        C ebx
                    694:        C ecx
                    695:        C edx
                    696:        C esi
                    697:        C edi   carry
                    698:        C ebp   divisor
                    699:
                    700:        movl    PARAM_DST, %esi
1.1.1.2 ! ohara     701:        movl    VAR_DST_STOP, %ecx      C &dst[xsize+2]
1.1       maekawa   702:        movl    %edi, %eax
                    703:
1.1.1.2 ! ohara     704:        subl    $8, %ecx                C &dst[xsize]
1.1       maekawa   705:
                    706:
                    707:        ALIGN(16)
                    708: L(fraction_top):
                    709:        C eax   n2, then scratch
                    710:        C ebx   scratch (nadj, q1)
                    711:        C ecx   dst, decrementing
                    712:        C edx   scratch
                    713:        C esi   dst stop point
                    714:        C edi   n2
                    715:        C ebp   divisor
                    716:
                    717:        mull    VAR_INVERSE     C m*n2
                    718:
                    719:        movl    %ebp, %eax      C d
                    720:        subl    $4, %ecx        C dst
                    721:        leal    1(%edi), %ebx
                    722:
                    723:        C
                    724:
                    725:        C
                    726:
                    727:        C
                    728:
                    729:        addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
                    730:
                    731:        mull    %ebx            C (q1+1)*d
                    732:
                    733:        C
                    734:
                    735:        C
                    736:
                    737:        C
                    738:
                    739:        C
                    740:
                    741:        negl    %eax            C low of n - (q1+1)*d
                    742:
                    743:        sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
                    744:        leal    (%ebp,%eax), %edx
                    745:
                    746:        cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
                    747:
                    748:        sbbl    $0, %ebx        C q
                    749:        movl    %eax, %edi      C remainder->n2
                    750:        cmpl    %esi, %ecx
                    751:
                    752:        movl    %ebx, (%ecx)    C previous q
                    753:        jne     L(fraction_top)
                    754:
                    755:
                    756:        jmp     L(fraction_done)
                    757:
                    758: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>