OpenXM_contrib/gmp/mpn/x86/k6/sqr_basecase.asm - annotate

Return to sqr_basecase.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/sqr_basecase.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
                      2: dnl
                      3: dnl  K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
                      4: dnl  product (measured on the speed difference between 17 and 33 limbs,
                      5: dnl  which is roughly the Karatsuba recursing range).
                      6:
                      7:
                      8: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      9: dnl
                     10: dnl  This file is part of the GNU MP Library.
                     11: dnl
                     12: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     13: dnl  modify it under the terms of the GNU Lesser General Public License as
                     14: dnl  published by the Free Software Foundation; either version 2.1 of the
                     15: dnl  License, or (at your option) any later version.
                     16: dnl
                     17: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     18: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     19: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     20: dnl  Lesser General Public License for more details.
                     21: dnl
                     22: dnl  You should have received a copy of the GNU Lesser General Public
                     23: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     24: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     25: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     26:
                     27:
                     28: include(`../config.m4')
                     29:
                     30:
                     31: dnl  KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
                     32: dnl  code supports.  This value is used only by the tune program to know
                     33: dnl  what it can go up to.  (An attempt to compile with a bigger value will
                     34: dnl  trigger some m4_assert()s in the code, making the build fail.)
                     35: dnl
                     36: dnl  The value is determined by requiring the displacements in the unrolled
                     37: dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
                     38: dnl  63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
                     39:
                     40: deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
                     41:
                     42:
                     43: dnl  Allow a value from the tune program to override config.m4.
                     44:
                     45: ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
                     46: `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
                     47:
                     48:
                     49: dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
                     50: dnl  number required is determined by KARATSUBA_SQR_THRESHOLD, since
                     51: dnl  mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
                     52: dnl
                     53: dnl  The first addmul is the biggest, and this takes the second least
                     54: dnl  significant limb and multiplies it by the third least significant and
                     55: dnl  up.  Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
                     56: dnl  limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
                     57:
                     58: m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
                     59: deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
                     60:
                     61:
                     62: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
                     63: C
                     64: C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
                     65: C lot of function call overheads are avoided, especially when the given size
                     66: C is small.
                     67: C
                     68: C The code size might look a bit excessive, but not all of it is executed
                     69: C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
                     70: C clearly apply only to those sizes; mid sizes like 10x10 only need part of
                     71: C the unrolled addmul; and big sizes like 35x35 that do need all of it will
                     72: C at least be getting value for money, because 35x35 spends something like
                     73: C 5780 cycles here.
                     74: C
                     75: C Different values of UNROLL_COUNT give slightly different speeds, between
                     76: C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
                     77: C This isn't a big difference, but it's presumably some alignment effect
                     78: C which if understood could give a simple speedup.
                     79:
                     80: defframe(PARAM_SIZE,12)
                     81: defframe(PARAM_SRC, 8)
                     82: defframe(PARAM_DST, 4)
                     83:
                     84:        .text
                     85:        ALIGN(32)
                     86: PROLOGUE(mpn_sqr_basecase)
                     87: deflit(`FRAME',0)
                     88:
                     89:        movl    PARAM_SIZE, %ecx
                     90:        movl    PARAM_SRC, %eax
                     91:
                     92:        cmpl    $2, %ecx
                     93:        je      L(two_limbs)
                     94:
                     95:        movl    PARAM_DST, %edx
                     96:        ja      L(three_or_more)
                     97:
                     98:
                     99: C -----------------------------------------------------------------------------
                    100: C one limb only
                    101:        C eax   src
                    102:        C ebx
                    103:        C ecx   size
                    104:        C edx   dst
                    105:
                    106:        movl    (%eax), %eax
                    107:        movl    %edx, %ecx
                    108:
                    109:        mull    %eax
                    110:
                    111:        movl    %eax, (%ecx)
                    112:        movl    %edx, 4(%ecx)
                    113:        ret
                    114:
                    115:
                    116: C -----------------------------------------------------------------------------
                    117:        ALIGN(16)
                    118: L(two_limbs):
                    119:        C eax   src
                    120:        C ebx
                    121:        C ecx   size
                    122:        C edx   dst
                    123:
                    124:        pushl   %ebx
                    125:        movl    %eax, %ebx      C src
                    126: deflit(`FRAME',4)
                    127:
                    128:        movl    (%ebx), %eax
                    129:        movl    PARAM_DST, %ecx
                    130:
                    131:        mull    %eax            C src[0]^2
                    132:
                    133:        movl    %eax, (%ecx)
                    134:        movl    4(%ebx), %eax
                    135:
                    136:        movl    %edx, 4(%ecx)
                    137:
                    138:        mull    %eax            C src[1]^2
                    139:
                    140:        movl    %eax, 8(%ecx)
                    141:        movl    (%ebx), %eax
                    142:
                    143:        movl    %edx, 12(%ecx)
                    144:        movl    4(%ebx), %edx
                    145:
                    146:        mull    %edx            C src[0]*src[1]
                    147:
                    148:        addl    %eax, 4(%ecx)
                    149:
                    150:        adcl    %edx, 8(%ecx)
                    151:        adcl    $0, 12(%ecx)
                    152:
                    153:        popl    %ebx
                    154:        addl    %eax, 4(%ecx)
                    155:
                    156:        adcl    %edx, 8(%ecx)
                    157:        adcl    $0, 12(%ecx)
                    158:
                    159:        ret
                    160:
                    161:
                    162: C -----------------------------------------------------------------------------
                    163: L(three_or_more):
                    164: deflit(`FRAME',0)
                    165:        cmpl    $4, %ecx
                    166:        jae     L(four_or_more)
                    167:
                    168:
                    169: C -----------------------------------------------------------------------------
                    170: C three limbs
                    171:        C eax   src
                    172:        C ecx   size
                    173:        C edx   dst
                    174:
                    175:        pushl   %ebx
                    176:        movl    %eax, %ebx      C src
                    177:
                    178:        movl    (%ebx), %eax
                    179:        movl    %edx, %ecx      C dst
                    180:
                    181:        mull    %eax            C src[0] ^ 2
                    182:
                    183:        movl    %eax, (%ecx)
                    184:        movl    4(%ebx), %eax
                    185:
                    186:        movl    %edx, 4(%ecx)
                    187:        pushl   %esi
                    188:
                    189:        mull    %eax            C src[1] ^ 2
                    190:
                    191:        movl    %eax, 8(%ecx)
                    192:        movl    8(%ebx), %eax
                    193:
                    194:        movl    %edx, 12(%ecx)
                    195:        pushl   %edi
                    196:
                    197:        mull    %eax            C src[2] ^ 2
                    198:
                    199:        movl    %eax, 16(%ecx)
                    200:        movl    (%ebx), %eax
                    201:
                    202:        movl    %edx, 20(%ecx)
                    203:        movl    4(%ebx), %edx
                    204:
                    205:        mull    %edx            C src[0] * src[1]
                    206:
                    207:        movl    %eax, %esi
                    208:        movl    (%ebx), %eax
                    209:
                    210:        movl    %edx, %edi
                    211:        movl    8(%ebx), %edx
                    212:
                    213:        pushl   %ebp
                    214:        xorl    %ebp, %ebp
                    215:
                    216:        mull    %edx            C src[0] * src[2]
                    217:
                    218:        addl    %eax, %edi
                    219:        movl    4(%ebx), %eax
                    220:
                    221:        adcl    %edx, %ebp
                    222:
                    223:        movl    8(%ebx), %edx
                    224:
                    225:        mull    %edx            C src[1] * src[2]
                    226:
                    227:        addl    %eax, %ebp
                    228:
                    229:        adcl    $0, %edx
                    230:
                    231:
                    232:        C eax   will be dst[5]
                    233:        C ebx
                    234:        C ecx   dst
                    235:        C edx   dst[4]
                    236:        C esi   dst[1]
                    237:        C edi   dst[2]
                    238:        C ebp   dst[3]
                    239:
                    240:        xorl    %eax, %eax
                    241:        addl    %esi, %esi
                    242:        adcl    %edi, %edi
                    243:        adcl    %ebp, %ebp
                    244:        adcl    %edx, %edx
                    245:        adcl    $0, %eax
                    246:
                    247:        addl    %esi, 4(%ecx)
                    248:        adcl    %edi, 8(%ecx)
                    249:        adcl    %ebp, 12(%ecx)
                    250:
                    251:        popl    %ebp
                    252:        popl    %edi
                    253:
                    254:        adcl    %edx, 16(%ecx)
                    255:
                    256:        popl    %esi
                    257:        popl    %ebx
                    258:
                    259:        adcl    %eax, 20(%ecx)
                    260:        ASSERT(nc)
                    261:
                    262:        ret
                    263:
                    264:
                    265: C -----------------------------------------------------------------------------
                    266:
                    267: defframe(SAVE_EBX,   -4)
                    268: defframe(SAVE_ESI,   -8)
                    269: defframe(SAVE_EDI,   -12)
                    270: defframe(SAVE_EBP,   -16)
                    271: defframe(VAR_COUNTER,-20)
                    272: defframe(VAR_JMP,    -24)
                    273: deflit(STACK_SPACE, 24)
                    274:
                    275:        ALIGN(16)
                    276: L(four_or_more):
                    277:
                    278:        C eax   src
                    279:        C ebx
                    280:        C ecx   size
                    281:        C edx   dst
                    282:        C esi
                    283:        C edi
                    284:        C ebp
                    285:
                    286: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
                    287: C
                    288: C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
                    289: C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
                    290: C a 5780 cycle operation, which is not surprising since the loop here is 8
                    291: C c/l and mpn_mul_1 is 6.25 c/l.
                    292:
                    293:        subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
                    294:
                    295:        movl    %edi, SAVE_EDI
                    296:        leal    4(%edx), %edi
                    297:
                    298:        movl    %ebx, SAVE_EBX
                    299:        leal    4(%eax), %ebx
                    300:
                    301:        movl    %esi, SAVE_ESI
                    302:        xorl    %esi, %esi
                    303:
                    304:        movl    %ebp, SAVE_EBP
                    305:
                    306:        C eax
                    307:        C ebx   src+4
                    308:        C ecx   size
                    309:        C edx
                    310:        C esi
                    311:        C edi   dst+4
                    312:        C ebp
                    313:
                    314:        movl    (%eax), %ebp    C multiplier
                    315:        leal    -1(%ecx), %ecx  C size-1, and pad to a 16 byte boundary
                    316:
                    317:
                    318:        ALIGN(16)
                    319: L(mul_1):
                    320:        C eax   scratch
                    321:        C ebx   src ptr
                    322:        C ecx   counter
                    323:        C edx   scratch
                    324:        C esi   carry
                    325:        C edi   dst ptr
                    326:        C ebp   multiplier
                    327:
                    328:        movl    (%ebx), %eax
                    329:        addl    $4, %ebx
                    330:
                    331:        mull    %ebp
                    332:
                    333:        addl    %esi, %eax
                    334:        movl    $0, %esi
                    335:
                    336:        adcl    %edx, %esi
                    337:
                    338:        movl    %eax, (%edi)
                    339:        addl    $4, %edi
                    340:
                    341:        loop    L(mul_1)
                    342:
                    343:
                    344: C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
                    345: C
                    346: C The last two addmuls, which are the bottom right corner of the product
                    347: C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
                    348: C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
                    349: C cases that need to be done.
                    350: C
                    351: C The unrolled code is the same as mpn_addmul_1(), see that routine for some
                    352: C comments.
                    353: C
                    354: C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
                    355: C
                    356: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
                    357: C chunk each outer loop.
                    358: C
                    359: C K6 doesn't do any branch prediction on indirect jumps, which is good
                    360: C actually because it's a different target each time.  The unrolled addmul
                    361: C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
                    362: C the indirect jump is quickly recovered.
                    363:
                    364:
                    365: dnl  This value is also implicitly encoded in a shift and add.
                    366: dnl
                    367: deflit(CODE_BYTES_PER_LIMB, 15)
                    368:
                    369: dnl  With the unmodified &src[size] and &dst[size] pointers, the
                    370: dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
                    371: dnl  values up to 31.  Above that an offset must be added to them.
                    372: dnl
                    373: deflit(OFFSET,
                    374: ifelse(eval(UNROLL_COUNT>31),1,
                    375: eval((UNROLL_COUNT-31)*4),
                    376: 0))
                    377:
                    378:        C eax
                    379:        C ebx   &src[size]
                    380:        C ecx
                    381:        C edx
                    382:        C esi   carry
                    383:        C edi   &dst[size]
                    384:        C ebp
                    385:
                    386:        movl    PARAM_SIZE, %ecx
                    387:        movl    %esi, (%edi)
                    388:
                    389:        subl    $4, %ecx
                    390:        jz      L(corner)
                    391:
                    392:        movl    %ecx, %edx
                    393: ifelse(OFFSET,0,,
                    394: `      subl    $OFFSET, %ebx')
                    395:
                    396:        shll    $4, %ecx
                    397: ifelse(OFFSET,0,,
                    398: `      subl    $OFFSET, %edi')
                    399:
                    400:        negl    %ecx
                    401:
                    402: ifdef(`PIC',`
                    403:        call    L(pic_calc)
                    404: L(here):
                    405: ',`
                    406:        leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
                    407: ')
                    408:        negl    %edx
                    409:
                    410:
                    411:        C The calculated jump mustn't be before the start of the available
                    412:        C code.  This is the limitation UNROLL_COUNT puts on the src operand
                    413:        C size, but checked here using the jump address directly.
                    414:        C
                    415:        ASSERT(ae,`
                    416:        movl_text_address( L(unroll_inner_start), %eax)
                    417:        cmpl    %eax, %ecx
                    418:        ')
                    419:
                    420:
                    421: C -----------------------------------------------------------------------------
                    422:        ALIGN(16)
                    423: L(unroll_outer_top):
                    424:        C eax
                    425:        C ebx   &src[size], constant
                    426:        C ecx   VAR_JMP
                    427:        C edx   VAR_COUNTER, limbs, negative
                    428:        C esi   high limb to store
                    429:        C edi   dst ptr, high of last addmul
                    430:        C ebp
                    431:
                    432:        movl    -12+OFFSET(%ebx,%edx,4), %ebp   C multiplier
                    433:        movl    %edx, VAR_COUNTER
                    434:
                    435:        movl    -8+OFFSET(%ebx,%edx,4), %eax    C first limb of multiplicand
                    436:
                    437:        mull    %ebp
                    438:
                    439:        testb   $1, %cl
                    440:
                    441:        movl    %edx, %esi      C high carry
                    442:        movl    %ecx, %edx      C jump
                    443:
                    444:        movl    %eax, %ecx      C low carry
                    445:        leal    CODE_BYTES_PER_LIMB(%edx), %edx
                    446:
                    447:        movl    %edx, VAR_JMP
                    448:        leal    4(%edi), %edi
                    449:
                    450:        C A branch-free version of this using some xors was found to be a
                    451:        C touch slower than just a conditional jump, despite the jump
                    452:        C switching between taken and not taken on every loop.
                    453:
                    454: ifelse(eval(UNROLL_COUNT%2),0,
                    455:        jz,jnz) L(unroll_noswap)
                    456:        movl    %esi, %eax      C high,low carry other way around
                    457:
                    458:        movl    %ecx, %esi
                    459:        movl    %eax, %ecx
                    460: L(unroll_noswap):
                    461:
                    462:        jmp     *%edx
                    463:
                    464:
                    465:        C Must be on an even address here so the low bit of the jump address
                    466:        C will indicate which way around ecx/esi should start.
                    467:        C
                    468:        C An attempt was made at padding here to get the end of the unrolled
                    469:        C code to come out on a good alignment, to save padding before
                    470:        C L(corner).  This worked, but turned out to run slower than just an
                    471:        C ALIGN(2).  The reason for this is not clear, it might be related
                    472:        C to the different speeds on different UNROLL_COUNTs noted above.
                    473:
                    474:        ALIGN(2)
                    475:
                    476: L(unroll_inner_start):
                    477:        C eax   scratch
                    478:        C ebx   src
                    479:        C ecx   carry low
                    480:        C edx   scratch
                    481:        C esi   carry high
                    482:        C edi   dst
                    483:        C ebp   multiplier
                    484:        C
                    485:        C 15 code bytes each limb
                    486:        C ecx/esi swapped on each chunk
                    487:
                    488: forloop(`i', UNROLL_COUNT, 1, `
                    489:        deflit(`disp_src', eval(-i*4 + OFFSET))
                    490:        deflit(`disp_dst', eval(disp_src - 4))
                    491:
                    492:        m4_assert(`disp_src>=-128 && disp_src<128')
                    493:        m4_assert(`disp_dst>=-128 && disp_dst<128')
                    494:
                    495: ifelse(eval(i%2),0,`
                    496: Zdisp( movl,   disp_src,(%ebx), %eax)
                    497:        mull    %ebp
                    498: Zdisp( addl,   %esi, disp_dst,(%edi))
                    499:        adcl    %eax, %ecx
                    500:        movl    %edx, %esi
                    501:        jadcl0( %esi)
                    502: ',`
                    503:        dnl  this one comes out last
                    504: Zdisp( movl,   disp_src,(%ebx), %eax)
                    505:        mull    %ebp
                    506: Zdisp( addl,   %ecx, disp_dst,(%edi))
                    507:        adcl    %eax, %esi
                    508:        movl    %edx, %ecx
                    509:        jadcl0( %ecx)
                    510: ')
                    511: ')
                    512: L(unroll_inner_end):
                    513:
                    514:        addl    %esi, -4+OFFSET(%edi)
                    515:
                    516:        movl    VAR_COUNTER, %edx
                    517:        jadcl0( %ecx)
                    518:
                    519:        movl    %ecx, m4_empty_if_zero(OFFSET)(%edi)
                    520:        movl    VAR_JMP, %ecx
                    521:
                    522:        incl    %edx
                    523:        jnz     L(unroll_outer_top)
                    524:
                    525:
                    526: ifelse(OFFSET,0,,`
                    527:        addl    $OFFSET, %ebx
                    528:        addl    $OFFSET, %edi
                    529: ')
                    530:
                    531:
                    532: C -----------------------------------------------------------------------------
                    533:        ALIGN(16)
                    534: L(corner):
                    535:        C ebx   &src[size]
                    536:        C edi   &dst[2*size-5]
                    537:
                    538:        movl    -12(%ebx), %ebp
                    539:
                    540:        movl    -8(%ebx), %eax
                    541:        movl    %eax, %ecx
                    542:
                    543:        mull    %ebp
                    544:
                    545:        addl    %eax, -4(%edi)
                    546:        adcl    $0, %edx
                    547:
                    548:        movl    -4(%ebx), %eax
                    549:        movl    %edx, %esi
                    550:        movl    %eax, %ebx
                    551:
                    552:        mull    %ebp
                    553:
                    554:        addl    %esi, %eax
                    555:        adcl    $0, %edx
                    556:
                    557:        addl    %eax, (%edi)
                    558:        adcl    $0, %edx
                    559:
                    560:        movl    %edx, %esi
                    561:        movl    %ebx, %eax
                    562:
                    563:        mull    %ecx
                    564:
                    565:        addl    %esi, %eax
                    566:        movl    %eax, 4(%edi)
                    567:
                    568:        adcl    $0, %edx
                    569:
                    570:        movl    %edx, 8(%edi)
                    571:
                    572:
                    573: C -----------------------------------------------------------------------------
                    574: C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
                    575: C The loop measures about 6 cycles/iteration, though it looks like it should
                    576: C decode in 5.
                    577:
                    578: L(lshift_start):
                    579:        movl    PARAM_SIZE, %ecx
                    580:
                    581:        movl    PARAM_DST, %edi
                    582:        subl    $1, %ecx                C size-1 and clear carry
                    583:
                    584:        movl    PARAM_SRC, %ebx
                    585:        movl    %ecx, %edx
                    586:
                    587:        xorl    %eax, %eax              C ready for adcl
                    588:
                    589:
                    590:        ALIGN(16)
                    591: L(lshift):
                    592:        C eax
                    593:        C ebx   src (for later use)
                    594:        C ecx   counter, decrementing
                    595:        C edx   size-1 (for later use)
                    596:        C esi
                    597:        C edi   dst, incrementing
                    598:        C ebp
                    599:
                    600:        rcll    4(%edi)
                    601:        rcll    8(%edi)
                    602:        leal    8(%edi), %edi
                    603:        loop    L(lshift)
                    604:
                    605:
                    606:        adcl    %eax, %eax
                    607:
                    608:        movl    %eax, 4(%edi)           C dst most significant limb
                    609:        movl    (%ebx), %eax            C src[0]
                    610:
                    611:        leal    4(%ebx,%edx,4), %ebx    C &src[size]
                    612:        subl    %edx, %ecx              C -(size-1)
                    613:
                    614:
                    615: C -----------------------------------------------------------------------------
                    616: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
                    617: C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
                    618: C low limb of src[0]^2.
                    619:
                    620:
                    621:        mull    %eax
                    622:
                    623:        movl    %eax, (%edi,%ecx,8)     C dst[0]
                    624:
                    625:
                    626:        ALIGN(16)
                    627: L(diag):
                    628:        C eax   scratch
                    629:        C ebx   &src[size]
                    630:        C ecx   counter, negative
                    631:        C edx   carry
                    632:        C esi   scratch
                    633:        C edi   dst[2*size-2]
                    634:        C ebp
                    635:
                    636:        movl    (%ebx,%ecx,4), %eax
                    637:        movl    %edx, %esi
                    638:
                    639:        mull    %eax
                    640:
                    641:        addl    %esi, 4(%edi,%ecx,8)
                    642:        adcl    %eax, 8(%edi,%ecx,8)
                    643:        adcl    $0, %edx
                    644:
                    645:        incl    %ecx
                    646:        jnz     L(diag)
                    647:
                    648:
                    649:        movl    SAVE_EBX, %ebx
                    650:        movl    SAVE_ESI, %esi
                    651:
                    652:        addl    %edx, 4(%edi)           C dst most significant limb
                    653:
                    654:        movl    SAVE_EDI, %edi
                    655:        movl    SAVE_EBP, %ebp
                    656:        addl    $FRAME, %esp
                    657:        ret
                    658:
                    659:
                    660:
                    661: C -----------------------------------------------------------------------------
                    662: ifdef(`PIC',`
                    663: L(pic_calc):
                    664:         C See README.family about old gas bugs
                    665:        addl    (%esp), %ecx
                    666:        addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
                    667:        addl    %edx, %ecx
                    668:        ret
                    669: ')
                    670:
                    671:
                    672: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>