[BACK]Return to sqr_basecase.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/sqr_basecase.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
        !            26: C     product (measured on the speed difference between 17 and 33 limbs,
        !            27: C     which is roughly the Karatsuba recursing range).
        !            28:
        !            29:
        !            30: dnl  SQR_KARATSUBA_THRESHOLD_MAX is the maximum SQR_KARATSUBA_THRESHOLD this
1.1       maekawa    31: dnl  code supports.  This value is used only by the tune program to know
                     32: dnl  what it can go up to.  (An attempt to compile with a bigger value will
                     33: dnl  trigger some m4_assert()s in the code, making the build fail.)
                     34: dnl
                     35: dnl  The value is determined by requiring the displacements in the unrolled
                     36: dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
1.1.1.2 ! ohara      37: dnl  63, giving a maximum SQR_KARATSUBA_THRESHOLD of 66.
1.1       maekawa    38:
1.1.1.2 ! ohara      39: deflit(SQR_KARATSUBA_THRESHOLD_MAX, 66)
1.1       maekawa    40:
                     41:
                     42: dnl  Allow a value from the tune program to override config.m4.
                     43:
1.1.1.2 ! ohara      44: ifdef(`SQR_KARATSUBA_THRESHOLD_OVERRIDE',
        !            45: `define(`SQR_KARATSUBA_THRESHOLD',SQR_KARATSUBA_THRESHOLD_OVERRIDE)')
1.1       maekawa    46:
                     47:
                     48: dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
1.1.1.2 ! ohara      49: dnl  number required is determined by SQR_KARATSUBA_THRESHOLD, since
        !            50: dnl  mpn_sqr_basecase only needs to handle sizes < SQR_KARATSUBA_THRESHOLD.
1.1       maekawa    51: dnl
                     52: dnl  The first addmul is the biggest, and this takes the second least
                     53: dnl  significant limb and multiplies it by the third least significant and
1.1.1.2 ! ohara      54: dnl  up.  Hence for a maximum operand size of SQR_KARATSUBA_THRESHOLD-1
        !            55: dnl  limbs, UNROLL_COUNT needs to be SQR_KARATSUBA_THRESHOLD-3.
1.1       maekawa    56:
1.1.1.2 ! ohara      57: m4_config_gmp_mparam(`SQR_KARATSUBA_THRESHOLD')
        !            58: deflit(UNROLL_COUNT, eval(SQR_KARATSUBA_THRESHOLD-3))
1.1       maekawa    59:
                     60:
                     61: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
                     62: C
                     63: C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
                     64: C lot of function call overheads are avoided, especially when the given size
                     65: C is small.
                     66: C
                     67: C The code size might look a bit excessive, but not all of it is executed
                     68: C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
                     69: C clearly apply only to those sizes; mid sizes like 10x10 only need part of
                     70: C the unrolled addmul; and big sizes like 35x35 that do need all of it will
                     71: C at least be getting value for money, because 35x35 spends something like
                     72: C 5780 cycles here.
                     73: C
                     74: C Different values of UNROLL_COUNT give slightly different speeds, between
                     75: C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
                     76: C This isn't a big difference, but it's presumably some alignment effect
                     77: C which if understood could give a simple speedup.
                     78:
                     79: defframe(PARAM_SIZE,12)
                     80: defframe(PARAM_SRC, 8)
                     81: defframe(PARAM_DST, 4)
                     82:
1.1.1.2 ! ohara      83:        TEXT
1.1       maekawa    84:        ALIGN(32)
                     85: PROLOGUE(mpn_sqr_basecase)
                     86: deflit(`FRAME',0)
                     87:
                     88:        movl    PARAM_SIZE, %ecx
                     89:        movl    PARAM_SRC, %eax
                     90:
                     91:        cmpl    $2, %ecx
                     92:        je      L(two_limbs)
                     93:
                     94:        movl    PARAM_DST, %edx
                     95:        ja      L(three_or_more)
                     96:
                     97:
                     98: C -----------------------------------------------------------------------------
                     99: C one limb only
                    100:        C eax   src
                    101:        C ebx
                    102:        C ecx   size
                    103:        C edx   dst
                    104:
                    105:        movl    (%eax), %eax
                    106:        movl    %edx, %ecx
                    107:
                    108:        mull    %eax
                    109:
                    110:        movl    %eax, (%ecx)
                    111:        movl    %edx, 4(%ecx)
                    112:        ret
                    113:
                    114:
                    115: C -----------------------------------------------------------------------------
                    116:        ALIGN(16)
                    117: L(two_limbs):
                    118:        C eax   src
                    119:        C ebx
                    120:        C ecx   size
                    121:        C edx   dst
                    122:
                    123:        pushl   %ebx
                    124:        movl    %eax, %ebx      C src
                    125: deflit(`FRAME',4)
                    126:
                    127:        movl    (%ebx), %eax
                    128:        movl    PARAM_DST, %ecx
                    129:
                    130:        mull    %eax            C src[0]^2
                    131:
                    132:        movl    %eax, (%ecx)
                    133:        movl    4(%ebx), %eax
                    134:
                    135:        movl    %edx, 4(%ecx)
                    136:
                    137:        mull    %eax            C src[1]^2
                    138:
                    139:        movl    %eax, 8(%ecx)
                    140:        movl    (%ebx), %eax
                    141:
                    142:        movl    %edx, 12(%ecx)
                    143:        movl    4(%ebx), %edx
                    144:
                    145:        mull    %edx            C src[0]*src[1]
                    146:
                    147:        addl    %eax, 4(%ecx)
                    148:
                    149:        adcl    %edx, 8(%ecx)
                    150:        adcl    $0, 12(%ecx)
                    151:
                    152:        popl    %ebx
                    153:        addl    %eax, 4(%ecx)
                    154:
                    155:        adcl    %edx, 8(%ecx)
                    156:        adcl    $0, 12(%ecx)
                    157:
                    158:        ret
                    159:
                    160:
                    161: C -----------------------------------------------------------------------------
                    162: L(three_or_more):
                    163: deflit(`FRAME',0)
                    164:        cmpl    $4, %ecx
                    165:        jae     L(four_or_more)
                    166:
                    167:
                    168: C -----------------------------------------------------------------------------
                    169: C three limbs
                    170:        C eax   src
                    171:        C ecx   size
                    172:        C edx   dst
                    173:
                    174:        pushl   %ebx
                    175:        movl    %eax, %ebx      C src
                    176:
                    177:        movl    (%ebx), %eax
                    178:        movl    %edx, %ecx      C dst
                    179:
                    180:        mull    %eax            C src[0] ^ 2
                    181:
                    182:        movl    %eax, (%ecx)
                    183:        movl    4(%ebx), %eax
                    184:
                    185:        movl    %edx, 4(%ecx)
                    186:        pushl   %esi
                    187:
                    188:        mull    %eax            C src[1] ^ 2
                    189:
                    190:        movl    %eax, 8(%ecx)
                    191:        movl    8(%ebx), %eax
                    192:
                    193:        movl    %edx, 12(%ecx)
                    194:        pushl   %edi
                    195:
                    196:        mull    %eax            C src[2] ^ 2
                    197:
                    198:        movl    %eax, 16(%ecx)
                    199:        movl    (%ebx), %eax
                    200:
                    201:        movl    %edx, 20(%ecx)
                    202:        movl    4(%ebx), %edx
                    203:
                    204:        mull    %edx            C src[0] * src[1]
                    205:
                    206:        movl    %eax, %esi
                    207:        movl    (%ebx), %eax
                    208:
                    209:        movl    %edx, %edi
                    210:        movl    8(%ebx), %edx
                    211:
                    212:        pushl   %ebp
                    213:        xorl    %ebp, %ebp
                    214:
                    215:        mull    %edx            C src[0] * src[2]
                    216:
                    217:        addl    %eax, %edi
                    218:        movl    4(%ebx), %eax
                    219:
                    220:        adcl    %edx, %ebp
                    221:
                    222:        movl    8(%ebx), %edx
                    223:
                    224:        mull    %edx            C src[1] * src[2]
                    225:
                    226:        addl    %eax, %ebp
                    227:
                    228:        adcl    $0, %edx
                    229:
                    230:
                    231:        C eax   will be dst[5]
                    232:        C ebx
                    233:        C ecx   dst
                    234:        C edx   dst[4]
                    235:        C esi   dst[1]
                    236:        C edi   dst[2]
                    237:        C ebp   dst[3]
                    238:
                    239:        xorl    %eax, %eax
                    240:        addl    %esi, %esi
                    241:        adcl    %edi, %edi
                    242:        adcl    %ebp, %ebp
                    243:        adcl    %edx, %edx
                    244:        adcl    $0, %eax
                    245:
                    246:        addl    %esi, 4(%ecx)
                    247:        adcl    %edi, 8(%ecx)
                    248:        adcl    %ebp, 12(%ecx)
                    249:
                    250:        popl    %ebp
                    251:        popl    %edi
                    252:
                    253:        adcl    %edx, 16(%ecx)
                    254:
                    255:        popl    %esi
                    256:        popl    %ebx
                    257:
                    258:        adcl    %eax, 20(%ecx)
                    259:        ASSERT(nc)
                    260:
                    261:        ret
                    262:
                    263:
                    264: C -----------------------------------------------------------------------------
                    265:
                    266: defframe(SAVE_EBX,   -4)
                    267: defframe(SAVE_ESI,   -8)
                    268: defframe(SAVE_EDI,   -12)
                    269: defframe(SAVE_EBP,   -16)
                    270: defframe(VAR_COUNTER,-20)
                    271: defframe(VAR_JMP,    -24)
                    272: deflit(STACK_SPACE, 24)
                    273:
                    274:        ALIGN(16)
                    275: L(four_or_more):
                    276:
                    277:        C eax   src
                    278:        C ebx
                    279:        C ecx   size
                    280:        C edx   dst
                    281:        C esi
                    282:        C edi
                    283:        C ebp
                    284:
                    285: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
                    286: C
                    287: C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
                    288: C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
                    289: C a 5780 cycle operation, which is not surprising since the loop here is 8
                    290: C c/l and mpn_mul_1 is 6.25 c/l.
                    291:
                    292:        subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
                    293:
                    294:        movl    %edi, SAVE_EDI
                    295:        leal    4(%edx), %edi
                    296:
                    297:        movl    %ebx, SAVE_EBX
                    298:        leal    4(%eax), %ebx
                    299:
                    300:        movl    %esi, SAVE_ESI
                    301:        xorl    %esi, %esi
                    302:
                    303:        movl    %ebp, SAVE_EBP
                    304:
                    305:        C eax
                    306:        C ebx   src+4
                    307:        C ecx   size
                    308:        C edx
                    309:        C esi
                    310:        C edi   dst+4
                    311:        C ebp
                    312:
                    313:        movl    (%eax), %ebp    C multiplier
                    314:        leal    -1(%ecx), %ecx  C size-1, and pad to a 16 byte boundary
                    315:
                    316:
                    317:        ALIGN(16)
                    318: L(mul_1):
                    319:        C eax   scratch
                    320:        C ebx   src ptr
                    321:        C ecx   counter
                    322:        C edx   scratch
                    323:        C esi   carry
                    324:        C edi   dst ptr
                    325:        C ebp   multiplier
                    326:
                    327:        movl    (%ebx), %eax
                    328:        addl    $4, %ebx
                    329:
                    330:        mull    %ebp
                    331:
                    332:        addl    %esi, %eax
                    333:        movl    $0, %esi
                    334:
                    335:        adcl    %edx, %esi
                    336:
                    337:        movl    %eax, (%edi)
                    338:        addl    $4, %edi
                    339:
                    340:        loop    L(mul_1)
                    341:
                    342:
                    343: C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
                    344: C
                    345: C The last two addmuls, which are the bottom right corner of the product
                    346: C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
                    347: C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
                    348: C cases that need to be done.
                    349: C
                    350: C The unrolled code is the same as mpn_addmul_1(), see that routine for some
                    351: C comments.
                    352: C
                    353: C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
                    354: C
                    355: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
                    356: C chunk each outer loop.
                    357: C
                    358: C K6 doesn't do any branch prediction on indirect jumps, which is good
                    359: C actually because it's a different target each time.  The unrolled addmul
                    360: C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
                    361: C the indirect jump is quickly recovered.
                    362:
                    363:
                    364: dnl  This value is also implicitly encoded in a shift and add.
                    365: dnl
                    366: deflit(CODE_BYTES_PER_LIMB, 15)
                    367:
                    368: dnl  With the unmodified &src[size] and &dst[size] pointers, the
                    369: dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
                    370: dnl  values up to 31.  Above that an offset must be added to them.
                    371: dnl
                    372: deflit(OFFSET,
                    373: ifelse(eval(UNROLL_COUNT>31),1,
                    374: eval((UNROLL_COUNT-31)*4),
                    375: 0))
                    376:
                    377:        C eax
                    378:        C ebx   &src[size]
                    379:        C ecx
                    380:        C edx
                    381:        C esi   carry
                    382:        C edi   &dst[size]
                    383:        C ebp
                    384:
                    385:        movl    PARAM_SIZE, %ecx
                    386:        movl    %esi, (%edi)
                    387:
                    388:        subl    $4, %ecx
                    389:        jz      L(corner)
                    390:
                    391:        movl    %ecx, %edx
                    392: ifelse(OFFSET,0,,
                    393: `      subl    $OFFSET, %ebx')
                    394:
                    395:        shll    $4, %ecx
                    396: ifelse(OFFSET,0,,
                    397: `      subl    $OFFSET, %edi')
                    398:
                    399:        negl    %ecx
                    400:
                    401: ifdef(`PIC',`
                    402:        call    L(pic_calc)
                    403: L(here):
                    404: ',`
                    405:        leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
                    406: ')
                    407:        negl    %edx
                    408:
                    409:
                    410:        C The calculated jump mustn't be before the start of the available
                    411:        C code.  This is the limitation UNROLL_COUNT puts on the src operand
                    412:        C size, but checked here using the jump address directly.
                    413:        C
                    414:        ASSERT(ae,`
                    415:        movl_text_address( L(unroll_inner_start), %eax)
                    416:        cmpl    %eax, %ecx
                    417:        ')
                    418:
                    419:
                    420: C -----------------------------------------------------------------------------
                    421:        ALIGN(16)
                    422: L(unroll_outer_top):
                    423:        C eax
                    424:        C ebx   &src[size], constant
                    425:        C ecx   VAR_JMP
                    426:        C edx   VAR_COUNTER, limbs, negative
                    427:        C esi   high limb to store
                    428:        C edi   dst ptr, high of last addmul
                    429:        C ebp
                    430:
                    431:        movl    -12+OFFSET(%ebx,%edx,4), %ebp   C multiplier
                    432:        movl    %edx, VAR_COUNTER
                    433:
                    434:        movl    -8+OFFSET(%ebx,%edx,4), %eax    C first limb of multiplicand
                    435:
                    436:        mull    %ebp
                    437:
                    438:        testb   $1, %cl
                    439:
                    440:        movl    %edx, %esi      C high carry
                    441:        movl    %ecx, %edx      C jump
                    442:
                    443:        movl    %eax, %ecx      C low carry
                    444:        leal    CODE_BYTES_PER_LIMB(%edx), %edx
                    445:
                    446:        movl    %edx, VAR_JMP
                    447:        leal    4(%edi), %edi
                    448:
                    449:        C A branch-free version of this using some xors was found to be a
                    450:        C touch slower than just a conditional jump, despite the jump
                    451:        C switching between taken and not taken on every loop.
                    452:
                    453: ifelse(eval(UNROLL_COUNT%2),0,
                    454:        jz,jnz) L(unroll_noswap)
                    455:        movl    %esi, %eax      C high,low carry other way around
                    456:
                    457:        movl    %ecx, %esi
                    458:        movl    %eax, %ecx
                    459: L(unroll_noswap):
                    460:
                    461:        jmp     *%edx
                    462:
                    463:
                    464:        C Must be on an even address here so the low bit of the jump address
                    465:        C will indicate which way around ecx/esi should start.
                    466:        C
                    467:        C An attempt was made at padding here to get the end of the unrolled
                    468:        C code to come out on a good alignment, to save padding before
                    469:        C L(corner).  This worked, but turned out to run slower than just an
                    470:        C ALIGN(2).  The reason for this is not clear, it might be related
                    471:        C to the different speeds on different UNROLL_COUNTs noted above.
                    472:
                    473:        ALIGN(2)
                    474:
                    475: L(unroll_inner_start):
                    476:        C eax   scratch
                    477:        C ebx   src
                    478:        C ecx   carry low
                    479:        C edx   scratch
                    480:        C esi   carry high
                    481:        C edi   dst
                    482:        C ebp   multiplier
                    483:        C
                    484:        C 15 code bytes each limb
                    485:        C ecx/esi swapped on each chunk
                    486:
                    487: forloop(`i', UNROLL_COUNT, 1, `
                    488:        deflit(`disp_src', eval(-i*4 + OFFSET))
                    489:        deflit(`disp_dst', eval(disp_src - 4))
                    490:
                    491:        m4_assert(`disp_src>=-128 && disp_src<128')
                    492:        m4_assert(`disp_dst>=-128 && disp_dst<128')
                    493:
                    494: ifelse(eval(i%2),0,`
                    495: Zdisp( movl,   disp_src,(%ebx), %eax)
                    496:        mull    %ebp
                    497: Zdisp( addl,   %esi, disp_dst,(%edi))
                    498:        adcl    %eax, %ecx
                    499:        movl    %edx, %esi
                    500:        jadcl0( %esi)
                    501: ',`
                    502:        dnl  this one comes out last
                    503: Zdisp( movl,   disp_src,(%ebx), %eax)
                    504:        mull    %ebp
                    505: Zdisp( addl,   %ecx, disp_dst,(%edi))
                    506:        adcl    %eax, %esi
                    507:        movl    %edx, %ecx
                    508:        jadcl0( %ecx)
                    509: ')
                    510: ')
                    511: L(unroll_inner_end):
                    512:
                    513:        addl    %esi, -4+OFFSET(%edi)
                    514:
                    515:        movl    VAR_COUNTER, %edx
                    516:        jadcl0( %ecx)
                    517:
                    518:        movl    %ecx, m4_empty_if_zero(OFFSET)(%edi)
                    519:        movl    VAR_JMP, %ecx
                    520:
                    521:        incl    %edx
                    522:        jnz     L(unroll_outer_top)
                    523:
                    524:
                    525: ifelse(OFFSET,0,,`
                    526:        addl    $OFFSET, %ebx
                    527:        addl    $OFFSET, %edi
                    528: ')
                    529:
                    530:
                    531: C -----------------------------------------------------------------------------
                    532:        ALIGN(16)
                    533: L(corner):
                    534:        C ebx   &src[size]
                    535:        C edi   &dst[2*size-5]
                    536:
                    537:        movl    -12(%ebx), %ebp
                    538:
                    539:        movl    -8(%ebx), %eax
                    540:        movl    %eax, %ecx
                    541:
                    542:        mull    %ebp
                    543:
                    544:        addl    %eax, -4(%edi)
                    545:        adcl    $0, %edx
                    546:
                    547:        movl    -4(%ebx), %eax
                    548:        movl    %edx, %esi
                    549:        movl    %eax, %ebx
                    550:
                    551:        mull    %ebp
                    552:
                    553:        addl    %esi, %eax
                    554:        adcl    $0, %edx
                    555:
                    556:        addl    %eax, (%edi)
                    557:        adcl    $0, %edx
                    558:
                    559:        movl    %edx, %esi
                    560:        movl    %ebx, %eax
                    561:
                    562:        mull    %ecx
                    563:
                    564:        addl    %esi, %eax
                    565:        movl    %eax, 4(%edi)
                    566:
                    567:        adcl    $0, %edx
                    568:
                    569:        movl    %edx, 8(%edi)
                    570:
                    571:
                    572: C -----------------------------------------------------------------------------
                    573: C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
                    574: C The loop measures about 6 cycles/iteration, though it looks like it should
                    575: C decode in 5.
                    576:
                    577: L(lshift_start):
                    578:        movl    PARAM_SIZE, %ecx
                    579:
                    580:        movl    PARAM_DST, %edi
                    581:        subl    $1, %ecx                C size-1 and clear carry
                    582:
                    583:        movl    PARAM_SRC, %ebx
                    584:        movl    %ecx, %edx
                    585:
                    586:        xorl    %eax, %eax              C ready for adcl
                    587:
                    588:
                    589:        ALIGN(16)
                    590: L(lshift):
                    591:        C eax
                    592:        C ebx   src (for later use)
                    593:        C ecx   counter, decrementing
                    594:        C edx   size-1 (for later use)
                    595:        C esi
                    596:        C edi   dst, incrementing
                    597:        C ebp
                    598:
                    599:        rcll    4(%edi)
                    600:        rcll    8(%edi)
                    601:        leal    8(%edi), %edi
                    602:        loop    L(lshift)
                    603:
                    604:
                    605:        adcl    %eax, %eax
                    606:
                    607:        movl    %eax, 4(%edi)           C dst most significant limb
                    608:        movl    (%ebx), %eax            C src[0]
                    609:
                    610:        leal    4(%ebx,%edx,4), %ebx    C &src[size]
                    611:        subl    %edx, %ecx              C -(size-1)
                    612:
                    613:
                    614: C -----------------------------------------------------------------------------
                    615: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
                    616: C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
                    617: C low limb of src[0]^2.
                    618:
                    619:
                    620:        mull    %eax
                    621:
                    622:        movl    %eax, (%edi,%ecx,8)     C dst[0]
                    623:
                    624:
                    625:        ALIGN(16)
                    626: L(diag):
                    627:        C eax   scratch
                    628:        C ebx   &src[size]
                    629:        C ecx   counter, negative
                    630:        C edx   carry
                    631:        C esi   scratch
                    632:        C edi   dst[2*size-2]
                    633:        C ebp
                    634:
                    635:        movl    (%ebx,%ecx,4), %eax
                    636:        movl    %edx, %esi
                    637:
                    638:        mull    %eax
                    639:
                    640:        addl    %esi, 4(%edi,%ecx,8)
                    641:        adcl    %eax, 8(%edi,%ecx,8)
                    642:        adcl    $0, %edx
                    643:
                    644:        incl    %ecx
                    645:        jnz     L(diag)
                    646:
                    647:
                    648:        movl    SAVE_EBX, %ebx
                    649:        movl    SAVE_ESI, %esi
                    650:
                    651:        addl    %edx, 4(%edi)           C dst most significant limb
                    652:
                    653:        movl    SAVE_EDI, %edi
                    654:        movl    SAVE_EBP, %ebp
                    655:        addl    $FRAME, %esp
                    656:        ret
                    657:
                    658:
                    659:
                    660: C -----------------------------------------------------------------------------
                    661: ifdef(`PIC',`
                    662: L(pic_calc):
1.1.1.2 ! ohara     663:         C See mpn/x86/README about old gas bugs
1.1       maekawa   664:        addl    (%esp), %ecx
                    665:        addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
                    666:        addl    %edx, %ecx
                    667:        ret
                    668: ')
                    669:
                    670:
                    671: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>