OpenXM_contrib/gmp/mpn/x86/k6/sqr_basecase.asm - annotate

Return to sqr_basecase.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/sqr_basecase.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
        !             2: dnl
        !             3: dnl  K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
        !             4: dnl  product (measured on the speed difference between 17 and 33 limbs,
        !             5: dnl  which is roughly the Karatsuba recursing range).
        !             6:
        !             7:
        !             8: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             9: dnl
        !            10: dnl  This file is part of the GNU MP Library.
        !            11: dnl
        !            12: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            13: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            14: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            15: dnl  License, or (at your option) any later version.
        !            16: dnl
        !            17: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            18: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            19: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            20: dnl  Lesser General Public License for more details.
        !            21: dnl
        !            22: dnl  You should have received a copy of the GNU Lesser General Public
        !            23: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            24: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            25: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            26:
        !            27:
        !            28: include(`../config.m4')
        !            29:
        !            30:
        !            31: dnl  KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
        !            32: dnl  code supports.  This value is used only by the tune program to know
        !            33: dnl  what it can go up to.  (An attempt to compile with a bigger value will
        !            34: dnl  trigger some m4_assert()s in the code, making the build fail.)
        !            35: dnl
        !            36: dnl  The value is determined by requiring the displacements in the unrolled
        !            37: dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
        !            38: dnl  63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
        !            39:
        !            40: deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
        !            41:
        !            42:
        !            43: dnl  Allow a value from the tune program to override config.m4.
        !            44:
        !            45: ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
        !            46: `define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
        !            47:
        !            48:
        !            49: dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
        !            50: dnl  number required is determined by KARATSUBA_SQR_THRESHOLD, since
        !            51: dnl  mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
        !            52: dnl
        !            53: dnl  The first addmul is the biggest, and this takes the second least
        !            54: dnl  significant limb and multiplies it by the third least significant and
        !            55: dnl  up.  Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
        !            56: dnl  limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
        !            57:
        !            58: m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
        !            59: deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
        !            60:
        !            61:
        !            62: C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
        !            63: C
        !            64: C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
        !            65: C lot of function call overheads are avoided, especially when the given size
        !            66: C is small.
        !            67: C
        !            68: C The code size might look a bit excessive, but not all of it is executed
        !            69: C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
        !            70: C clearly apply only to those sizes; mid sizes like 10x10 only need part of
        !            71: C the unrolled addmul; and big sizes like 35x35 that do need all of it will
        !            72: C at least be getting value for money, because 35x35 spends something like
        !            73: C 5780 cycles here.
        !            74: C
        !            75: C Different values of UNROLL_COUNT give slightly different speeds, between
        !            76: C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
        !            77: C This isn't a big difference, but it's presumably some alignment effect
        !            78: C which if understood could give a simple speedup.
        !            79:
        !            80: defframe(PARAM_SIZE,12)
        !            81: defframe(PARAM_SRC, 8)
        !            82: defframe(PARAM_DST, 4)
        !            83:
        !            84:        .text
        !            85:        ALIGN(32)
        !            86: PROLOGUE(mpn_sqr_basecase)
        !            87: deflit(`FRAME',0)
        !            88:
        !            89:        movl    PARAM_SIZE, %ecx
        !            90:        movl    PARAM_SRC, %eax
        !            91:
        !            92:        cmpl    $2, %ecx
        !            93:        je      L(two_limbs)
        !            94:
        !            95:        movl    PARAM_DST, %edx
        !            96:        ja      L(three_or_more)
        !            97:
        !            98:
        !            99: C -----------------------------------------------------------------------------
        !           100: C one limb only
        !           101:        C eax   src
        !           102:        C ebx
        !           103:        C ecx   size
        !           104:        C edx   dst
        !           105:
        !           106:        movl    (%eax), %eax
        !           107:        movl    %edx, %ecx
        !           108:
        !           109:        mull    %eax
        !           110:
        !           111:        movl    %eax, (%ecx)
        !           112:        movl    %edx, 4(%ecx)
        !           113:        ret
        !           114:
        !           115:
        !           116: C -----------------------------------------------------------------------------
        !           117:        ALIGN(16)
        !           118: L(two_limbs):
        !           119:        C eax   src
        !           120:        C ebx
        !           121:        C ecx   size
        !           122:        C edx   dst
        !           123:
        !           124:        pushl   %ebx
        !           125:        movl    %eax, %ebx      C src
        !           126: deflit(`FRAME',4)
        !           127:
        !           128:        movl    (%ebx), %eax
        !           129:        movl    PARAM_DST, %ecx
        !           130:
        !           131:        mull    %eax            C src[0]^2
        !           132:
        !           133:        movl    %eax, (%ecx)
        !           134:        movl    4(%ebx), %eax
        !           135:
        !           136:        movl    %edx, 4(%ecx)
        !           137:
        !           138:        mull    %eax            C src[1]^2
        !           139:
        !           140:        movl    %eax, 8(%ecx)
        !           141:        movl    (%ebx), %eax
        !           142:
        !           143:        movl    %edx, 12(%ecx)
        !           144:        movl    4(%ebx), %edx
        !           145:
        !           146:        mull    %edx            C src[0]*src[1]
        !           147:
        !           148:        addl    %eax, 4(%ecx)
        !           149:
        !           150:        adcl    %edx, 8(%ecx)
        !           151:        adcl    $0, 12(%ecx)
        !           152:
        !           153:        popl    %ebx
        !           154:        addl    %eax, 4(%ecx)
        !           155:
        !           156:        adcl    %edx, 8(%ecx)
        !           157:        adcl    $0, 12(%ecx)
        !           158:
        !           159:        ret
        !           160:
        !           161:
        !           162: C -----------------------------------------------------------------------------
        !           163: L(three_or_more):
        !           164: deflit(`FRAME',0)
        !           165:        cmpl    $4, %ecx
        !           166:        jae     L(four_or_more)
        !           167:
        !           168:
        !           169: C -----------------------------------------------------------------------------
        !           170: C three limbs
        !           171:        C eax   src
        !           172:        C ecx   size
        !           173:        C edx   dst
        !           174:
        !           175:        pushl   %ebx
        !           176:        movl    %eax, %ebx      C src
        !           177:
        !           178:        movl    (%ebx), %eax
        !           179:        movl    %edx, %ecx      C dst
        !           180:
        !           181:        mull    %eax            C src[0] ^ 2
        !           182:
        !           183:        movl    %eax, (%ecx)
        !           184:        movl    4(%ebx), %eax
        !           185:
        !           186:        movl    %edx, 4(%ecx)
        !           187:        pushl   %esi
        !           188:
        !           189:        mull    %eax            C src[1] ^ 2
        !           190:
        !           191:        movl    %eax, 8(%ecx)
        !           192:        movl    8(%ebx), %eax
        !           193:
        !           194:        movl    %edx, 12(%ecx)
        !           195:        pushl   %edi
        !           196:
        !           197:        mull    %eax            C src[2] ^ 2
        !           198:
        !           199:        movl    %eax, 16(%ecx)
        !           200:        movl    (%ebx), %eax
        !           201:
        !           202:        movl    %edx, 20(%ecx)
        !           203:        movl    4(%ebx), %edx
        !           204:
        !           205:        mull    %edx            C src[0] * src[1]
        !           206:
        !           207:        movl    %eax, %esi
        !           208:        movl    (%ebx), %eax
        !           209:
        !           210:        movl    %edx, %edi
        !           211:        movl    8(%ebx), %edx
        !           212:
        !           213:        pushl   %ebp
        !           214:        xorl    %ebp, %ebp
        !           215:
        !           216:        mull    %edx            C src[0] * src[2]
        !           217:
        !           218:        addl    %eax, %edi
        !           219:        movl    4(%ebx), %eax
        !           220:
        !           221:        adcl    %edx, %ebp
        !           222:
        !           223:        movl    8(%ebx), %edx
        !           224:
        !           225:        mull    %edx            C src[1] * src[2]
        !           226:
        !           227:        addl    %eax, %ebp
        !           228:
        !           229:        adcl    $0, %edx
        !           230:
        !           231:
        !           232:        C eax   will be dst[5]
        !           233:        C ebx
        !           234:        C ecx   dst
        !           235:        C edx   dst[4]
        !           236:        C esi   dst[1]
        !           237:        C edi   dst[2]
        !           238:        C ebp   dst[3]
        !           239:
        !           240:        xorl    %eax, %eax
        !           241:        addl    %esi, %esi
        !           242:        adcl    %edi, %edi
        !           243:        adcl    %ebp, %ebp
        !           244:        adcl    %edx, %edx
        !           245:        adcl    $0, %eax
        !           246:
        !           247:        addl    %esi, 4(%ecx)
        !           248:        adcl    %edi, 8(%ecx)
        !           249:        adcl    %ebp, 12(%ecx)
        !           250:
        !           251:        popl    %ebp
        !           252:        popl    %edi
        !           253:
        !           254:        adcl    %edx, 16(%ecx)
        !           255:
        !           256:        popl    %esi
        !           257:        popl    %ebx
        !           258:
        !           259:        adcl    %eax, 20(%ecx)
        !           260:        ASSERT(nc)
        !           261:
        !           262:        ret
        !           263:
        !           264:
        !           265: C -----------------------------------------------------------------------------
        !           266:
        !           267: defframe(SAVE_EBX,   -4)
        !           268: defframe(SAVE_ESI,   -8)
        !           269: defframe(SAVE_EDI,   -12)
        !           270: defframe(SAVE_EBP,   -16)
        !           271: defframe(VAR_COUNTER,-20)
        !           272: defframe(VAR_JMP,    -24)
        !           273: deflit(STACK_SPACE, 24)
        !           274:
        !           275:        ALIGN(16)
        !           276: L(four_or_more):
        !           277:
        !           278:        C eax   src
        !           279:        C ebx
        !           280:        C ecx   size
        !           281:        C edx   dst
        !           282:        C esi
        !           283:        C edi
        !           284:        C ebp
        !           285:
        !           286: C First multiply src[0]*src[1..size-1] and store at dst[1..size].
        !           287: C
        !           288: C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
        !           289: C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
        !           290: C a 5780 cycle operation, which is not surprising since the loop here is 8
        !           291: C c/l and mpn_mul_1 is 6.25 c/l.
        !           292:
        !           293:        subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
        !           294:
        !           295:        movl    %edi, SAVE_EDI
        !           296:        leal    4(%edx), %edi
        !           297:
        !           298:        movl    %ebx, SAVE_EBX
        !           299:        leal    4(%eax), %ebx
        !           300:
        !           301:        movl    %esi, SAVE_ESI
        !           302:        xorl    %esi, %esi
        !           303:
        !           304:        movl    %ebp, SAVE_EBP
        !           305:
        !           306:        C eax
        !           307:        C ebx   src+4
        !           308:        C ecx   size
        !           309:        C edx
        !           310:        C esi
        !           311:        C edi   dst+4
        !           312:        C ebp
        !           313:
        !           314:        movl    (%eax), %ebp    C multiplier
        !           315:        leal    -1(%ecx), %ecx  C size-1, and pad to a 16 byte boundary
        !           316:
        !           317:
        !           318:        ALIGN(16)
        !           319: L(mul_1):
        !           320:        C eax   scratch
        !           321:        C ebx   src ptr
        !           322:        C ecx   counter
        !           323:        C edx   scratch
        !           324:        C esi   carry
        !           325:        C edi   dst ptr
        !           326:        C ebp   multiplier
        !           327:
        !           328:        movl    (%ebx), %eax
        !           329:        addl    $4, %ebx
        !           330:
        !           331:        mull    %ebp
        !           332:
        !           333:        addl    %esi, %eax
        !           334:        movl    $0, %esi
        !           335:
        !           336:        adcl    %edx, %esi
        !           337:
        !           338:        movl    %eax, (%edi)
        !           339:        addl    $4, %edi
        !           340:
        !           341:        loop    L(mul_1)
        !           342:
        !           343:
        !           344: C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
        !           345: C
        !           346: C The last two addmuls, which are the bottom right corner of the product
        !           347: C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
        !           348: C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
        !           349: C cases that need to be done.
        !           350: C
        !           351: C The unrolled code is the same as mpn_addmul_1(), see that routine for some
        !           352: C comments.
        !           353: C
        !           354: C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
        !           355: C
        !           356: C VAR_JMP is the computed jump into the unrolled code, stepped by one code
        !           357: C chunk each outer loop.
        !           358: C
        !           359: C K6 doesn't do any branch prediction on indirect jumps, which is good
        !           360: C actually because it's a different target each time.  The unrolled addmul
        !           361: C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
        !           362: C the indirect jump is quickly recovered.
        !           363:
        !           364:
        !           365: dnl  This value is also implicitly encoded in a shift and add.
        !           366: dnl
        !           367: deflit(CODE_BYTES_PER_LIMB, 15)
        !           368:
        !           369: dnl  With the unmodified &src[size] and &dst[size] pointers, the
        !           370: dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
        !           371: dnl  values up to 31.  Above that an offset must be added to them.
        !           372: dnl
        !           373: deflit(OFFSET,
        !           374: ifelse(eval(UNROLL_COUNT>31),1,
        !           375: eval((UNROLL_COUNT-31)*4),
        !           376: 0))
        !           377:
        !           378:        C eax
        !           379:        C ebx   &src[size]
        !           380:        C ecx
        !           381:        C edx
        !           382:        C esi   carry
        !           383:        C edi   &dst[size]
        !           384:        C ebp
        !           385:
        !           386:        movl    PARAM_SIZE, %ecx
        !           387:        movl    %esi, (%edi)
        !           388:
        !           389:        subl    $4, %ecx
        !           390:        jz      L(corner)
        !           391:
        !           392:        movl    %ecx, %edx
        !           393: ifelse(OFFSET,0,,
        !           394: `      subl    $OFFSET, %ebx')
        !           395:
        !           396:        shll    $4, %ecx
        !           397: ifelse(OFFSET,0,,
        !           398: `      subl    $OFFSET, %edi')
        !           399:
        !           400:        negl    %ecx
        !           401:
        !           402: ifdef(`PIC',`
        !           403:        call    L(pic_calc)
        !           404: L(here):
        !           405: ',`
        !           406:        leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
        !           407: ')
        !           408:        negl    %edx
        !           409:
        !           410:
        !           411:        C The calculated jump mustn't be before the start of the available
        !           412:        C code.  This is the limitation UNROLL_COUNT puts on the src operand
        !           413:        C size, but checked here using the jump address directly.
        !           414:        C
        !           415:        ASSERT(ae,`
        !           416:        movl_text_address( L(unroll_inner_start), %eax)
        !           417:        cmpl    %eax, %ecx
        !           418:        ')
        !           419:
        !           420:
        !           421: C -----------------------------------------------------------------------------
        !           422:        ALIGN(16)
        !           423: L(unroll_outer_top):
        !           424:        C eax
        !           425:        C ebx   &src[size], constant
        !           426:        C ecx   VAR_JMP
        !           427:        C edx   VAR_COUNTER, limbs, negative
        !           428:        C esi   high limb to store
        !           429:        C edi   dst ptr, high of last addmul
        !           430:        C ebp
        !           431:
        !           432:        movl    -12+OFFSET(%ebx,%edx,4), %ebp   C multiplier
        !           433:        movl    %edx, VAR_COUNTER
        !           434:
        !           435:        movl    -8+OFFSET(%ebx,%edx,4), %eax    C first limb of multiplicand
        !           436:
        !           437:        mull    %ebp
        !           438:
        !           439:        testb   $1, %cl
        !           440:
        !           441:        movl    %edx, %esi      C high carry
        !           442:        movl    %ecx, %edx      C jump
        !           443:
        !           444:        movl    %eax, %ecx      C low carry
        !           445:        leal    CODE_BYTES_PER_LIMB(%edx), %edx
        !           446:
        !           447:        movl    %edx, VAR_JMP
        !           448:        leal    4(%edi), %edi
        !           449:
        !           450:        C A branch-free version of this using some xors was found to be a
        !           451:        C touch slower than just a conditional jump, despite the jump
        !           452:        C switching between taken and not taken on every loop.
        !           453:
        !           454: ifelse(eval(UNROLL_COUNT%2),0,
        !           455:        jz,jnz) L(unroll_noswap)
        !           456:        movl    %esi, %eax      C high,low carry other way around
        !           457:
        !           458:        movl    %ecx, %esi
        !           459:        movl    %eax, %ecx
        !           460: L(unroll_noswap):
        !           461:
        !           462:        jmp     *%edx
        !           463:
        !           464:
        !           465:        C Must be on an even address here so the low bit of the jump address
        !           466:        C will indicate which way around ecx/esi should start.
        !           467:        C
        !           468:        C An attempt was made at padding here to get the end of the unrolled
        !           469:        C code to come out on a good alignment, to save padding before
        !           470:        C L(corner).  This worked, but turned out to run slower than just an
        !           471:        C ALIGN(2).  The reason for this is not clear, it might be related
        !           472:        C to the different speeds on different UNROLL_COUNTs noted above.
        !           473:
        !           474:        ALIGN(2)
        !           475:
        !           476: L(unroll_inner_start):
        !           477:        C eax   scratch
        !           478:        C ebx   src
        !           479:        C ecx   carry low
        !           480:        C edx   scratch
        !           481:        C esi   carry high
        !           482:        C edi   dst
        !           483:        C ebp   multiplier
        !           484:        C
        !           485:        C 15 code bytes each limb
        !           486:        C ecx/esi swapped on each chunk
        !           487:
        !           488: forloop(`i', UNROLL_COUNT, 1, `
        !           489:        deflit(`disp_src', eval(-i*4 + OFFSET))
        !           490:        deflit(`disp_dst', eval(disp_src - 4))
        !           491:
        !           492:        m4_assert(`disp_src>=-128 && disp_src<128')
        !           493:        m4_assert(`disp_dst>=-128 && disp_dst<128')
        !           494:
        !           495: ifelse(eval(i%2),0,`
        !           496: Zdisp( movl,   disp_src,(%ebx), %eax)
        !           497:        mull    %ebp
        !           498: Zdisp( addl,   %esi, disp_dst,(%edi))
        !           499:        adcl    %eax, %ecx
        !           500:        movl    %edx, %esi
        !           501:        jadcl0( %esi)
        !           502: ',`
        !           503:        dnl  this one comes out last
        !           504: Zdisp( movl,   disp_src,(%ebx), %eax)
        !           505:        mull    %ebp
        !           506: Zdisp( addl,   %ecx, disp_dst,(%edi))
        !           507:        adcl    %eax, %esi
        !           508:        movl    %edx, %ecx
        !           509:        jadcl0( %ecx)
        !           510: ')
        !           511: ')
        !           512: L(unroll_inner_end):
        !           513:
        !           514:        addl    %esi, -4+OFFSET(%edi)
        !           515:
        !           516:        movl    VAR_COUNTER, %edx
        !           517:        jadcl0( %ecx)
        !           518:
        !           519:        movl    %ecx, m4_empty_if_zero(OFFSET)(%edi)
        !           520:        movl    VAR_JMP, %ecx
        !           521:
        !           522:        incl    %edx
        !           523:        jnz     L(unroll_outer_top)
        !           524:
        !           525:
        !           526: ifelse(OFFSET,0,,`
        !           527:        addl    $OFFSET, %ebx
        !           528:        addl    $OFFSET, %edi
        !           529: ')
        !           530:
        !           531:
        !           532: C -----------------------------------------------------------------------------
        !           533:        ALIGN(16)
        !           534: L(corner):
        !           535:        C ebx   &src[size]
        !           536:        C edi   &dst[2*size-5]
        !           537:
        !           538:        movl    -12(%ebx), %ebp
        !           539:
        !           540:        movl    -8(%ebx), %eax
        !           541:        movl    %eax, %ecx
        !           542:
        !           543:        mull    %ebp
        !           544:
        !           545:        addl    %eax, -4(%edi)
        !           546:        adcl    $0, %edx
        !           547:
        !           548:        movl    -4(%ebx), %eax
        !           549:        movl    %edx, %esi
        !           550:        movl    %eax, %ebx
        !           551:
        !           552:        mull    %ebp
        !           553:
        !           554:        addl    %esi, %eax
        !           555:        adcl    $0, %edx
        !           556:
        !           557:        addl    %eax, (%edi)
        !           558:        adcl    $0, %edx
        !           559:
        !           560:        movl    %edx, %esi
        !           561:        movl    %ebx, %eax
        !           562:
        !           563:        mull    %ecx
        !           564:
        !           565:        addl    %esi, %eax
        !           566:        movl    %eax, 4(%edi)
        !           567:
        !           568:        adcl    $0, %edx
        !           569:
        !           570:        movl    %edx, 8(%edi)
        !           571:
        !           572:
        !           573: C -----------------------------------------------------------------------------
        !           574: C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
        !           575: C The loop measures about 6 cycles/iteration, though it looks like it should
        !           576: C decode in 5.
        !           577:
        !           578: L(lshift_start):
        !           579:        movl    PARAM_SIZE, %ecx
        !           580:
        !           581:        movl    PARAM_DST, %edi
        !           582:        subl    $1, %ecx                C size-1 and clear carry
        !           583:
        !           584:        movl    PARAM_SRC, %ebx
        !           585:        movl    %ecx, %edx
        !           586:
        !           587:        xorl    %eax, %eax              C ready for adcl
        !           588:
        !           589:
        !           590:        ALIGN(16)
        !           591: L(lshift):
        !           592:        C eax
        !           593:        C ebx   src (for later use)
        !           594:        C ecx   counter, decrementing
        !           595:        C edx   size-1 (for later use)
        !           596:        C esi
        !           597:        C edi   dst, incrementing
        !           598:        C ebp
        !           599:
        !           600:        rcll    4(%edi)
        !           601:        rcll    8(%edi)
        !           602:        leal    8(%edi), %edi
        !           603:        loop    L(lshift)
        !           604:
        !           605:
        !           606:        adcl    %eax, %eax
        !           607:
        !           608:        movl    %eax, 4(%edi)           C dst most significant limb
        !           609:        movl    (%ebx), %eax            C src[0]
        !           610:
        !           611:        leal    4(%ebx,%edx,4), %ebx    C &src[size]
        !           612:        subl    %edx, %ecx              C -(size-1)
        !           613:
        !           614:
        !           615: C -----------------------------------------------------------------------------
        !           616: C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
        !           617: C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
        !           618: C low limb of src[0]^2.
        !           619:
        !           620:
        !           621:        mull    %eax
        !           622:
        !           623:        movl    %eax, (%edi,%ecx,8)     C dst[0]
        !           624:
        !           625:
        !           626:        ALIGN(16)
        !           627: L(diag):
        !           628:        C eax   scratch
        !           629:        C ebx   &src[size]
        !           630:        C ecx   counter, negative
        !           631:        C edx   carry
        !           632:        C esi   scratch
        !           633:        C edi   dst[2*size-2]
        !           634:        C ebp
        !           635:
        !           636:        movl    (%ebx,%ecx,4), %eax
        !           637:        movl    %edx, %esi
        !           638:
        !           639:        mull    %eax
        !           640:
        !           641:        addl    %esi, 4(%edi,%ecx,8)
        !           642:        adcl    %eax, 8(%edi,%ecx,8)
        !           643:        adcl    $0, %edx
        !           644:
        !           645:        incl    %ecx
        !           646:        jnz     L(diag)
        !           647:
        !           648:
        !           649:        movl    SAVE_EBX, %ebx
        !           650:        movl    SAVE_ESI, %esi
        !           651:
        !           652:        addl    %edx, 4(%edi)           C dst most significant limb
        !           653:
        !           654:        movl    SAVE_EDI, %edi
        !           655:        movl    SAVE_EBP, %ebp
        !           656:        addl    $FRAME, %esp
        !           657:        ret
        !           658:
        !           659:
        !           660:
        !           661: C -----------------------------------------------------------------------------
        !           662: ifdef(`PIC',`
        !           663: L(pic_calc):
        !           664:         C See README.family about old gas bugs
        !           665:        addl    (%esp), %ecx
        !           666:        addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
        !           667:        addl    %edx, %ecx
        !           668:        ret
        !           669: ')
        !           670:
        !           671:
        !           672: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>