OpenXM_contrib/gmp/mpn/x86/k6/mul_basecase.asm - annotate

Return to mul_basecase.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_basecase.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
        !             2: dnl
        !             3: dnl  K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
        !             4: dnl      unrolling).
        !             5:
        !             6:
        !             7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             8: dnl
        !             9: dnl  This file is part of the GNU MP Library.
        !            10: dnl
        !            11: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            12: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            13: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            14: dnl  License, or (at your option) any later version.
        !            15: dnl
        !            16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            19: dnl  Lesser General Public License for more details.
        !            20: dnl
        !            21: dnl  You should have received a copy of the GNU Lesser General Public
        !            22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            24: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            25:
        !            26:
        !            27: include(`../config.m4')
        !            28:
        !            29:
        !            30: dnl  K6: UNROLL_COUNT cycles/product (approx)
        !            31: dnl           8           9.75
        !            32: dnl          16           9.3
        !            33: dnl          32           9.3
        !            34: dnl  Maximum possible with the current code is 32.
        !            35: dnl
        !            36: dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
        !            37: dnl  might explain it's good performance.
        !            38:
        !            39: deflit(UNROLL_COUNT, 16)
        !            40:
        !            41:
        !            42: C void mpn_mul_basecase (mp_ptr wp,
        !            43: C                        mp_srcptr xp, mp_size_t xsize,
        !            44: C                        mp_srcptr yp, mp_size_t ysize);
        !            45: C
        !            46: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
        !            47: C wp,xsize+ysize.
        !            48: C
        !            49: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
        !            50: C it's faster because it does most of the mpn_addmul_1() entry code only
        !            51: C once.  The saving is about 10-20% on typical sizes coming from the
        !            52: C Karatsuba multiply code.
        !            53: C
        !            54: C Future:
        !            55: C
        !            56: C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
        !            57: C setups and maybe 2 or 3 wasted cycles at the end.  Code saving would be
        !            58: C 256 bytes.
        !            59:
        !            60: ifdef(`PIC',`
        !            61: deflit(UNROLL_THRESHOLD, 8)
        !            62: ',`
        !            63: deflit(UNROLL_THRESHOLD, 8)
        !            64: ')
        !            65:
        !            66: defframe(PARAM_YSIZE,20)
        !            67: defframe(PARAM_YP,   16)
        !            68: defframe(PARAM_XSIZE,12)
        !            69: defframe(PARAM_XP,   8)
        !            70: defframe(PARAM_WP,   4)
        !            71:
        !            72:        .text
        !            73:        ALIGN(32)
        !            74: PROLOGUE(mpn_mul_basecase)
        !            75: deflit(`FRAME',0)
        !            76:
        !            77:        movl    PARAM_XSIZE, %ecx
        !            78:        movl    PARAM_YP, %eax
        !            79:
        !            80:        movl    PARAM_XP, %edx
        !            81:        movl    (%eax), %eax    C yp low limb
        !            82:
        !            83:        cmpl    $2, %ecx
        !            84:        ja      L(xsize_more_than_two_limbs)
        !            85:        je      L(two_by_something)
        !            86:
        !            87:
        !            88:        C one limb by one limb
        !            89:
        !            90:        movl    (%edx), %edx    C xp low limb
        !            91:        movl    PARAM_WP, %ecx
        !            92:
        !            93:        mull    %edx
        !            94:
        !            95:        movl    %eax, (%ecx)
        !            96:        movl    %edx, 4(%ecx)
        !            97:        ret
        !            98:
        !            99:
        !           100: C -----------------------------------------------------------------------------
        !           101: L(two_by_something):
        !           102:        decl    PARAM_YSIZE
        !           103:        pushl   %ebx
        !           104: deflit(`FRAME',4)
        !           105:
        !           106:        movl    PARAM_WP, %ebx
        !           107:        pushl   %esi
        !           108: deflit(`FRAME',8)
        !           109:
        !           110:        movl    %eax, %ecx      C yp low limb
        !           111:        movl    (%edx), %eax    C xp low limb
        !           112:
        !           113:        movl    %edx, %esi      C xp
        !           114:        jnz     L(two_by_two)
        !           115:
        !           116:
        !           117:        C two limbs by one limb
        !           118:
        !           119:        mull    %ecx
        !           120:
        !           121:        movl    %eax, (%ebx)
        !           122:        movl    4(%esi), %eax
        !           123:
        !           124:        movl    %edx, %esi      C carry
        !           125:
        !           126:        mull    %ecx
        !           127:
        !           128:        addl    %eax, %esi
        !           129:        movl    %esi, 4(%ebx)
        !           130:
        !           131:        adcl    $0, %edx
        !           132:
        !           133:        movl    %edx, 8(%ebx)
        !           134:        popl    %esi
        !           135:
        !           136:        popl    %ebx
        !           137:        ret
        !           138:
        !           139:
        !           140:
        !           141: C -----------------------------------------------------------------------------
        !           142:        ALIGN(16)
        !           143: L(two_by_two):
        !           144:        C eax   xp low limb
        !           145:        C ebx   wp
        !           146:        C ecx   yp low limb
        !           147:        C edx
        !           148:        C esi   xp
        !           149:        C edi
        !           150:        C ebp
        !           151: deflit(`FRAME',8)
        !           152:
        !           153:        mull    %ecx            C xp[0] * yp[0]
        !           154:
        !           155:        push    %edi
        !           156: deflit(`FRAME',12)
        !           157:        movl    %eax, (%ebx)
        !           158:
        !           159:        movl    4(%esi), %eax
        !           160:        movl    %edx, %edi      C carry, for wp[1]
        !           161:
        !           162:        mull    %ecx            C xp[1] * yp[0]
        !           163:
        !           164:        addl    %eax, %edi
        !           165:        movl    PARAM_YP, %ecx
        !           166:
        !           167:        adcl    $0, %edx
        !           168:
        !           169:        movl    %edi, 4(%ebx)
        !           170:        movl    4(%ecx), %ecx   C yp[1]
        !           171:
        !           172:        movl    4(%esi), %eax   C xp[1]
        !           173:        movl    %edx, %edi      C carry, for wp[2]
        !           174:
        !           175:        mull    %ecx            C xp[1] * yp[1]
        !           176:
        !           177:        addl    %eax, %edi
        !           178:
        !           179:        adcl    $0, %edx
        !           180:
        !           181:        movl    (%esi), %eax    C xp[0]
        !           182:        movl    %edx, %esi      C carry, for wp[3]
        !           183:
        !           184:        mull    %ecx            C xp[0] * yp[1]
        !           185:
        !           186:        addl    %eax, 4(%ebx)
        !           187:        adcl    %edx, %edi
        !           188:        adcl    $0, %esi
        !           189:
        !           190:        movl    %edi, 8(%ebx)
        !           191:        popl    %edi
        !           192:
        !           193:        movl    %esi, 12(%ebx)
        !           194:        popl    %esi
        !           195:
        !           196:        popl    %ebx
        !           197:        ret
        !           198:
        !           199:
        !           200: C -----------------------------------------------------------------------------
        !           201:        ALIGN(16)
        !           202: L(xsize_more_than_two_limbs):
        !           203:
        !           204: C The first limb of yp is processed with a simple mpn_mul_1 style loop
        !           205: C inline.  Unrolling this doesn't seem worthwhile since it's only run once
        !           206: C (whereas the addmul below is run ysize-1 many times).  A call to the
        !           207: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
        !           208: C popping, and doesn't seem likely to be worthwhile on the typical 10-20
        !           209: C limb operations the Karatsuba code calls here with.
        !           210:
        !           211:        C eax   yp[0]
        !           212:        C ebx
        !           213:        C ecx   xsize
        !           214:        C edx   xp
        !           215:        C esi
        !           216:        C edi
        !           217:        C ebp
        !           218: deflit(`FRAME',0)
        !           219:
        !           220:        pushl   %edi            defframe_pushl(SAVE_EDI)
        !           221:        pushl   %ebp            defframe_pushl(SAVE_EBP)
        !           222:
        !           223:        movl    PARAM_WP, %edi
        !           224:        pushl   %esi            defframe_pushl(SAVE_ESI)
        !           225:
        !           226:        movl    %eax, %ebp
        !           227:        pushl   %ebx            defframe_pushl(SAVE_EBX)
        !           228:
        !           229:        leal    (%edx,%ecx,4), %ebx     C xp end
        !           230:        xorl    %esi, %esi
        !           231:
        !           232:        leal    (%edi,%ecx,4), %edi     C wp end of mul1
        !           233:        negl    %ecx
        !           234:
        !           235:
        !           236: L(mul1):
        !           237:        C eax   scratch
        !           238:        C ebx   xp end
        !           239:        C ecx   counter, negative
        !           240:        C edx   scratch
        !           241:        C esi   carry
        !           242:        C edi   wp end of mul1
        !           243:        C ebp   multiplier
        !           244:
        !           245:        movl    (%ebx,%ecx,4), %eax
        !           246:
        !           247:        mull    %ebp
        !           248:
        !           249:        addl    %esi, %eax
        !           250:        movl    $0, %esi
        !           251:
        !           252:        adcl    %edx, %esi
        !           253:
        !           254:        movl    %eax, (%edi,%ecx,4)
        !           255:        incl    %ecx
        !           256:
        !           257:        jnz     L(mul1)
        !           258:
        !           259:
        !           260:        movl    PARAM_YSIZE, %edx
        !           261:        movl    %esi, (%edi)            C final carry
        !           262:
        !           263:        movl    PARAM_XSIZE, %ecx
        !           264:        decl    %edx
        !           265:
        !           266:        jnz     L(ysize_more_than_one_limb)
        !           267:
        !           268:        popl    %ebx
        !           269:        popl    %esi
        !           270:        popl    %ebp
        !           271:        popl    %edi
        !           272:        ret
        !           273:
        !           274:
        !           275: L(ysize_more_than_one_limb):
        !           276:        cmpl    $UNROLL_THRESHOLD, %ecx
        !           277:        movl    PARAM_YP, %eax
        !           278:
        !           279:        jae     L(unroll)
        !           280:
        !           281:
        !           282: C -----------------------------------------------------------------------------
        !           283: C Simple addmul loop.
        !           284: C
        !           285: C Using ebx and edi pointing at the ends of their respective locations saves
        !           286: C a couple of instructions in the outer loop.  The inner loop is still 11
        !           287: C cycles, the same as the simple loop in aorsmul_1.asm.
        !           288:
        !           289:        C eax   yp
        !           290:        C ebx   xp end
        !           291:        C ecx   xsize
        !           292:        C edx   ysize-1
        !           293:        C esi
        !           294:        C edi   wp end of mul1
        !           295:        C ebp
        !           296:
        !           297:        movl    4(%eax), %ebp           C multiplier
        !           298:        negl    %ecx
        !           299:
        !           300:        movl    %ecx, PARAM_XSIZE       C -xsize
        !           301:        xorl    %esi, %esi              C initial carry
        !           302:
        !           303:        leal    4(%eax,%edx,4), %eax    C yp end
        !           304:        negl    %edx
        !           305:
        !           306:        movl    %eax, PARAM_YP
        !           307:        movl    %edx, PARAM_YSIZE
        !           308:
        !           309:        jmp     L(simple_outer_entry)
        !           310:
        !           311:
        !           312:        C aligning here saves a couple of cycles
        !           313:        ALIGN(16)
        !           314: L(simple_outer_top):
        !           315:        C edx   ysize counter, negative
        !           316:
        !           317:        movl    PARAM_YP, %eax          C yp end
        !           318:        xorl    %esi, %esi              C carry
        !           319:
        !           320:        movl    PARAM_XSIZE, %ecx       C -xsize
        !           321:        movl    %edx, PARAM_YSIZE
        !           322:
        !           323:        movl    (%eax,%edx,4), %ebp     C yp limb multiplier
        !           324: L(simple_outer_entry):
        !           325:        addl    $4, %edi
        !           326:
        !           327:
        !           328: L(simple_inner):
        !           329:        C eax   scratch
        !           330:        C ebx   xp end
        !           331:        C ecx   counter, negative
        !           332:        C edx   scratch
        !           333:        C esi   carry
        !           334:        C edi   wp end of this addmul
        !           335:        C ebp   multiplier
        !           336:
        !           337:        movl    (%ebx,%ecx,4), %eax
        !           338:
        !           339:        mull    %ebp
        !           340:
        !           341:        addl    %esi, %eax
        !           342:        movl    $0, %esi
        !           343:
        !           344:        adcl    $0, %edx
        !           345:        addl    %eax, (%edi,%ecx,4)
        !           346:        adcl    %edx, %esi
        !           347:
        !           348:        incl    %ecx
        !           349:        jnz     L(simple_inner)
        !           350:
        !           351:
        !           352:        movl    PARAM_YSIZE, %edx
        !           353:        movl    %esi, (%edi)
        !           354:
        !           355:        incl    %edx
        !           356:        jnz     L(simple_outer_top)
        !           357:
        !           358:
        !           359:        popl    %ebx
        !           360:        popl    %esi
        !           361:        popl    %ebp
        !           362:        popl    %edi
        !           363:        ret
        !           364:
        !           365:
        !           366: C -----------------------------------------------------------------------------
        !           367: C Unrolled loop.
        !           368: C
        !           369: C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
        !           370: C some comments.
        !           371: C
        !           372: C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
        !           373: C 0, inclusive.
        !           374: C
        !           375: C VAR_JMP is the computed jump into the unrolled loop.
        !           376: C
        !           377: C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
        !           378: C is entered.
        !           379: C
        !           380: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
        !           381: C start of the unrolled loop.  This can't just be fetched through the xp
        !           382: C pointer because of the offset applied to it.
        !           383: C
        !           384: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
        !           385: C inclusive.
        !           386: C
        !           387: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
        !           388: C added to give the location of the next limb of yp, which is the multiplier
        !           389: C in the unrolled loop.
        !           390: C
        !           391: C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
        !           392: C to give the starting point in the destination for each unrolled loop (this
        !           393: C point is one limb upwards for each limb of yp processed).
        !           394: C
        !           395: C Having PARAM_YSIZE count negative to zero means it's not necessary to
        !           396: C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
        !           397: C the stack remain constant and on each loop an leal adjusts them with the
        !           398: C PARAM_YSIZE counter value.
        !           399:
        !           400:
        !           401: defframe(VAR_COUNTER,      -20)
        !           402: defframe(VAR_COUNTER_INIT, -24)
        !           403: defframe(VAR_JMP,          -28)
        !           404: defframe(VAR_XP_LOW,       -32)
        !           405: deflit(VAR_STACK_SPACE, 16)
        !           406:
        !           407: dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
        !           408: dnl  slower in this code, hence the defframe empty-if-zero feature is
        !           409: dnl  disabled.
        !           410: dnl
        !           411: dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
        !           412: dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
        !           413: dnl  anything isn't clear.
        !           414: dnl
        !           415: define(`defframe_empty_if_zero_disabled',1)
        !           416:
        !           417: L(unroll):
        !           418:        C eax   yp (not used)
        !           419:        C ebx   xp end (not used)
        !           420:        C ecx   xsize
        !           421:        C edx   ysize-1
        !           422:        C esi
        !           423:        C edi   wp end of mul1 (not used)
        !           424:        C ebp
        !           425: deflit(`FRAME', 16)
        !           426:
        !           427:        leal    -2(%ecx), %ebp  C one limb processed at start,
        !           428:        decl    %ecx            C and ebp is one less
        !           429:
        !           430:        shrl    $UNROLL_LOG2, %ebp
        !           431:        negl    %ecx
        !           432:
        !           433:        subl    $VAR_STACK_SPACE, %esp
        !           434: deflit(`FRAME', 16+VAR_STACK_SPACE)
        !           435:        andl    $UNROLL_MASK, %ecx
        !           436:
        !           437:        movl    %ecx, %esi
        !           438:        shll    $4, %ecx
        !           439:
        !           440:        movl    %ebp, VAR_COUNTER_INIT
        !           441:        negl    %esi
        !           442:
        !           443:        C 15 code bytes per limb
        !           444: ifdef(`PIC',`
        !           445:        call    L(pic_calc)
        !           446: L(unroll_here):
        !           447: ',`
        !           448:        leal    L(unroll_entry) (%ecx,%esi,1), %ecx
        !           449: ')
        !           450:
        !           451:        movl    PARAM_XP, %ebx
        !           452:        movl    %ebp, VAR_COUNTER
        !           453:
        !           454:        movl    PARAM_WP, %edi
        !           455:        movl    %ecx, VAR_JMP
        !           456:
        !           457:        movl    (%ebx), %eax
        !           458:        leal    4(%edi,%esi,4), %edi    C wp adjust for unrolling and mul1
        !           459:
        !           460:        leal    (%ebx,%esi,4), %ebx     C xp adjust for unrolling
        !           461:
        !           462:        movl    %eax, VAR_XP_LOW
        !           463:
        !           464:        movl    %ebx, PARAM_XP
        !           465:        movl    PARAM_YP, %ebx
        !           466:
        !           467:        leal    (%edi,%edx,4), %ecx     C wp adjust for ysize indexing
        !           468:        movl    4(%ebx), %ebp           C multiplier (yp second limb)
        !           469:
        !           470:        leal    4(%ebx,%edx,4), %ebx    C yp adjust for ysize indexing
        !           471:
        !           472:        movl    %ecx, PARAM_WP
        !           473:
        !           474:        leal    1(%esi), %ecx   C adjust parity for decl %ecx above
        !           475:
        !           476:        movl    %ebx, PARAM_YP
        !           477:        negl    %edx
        !           478:
        !           479:        movl    %edx, PARAM_YSIZE
        !           480:        jmp     L(unroll_outer_entry)
        !           481:
        !           482:
        !           483: ifdef(`PIC',`
        !           484: L(pic_calc):
        !           485:        C See README.family about old gas bugs
        !           486:        leal    (%ecx,%esi,1), %ecx
        !           487:        addl    $L(unroll_entry)-L(unroll_here), %ecx
        !           488:        addl    (%esp), %ecx
        !           489:        ret
        !           490: ')
        !           491:
        !           492:
        !           493: C -----------------------------------------------------------------------------
        !           494:        C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
        !           495:        C cost any extra space, since the inner unrolled loop below is
        !           496:        C aligned to 32.
        !           497:        ALIGN(32)
        !           498: L(unroll_outer_top):
        !           499:        C edx   ysize
        !           500:
        !           501:        movl    PARAM_YP, %eax
        !           502:        movl    %edx, PARAM_YSIZE       C incremented ysize counter
        !           503:
        !           504:        movl    PARAM_WP, %edi
        !           505:
        !           506:        movl    VAR_COUNTER_INIT, %ebx
        !           507:        movl    (%eax,%edx,4), %ebp     C next multiplier
        !           508:
        !           509:        movl    PARAM_XSIZE, %ecx
        !           510:        leal    (%edi,%edx,4), %edi     C adjust wp for where we are in yp
        !           511:
        !           512:        movl    VAR_XP_LOW, %eax
        !           513:        movl    %ebx, VAR_COUNTER
        !           514:
        !           515: L(unroll_outer_entry):
        !           516:        mull    %ebp
        !           517:
        !           518:        C using testb is a tiny bit faster than testl
        !           519:        testb   $1, %cl
        !           520:
        !           521:        movl    %eax, %ecx      C low carry
        !           522:        movl    VAR_JMP, %eax
        !           523:
        !           524:        movl    %edx, %esi      C high carry
        !           525:        movl    PARAM_XP, %ebx
        !           526:
        !           527:        jnz     L(unroll_noswap)
        !           528:        movl    %ecx, %esi      C high,low carry other way around
        !           529:
        !           530:        movl    %edx, %ecx
        !           531: L(unroll_noswap):
        !           532:
        !           533:        jmp     *%eax
        !           534:
        !           535:
        !           536:
        !           537: C -----------------------------------------------------------------------------
        !           538:        ALIGN(32)
        !           539: L(unroll_top):
        !           540:        C eax   scratch
        !           541:        C ebx   xp
        !           542:        C ecx   carry low
        !           543:        C edx   scratch
        !           544:        C esi   carry high
        !           545:        C edi   wp
        !           546:        C ebp   multiplier
        !           547:        C VAR_COUNTER  loop counter
        !           548:        C
        !           549:        C 15 code bytes each limb
        !           550:
        !           551:        leal    UNROLL_BYTES(%edi), %edi
        !           552:
        !           553: L(unroll_entry):
        !           554: deflit(CHUNK_COUNT,2)
        !           555: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           556:        deflit(`disp0', eval(i*CHUNK_COUNT*4))
        !           557:        deflit(`disp1', eval(disp0 + 4))
        !           558:        deflit(`disp2', eval(disp1 + 4))
        !           559:
        !           560:        movl    disp1(%ebx), %eax
        !           561:        mull    %ebp
        !           562: Zdisp( addl,   %ecx, disp0,(%edi))
        !           563:        adcl    %eax, %esi
        !           564:        movl    %edx, %ecx
        !           565:        jadcl0( %ecx)
        !           566:
        !           567:        movl    disp2(%ebx), %eax
        !           568:        mull    %ebp
        !           569:        addl    %esi, disp1(%edi)
        !           570:        adcl    %eax, %ecx
        !           571:        movl    %edx, %esi
        !           572:        jadcl0( %esi)
        !           573: ')
        !           574:
        !           575:        decl    VAR_COUNTER
        !           576:        leal    UNROLL_BYTES(%ebx), %ebx
        !           577:
        !           578:        jns     L(unroll_top)
        !           579:
        !           580:
        !           581:        movl    PARAM_YSIZE, %edx
        !           582:        addl    %ecx, UNROLL_BYTES(%edi)
        !           583:
        !           584:        adcl    $0, %esi
        !           585:
        !           586:        incl    %edx
        !           587:        movl    %esi, UNROLL_BYTES+4(%edi)
        !           588:
        !           589:        jnz     L(unroll_outer_top)
        !           590:
        !           591:
        !           592:        movl    SAVE_ESI, %esi
        !           593:        movl    SAVE_EBP, %ebp
        !           594:        movl    SAVE_EDI, %edi
        !           595:        movl    SAVE_EBX, %ebx
        !           596:
        !           597:        addl    $FRAME, %esp
        !           598:        ret
        !           599:
        !           600: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>