[BACK]Return to mul_basecase.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mul_basecase.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
        !             2: dnl
        !             3: dnl  K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
        !             4: dnl      limbs/loop unrolling).
        !             5:
        !             6:
        !             7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             8: dnl
        !             9: dnl  This file is part of the GNU MP Library.
        !            10: dnl
        !            11: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            12: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            13: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            14: dnl  License, or (at your option) any later version.
        !            15: dnl
        !            16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            19: dnl  Lesser General Public License for more details.
        !            20: dnl
        !            21: dnl  You should have received a copy of the GNU Lesser General Public
        !            22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            24: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            25:
        !            26:
        !            27: include(`../config.m4')
        !            28:
        !            29:
        !            30: dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
        !            31: dnl           8           4.67
        !            32: dnl          16           4.59
        !            33: dnl          32           4.42
        !            34: dnl  Maximum possible with the current code is 32.
        !            35: dnl
        !            36: dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
        !            37: dnl  done with a straight run through a block of code, no inner loop.  Using
        !            38: dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
        !            39:
        !            40: deflit(UNROLL_COUNT, 32)
        !            41:
        !            42:
        !            43: C void mpn_mul_basecase (mp_ptr wp,
        !            44: C                        mp_srcptr xp, mp_size_t xsize,
        !            45: C                        mp_srcptr yp, mp_size_t ysize);
        !            46: C
        !            47: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
        !            48: C wp,xsize+ysize.
        !            49: C
        !            50: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
        !            51: C it's faster because it does most of the mpn_addmul_1() startup
        !            52: C calculations only once.  The saving is 15-25% on typical sizes coming from
        !            53: C the Karatsuba multiply code.
        !            54:
        !            55: ifdef(`PIC',`
        !            56: deflit(UNROLL_THRESHOLD, 5)
        !            57: ',`
        !            58: deflit(UNROLL_THRESHOLD, 5)
        !            59: ')
        !            60:
        !            61: defframe(PARAM_YSIZE,20)
        !            62: defframe(PARAM_YP,   16)
        !            63: defframe(PARAM_XSIZE,12)
        !            64: defframe(PARAM_XP,   8)
        !            65: defframe(PARAM_WP,   4)
        !            66:
        !            67:        .text
        !            68:        ALIGN(32)
        !            69: PROLOGUE(mpn_mul_basecase)
        !            70: deflit(`FRAME',0)
        !            71:
        !            72:        movl    PARAM_XSIZE, %ecx
        !            73:        movl    PARAM_YP, %eax
        !            74:
        !            75:        movl    PARAM_XP, %edx
        !            76:        movl    (%eax), %eax    C yp low limb
        !            77:
        !            78:        cmpl    $2, %ecx
        !            79:        ja      L(xsize_more_than_two)
        !            80:        je      L(two_by_something)
        !            81:
        !            82:
        !            83:        C one limb by one limb
        !            84:
        !            85:        mull    (%edx)
        !            86:
        !            87:        movl    PARAM_WP, %ecx
        !            88:        movl    %eax, (%ecx)
        !            89:        movl    %edx, 4(%ecx)
        !            90:        ret
        !            91:
        !            92:
        !            93: C -----------------------------------------------------------------------------
        !            94: L(two_by_something):
        !            95: deflit(`FRAME',0)
        !            96:        decl    PARAM_YSIZE
        !            97:        pushl   %ebx            defframe_pushl(`SAVE_EBX')
        !            98:        movl    %eax, %ecx      C yp low limb
        !            99:
        !           100:        movl    PARAM_WP, %ebx
        !           101:        pushl   %esi            defframe_pushl(`SAVE_ESI')
        !           102:        movl    %edx, %esi      C xp
        !           103:
        !           104:        movl    (%edx), %eax    C xp low limb
        !           105:        jnz     L(two_by_two)
        !           106:
        !           107:
        !           108:        C two limbs by one limb
        !           109:
        !           110:        mull    %ecx
        !           111:
        !           112:        movl    %eax, (%ebx)
        !           113:        movl    4(%esi), %eax
        !           114:        movl    %edx, %esi      C carry
        !           115:
        !           116:        mull    %ecx
        !           117:
        !           118:        addl    %eax, %esi
        !           119:
        !           120:        movl    %esi, 4(%ebx)
        !           121:        movl    SAVE_ESI, %esi
        !           122:
        !           123:        adcl    $0, %edx
        !           124:
        !           125:        movl    %edx, 8(%ebx)
        !           126:        movl    SAVE_EBX, %ebx
        !           127:        addl    $FRAME, %esp
        !           128:
        !           129:        ret
        !           130:
        !           131:
        !           132:
        !           133: C -----------------------------------------------------------------------------
        !           134: C Could load yp earlier into another register.
        !           135:
        !           136:        ALIGN(16)
        !           137: L(two_by_two):
        !           138:        C eax   xp low limb
        !           139:        C ebx   wp
        !           140:        C ecx   yp low limb
        !           141:        C edx
        !           142:        C esi   xp
        !           143:        C edi
        !           144:        C ebp
        !           145:
        !           146: dnl  FRAME carries on from previous
        !           147:
        !           148:        mull    %ecx            C xp[0] * yp[0]
        !           149:
        !           150:        push    %edi            defframe_pushl(`SAVE_EDI')
        !           151:        movl    %edx, %edi      C carry, for wp[1]
        !           152:
        !           153:        movl    %eax, (%ebx)
        !           154:        movl    4(%esi), %eax
        !           155:
        !           156:        mull    %ecx            C xp[1] * yp[0]
        !           157:
        !           158:        addl    %eax, %edi
        !           159:        movl    PARAM_YP, %ecx
        !           160:
        !           161:        adcl    $0, %edx
        !           162:        movl    4(%ecx), %ecx   C yp[1]
        !           163:        movl    %edi, 4(%ebx)
        !           164:
        !           165:        movl    4(%esi), %eax   C xp[1]
        !           166:        movl    %edx, %edi      C carry, for wp[2]
        !           167:
        !           168:        mull    %ecx            C xp[1] * yp[1]
        !           169:
        !           170:        addl    %eax, %edi
        !           171:
        !           172:        adcl    $0, %edx
        !           173:        movl    (%esi), %eax    C xp[0]
        !           174:
        !           175:        movl    %edx, %esi      C carry, for wp[3]
        !           176:
        !           177:        mull    %ecx            C xp[0] * yp[1]
        !           178:
        !           179:        addl    %eax, 4(%ebx)
        !           180:        adcl    %edx, %edi
        !           181:        movl    %edi, 8(%ebx)
        !           182:
        !           183:        adcl    $0, %esi
        !           184:        movl    SAVE_EDI, %edi
        !           185:        movl    %esi, 12(%ebx)
        !           186:
        !           187:        movl    SAVE_ESI, %esi
        !           188:        movl    SAVE_EBX, %ebx
        !           189:        addl    $FRAME, %esp
        !           190:
        !           191:        ret
        !           192:
        !           193:
        !           194: C -----------------------------------------------------------------------------
        !           195:        ALIGN(16)
        !           196: L(xsize_more_than_two):
        !           197:
        !           198: C The first limb of yp is processed with a simple mpn_mul_1 style loop
        !           199: C inline.  Unrolling this doesn't seem worthwhile since it's only run once
        !           200: C (whereas the addmul below is run ysize-1 many times).  A call to the
        !           201: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
        !           202: C popping, and doesn't seem likely to be worthwhile on the typical 13-26
        !           203: C limb operations the Karatsuba code calls here with.
        !           204:
        !           205:        C eax   yp[0]
        !           206:        C ebx
        !           207:        C ecx   xsize
        !           208:        C edx   xp
        !           209:        C esi
        !           210:        C edi
        !           211:        C ebp
        !           212:
        !           213: dnl  FRAME doesn't carry on from previous, no pushes yet here
        !           214: defframe(`SAVE_EBX',-4)
        !           215: defframe(`SAVE_ESI',-8)
        !           216: defframe(`SAVE_EDI',-12)
        !           217: defframe(`SAVE_EBP',-16)
        !           218: deflit(`FRAME',0)
        !           219:
        !           220:        subl    $16, %esp
        !           221: deflit(`FRAME',16)
        !           222:
        !           223:        movl    %edi, SAVE_EDI
        !           224:        movl    PARAM_WP, %edi
        !           225:
        !           226:        movl    %ebx, SAVE_EBX
        !           227:        movl    %ebp, SAVE_EBP
        !           228:        movl    %eax, %ebp
        !           229:
        !           230:        movl    %esi, SAVE_ESI
        !           231:        xorl    %ebx, %ebx
        !           232:        leal    (%edx,%ecx,4), %esi     C xp end
        !           233:
        !           234:        leal    (%edi,%ecx,4), %edi     C wp end of mul1
        !           235:        negl    %ecx
        !           236:
        !           237:
        !           238: L(mul1):
        !           239:        C eax   scratch
        !           240:        C ebx   carry
        !           241:        C ecx   counter, negative
        !           242:        C edx   scratch
        !           243:        C esi   xp end
        !           244:        C edi   wp end of mul1
        !           245:        C ebp   multiplier
        !           246:
        !           247:        movl    (%esi,%ecx,4), %eax
        !           248:
        !           249:        mull    %ebp
        !           250:
        !           251:        addl    %ebx, %eax
        !           252:        movl    %eax, (%edi,%ecx,4)
        !           253:        movl    $0, %ebx
        !           254:
        !           255:        adcl    %edx, %ebx
        !           256:        incl    %ecx
        !           257:        jnz     L(mul1)
        !           258:
        !           259:
        !           260:        movl    PARAM_YSIZE, %edx
        !           261:        movl    PARAM_XSIZE, %ecx
        !           262:
        !           263:        movl    %ebx, (%edi)            C final carry
        !           264:        decl    %edx
        !           265:
        !           266:        jnz     L(ysize_more_than_one)
        !           267:
        !           268:
        !           269:        movl    SAVE_EDI, %edi
        !           270:        movl    SAVE_EBX, %ebx
        !           271:
        !           272:        movl    SAVE_EBP, %ebp
        !           273:        movl    SAVE_ESI, %esi
        !           274:        addl    $FRAME, %esp
        !           275:
        !           276:        ret
        !           277:
        !           278:
        !           279: L(ysize_more_than_one):
        !           280:        cmpl    $UNROLL_THRESHOLD, %ecx
        !           281:        movl    PARAM_YP, %eax
        !           282:
        !           283:        jae     L(unroll)
        !           284:
        !           285:
        !           286: C -----------------------------------------------------------------------------
        !           287:        C simple addmul looping
        !           288:        C
        !           289:        C eax   yp
        !           290:        C ebx
        !           291:        C ecx   xsize
        !           292:        C edx   ysize-1
        !           293:        C esi   xp end
        !           294:        C edi   wp end of mul1
        !           295:        C ebp
        !           296:
        !           297:        leal    4(%eax,%edx,4), %ebp    C yp end
        !           298:        negl    %ecx
        !           299:        negl    %edx
        !           300:
        !           301:        movl    (%esi,%ecx,4), %eax     C xp low limb
        !           302:        movl    %edx, PARAM_YSIZE       C -(ysize-1)
        !           303:        incl    %ecx
        !           304:
        !           305:        xorl    %ebx, %ebx              C initial carry
        !           306:        movl    %ecx, PARAM_XSIZE       C -(xsize-1)
        !           307:        movl    %ebp, PARAM_YP
        !           308:
        !           309:        movl    (%ebp,%edx,4), %ebp     C yp second lowest limb - multiplier
        !           310:        jmp     L(simple_outer_entry)
        !           311:
        !           312:
        !           313:        C this is offset 0x121 so close enough to aligned
        !           314: L(simple_outer_top):
        !           315:        C ebp   ysize counter, negative
        !           316:
        !           317:        movl    PARAM_YP, %edx
        !           318:        movl    PARAM_XSIZE, %ecx       C -(xsize-1)
        !           319:        xorl    %ebx, %ebx              C carry
        !           320:
        !           321:        movl    %ebp, PARAM_YSIZE
        !           322:        addl    $4, %edi                C next position in wp
        !           323:
        !           324:        movl    (%edx,%ebp,4), %ebp     C yp limb - multiplier
        !           325:        movl    -4(%esi,%ecx,4), %eax   C xp low limb
        !           326:
        !           327:
        !           328: L(simple_outer_entry):
        !           329:
        !           330: L(simple_inner):
        !           331:        C eax   xp limb
        !           332:        C ebx   carry limb
        !           333:        C ecx   loop counter (negative)
        !           334:        C edx   scratch
        !           335:        C esi   xp end
        !           336:        C edi   wp end
        !           337:        C ebp   multiplier
        !           338:
        !           339:        mull    %ebp
        !           340:
        !           341:        addl    %eax, %ebx
        !           342:        adcl    $0, %edx
        !           343:
        !           344:        addl    %ebx, (%edi,%ecx,4)
        !           345:        movl    (%esi,%ecx,4), %eax
        !           346:        adcl    $0, %edx
        !           347:
        !           348:        incl    %ecx
        !           349:        movl    %edx, %ebx
        !           350:        jnz     L(simple_inner)
        !           351:
        !           352:
        !           353:        mull    %ebp
        !           354:
        !           355:        movl    PARAM_YSIZE, %ebp
        !           356:        addl    %eax, %ebx
        !           357:
        !           358:        adcl    $0, %edx
        !           359:        addl    %ebx, (%edi)
        !           360:
        !           361:        adcl    $0, %edx
        !           362:        incl    %ebp
        !           363:
        !           364:        movl    %edx, 4(%edi)
        !           365:        jnz     L(simple_outer_top)
        !           366:
        !           367:
        !           368:        movl    SAVE_EBX, %ebx
        !           369:        movl    SAVE_ESI, %esi
        !           370:
        !           371:        movl    SAVE_EDI, %edi
        !           372:        movl    SAVE_EBP, %ebp
        !           373:        addl    $FRAME, %esp
        !           374:
        !           375:        ret
        !           376:
        !           377:
        !           378:
        !           379: C -----------------------------------------------------------------------------
        !           380: C
        !           381: C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
        !           382: C comments.
        !           383: C
        !           384: C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
        !           385: C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
        !           386: C to given an initial VAR_COUNTER at the top of the outer loop.
        !           387: C
        !           388: C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
        !           389: C up to -1, inclusive.
        !           390: C
        !           391: C VAR_JMP is the computed jump into the unrolled loop.
        !           392: C
        !           393: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
        !           394: C start of the unrolled loop.
        !           395: C
        !           396: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
        !           397: C inclusive.
        !           398: C
        !           399: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
        !           400: C added to give the location of the next limb of yp, which is the multiplier
        !           401: C in the unrolled loop.
        !           402: C
        !           403: C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
        !           404: C outer loop to take care of xp, wp and the inner loop counter.
        !           405:
        !           406: defframe(VAR_COUNTER,  -20)
        !           407: defframe(VAR_ADJUST,   -24)
        !           408: defframe(VAR_JMP,      -28)
        !           409: defframe(VAR_XP_LOW,   -32)
        !           410: deflit(VAR_EXTRA_SPACE, 16)
        !           411:
        !           412:
        !           413: L(unroll):
        !           414:        C eax   yp
        !           415:        C ebx
        !           416:        C ecx   xsize
        !           417:        C edx   ysize-1
        !           418:        C esi   xp end
        !           419:        C edi   wp end of mul1
        !           420:        C ebp
        !           421:
        !           422:        movl    PARAM_XP, %esi
        !           423:        movl    4(%eax), %ebp           C multiplier (yp second limb)
        !           424:        leal    4(%eax,%edx,4), %eax    C yp adjust for ysize indexing
        !           425:
        !           426:        movl    PARAM_WP, %edi
        !           427:        movl    %eax, PARAM_YP
        !           428:        negl    %edx
        !           429:
        !           430:        movl    %edx, PARAM_YSIZE
        !           431:        leal    UNROLL_COUNT-2(%ecx), %ebx      C (xsize-1)+UNROLL_COUNT-1
        !           432:        decl    %ecx                            C xsize-1
        !           433:
        !           434:        movl    (%esi), %eax            C xp low limb
        !           435:        andl    $-UNROLL_MASK-1, %ebx
        !           436:        negl    %ecx
        !           437:
        !           438:        subl    $VAR_EXTRA_SPACE, %esp
        !           439: deflit(`FRAME',16+VAR_EXTRA_SPACE)
        !           440:        negl    %ebx
        !           441:        andl    $UNROLL_MASK, %ecx
        !           442:
        !           443:        movl    %ebx, VAR_ADJUST
        !           444:        movl    %ecx, %edx
        !           445:        shll    $4, %ecx
        !           446:
        !           447:        sarl    $UNROLL_LOG2, %ebx
        !           448:
        !           449:        C 17 code bytes per limb
        !           450: ifdef(`PIC',`
        !           451:        call    L(pic_calc)
        !           452: L(unroll_here):
        !           453: ',`
        !           454:        leal    L(unroll_entry) (%ecx,%edx,1), %ecx
        !           455: ')
        !           456:        negl    %edx
        !           457:
        !           458:        movl    %eax, VAR_XP_LOW
        !           459:        movl    %ecx, VAR_JMP
        !           460:        leal    4(%edi,%edx,4), %edi    C wp and xp, adjust for unrolling,
        !           461:        leal    4(%esi,%edx,4), %esi    C  and start at second limb
        !           462:        jmp     L(unroll_outer_entry)
        !           463:
        !           464:
        !           465: ifdef(`PIC',`
        !           466: L(pic_calc):
        !           467:        C See README.family about old gas bugs
        !           468:        leal    (%ecx,%edx,1), %ecx
        !           469:        addl    $L(unroll_entry)-L(unroll_here), %ecx
        !           470:        addl    (%esp), %ecx
        !           471:        ret
        !           472: ')
        !           473:
        !           474:
        !           475: C --------------------------------------------------------------------------
        !           476:        ALIGN(32)
        !           477: L(unroll_outer_top):
        !           478:        C ebp   ysize counter, negative
        !           479:
        !           480:        movl    VAR_ADJUST, %ebx
        !           481:        movl    PARAM_YP, %edx
        !           482:
        !           483:        movl    VAR_XP_LOW, %eax
        !           484:        movl    %ebp, PARAM_YSIZE       C store incremented ysize counter
        !           485:
        !           486:        leal    4(%edi,%ebx,4), %edi
        !           487:        leal    (%esi,%ebx,4), %esi
        !           488:        sarl    $UNROLL_LOG2, %ebx
        !           489:
        !           490:        movl    (%edx,%ebp,4), %ebp     C yp next multiplier
        !           491:        movl    VAR_JMP, %ecx
        !           492:
        !           493: L(unroll_outer_entry):
        !           494:        mull    %ebp
        !           495:
        !           496:        testb   $1, %cl         C and clear carry bit
        !           497:        movl    %ebx, VAR_COUNTER
        !           498:        movl    $0, %ebx
        !           499:
        !           500:        movl    $0, %ecx
        !           501:        cmovz(  %eax, %ecx)     C eax into low carry, zero into high carry limb
        !           502:        cmovnz( %eax, %ebx)
        !           503:
        !           504:        C Extra fetch of VAR_JMP is bad, but registers are tight
        !           505:        jmp     *VAR_JMP
        !           506:
        !           507:
        !           508: C -----------------------------------------------------------------------------
        !           509:        ALIGN(32)
        !           510: L(unroll_top):
        !           511:        C eax   xp limb
        !           512:        C ebx   carry high
        !           513:        C ecx   carry low
        !           514:        C edx   scratch
        !           515:        C esi   xp+8
        !           516:        C edi   wp
        !           517:        C ebp   yp multiplier limb
        !           518:        C
        !           519:        C VAR_COUNTER  loop counter, negative
        !           520:        C
        !           521:        C 17 bytes each limb
        !           522:
        !           523: L(unroll_entry):
        !           524:
        !           525: deflit(CHUNK_COUNT,2)
        !           526: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           527:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
        !           528:        deflit(`disp1', eval(disp0 + 4))
        !           529:
        !           530: Zdisp( movl,   disp0,(%esi), %eax)
        !           531:        adcl    %edx, %ebx
        !           532:
        !           533:        mull    %ebp
        !           534:
        !           535: Zdisp( addl,   %ecx, disp0,(%edi))
        !           536:        movl    $0, %ecx
        !           537:
        !           538:        adcl    %eax, %ebx
        !           539:
        !           540:
        !           541:        movl    disp1(%esi), %eax
        !           542:        adcl    %edx, %ecx
        !           543:
        !           544:        mull    %ebp
        !           545:
        !           546:        addl    %ebx, disp1(%edi)
        !           547:        movl    $0, %ebx
        !           548:
        !           549:        adcl    %eax, %ecx
        !           550: ')
        !           551:
        !           552:
        !           553:        incl    VAR_COUNTER
        !           554:        leal    UNROLL_BYTES(%esi), %esi
        !           555:        leal    UNROLL_BYTES(%edi), %edi
        !           556:
        !           557:        jnz     L(unroll_top)
        !           558:
        !           559:
        !           560:        C eax
        !           561:        C ebx   zero
        !           562:        C ecx   low
        !           563:        C edx   high
        !           564:        C esi
        !           565:        C edi   wp, pointing at second last limb)
        !           566:        C ebp
        !           567:        C
        !           568:        C carry flag to be added to high
        !           569:
        !           570: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
        !           571: deflit(`disp1', eval(disp0-0 + 4))
        !           572:
        !           573:        movl    PARAM_YSIZE, %ebp
        !           574:        adcl    $0, %edx
        !           575:        addl    %ecx, disp0(%edi)
        !           576:
        !           577:        adcl    $0, %edx
        !           578:        incl    %ebp
        !           579:
        !           580:        movl    %edx, disp1(%edi)
        !           581:        jnz     L(unroll_outer_top)
        !           582:
        !           583:
        !           584:        movl    SAVE_ESI, %esi
        !           585:        movl    SAVE_EBP, %ebp
        !           586:
        !           587:        movl    SAVE_EDI, %edi
        !           588:        movl    SAVE_EBX, %ebx
        !           589:        addl    $FRAME, %esp
        !           590:
        !           591:        ret
        !           592:
        !           593: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>