[BACK]Return to mul_basecase.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mul_basecase.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
                      2: dnl
                      3: dnl  K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
                      4: dnl      limbs/loop unrolling).
                      5:
                      6:
                      7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      8: dnl
                      9: dnl  This file is part of the GNU MP Library.
                     10: dnl
                     11: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     12: dnl  modify it under the terms of the GNU Lesser General Public License as
                     13: dnl  published by the Free Software Foundation; either version 2.1 of the
                     14: dnl  License, or (at your option) any later version.
                     15: dnl
                     16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     19: dnl  Lesser General Public License for more details.
                     20: dnl
                     21: dnl  You should have received a copy of the GNU Lesser General Public
                     22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     24: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     25:
                     26:
                     27: include(`../config.m4')
                     28:
                     29:
                     30: dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
                     31: dnl           8           4.67
                     32: dnl          16           4.59
                     33: dnl          32           4.42
                     34: dnl  Maximum possible with the current code is 32.
                     35: dnl
                     36: dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
                     37: dnl  done with a straight run through a block of code, no inner loop.  Using
                     38: dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
                     39:
                     40: deflit(UNROLL_COUNT, 32)
                     41:
                     42:
                     43: C void mpn_mul_basecase (mp_ptr wp,
                     44: C                        mp_srcptr xp, mp_size_t xsize,
                     45: C                        mp_srcptr yp, mp_size_t ysize);
                     46: C
                     47: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
                     48: C wp,xsize+ysize.
                     49: C
                     50: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
                     51: C it's faster because it does most of the mpn_addmul_1() startup
                     52: C calculations only once.  The saving is 15-25% on typical sizes coming from
                     53: C the Karatsuba multiply code.
                     54:
                     55: ifdef(`PIC',`
                     56: deflit(UNROLL_THRESHOLD, 5)
                     57: ',`
                     58: deflit(UNROLL_THRESHOLD, 5)
                     59: ')
                     60:
                     61: defframe(PARAM_YSIZE,20)
                     62: defframe(PARAM_YP,   16)
                     63: defframe(PARAM_XSIZE,12)
                     64: defframe(PARAM_XP,   8)
                     65: defframe(PARAM_WP,   4)
                     66:
                     67:        .text
                     68:        ALIGN(32)
                     69: PROLOGUE(mpn_mul_basecase)
                     70: deflit(`FRAME',0)
                     71:
                     72:        movl    PARAM_XSIZE, %ecx
                     73:        movl    PARAM_YP, %eax
                     74:
                     75:        movl    PARAM_XP, %edx
                     76:        movl    (%eax), %eax    C yp low limb
                     77:
                     78:        cmpl    $2, %ecx
                     79:        ja      L(xsize_more_than_two)
                     80:        je      L(two_by_something)
                     81:
                     82:
                     83:        C one limb by one limb
                     84:
                     85:        mull    (%edx)
                     86:
                     87:        movl    PARAM_WP, %ecx
                     88:        movl    %eax, (%ecx)
                     89:        movl    %edx, 4(%ecx)
                     90:        ret
                     91:
                     92:
                     93: C -----------------------------------------------------------------------------
                     94: L(two_by_something):
                     95: deflit(`FRAME',0)
                     96:        decl    PARAM_YSIZE
                     97:        pushl   %ebx            defframe_pushl(`SAVE_EBX')
                     98:        movl    %eax, %ecx      C yp low limb
                     99:
                    100:        movl    PARAM_WP, %ebx
                    101:        pushl   %esi            defframe_pushl(`SAVE_ESI')
                    102:        movl    %edx, %esi      C xp
                    103:
                    104:        movl    (%edx), %eax    C xp low limb
                    105:        jnz     L(two_by_two)
                    106:
                    107:
                    108:        C two limbs by one limb
                    109:
                    110:        mull    %ecx
                    111:
                    112:        movl    %eax, (%ebx)
                    113:        movl    4(%esi), %eax
                    114:        movl    %edx, %esi      C carry
                    115:
                    116:        mull    %ecx
                    117:
                    118:        addl    %eax, %esi
                    119:
                    120:        movl    %esi, 4(%ebx)
                    121:        movl    SAVE_ESI, %esi
                    122:
                    123:        adcl    $0, %edx
                    124:
                    125:        movl    %edx, 8(%ebx)
                    126:        movl    SAVE_EBX, %ebx
                    127:        addl    $FRAME, %esp
                    128:
                    129:        ret
                    130:
                    131:
                    132:
                    133: C -----------------------------------------------------------------------------
                    134: C Could load yp earlier into another register.
                    135:
                    136:        ALIGN(16)
                    137: L(two_by_two):
                    138:        C eax   xp low limb
                    139:        C ebx   wp
                    140:        C ecx   yp low limb
                    141:        C edx
                    142:        C esi   xp
                    143:        C edi
                    144:        C ebp
                    145:
                    146: dnl  FRAME carries on from previous
                    147:
                    148:        mull    %ecx            C xp[0] * yp[0]
                    149:
                    150:        push    %edi            defframe_pushl(`SAVE_EDI')
                    151:        movl    %edx, %edi      C carry, for wp[1]
                    152:
                    153:        movl    %eax, (%ebx)
                    154:        movl    4(%esi), %eax
                    155:
                    156:        mull    %ecx            C xp[1] * yp[0]
                    157:
                    158:        addl    %eax, %edi
                    159:        movl    PARAM_YP, %ecx
                    160:
                    161:        adcl    $0, %edx
                    162:        movl    4(%ecx), %ecx   C yp[1]
                    163:        movl    %edi, 4(%ebx)
                    164:
                    165:        movl    4(%esi), %eax   C xp[1]
                    166:        movl    %edx, %edi      C carry, for wp[2]
                    167:
                    168:        mull    %ecx            C xp[1] * yp[1]
                    169:
                    170:        addl    %eax, %edi
                    171:
                    172:        adcl    $0, %edx
                    173:        movl    (%esi), %eax    C xp[0]
                    174:
                    175:        movl    %edx, %esi      C carry, for wp[3]
                    176:
                    177:        mull    %ecx            C xp[0] * yp[1]
                    178:
                    179:        addl    %eax, 4(%ebx)
                    180:        adcl    %edx, %edi
                    181:        movl    %edi, 8(%ebx)
                    182:
                    183:        adcl    $0, %esi
                    184:        movl    SAVE_EDI, %edi
                    185:        movl    %esi, 12(%ebx)
                    186:
                    187:        movl    SAVE_ESI, %esi
                    188:        movl    SAVE_EBX, %ebx
                    189:        addl    $FRAME, %esp
                    190:
                    191:        ret
                    192:
                    193:
                    194: C -----------------------------------------------------------------------------
                    195:        ALIGN(16)
                    196: L(xsize_more_than_two):
                    197:
                    198: C The first limb of yp is processed with a simple mpn_mul_1 style loop
                    199: C inline.  Unrolling this doesn't seem worthwhile since it's only run once
                    200: C (whereas the addmul below is run ysize-1 many times).  A call to the
                    201: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
                    202: C popping, and doesn't seem likely to be worthwhile on the typical 13-26
                    203: C limb operations the Karatsuba code calls here with.
                    204:
                    205:        C eax   yp[0]
                    206:        C ebx
                    207:        C ecx   xsize
                    208:        C edx   xp
                    209:        C esi
                    210:        C edi
                    211:        C ebp
                    212:
                    213: dnl  FRAME doesn't carry on from previous, no pushes yet here
                    214: defframe(`SAVE_EBX',-4)
                    215: defframe(`SAVE_ESI',-8)
                    216: defframe(`SAVE_EDI',-12)
                    217: defframe(`SAVE_EBP',-16)
                    218: deflit(`FRAME',0)
                    219:
                    220:        subl    $16, %esp
                    221: deflit(`FRAME',16)
                    222:
                    223:        movl    %edi, SAVE_EDI
                    224:        movl    PARAM_WP, %edi
                    225:
                    226:        movl    %ebx, SAVE_EBX
                    227:        movl    %ebp, SAVE_EBP
                    228:        movl    %eax, %ebp
                    229:
                    230:        movl    %esi, SAVE_ESI
                    231:        xorl    %ebx, %ebx
                    232:        leal    (%edx,%ecx,4), %esi     C xp end
                    233:
                    234:        leal    (%edi,%ecx,4), %edi     C wp end of mul1
                    235:        negl    %ecx
                    236:
                    237:
                    238: L(mul1):
                    239:        C eax   scratch
                    240:        C ebx   carry
                    241:        C ecx   counter, negative
                    242:        C edx   scratch
                    243:        C esi   xp end
                    244:        C edi   wp end of mul1
                    245:        C ebp   multiplier
                    246:
                    247:        movl    (%esi,%ecx,4), %eax
                    248:
                    249:        mull    %ebp
                    250:
                    251:        addl    %ebx, %eax
                    252:        movl    %eax, (%edi,%ecx,4)
                    253:        movl    $0, %ebx
                    254:
                    255:        adcl    %edx, %ebx
                    256:        incl    %ecx
                    257:        jnz     L(mul1)
                    258:
                    259:
                    260:        movl    PARAM_YSIZE, %edx
                    261:        movl    PARAM_XSIZE, %ecx
                    262:
                    263:        movl    %ebx, (%edi)            C final carry
                    264:        decl    %edx
                    265:
                    266:        jnz     L(ysize_more_than_one)
                    267:
                    268:
                    269:        movl    SAVE_EDI, %edi
                    270:        movl    SAVE_EBX, %ebx
                    271:
                    272:        movl    SAVE_EBP, %ebp
                    273:        movl    SAVE_ESI, %esi
                    274:        addl    $FRAME, %esp
                    275:
                    276:        ret
                    277:
                    278:
                    279: L(ysize_more_than_one):
                    280:        cmpl    $UNROLL_THRESHOLD, %ecx
                    281:        movl    PARAM_YP, %eax
                    282:
                    283:        jae     L(unroll)
                    284:
                    285:
                    286: C -----------------------------------------------------------------------------
                    287:        C simple addmul looping
                    288:        C
                    289:        C eax   yp
                    290:        C ebx
                    291:        C ecx   xsize
                    292:        C edx   ysize-1
                    293:        C esi   xp end
                    294:        C edi   wp end of mul1
                    295:        C ebp
                    296:
                    297:        leal    4(%eax,%edx,4), %ebp    C yp end
                    298:        negl    %ecx
                    299:        negl    %edx
                    300:
                    301:        movl    (%esi,%ecx,4), %eax     C xp low limb
                    302:        movl    %edx, PARAM_YSIZE       C -(ysize-1)
                    303:        incl    %ecx
                    304:
                    305:        xorl    %ebx, %ebx              C initial carry
                    306:        movl    %ecx, PARAM_XSIZE       C -(xsize-1)
                    307:        movl    %ebp, PARAM_YP
                    308:
                    309:        movl    (%ebp,%edx,4), %ebp     C yp second lowest limb - multiplier
                    310:        jmp     L(simple_outer_entry)
                    311:
                    312:
                    313:        C this is offset 0x121 so close enough to aligned
                    314: L(simple_outer_top):
                    315:        C ebp   ysize counter, negative
                    316:
                    317:        movl    PARAM_YP, %edx
                    318:        movl    PARAM_XSIZE, %ecx       C -(xsize-1)
                    319:        xorl    %ebx, %ebx              C carry
                    320:
                    321:        movl    %ebp, PARAM_YSIZE
                    322:        addl    $4, %edi                C next position in wp
                    323:
                    324:        movl    (%edx,%ebp,4), %ebp     C yp limb - multiplier
                    325:        movl    -4(%esi,%ecx,4), %eax   C xp low limb
                    326:
                    327:
                    328: L(simple_outer_entry):
                    329:
                    330: L(simple_inner):
                    331:        C eax   xp limb
                    332:        C ebx   carry limb
                    333:        C ecx   loop counter (negative)
                    334:        C edx   scratch
                    335:        C esi   xp end
                    336:        C edi   wp end
                    337:        C ebp   multiplier
                    338:
                    339:        mull    %ebp
                    340:
                    341:        addl    %eax, %ebx
                    342:        adcl    $0, %edx
                    343:
                    344:        addl    %ebx, (%edi,%ecx,4)
                    345:        movl    (%esi,%ecx,4), %eax
                    346:        adcl    $0, %edx
                    347:
                    348:        incl    %ecx
                    349:        movl    %edx, %ebx
                    350:        jnz     L(simple_inner)
                    351:
                    352:
                    353:        mull    %ebp
                    354:
                    355:        movl    PARAM_YSIZE, %ebp
                    356:        addl    %eax, %ebx
                    357:
                    358:        adcl    $0, %edx
                    359:        addl    %ebx, (%edi)
                    360:
                    361:        adcl    $0, %edx
                    362:        incl    %ebp
                    363:
                    364:        movl    %edx, 4(%edi)
                    365:        jnz     L(simple_outer_top)
                    366:
                    367:
                    368:        movl    SAVE_EBX, %ebx
                    369:        movl    SAVE_ESI, %esi
                    370:
                    371:        movl    SAVE_EDI, %edi
                    372:        movl    SAVE_EBP, %ebp
                    373:        addl    $FRAME, %esp
                    374:
                    375:        ret
                    376:
                    377:
                    378:
                    379: C -----------------------------------------------------------------------------
                    380: C
                    381: C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
                    382: C comments.
                    383: C
                    384: C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
                    385: C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
                    386: C to given an initial VAR_COUNTER at the top of the outer loop.
                    387: C
                    388: C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
                    389: C up to -1, inclusive.
                    390: C
                    391: C VAR_JMP is the computed jump into the unrolled loop.
                    392: C
                    393: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
                    394: C start of the unrolled loop.
                    395: C
                    396: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
                    397: C inclusive.
                    398: C
                    399: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
                    400: C added to give the location of the next limb of yp, which is the multiplier
                    401: C in the unrolled loop.
                    402: C
                    403: C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
                    404: C outer loop to take care of xp, wp and the inner loop counter.
                    405:
                    406: defframe(VAR_COUNTER,  -20)
                    407: defframe(VAR_ADJUST,   -24)
                    408: defframe(VAR_JMP,      -28)
                    409: defframe(VAR_XP_LOW,   -32)
                    410: deflit(VAR_EXTRA_SPACE, 16)
                    411:
                    412:
                    413: L(unroll):
                    414:        C eax   yp
                    415:        C ebx
                    416:        C ecx   xsize
                    417:        C edx   ysize-1
                    418:        C esi   xp end
                    419:        C edi   wp end of mul1
                    420:        C ebp
                    421:
                    422:        movl    PARAM_XP, %esi
                    423:        movl    4(%eax), %ebp           C multiplier (yp second limb)
                    424:        leal    4(%eax,%edx,4), %eax    C yp adjust for ysize indexing
                    425:
                    426:        movl    PARAM_WP, %edi
                    427:        movl    %eax, PARAM_YP
                    428:        negl    %edx
                    429:
                    430:        movl    %edx, PARAM_YSIZE
                    431:        leal    UNROLL_COUNT-2(%ecx), %ebx      C (xsize-1)+UNROLL_COUNT-1
                    432:        decl    %ecx                            C xsize-1
                    433:
                    434:        movl    (%esi), %eax            C xp low limb
                    435:        andl    $-UNROLL_MASK-1, %ebx
                    436:        negl    %ecx
                    437:
                    438:        subl    $VAR_EXTRA_SPACE, %esp
                    439: deflit(`FRAME',16+VAR_EXTRA_SPACE)
                    440:        negl    %ebx
                    441:        andl    $UNROLL_MASK, %ecx
                    442:
                    443:        movl    %ebx, VAR_ADJUST
                    444:        movl    %ecx, %edx
                    445:        shll    $4, %ecx
                    446:
                    447:        sarl    $UNROLL_LOG2, %ebx
                    448:
                    449:        C 17 code bytes per limb
                    450: ifdef(`PIC',`
                    451:        call    L(pic_calc)
                    452: L(unroll_here):
                    453: ',`
                    454:        leal    L(unroll_entry) (%ecx,%edx,1), %ecx
                    455: ')
                    456:        negl    %edx
                    457:
                    458:        movl    %eax, VAR_XP_LOW
                    459:        movl    %ecx, VAR_JMP
                    460:        leal    4(%edi,%edx,4), %edi    C wp and xp, adjust for unrolling,
                    461:        leal    4(%esi,%edx,4), %esi    C  and start at second limb
                    462:        jmp     L(unroll_outer_entry)
                    463:
                    464:
                    465: ifdef(`PIC',`
                    466: L(pic_calc):
                    467:        C See README.family about old gas bugs
                    468:        leal    (%ecx,%edx,1), %ecx
                    469:        addl    $L(unroll_entry)-L(unroll_here), %ecx
                    470:        addl    (%esp), %ecx
                    471:        ret
                    472: ')
                    473:
                    474:
                    475: C --------------------------------------------------------------------------
                    476:        ALIGN(32)
                    477: L(unroll_outer_top):
                    478:        C ebp   ysize counter, negative
                    479:
                    480:        movl    VAR_ADJUST, %ebx
                    481:        movl    PARAM_YP, %edx
                    482:
                    483:        movl    VAR_XP_LOW, %eax
                    484:        movl    %ebp, PARAM_YSIZE       C store incremented ysize counter
                    485:
                    486:        leal    4(%edi,%ebx,4), %edi
                    487:        leal    (%esi,%ebx,4), %esi
                    488:        sarl    $UNROLL_LOG2, %ebx
                    489:
                    490:        movl    (%edx,%ebp,4), %ebp     C yp next multiplier
                    491:        movl    VAR_JMP, %ecx
                    492:
                    493: L(unroll_outer_entry):
                    494:        mull    %ebp
                    495:
                    496:        testb   $1, %cl         C and clear carry bit
                    497:        movl    %ebx, VAR_COUNTER
                    498:        movl    $0, %ebx
                    499:
                    500:        movl    $0, %ecx
                    501:        cmovz(  %eax, %ecx)     C eax into low carry, zero into high carry limb
                    502:        cmovnz( %eax, %ebx)
                    503:
                    504:        C Extra fetch of VAR_JMP is bad, but registers are tight
                    505:        jmp     *VAR_JMP
                    506:
                    507:
                    508: C -----------------------------------------------------------------------------
                    509:        ALIGN(32)
                    510: L(unroll_top):
                    511:        C eax   xp limb
                    512:        C ebx   carry high
                    513:        C ecx   carry low
                    514:        C edx   scratch
                    515:        C esi   xp+8
                    516:        C edi   wp
                    517:        C ebp   yp multiplier limb
                    518:        C
                    519:        C VAR_COUNTER  loop counter, negative
                    520:        C
                    521:        C 17 bytes each limb
                    522:
                    523: L(unroll_entry):
                    524:
                    525: deflit(CHUNK_COUNT,2)
                    526: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
                    527:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
                    528:        deflit(`disp1', eval(disp0 + 4))
                    529:
                    530: Zdisp( movl,   disp0,(%esi), %eax)
                    531:        adcl    %edx, %ebx
                    532:
                    533:        mull    %ebp
                    534:
                    535: Zdisp( addl,   %ecx, disp0,(%edi))
                    536:        movl    $0, %ecx
                    537:
                    538:        adcl    %eax, %ebx
                    539:
                    540:
                    541:        movl    disp1(%esi), %eax
                    542:        adcl    %edx, %ecx
                    543:
                    544:        mull    %ebp
                    545:
                    546:        addl    %ebx, disp1(%edi)
                    547:        movl    $0, %ebx
                    548:
                    549:        adcl    %eax, %ecx
                    550: ')
                    551:
                    552:
                    553:        incl    VAR_COUNTER
                    554:        leal    UNROLL_BYTES(%esi), %esi
                    555:        leal    UNROLL_BYTES(%edi), %edi
                    556:
                    557:        jnz     L(unroll_top)
                    558:
                    559:
                    560:        C eax
                    561:        C ebx   zero
                    562:        C ecx   low
                    563:        C edx   high
                    564:        C esi
                    565:        C edi   wp, pointing at second last limb)
                    566:        C ebp
                    567:        C
                    568:        C carry flag to be added to high
                    569:
                    570: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
                    571: deflit(`disp1', eval(disp0-0 + 4))
                    572:
                    573:        movl    PARAM_YSIZE, %ebp
                    574:        adcl    $0, %edx
                    575:        addl    %ecx, disp0(%edi)
                    576:
                    577:        adcl    $0, %edx
                    578:        incl    %ebp
                    579:
                    580:        movl    %edx, disp1(%edi)
                    581:        jnz     L(unroll_outer_top)
                    582:
                    583:
                    584:        movl    SAVE_ESI, %esi
                    585:        movl    SAVE_EBP, %ebp
                    586:
                    587:        movl    SAVE_EDI, %edi
                    588:        movl    SAVE_EBX, %ebx
                    589:        addl    $FRAME, %esp
                    590:
                    591:        ret
                    592:
                    593: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>