[BACK]Return to mul_basecase.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_basecase.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
        !            26: C     unrolling).
        !            27:
        !            28:
        !            29:
1.1       maekawa    30: dnl  K6: UNROLL_COUNT cycles/product (approx)
                     31: dnl           8           9.75
                     32: dnl          16           9.3
                     33: dnl          32           9.3
                     34: dnl  Maximum possible with the current code is 32.
                     35: dnl
                     36: dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
                     37: dnl  might explain it's good performance.
                     38:
                     39: deflit(UNROLL_COUNT, 16)
                     40:
                     41:
                     42: C void mpn_mul_basecase (mp_ptr wp,
                     43: C                        mp_srcptr xp, mp_size_t xsize,
                     44: C                        mp_srcptr yp, mp_size_t ysize);
                     45: C
                     46: C Calculate xp,xsize multiplied by yp,ysize, storing the result in
                     47: C wp,xsize+ysize.
                     48: C
                     49: C This routine is essentially the same as mpn/generic/mul_basecase.c, but
                     50: C it's faster because it does most of the mpn_addmul_1() entry code only
                     51: C once.  The saving is about 10-20% on typical sizes coming from the
                     52: C Karatsuba multiply code.
                     53: C
                     54: C Future:
                     55: C
                     56: C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
                     57: C setups and maybe 2 or 3 wasted cycles at the end.  Code saving would be
                     58: C 256 bytes.
                     59:
                     60: ifdef(`PIC',`
                     61: deflit(UNROLL_THRESHOLD, 8)
                     62: ',`
                     63: deflit(UNROLL_THRESHOLD, 8)
                     64: ')
                     65:
                     66: defframe(PARAM_YSIZE,20)
                     67: defframe(PARAM_YP,   16)
                     68: defframe(PARAM_XSIZE,12)
                     69: defframe(PARAM_XP,   8)
                     70: defframe(PARAM_WP,   4)
                     71:
1.1.1.2 ! ohara      72:        TEXT
1.1       maekawa    73:        ALIGN(32)
                     74: PROLOGUE(mpn_mul_basecase)
                     75: deflit(`FRAME',0)
                     76:
                     77:        movl    PARAM_XSIZE, %ecx
                     78:        movl    PARAM_YP, %eax
                     79:
                     80:        movl    PARAM_XP, %edx
                     81:        movl    (%eax), %eax    C yp low limb
                     82:
                     83:        cmpl    $2, %ecx
                     84:        ja      L(xsize_more_than_two_limbs)
                     85:        je      L(two_by_something)
                     86:
                     87:
                     88:        C one limb by one limb
                     89:
                     90:        movl    (%edx), %edx    C xp low limb
                     91:        movl    PARAM_WP, %ecx
                     92:
                     93:        mull    %edx
                     94:
                     95:        movl    %eax, (%ecx)
                     96:        movl    %edx, 4(%ecx)
                     97:        ret
                     98:
                     99:
                    100: C -----------------------------------------------------------------------------
                    101: L(two_by_something):
                    102:        decl    PARAM_YSIZE
                    103:        pushl   %ebx
                    104: deflit(`FRAME',4)
                    105:
                    106:        movl    PARAM_WP, %ebx
                    107:        pushl   %esi
                    108: deflit(`FRAME',8)
                    109:
                    110:        movl    %eax, %ecx      C yp low limb
                    111:        movl    (%edx), %eax    C xp low limb
                    112:
                    113:        movl    %edx, %esi      C xp
                    114:        jnz     L(two_by_two)
                    115:
                    116:
                    117:        C two limbs by one limb
                    118:
                    119:        mull    %ecx
                    120:
                    121:        movl    %eax, (%ebx)
                    122:        movl    4(%esi), %eax
                    123:
                    124:        movl    %edx, %esi      C carry
                    125:
                    126:        mull    %ecx
                    127:
                    128:        addl    %eax, %esi
                    129:        movl    %esi, 4(%ebx)
                    130:
                    131:        adcl    $0, %edx
                    132:
                    133:        movl    %edx, 8(%ebx)
                    134:        popl    %esi
                    135:
                    136:        popl    %ebx
                    137:        ret
                    138:
                    139:
                    140:
                    141: C -----------------------------------------------------------------------------
                    142:        ALIGN(16)
                    143: L(two_by_two):
                    144:        C eax   xp low limb
                    145:        C ebx   wp
                    146:        C ecx   yp low limb
                    147:        C edx
                    148:        C esi   xp
                    149:        C edi
                    150:        C ebp
                    151: deflit(`FRAME',8)
                    152:
                    153:        mull    %ecx            C xp[0] * yp[0]
                    154:
                    155:        push    %edi
                    156: deflit(`FRAME',12)
                    157:        movl    %eax, (%ebx)
                    158:
                    159:        movl    4(%esi), %eax
                    160:        movl    %edx, %edi      C carry, for wp[1]
                    161:
                    162:        mull    %ecx            C xp[1] * yp[0]
                    163:
                    164:        addl    %eax, %edi
                    165:        movl    PARAM_YP, %ecx
                    166:
                    167:        adcl    $0, %edx
                    168:
                    169:        movl    %edi, 4(%ebx)
                    170:        movl    4(%ecx), %ecx   C yp[1]
                    171:
                    172:        movl    4(%esi), %eax   C xp[1]
                    173:        movl    %edx, %edi      C carry, for wp[2]
                    174:
                    175:        mull    %ecx            C xp[1] * yp[1]
                    176:
                    177:        addl    %eax, %edi
                    178:
                    179:        adcl    $0, %edx
                    180:
                    181:        movl    (%esi), %eax    C xp[0]
                    182:        movl    %edx, %esi      C carry, for wp[3]
                    183:
                    184:        mull    %ecx            C xp[0] * yp[1]
                    185:
                    186:        addl    %eax, 4(%ebx)
                    187:        adcl    %edx, %edi
                    188:        adcl    $0, %esi
                    189:
                    190:        movl    %edi, 8(%ebx)
                    191:        popl    %edi
                    192:
                    193:        movl    %esi, 12(%ebx)
                    194:        popl    %esi
                    195:
                    196:        popl    %ebx
                    197:        ret
                    198:
                    199:
                    200: C -----------------------------------------------------------------------------
                    201:        ALIGN(16)
                    202: L(xsize_more_than_two_limbs):
                    203:
                    204: C The first limb of yp is processed with a simple mpn_mul_1 style loop
                    205: C inline.  Unrolling this doesn't seem worthwhile since it's only run once
                    206: C (whereas the addmul below is run ysize-1 many times).  A call to the
                    207: C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
                    208: C popping, and doesn't seem likely to be worthwhile on the typical 10-20
                    209: C limb operations the Karatsuba code calls here with.
                    210:
                    211:        C eax   yp[0]
                    212:        C ebx
                    213:        C ecx   xsize
                    214:        C edx   xp
                    215:        C esi
                    216:        C edi
                    217:        C ebp
                    218: deflit(`FRAME',0)
                    219:
                    220:        pushl   %edi            defframe_pushl(SAVE_EDI)
                    221:        pushl   %ebp            defframe_pushl(SAVE_EBP)
                    222:
                    223:        movl    PARAM_WP, %edi
                    224:        pushl   %esi            defframe_pushl(SAVE_ESI)
                    225:
                    226:        movl    %eax, %ebp
                    227:        pushl   %ebx            defframe_pushl(SAVE_EBX)
                    228:
                    229:        leal    (%edx,%ecx,4), %ebx     C xp end
                    230:        xorl    %esi, %esi
                    231:
                    232:        leal    (%edi,%ecx,4), %edi     C wp end of mul1
                    233:        negl    %ecx
                    234:
                    235:
                    236: L(mul1):
                    237:        C eax   scratch
                    238:        C ebx   xp end
                    239:        C ecx   counter, negative
                    240:        C edx   scratch
                    241:        C esi   carry
                    242:        C edi   wp end of mul1
                    243:        C ebp   multiplier
                    244:
                    245:        movl    (%ebx,%ecx,4), %eax
                    246:
                    247:        mull    %ebp
                    248:
                    249:        addl    %esi, %eax
                    250:        movl    $0, %esi
                    251:
                    252:        adcl    %edx, %esi
                    253:
                    254:        movl    %eax, (%edi,%ecx,4)
                    255:        incl    %ecx
                    256:
                    257:        jnz     L(mul1)
                    258:
                    259:
                    260:        movl    PARAM_YSIZE, %edx
                    261:        movl    %esi, (%edi)            C final carry
                    262:
                    263:        movl    PARAM_XSIZE, %ecx
                    264:        decl    %edx
                    265:
                    266:        jnz     L(ysize_more_than_one_limb)
                    267:
                    268:        popl    %ebx
                    269:        popl    %esi
                    270:        popl    %ebp
                    271:        popl    %edi
                    272:        ret
                    273:
                    274:
                    275: L(ysize_more_than_one_limb):
                    276:        cmpl    $UNROLL_THRESHOLD, %ecx
                    277:        movl    PARAM_YP, %eax
                    278:
                    279:        jae     L(unroll)
                    280:
                    281:
                    282: C -----------------------------------------------------------------------------
                    283: C Simple addmul loop.
                    284: C
                    285: C Using ebx and edi pointing at the ends of their respective locations saves
                    286: C a couple of instructions in the outer loop.  The inner loop is still 11
                    287: C cycles, the same as the simple loop in aorsmul_1.asm.
                    288:
                    289:        C eax   yp
                    290:        C ebx   xp end
                    291:        C ecx   xsize
                    292:        C edx   ysize-1
                    293:        C esi
                    294:        C edi   wp end of mul1
                    295:        C ebp
                    296:
                    297:        movl    4(%eax), %ebp           C multiplier
                    298:        negl    %ecx
                    299:
                    300:        movl    %ecx, PARAM_XSIZE       C -xsize
                    301:        xorl    %esi, %esi              C initial carry
                    302:
                    303:        leal    4(%eax,%edx,4), %eax    C yp end
                    304:        negl    %edx
                    305:
                    306:        movl    %eax, PARAM_YP
                    307:        movl    %edx, PARAM_YSIZE
                    308:
                    309:        jmp     L(simple_outer_entry)
                    310:
                    311:
                    312:        C aligning here saves a couple of cycles
                    313:        ALIGN(16)
                    314: L(simple_outer_top):
                    315:        C edx   ysize counter, negative
                    316:
                    317:        movl    PARAM_YP, %eax          C yp end
                    318:        xorl    %esi, %esi              C carry
                    319:
                    320:        movl    PARAM_XSIZE, %ecx       C -xsize
                    321:        movl    %edx, PARAM_YSIZE
                    322:
                    323:        movl    (%eax,%edx,4), %ebp     C yp limb multiplier
                    324: L(simple_outer_entry):
                    325:        addl    $4, %edi
                    326:
                    327:
                    328: L(simple_inner):
                    329:        C eax   scratch
                    330:        C ebx   xp end
                    331:        C ecx   counter, negative
                    332:        C edx   scratch
                    333:        C esi   carry
                    334:        C edi   wp end of this addmul
                    335:        C ebp   multiplier
                    336:
                    337:        movl    (%ebx,%ecx,4), %eax
                    338:
                    339:        mull    %ebp
                    340:
                    341:        addl    %esi, %eax
                    342:        movl    $0, %esi
                    343:
                    344:        adcl    $0, %edx
                    345:        addl    %eax, (%edi,%ecx,4)
                    346:        adcl    %edx, %esi
                    347:
                    348:        incl    %ecx
                    349:        jnz     L(simple_inner)
                    350:
                    351:
                    352:        movl    PARAM_YSIZE, %edx
                    353:        movl    %esi, (%edi)
                    354:
                    355:        incl    %edx
                    356:        jnz     L(simple_outer_top)
                    357:
                    358:
                    359:        popl    %ebx
                    360:        popl    %esi
                    361:        popl    %ebp
                    362:        popl    %edi
                    363:        ret
                    364:
                    365:
                    366: C -----------------------------------------------------------------------------
                    367: C Unrolled loop.
                    368: C
                    369: C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
                    370: C some comments.
                    371: C
                    372: C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
                    373: C 0, inclusive.
                    374: C
                    375: C VAR_JMP is the computed jump into the unrolled loop.
                    376: C
                    377: C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
                    378: C is entered.
                    379: C
                    380: C VAR_XP_LOW is the least significant limb of xp, which is needed at the
                    381: C start of the unrolled loop.  This can't just be fetched through the xp
                    382: C pointer because of the offset applied to it.
                    383: C
                    384: C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
                    385: C inclusive.
                    386: C
                    387: C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
                    388: C added to give the location of the next limb of yp, which is the multiplier
                    389: C in the unrolled loop.
                    390: C
                    391: C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
                    392: C to give the starting point in the destination for each unrolled loop (this
                    393: C point is one limb upwards for each limb of yp processed).
                    394: C
                    395: C Having PARAM_YSIZE count negative to zero means it's not necessary to
                    396: C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
                    397: C the stack remain constant and on each loop an leal adjusts them with the
                    398: C PARAM_YSIZE counter value.
                    399:
                    400:
                    401: defframe(VAR_COUNTER,      -20)
                    402: defframe(VAR_COUNTER_INIT, -24)
                    403: defframe(VAR_JMP,          -28)
                    404: defframe(VAR_XP_LOW,       -32)
                    405: deflit(VAR_STACK_SPACE, 16)
                    406:
                    407: dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
                    408: dnl  slower in this code, hence the defframe empty-if-zero feature is
                    409: dnl  disabled.
                    410: dnl
                    411: dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
                    412: dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
                    413: dnl  anything isn't clear.
                    414: dnl
                    415: define(`defframe_empty_if_zero_disabled',1)
                    416:
                    417: L(unroll):
                    418:        C eax   yp (not used)
                    419:        C ebx   xp end (not used)
                    420:        C ecx   xsize
                    421:        C edx   ysize-1
                    422:        C esi
                    423:        C edi   wp end of mul1 (not used)
                    424:        C ebp
                    425: deflit(`FRAME', 16)
                    426:
                    427:        leal    -2(%ecx), %ebp  C one limb processed at start,
                    428:        decl    %ecx            C and ebp is one less
                    429:
                    430:        shrl    $UNROLL_LOG2, %ebp
                    431:        negl    %ecx
                    432:
                    433:        subl    $VAR_STACK_SPACE, %esp
                    434: deflit(`FRAME', 16+VAR_STACK_SPACE)
                    435:        andl    $UNROLL_MASK, %ecx
                    436:
                    437:        movl    %ecx, %esi
                    438:        shll    $4, %ecx
                    439:
                    440:        movl    %ebp, VAR_COUNTER_INIT
                    441:        negl    %esi
                    442:
                    443:        C 15 code bytes per limb
                    444: ifdef(`PIC',`
                    445:        call    L(pic_calc)
                    446: L(unroll_here):
                    447: ',`
                    448:        leal    L(unroll_entry) (%ecx,%esi,1), %ecx
                    449: ')
                    450:
                    451:        movl    PARAM_XP, %ebx
                    452:        movl    %ebp, VAR_COUNTER
                    453:
                    454:        movl    PARAM_WP, %edi
                    455:        movl    %ecx, VAR_JMP
                    456:
                    457:        movl    (%ebx), %eax
                    458:        leal    4(%edi,%esi,4), %edi    C wp adjust for unrolling and mul1
                    459:
                    460:        leal    (%ebx,%esi,4), %ebx     C xp adjust for unrolling
                    461:
                    462:        movl    %eax, VAR_XP_LOW
                    463:
                    464:        movl    %ebx, PARAM_XP
                    465:        movl    PARAM_YP, %ebx
                    466:
                    467:        leal    (%edi,%edx,4), %ecx     C wp adjust for ysize indexing
                    468:        movl    4(%ebx), %ebp           C multiplier (yp second limb)
                    469:
                    470:        leal    4(%ebx,%edx,4), %ebx    C yp adjust for ysize indexing
                    471:
                    472:        movl    %ecx, PARAM_WP
                    473:
                    474:        leal    1(%esi), %ecx   C adjust parity for decl %ecx above
                    475:
                    476:        movl    %ebx, PARAM_YP
                    477:        negl    %edx
                    478:
                    479:        movl    %edx, PARAM_YSIZE
                    480:        jmp     L(unroll_outer_entry)
                    481:
                    482:
                    483: ifdef(`PIC',`
                    484: L(pic_calc):
1.1.1.2 ! ohara     485:        C See mpn/x86/README about old gas bugs
1.1       maekawa   486:        leal    (%ecx,%esi,1), %ecx
                    487:        addl    $L(unroll_entry)-L(unroll_here), %ecx
                    488:        addl    (%esp), %ecx
                    489:        ret
                    490: ')
                    491:
                    492:
                    493: C -----------------------------------------------------------------------------
                    494:        C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
                    495:        C cost any extra space, since the inner unrolled loop below is
                    496:        C aligned to 32.
                    497:        ALIGN(32)
                    498: L(unroll_outer_top):
                    499:        C edx   ysize
                    500:
                    501:        movl    PARAM_YP, %eax
                    502:        movl    %edx, PARAM_YSIZE       C incremented ysize counter
                    503:
                    504:        movl    PARAM_WP, %edi
                    505:
                    506:        movl    VAR_COUNTER_INIT, %ebx
                    507:        movl    (%eax,%edx,4), %ebp     C next multiplier
                    508:
                    509:        movl    PARAM_XSIZE, %ecx
                    510:        leal    (%edi,%edx,4), %edi     C adjust wp for where we are in yp
                    511:
                    512:        movl    VAR_XP_LOW, %eax
                    513:        movl    %ebx, VAR_COUNTER
                    514:
                    515: L(unroll_outer_entry):
                    516:        mull    %ebp
                    517:
                    518:        C using testb is a tiny bit faster than testl
                    519:        testb   $1, %cl
                    520:
                    521:        movl    %eax, %ecx      C low carry
                    522:        movl    VAR_JMP, %eax
                    523:
                    524:        movl    %edx, %esi      C high carry
                    525:        movl    PARAM_XP, %ebx
                    526:
                    527:        jnz     L(unroll_noswap)
                    528:        movl    %ecx, %esi      C high,low carry other way around
                    529:
                    530:        movl    %edx, %ecx
                    531: L(unroll_noswap):
                    532:
                    533:        jmp     *%eax
                    534:
                    535:
                    536:
                    537: C -----------------------------------------------------------------------------
                    538:        ALIGN(32)
                    539: L(unroll_top):
                    540:        C eax   scratch
                    541:        C ebx   xp
                    542:        C ecx   carry low
                    543:        C edx   scratch
                    544:        C esi   carry high
                    545:        C edi   wp
                    546:        C ebp   multiplier
                    547:        C VAR_COUNTER  loop counter
                    548:        C
                    549:        C 15 code bytes each limb
                    550:
                    551:        leal    UNROLL_BYTES(%edi), %edi
                    552:
                    553: L(unroll_entry):
                    554: deflit(CHUNK_COUNT,2)
                    555: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
                    556:        deflit(`disp0', eval(i*CHUNK_COUNT*4))
                    557:        deflit(`disp1', eval(disp0 + 4))
                    558:        deflit(`disp2', eval(disp1 + 4))
                    559:
                    560:        movl    disp1(%ebx), %eax
                    561:        mull    %ebp
                    562: Zdisp( addl,   %ecx, disp0,(%edi))
                    563:        adcl    %eax, %esi
                    564:        movl    %edx, %ecx
                    565:        jadcl0( %ecx)
                    566:
                    567:        movl    disp2(%ebx), %eax
                    568:        mull    %ebp
                    569:        addl    %esi, disp1(%edi)
                    570:        adcl    %eax, %ecx
                    571:        movl    %edx, %esi
                    572:        jadcl0( %esi)
                    573: ')
                    574:
                    575:        decl    VAR_COUNTER
                    576:        leal    UNROLL_BYTES(%ebx), %ebx
                    577:
                    578:        jns     L(unroll_top)
                    579:
                    580:
                    581:        movl    PARAM_YSIZE, %edx
                    582:        addl    %ecx, UNROLL_BYTES(%edi)
                    583:
                    584:        adcl    $0, %esi
                    585:
                    586:        incl    %edx
                    587:        movl    %esi, UNROLL_BYTES+4(%edi)
                    588:
                    589:        jnz     L(unroll_outer_top)
                    590:
                    591:
                    592:        movl    SAVE_ESI, %esi
                    593:        movl    SAVE_EBP, %ebp
                    594:        movl    SAVE_EDI, %edi
                    595:        movl    SAVE_EBX, %ebx
                    596:
                    597:        addl    $FRAME, %esp
                    598:        ret
                    599:
                    600: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>