OpenXM_contrib/gmp/mpn/x86/k7/aorsmul_1.asm - annotate

Return to aorsmul_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7
Annotation of OpenXM_contrib/gmp/mpn/x86/k7/aorsmul_1.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C K7: 3.9 cycles/limb.
        !            26: C
        !            27: C Future: It should be possible to avoid the separate mul after the
        !            28: C unrolled loop by moving the movl/adcl to the top.
        !            29:
        !            30:
        !            31:
1.1       maekawa    32: dnl  K7: UNROLL_COUNT  cycles/limb
                     33: dnl           4            4.42
                     34: dnl           8            4.16
                     35: dnl          16            3.9
                     36: dnl          32            3.9
                     37: dnl          64            3.87
                     38: dnl  Maximum possible with the current code is 64.
                     39:
                     40: deflit(UNROLL_COUNT, 16)
                     41:
                     42:
                     43: ifdef(`OPERATION_addmul_1',`
                     44:        define(M4_inst,        addl)
                     45:        define(M4_function_1,  mpn_addmul_1)
                     46:        define(M4_function_1c, mpn_addmul_1c)
                     47:        define(M4_description, add it to)
                     48:        define(M4_desc_retval, carry)
                     49: ',`ifdef(`OPERATION_submul_1',`
                     50:        define(M4_inst,        subl)
                     51:        define(M4_function_1,  mpn_submul_1)
                     52:        define(M4_function_1c, mpn_submul_1c)
                     53:        define(M4_description, subtract it from)
                     54:        define(M4_desc_retval, borrow)
                     55: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
                     56: ')')')
                     57:
                     58: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
                     59:
                     60:
                     61: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     62: C                            mp_limb_t mult);
                     63: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
                     64: C                             mp_limb_t mult, mp_limb_t carry);
                     65: C
                     66: C Calculate src,size multiplied by mult and M4_description dst,size.
                     67: C Return the M4_desc_retval limb from the top of the result.
                     68:
                     69: ifdef(`PIC',`
                     70: deflit(UNROLL_THRESHOLD, 9)
                     71: ',`
                     72: deflit(UNROLL_THRESHOLD, 6)
                     73: ')
                     74:
                     75: defframe(PARAM_CARRY,     20)
                     76: defframe(PARAM_MULTIPLIER,16)
                     77: defframe(PARAM_SIZE,      12)
                     78: defframe(PARAM_SRC,       8)
                     79: defframe(PARAM_DST,       4)
                     80: deflit(`FRAME',0)
                     81:
                     82: defframe(SAVE_EBX, -4)
                     83: defframe(SAVE_ESI, -8)
                     84: defframe(SAVE_EDI, -12)
                     85: defframe(SAVE_EBP, -16)
                     86: deflit(SAVE_SIZE, 16)
                     87:
1.1.1.2 ! ohara      88:        TEXT
1.1       maekawa    89:        ALIGN(32)
                     90: PROLOGUE(M4_function_1)
                     91:        movl    PARAM_SIZE, %edx
                     92:        movl    PARAM_SRC, %eax
                     93:        xorl    %ecx, %ecx
                     94:
                     95:        decl    %edx
1.1.1.2 ! ohara      96:        jnz     L(start_1)
1.1       maekawa    97:
                     98:        movl    (%eax), %eax
                     99:        movl    PARAM_DST, %ecx
                    100:
                    101:        mull    PARAM_MULTIPLIER
                    102:
                    103:        M4_inst %eax, (%ecx)
                    104:        adcl    $0, %edx
                    105:        movl    %edx, %eax
                    106:
                    107:        ret
                    108: EPILOGUE()
                    109:
                    110:        ALIGN(16)
                    111: PROLOGUE(M4_function_1c)
                    112:        movl    PARAM_SIZE, %edx
                    113:        movl    PARAM_SRC, %eax
                    114:
                    115:        decl    %edx
                    116:        jnz     L(more_than_one_limb)
                    117:
                    118:        movl    (%eax), %eax
                    119:        movl    PARAM_DST, %ecx
                    120:
                    121:        mull    PARAM_MULTIPLIER
                    122:
                    123:        addl    PARAM_CARRY, %eax
                    124:
                    125:        adcl    $0, %edx
                    126:        M4_inst %eax, (%ecx)
                    127:
                    128:        adcl    $0, %edx
                    129:        movl    %edx, %eax
                    130:
                    131:        ret
                    132:
                    133:
                    134:        C offset 0x44 so close enough to aligned
                    135: L(more_than_one_limb):
                    136:        movl    PARAM_CARRY, %ecx
                    137: L(start_1):
                    138:        C eax   src
                    139:        C ecx   initial carry
                    140:        C edx   size-1
                    141:        subl    $SAVE_SIZE, %esp
                    142: deflit(`FRAME',16)
                    143:
                    144:        movl    %ebx, SAVE_EBX
                    145:        movl    %esi, SAVE_ESI
                    146:        movl    %edx, %ebx      C size-1
                    147:
                    148:        movl    PARAM_SRC, %esi
                    149:        movl    %ebp, SAVE_EBP
                    150:        cmpl    $UNROLL_THRESHOLD, %edx
                    151:
                    152:        movl    PARAM_MULTIPLIER, %ebp
                    153:        movl    %edi, SAVE_EDI
                    154:
                    155:        movl    (%esi), %eax    C src low limb
                    156:        movl    PARAM_DST, %edi
                    157:        ja      L(unroll)
                    158:
                    159:
                    160:        C simple loop
                    161:
                    162:        leal    4(%esi,%ebx,4), %esi    C point one limb past last
                    163:        leal    (%edi,%ebx,4), %edi     C point at last limb
                    164:        negl    %ebx
                    165:
                    166:        C The movl to load the next source limb is done well ahead of the
                    167:        C mul.  This is necessary for full speed, and leads to one limb
                    168:        C handled separately at the end.
                    169:
                    170: L(simple):
                    171:        C eax   src limb
                    172:        C ebx   loop counter
                    173:        C ecx   carry limb
                    174:        C edx   scratch
                    175:        C esi   src
                    176:        C edi   dst
                    177:        C ebp   multiplier
                    178:
                    179:        mull    %ebp
                    180:
                    181:        addl    %eax, %ecx
                    182:        adcl    $0, %edx
                    183:
                    184:        M4_inst %ecx, (%edi,%ebx,4)
                    185:        movl    (%esi,%ebx,4), %eax
                    186:        adcl    $0, %edx
                    187:
                    188:        incl    %ebx
                    189:        movl    %edx, %ecx
                    190:        jnz     L(simple)
                    191:
                    192:
                    193:        mull    %ebp
                    194:
                    195:        movl    SAVE_EBX, %ebx
                    196:        movl    SAVE_ESI, %esi
                    197:        movl    SAVE_EBP, %ebp
                    198:
                    199:        addl    %eax, %ecx
                    200:        adcl    $0, %edx
                    201:
                    202:        M4_inst %ecx, (%edi)
                    203:        adcl    $0, %edx
                    204:        movl    SAVE_EDI, %edi
                    205:
                    206:        addl    $SAVE_SIZE, %esp
                    207:        movl    %edx, %eax
                    208:        ret
                    209:
                    210:
                    211:
                    212: C -----------------------------------------------------------------------------
                    213:        ALIGN(16)
                    214: L(unroll):
                    215:        C eax   src low limb
                    216:        C ebx   size-1
                    217:        C ecx   carry
                    218:        C edx   size-1
                    219:        C esi   src
                    220:        C edi   dst
                    221:        C ebp   multiplier
                    222:
                    223: dnl  overlapping with parameters no longer needed
                    224: define(VAR_COUNTER,`PARAM_SIZE')
                    225: define(VAR_JUMP,   `PARAM_MULTIPLIER')
                    226:
                    227:        subl    $2, %ebx        C (size-2)-1
                    228:        decl    %edx            C size-2
                    229:
                    230:        shrl    $UNROLL_LOG2, %ebx
                    231:        negl    %edx
                    232:
                    233:        movl    %ebx, VAR_COUNTER
                    234:        andl    $UNROLL_MASK, %edx
                    235:
                    236:        movl    %edx, %ebx
                    237:        shll    $4, %edx
                    238:
                    239: ifdef(`PIC',`
                    240:        call    L(pic_calc)
                    241: L(here):
                    242: ',`
                    243:        leal    L(entry) (%edx,%ebx,1), %edx
                    244: ')
                    245:        negl    %ebx
                    246:        movl    %edx, VAR_JUMP
                    247:
                    248:        mull    %ebp
                    249:
                    250:        addl    %eax, %ecx      C initial carry, becomes low carry
                    251:        adcl    $0, %edx
                    252:        testb   $1, %bl
                    253:
                    254:        movl    4(%esi), %eax   C src second limb
                    255:        leal    ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
                    256:        leal    ifelse(UNROLL_BYTES,256,128)   (%edi,%ebx,4), %edi
                    257:
                    258:        movl    %edx, %ebx      C high carry
                    259:        cmovnz( %ecx, %ebx)     C high,low carry other way around
                    260:        cmovnz( %edx, %ecx)
                    261:
                    262:        jmp     *VAR_JUMP
                    263:
                    264:
                    265: ifdef(`PIC',`
                    266: L(pic_calc):
1.1.1.2 ! ohara     267:        C See mpn/x86/README about old gas bugs
1.1       maekawa   268:        leal    (%edx,%ebx,1), %edx
                    269:        addl    $L(entry)-L(here), %edx
                    270:        addl    (%esp), %edx
                    271:        ret
                    272: ')
                    273:
                    274:
                    275: C -----------------------------------------------------------------------------
                    276: C This code uses a "two carry limbs" scheme.  At the top of the loop the
                    277: C carries are ebx=lo, ecx=hi, then they swap for each limb processed.  For
                    278: C the computed jump an odd size means they start one way around, an even
                    279: C size the other.  Either way one limb is handled separately at the start of
                    280: C the loop.
                    281: C
                    282: C The positioning of the movl to load the next source limb is important.
                    283: C Moving it after the adcl with a view to avoiding a separate mul at the end
                    284: C of the loop slows the code down.
                    285:
                    286:        ALIGN(32)
                    287: L(top):
                    288:        C eax   src limb
                    289:        C ebx   carry high
                    290:        C ecx   carry low
                    291:        C edx   scratch
                    292:        C esi   src+8
                    293:        C edi   dst
                    294:        C ebp   multiplier
                    295:        C
                    296:        C VAR_COUNTER  loop counter
                    297:        C
                    298:        C 17 bytes each limb
                    299:
                    300: L(entry):
                    301: deflit(CHUNK_COUNT,2)
                    302: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
                    303:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
                    304:        deflit(`disp1', eval(disp0 + 4))
                    305:
                    306:        mull    %ebp
                    307:
                    308: Zdisp( M4_inst,%ecx, disp0,(%edi))
                    309:        movl    $0, %ecx
                    310:
                    311:        adcl    %eax, %ebx
                    312:
                    313: Zdisp( movl,   disp0,(%esi), %eax)
                    314:        adcl    %edx, %ecx
                    315:
                    316:
                    317:        mull    %ebp
                    318:
                    319:        M4_inst %ebx, disp1(%edi)
                    320:        movl    $0, %ebx
                    321:
                    322:        adcl    %eax, %ecx
                    323:
                    324:        movl    disp1(%esi), %eax
                    325:        adcl    %edx, %ebx
                    326: ')
                    327:
                    328:        decl    VAR_COUNTER
                    329:        leal    UNROLL_BYTES(%esi), %esi
                    330:        leal    UNROLL_BYTES(%edi), %edi
                    331:
                    332:        jns     L(top)
                    333:
                    334:
                    335:        C eax   src limb
                    336:        C ebx   carry high
                    337:        C ecx   carry low
                    338:        C edx
                    339:        C esi
                    340:        C edi   dst (points at second last limb)
                    341:        C ebp   multiplier
                    342: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
                    343: deflit(`disp1', eval(disp0-0 + 4))
                    344:
                    345:        mull    %ebp
                    346:
                    347:        M4_inst %ecx, disp0(%edi)
                    348:        movl    SAVE_EBP, %ebp
                    349:
                    350:        adcl    %ebx, %eax
                    351:        movl    SAVE_EBX, %ebx
                    352:        movl    SAVE_ESI, %esi
                    353:
                    354:        adcl    $0, %edx
                    355:        M4_inst %eax, disp1(%edi)
                    356:        movl    SAVE_EDI, %edi
                    357:
                    358:        adcl    $0, %edx
                    359:        addl    $SAVE_SIZE, %esp
                    360:
                    361:        movl    %edx, %eax
                    362:        ret
                    363:
                    364: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>