OpenXM_contrib/gmp/mpn/x86/p6/aorsmul_1.asm - annotate

Return to aorsmul_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / p6
Annotation of OpenXM_contrib/gmp/mpn/x86/p6/aorsmul_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
        !             2: dnl
        !             3: dnl  P6: 6.35 cycles/limb (at 16 limbs/loop).
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: dnl  P6 UNROLL_COUNT cycles/limb
        !            30: dnl          8           6.7
        !            31: dnl         16           6.35
        !            32: dnl         32           6.3
        !            33: dnl         64           6.3
        !            34: dnl  Maximum possible with the current code is 64.
        !            35:
        !            36: deflit(UNROLL_COUNT, 16)
        !            37:
        !            38:
        !            39: ifdef(`OPERATION_addmul_1', `
        !            40:        define(M4_inst,        addl)
        !            41:        define(M4_function_1,  mpn_addmul_1)
        !            42:        define(M4_function_1c, mpn_addmul_1c)
        !            43:        define(M4_description, add it to)
        !            44:        define(M4_desc_retval, carry)
        !            45: ',`ifdef(`OPERATION_submul_1', `
        !            46:        define(M4_inst,        subl)
        !            47:        define(M4_function_1,  mpn_submul_1)
        !            48:        define(M4_function_1c, mpn_submul_1c)
        !            49:        define(M4_description, subtract it from)
        !            50:        define(M4_desc_retval, borrow)
        !            51: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
        !            52: ')')')
        !            53:
        !            54: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
        !            55:
        !            56:
        !            57: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            58: C                            mp_limb_t mult);
        !            59: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            60: C                             mp_limb_t mult, mp_limb_t carry);
        !            61: C
        !            62: C Calculate src,size multiplied by mult and M4_description dst,size.
        !            63: C Return the M4_desc_retval limb from the top of the result.
        !            64: C
        !            65: C This code is pretty much the same as the K6 code.  The unrolled loop is
        !            66: C the same, but there's just a few scheduling tweaks in the setups and the
        !            67: C simple loop.
        !            68: C
        !            69: C A number of variations have been tried for the unrolled loop, with one or
        !            70: C two carries, and with loads scheduled earlier, but nothing faster than 6
        !            71: C cycles/limb has been found.
        !            72:
        !            73: ifdef(`PIC',`
        !            74: deflit(UNROLL_THRESHOLD, 5)
        !            75: ',`
        !            76: deflit(UNROLL_THRESHOLD, 5)
        !            77: ')
        !            78:
        !            79: defframe(PARAM_CARRY,     20)
        !            80: defframe(PARAM_MULTIPLIER,16)
        !            81: defframe(PARAM_SIZE,      12)
        !            82: defframe(PARAM_SRC,       8)
        !            83: defframe(PARAM_DST,       4)
        !            84:
        !            85:        .text
        !            86:        ALIGN(32)
        !            87:
        !            88: PROLOGUE(M4_function_1c)
        !            89:        pushl   %ebx
        !            90: deflit(`FRAME',4)
        !            91:        movl    PARAM_CARRY, %ebx
        !            92:        jmp     LF(M4_function_1,start_nc)
        !            93: EPILOGUE()
        !            94:
        !            95: PROLOGUE(M4_function_1)
        !            96:        push    %ebx
        !            97: deflit(`FRAME',4)
        !            98:        xorl    %ebx, %ebx      C initial carry
        !            99:
        !           100: L(start_nc):
        !           101:        movl    PARAM_SIZE, %ecx
        !           102:        pushl   %esi
        !           103: deflit(`FRAME',8)
        !           104:
        !           105:        movl    PARAM_SRC, %esi
        !           106:        pushl   %edi
        !           107: deflit(`FRAME',12)
        !           108:
        !           109:        movl    PARAM_DST, %edi
        !           110:        pushl   %ebp
        !           111: deflit(`FRAME',16)
        !           112:        cmpl    $UNROLL_THRESHOLD, %ecx
        !           113:
        !           114:        movl    PARAM_MULTIPLIER, %ebp
        !           115:        jae     L(unroll)
        !           116:
        !           117:
        !           118:        C simple loop
        !           119:        C this is offset 0x22, so close enough to aligned
        !           120: L(simple):
        !           121:        C eax   scratch
        !           122:        C ebx   carry
        !           123:        C ecx   counter
        !           124:        C edx   scratch
        !           125:        C esi   src
        !           126:        C edi   dst
        !           127:        C ebp   multiplier
        !           128:
        !           129:        movl    (%esi), %eax
        !           130:        addl    $4, %edi
        !           131:
        !           132:        mull    %ebp
        !           133:
        !           134:        addl    %ebx, %eax
        !           135:        adcl    $0, %edx
        !           136:
        !           137:        M4_inst %eax, -4(%edi)
        !           138:        movl    %edx, %ebx
        !           139:
        !           140:        adcl    $0, %ebx
        !           141:        decl    %ecx
        !           142:
        !           143:        leal    4(%esi), %esi
        !           144:        jnz     L(simple)
        !           145:
        !           146:
        !           147:        popl    %ebp
        !           148:        popl    %edi
        !           149:
        !           150:        popl    %esi
        !           151:        movl    %ebx, %eax
        !           152:
        !           153:        popl    %ebx
        !           154:        ret
        !           155:
        !           156:
        !           157:
        !           158: C------------------------------------------------------------------------------
        !           159: C VAR_JUMP holds the computed jump temporarily because there's not enough
        !           160: C registers when doing the mul for the initial two carry limbs.
        !           161: C
        !           162: C The add/adc for the initial carry in %ebx is necessary only for the
        !           163: C mpn_add/submul_1c entry points.  Duplicating the startup code to
        !           164: C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
        !           165: C idea.
        !           166:
        !           167: dnl  overlapping with parameters already fetched
        !           168: define(VAR_COUNTER,`PARAM_SIZE')
        !           169: define(VAR_JUMP,   `PARAM_DST')
        !           170:
        !           171:        C this is offset 0x43, so close enough to aligned
        !           172: L(unroll):
        !           173:        C eax
        !           174:        C ebx   initial carry
        !           175:        C ecx   size
        !           176:        C edx
        !           177:        C esi   src
        !           178:        C edi   dst
        !           179:        C ebp
        !           180:
        !           181:        movl    %ecx, %edx
        !           182:        decl    %ecx
        !           183:
        !           184:        subl    $2, %edx
        !           185:        negl    %ecx
        !           186:
        !           187:        shrl    $UNROLL_LOG2, %edx
        !           188:        andl    $UNROLL_MASK, %ecx
        !           189:
        !           190:        movl    %edx, VAR_COUNTER
        !           191:        movl    %ecx, %edx
        !           192:
        !           193:        C 15 code bytes per limb
        !           194: ifdef(`PIC',`
        !           195:        call    L(pic_calc)
        !           196: L(here):
        !           197: ',`
        !           198:        shll    $4, %edx
        !           199:        negl    %ecx
        !           200:
        !           201:        leal    L(entry) (%edx,%ecx,1), %edx
        !           202: ')
        !           203:        movl    (%esi), %eax            C src low limb
        !           204:
        !           205:        movl    %edx, VAR_JUMP
        !           206:        leal    ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
        !           207:
        !           208:        mull    %ebp
        !           209:
        !           210:        addl    %ebx, %eax      C initial carry (from _1c)
        !           211:        adcl    $0, %edx
        !           212:
        !           213:        movl    %edx, %ebx      C high carry
        !           214:        leal    ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
        !           215:
        !           216:        movl    VAR_JUMP, %edx
        !           217:        testl   $1, %ecx
        !           218:        movl    %eax, %ecx      C low carry
        !           219:
        !           220:        cmovnz( %ebx, %ecx)     C high,low carry other way around
        !           221:        cmovnz( %eax, %ebx)
        !           222:
        !           223:        jmp     *%edx
        !           224:
        !           225:
        !           226: ifdef(`PIC',`
        !           227: L(pic_calc):
        !           228:        shll    $4, %edx
        !           229:        negl    %ecx
        !           230:
        !           231:        C See README.family about old gas bugs
        !           232:        leal    (%edx,%ecx,1), %edx
        !           233:        addl    $L(entry)-L(here), %edx
        !           234:
        !           235:        addl    (%esp), %edx
        !           236:
        !           237:        ret
        !           238: ')
        !           239:
        !           240:
        !           241: C -----------------------------------------------------------
        !           242:        ALIGN(32)
        !           243: L(top):
        !           244: deflit(`FRAME',16)
        !           245:        C eax   scratch
        !           246:        C ebx   carry hi
        !           247:        C ecx   carry lo
        !           248:        C edx   scratch
        !           249:        C esi   src
        !           250:        C edi   dst
        !           251:        C ebp   multiplier
        !           252:        C
        !           253:        C VAR_COUNTER   loop counter
        !           254:        C
        !           255:        C 15 code bytes per limb
        !           256:
        !           257:        addl    $UNROLL_BYTES, %edi
        !           258:
        !           259: L(entry):
        !           260: deflit(CHUNK_COUNT,2)
        !           261: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           262:        deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
        !           263:        deflit(`disp1', eval(disp0 + 4))
        !           264:
        !           265: Zdisp( movl,   disp0,(%esi), %eax)
        !           266:        mull    %ebp
        !           267: Zdisp( M4_inst,%ecx, disp0,(%edi))
        !           268:        adcl    %eax, %ebx
        !           269:        movl    %edx, %ecx
        !           270:        adcl    $0, %ecx
        !           271:
        !           272:        movl    disp1(%esi), %eax
        !           273:        mull    %ebp
        !           274:        M4_inst %ebx, disp1(%edi)
        !           275:        adcl    %eax, %ecx
        !           276:        movl    %edx, %ebx
        !           277:        adcl    $0, %ebx
        !           278: ')
        !           279:
        !           280:        decl    VAR_COUNTER
        !           281:        leal    UNROLL_BYTES(%esi), %esi
        !           282:
        !           283:        jns     L(top)
        !           284:
        !           285:
        !           286: deflit(`disp0',        eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
        !           287:
        !           288:        M4_inst %ecx, disp0(%edi)
        !           289:        movl    %ebx, %eax
        !           290:
        !           291:        popl    %ebp
        !           292:        popl    %edi
        !           293:
        !           294:        popl    %esi
        !           295:        popl    %ebx
        !           296:        adcl    $0, %eax
        !           297:
        !           298:        ret
        !           299:
        !           300: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>