[BACK]Return to mul_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mul_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_mul_1 -- mpn by limb multiply.
        !             2: dnl
        !             3: dnl  K7: 3.4 cycles/limb (at 16 limbs/loop).
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: dnl  K7: UNROLL_COUNT cycles/limb
        !            30: dnl           8           3.9
        !            31: dnl          16           3.4
        !            32: dnl          32           3.4
        !            33: dnl          64           3.35
        !            34: dnl  Maximum possible with the current code is 64.
        !            35:
        !            36: deflit(UNROLL_COUNT, 16)
        !            37:
        !            38:
        !            39: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            40: C                      mp_limb_t multiplier);
        !            41: C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            42: C                       mp_limb_t multiplier, mp_limb_t carry);
        !            43: C
        !            44: C Multiply src,size by mult and store the result in dst,size.
        !            45: C Return the carry limb from the top of the result.
        !            46: C
        !            47: C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
        !            48: C the low limb of the destination.
        !            49: C
        !            50: C Variations on the unrolled loop have been tried, with the current
        !            51: C registers or with the counter on the stack to free up ecx.  The current
        !            52: C code is the fastest found.
        !            53: C
        !            54: C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
        !            55: C from the unrolled loop actually slows it down to 5.0 cycles/limb.  Code
        !            56: C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
        !            57: C without having to change the computed jump.  There's obviously something
        !            58: C fishy going on, perhaps with what execution units the mul needs.
        !            59:
        !            60: defframe(PARAM_CARRY,     20)
        !            61: defframe(PARAM_MULTIPLIER,16)
        !            62: defframe(PARAM_SIZE,      12)
        !            63: defframe(PARAM_SRC,       8)
        !            64: defframe(PARAM_DST,       4)
        !            65:
        !            66: defframe(SAVE_EBP, -4)
        !            67: defframe(SAVE_EDI, -8)
        !            68: defframe(SAVE_ESI, -12)
        !            69: defframe(SAVE_EBX, -16)
        !            70: deflit(STACK_SPACE, 16)
        !            71:
        !            72: dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
        !            73: ifdef(`PIC',`
        !            74: deflit(UNROLL_THRESHOLD, 7)
        !            75: ',`
        !            76: deflit(UNROLL_THRESHOLD, 5)
        !            77: ')
        !            78:
        !            79:        .text
        !            80:        ALIGN(32)
        !            81: PROLOGUE(mpn_mul_1c)
        !            82: deflit(`FRAME',0)
        !            83:        movl    PARAM_CARRY, %edx
        !            84:        jmp     LF(mpn_mul_1,start_nc)
        !            85: EPILOGUE()
        !            86:
        !            87:
        !            88: PROLOGUE(mpn_mul_1)
        !            89: deflit(`FRAME',0)
        !            90:        xorl    %edx, %edx      C initial carry
        !            91: L(start_nc):
        !            92:        movl    PARAM_SIZE, %ecx
        !            93:        subl    $STACK_SPACE, %esp
        !            94: deflit(`FRAME', STACK_SPACE)
        !            95:
        !            96:        movl    %edi, SAVE_EDI
        !            97:        movl    %ebx, SAVE_EBX
        !            98:        movl    %edx, %ebx
        !            99:
        !           100:        movl    %esi, SAVE_ESI
        !           101:        movl    PARAM_SRC, %esi
        !           102:        cmpl    $UNROLL_THRESHOLD, %ecx
        !           103:
        !           104:        movl    PARAM_DST, %edi
        !           105:        movl    %ebp, SAVE_EBP
        !           106:        jae     L(unroll)
        !           107:
        !           108:        leal    (%esi,%ecx,4), %esi
        !           109:        leal    (%edi,%ecx,4), %edi
        !           110:        negl    %ecx
        !           111:
        !           112:        movl    PARAM_MULTIPLIER, %ebp
        !           113:
        !           114: L(simple):
        !           115:        C eax   scratch
        !           116:        C ebx   carry
        !           117:        C ecx   counter (negative)
        !           118:        C edx   scratch
        !           119:        C esi   src
        !           120:        C edi   dst
        !           121:        C ebp   multiplier
        !           122:
        !           123:        movl    (%esi,%ecx,4), %eax
        !           124:
        !           125:        mull    %ebp
        !           126:
        !           127:        addl    %ebx, %eax
        !           128:        movl    %eax, (%edi,%ecx,4)
        !           129:        movl    $0, %ebx
        !           130:
        !           131:        adcl    %edx, %ebx
        !           132:        incl    %ecx
        !           133:        jnz     L(simple)
        !           134:
        !           135:        movl    %ebx, %eax
        !           136:        movl    SAVE_EBX, %ebx
        !           137:        movl    SAVE_ESI, %esi
        !           138:
        !           139:        movl    SAVE_EDI, %edi
        !           140:        movl    SAVE_EBP, %ebp
        !           141:        addl    $STACK_SPACE, %esp
        !           142:
        !           143:        ret
        !           144:
        !           145:
        !           146: C -----------------------------------------------------------------------------
        !           147: C The mov to load the next source limb is done well ahead of the mul, this
        !           148: C is necessary for full speed.  It leads to one limb handled separately
        !           149: C after the loop.
        !           150: C
        !           151: C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
        !           152: C to avoid having an 0x80 displacement in the code for the last limb in the
        !           153: C unrolled loop.  This is for a fair comparison between 16 and 32 unrolling.
        !           154:
        !           155: ifelse(eval(UNROLL_COUNT >= 32),1,`
        !           156: deflit(SRC_OFFSET,4)
        !           157: ',`
        !           158: deflit(SRC_OFFSET,)
        !           159: ')
        !           160:
        !           161:        C this is offset 0x62, so close enough to aligned
        !           162: L(unroll):
        !           163:        C eax
        !           164:        C ebx   initial carry
        !           165:        C ecx   size
        !           166:        C edx
        !           167:        C esi   src
        !           168:        C edi   dst
        !           169:        C ebp
        !           170: deflit(`FRAME', STACK_SPACE)
        !           171:
        !           172:        leal    -1(%ecx), %edx  C one limb handled at end
        !           173:        leal    -2(%ecx), %ecx  C and ecx is one less than edx
        !           174:        movl    %ebp, SAVE_EBP
        !           175:
        !           176:        negl    %edx
        !           177:        shrl    $UNROLL_LOG2, %ecx      C unrolled loop counter
        !           178:        movl    (%esi), %eax            C src low limb
        !           179:
        !           180:        andl    $UNROLL_MASK, %edx
        !           181:        movl    PARAM_DST, %edi
        !           182:
        !           183:        movl    %edx, %ebp
        !           184:        shll    $4, %edx
        !           185:
        !           186:        C 17 code bytes per limb
        !           187: ifdef(`PIC',`
        !           188:        call    L(add_eip_to_edx)
        !           189: L(here):
        !           190: ',`
        !           191:        leal    L(entry) (%edx,%ebp), %edx
        !           192: ')
        !           193:        negl    %ebp
        !           194:
        !           195:        leal    ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
        !           196:        leal    ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
        !           197:        movl    PARAM_MULTIPLIER, %ebp
        !           198:
        !           199:        jmp     *%edx
        !           200:
        !           201:
        !           202: ifdef(`PIC',`
        !           203: L(add_eip_to_edx):
        !           204:        C See README.family about old gas bugs
        !           205:        leal    (%edx,%ebp), %edx
        !           206:        addl    $L(entry)-L(here), %edx
        !           207:        addl    (%esp), %edx
        !           208:        ret
        !           209: ')
        !           210:
        !           211:
        !           212: C ----------------------------------------------------------------------------
        !           213:        ALIGN(32)
        !           214: L(top):
        !           215:        C eax   next src limb
        !           216:        C ebx   carry
        !           217:        C ecx   counter
        !           218:        C edx   scratch
        !           219:        C esi   src+4
        !           220:        C edi   dst
        !           221:        C ebp   multiplier
        !           222:        C
        !           223:        C 17 code bytes per limb processed
        !           224:
        !           225: L(entry):
        !           226: forloop(i, 0, UNROLL_COUNT-1, `
        !           227:        deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
        !           228:        deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
        !           229:
        !           230:        mull    %ebp
        !           231:
        !           232:        addl    %eax, %ebx
        !           233: Zdisp( movl,   disp_src,(%esi), %eax)
        !           234: Zdisp( movl,   %ebx, disp_dst,(%edi))
        !           235:
        !           236:        movl    $0, %ebx
        !           237:        adcl    %edx, %ebx
        !           238: ')
        !           239:
        !           240:        decl    %ecx
        !           241:
        !           242:        leal    UNROLL_BYTES(%esi), %esi
        !           243:        leal    UNROLL_BYTES(%edi), %edi
        !           244:        jns     L(top)
        !           245:
        !           246:
        !           247: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
        !           248:
        !           249:        mull    %ebp
        !           250:
        !           251:        addl    %eax, %ebx
        !           252:        movl    $0, %eax
        !           253:        movl    SAVE_ESI, %esi
        !           254:
        !           255:        movl    %ebx, disp0(%edi)
        !           256:        movl    SAVE_EBX, %ebx
        !           257:        movl    SAVE_EDI, %edi
        !           258:
        !           259:        adcl    %edx, %eax
        !           260:        movl    SAVE_EBP, %ebp
        !           261:        addl    $STACK_SPACE, %esp
        !           262:
        !           263:        ret
        !           264:
        !           265: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>