[BACK]Return to aors_n.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/aors_n.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
        !             2: dnl
        !             3: dnl  K7: 1.64 cycles/limb (at 16 limb/loop).
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: dnl  K7: UNROLL_COUNT cycles/limb
        !            30: dnl           8           1.9
        !            31: dnl          16           1.64
        !            32: dnl          32           1.7
        !            33: dnl          64           2.0
        !            34: dnl  Maximum possible with the current code is 64.
        !            35:
        !            36: deflit(UNROLL_COUNT, 16)
        !            37:
        !            38:
        !            39: ifdef(`OPERATION_add_n', `
        !            40:        define(M4_inst,        adcl)
        !            41:        define(M4_function_n,  mpn_add_n)
        !            42:        define(M4_function_nc, mpn_add_nc)
        !            43:        define(M4_description, add)
        !            44: ',`ifdef(`OPERATION_sub_n', `
        !            45:        define(M4_inst,        sbbl)
        !            46:        define(M4_function_n,  mpn_sub_n)
        !            47:        define(M4_function_nc, mpn_sub_nc)
        !            48:        define(M4_description, subtract)
        !            49: ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
        !            50: ')')')
        !            51:
        !            52: MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
        !            53:
        !            54:
        !            55: C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
        !            56: C                         mp_size_t size);
        !            57: C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
        !            58: C                         mp_size_t size, mp_limb_t carry);
        !            59: C
        !            60: C Calculate src1,size M4_description src2,size, and store the result in
        !            61: C dst,size.  The return value is the carry bit from the top of the result (1
        !            62: C or 0).
        !            63: C
        !            64: C The _nc version accepts 1 or 0 for an initial carry into the low limb of
        !            65: C the calculation.  Note values other than 1 or 0 here will lead to garbage
        !            66: C results.
        !            67: C
        !            68: C This code runs at 1.64 cycles/limb, which is probably the best possible
        !            69: C with plain integer operations.  Each limb is 2 loads and 1 store, and in
        !            70: C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
        !            71: C c/l.
        !            72:
        !            73: dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
        !            74: ifdef(`PIC',`
        !            75: deflit(UNROLL_THRESHOLD, 8)
        !            76: ',`
        !            77: deflit(UNROLL_THRESHOLD, 8)
        !            78: ')
        !            79:
        !            80: defframe(PARAM_CARRY,20)
        !            81: defframe(PARAM_SIZE, 16)
        !            82: defframe(PARAM_SRC2, 12)
        !            83: defframe(PARAM_SRC1, 8)
        !            84: defframe(PARAM_DST,  4)
        !            85:
        !            86: defframe(SAVE_EBP, -4)
        !            87: defframe(SAVE_ESI, -8)
        !            88: defframe(SAVE_EBX, -12)
        !            89: defframe(SAVE_EDI, -16)
        !            90: deflit(STACK_SPACE, 16)
        !            91:
        !            92:        .text
        !            93:        ALIGN(32)
        !            94: deflit(`FRAME',0)
        !            95:
        !            96: PROLOGUE(M4_function_nc)
        !            97:        movl    PARAM_CARRY, %eax
        !            98:        jmp     LF(M4_function_n,start)
        !            99: EPILOGUE()
        !           100:
        !           101: PROLOGUE(M4_function_n)
        !           102:
        !           103:        xorl    %eax, %eax      C carry
        !           104: L(start):
        !           105:        movl    PARAM_SIZE, %ecx
        !           106:        subl    $STACK_SPACE, %esp
        !           107: deflit(`FRAME',STACK_SPACE)
        !           108:
        !           109:        movl    %edi, SAVE_EDI
        !           110:        movl    %ebx, SAVE_EBX
        !           111:        cmpl    $UNROLL_THRESHOLD, %ecx
        !           112:
        !           113:        movl    PARAM_SRC2, %edx
        !           114:        movl    PARAM_SRC1, %ebx
        !           115:        jae     L(unroll)
        !           116:
        !           117:        movl    PARAM_DST, %edi
        !           118:        leal    (%ebx,%ecx,4), %ebx
        !           119:        leal    (%edx,%ecx,4), %edx
        !           120:
        !           121:        leal    (%edi,%ecx,4), %edi
        !           122:        negl    %ecx
        !           123:        shrl    %eax
        !           124:
        !           125:        C This loop in in a single 16 byte code block already, so no
        !           126:        C alignment necessary.
        !           127: L(simple):
        !           128:        C eax   scratch
        !           129:        C ebx   src1
        !           130:        C ecx   counter
        !           131:        C edx   src2
        !           132:        C esi
        !           133:        C edi   dst
        !           134:        C ebp
        !           135:
        !           136:        movl    (%ebx,%ecx,4), %eax
        !           137:        M4_inst (%edx,%ecx,4), %eax
        !           138:        movl    %eax, (%edi,%ecx,4)
        !           139:        incl    %ecx
        !           140:        jnz     L(simple)
        !           141:
        !           142:        movl    $0, %eax
        !           143:        movl    SAVE_EDI, %edi
        !           144:
        !           145:        movl    SAVE_EBX, %ebx
        !           146:        setc    %al
        !           147:        addl    $STACK_SPACE, %esp
        !           148:
        !           149:        ret
        !           150:
        !           151:
        !           152: C -----------------------------------------------------------------------------
        !           153:        C This is at 0x55, close enough to aligned.
        !           154: L(unroll):
        !           155: deflit(`FRAME',STACK_SPACE)
        !           156:        movl    %ebp, SAVE_EBP
        !           157:        andl    $-2, %ecx               C size low bit masked out
        !           158:        andl    $1, PARAM_SIZE          C size low bit kept
        !           159:
        !           160:        movl    %ecx, %edi
        !           161:        decl    %ecx
        !           162:        movl    PARAM_DST, %ebp
        !           163:
        !           164:        shrl    $UNROLL_LOG2, %ecx
        !           165:        negl    %edi
        !           166:        movl    %esi, SAVE_ESI
        !           167:
        !           168:        andl    $UNROLL_MASK, %edi
        !           169:
        !           170: ifdef(`PIC',`
        !           171:        call    L(pic_calc)
        !           172: L(here):
        !           173: ',`
        !           174:        leal    L(entry) (%edi,%edi,8), %esi    C 9 bytes per
        !           175: ')
        !           176:        negl    %edi
        !           177:        shrl    %eax
        !           178:
        !           179:        leal    ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
        !           180:        leal    ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
        !           181:        leal    ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
        !           182:
        !           183:        jmp     *%esi
        !           184:
        !           185:
        !           186: ifdef(`PIC',`
        !           187: L(pic_calc):
        !           188:        C See README.family about old gas bugs
        !           189:        leal    (%edi,%edi,8), %esi
        !           190:        addl    $L(entry)-L(here), %esi
        !           191:        addl    (%esp), %esi
        !           192:        ret
        !           193: ')
        !           194:
        !           195:
        !           196: C -----------------------------------------------------------------------------
        !           197:        ALIGN(32)
        !           198: L(top):
        !           199:        C eax   zero
        !           200:        C ebx   src1
        !           201:        C ecx   counter
        !           202:        C edx   src2
        !           203:        C esi   scratch (was computed jump)
        !           204:        C edi   dst
        !           205:        C ebp   scratch
        !           206:
        !           207:        leal    UNROLL_BYTES(%edx), %edx
        !           208:
        !           209: L(entry):
        !           210: deflit(CHUNK_COUNT, 2)
        !           211: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           212:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
        !           213:        deflit(`disp1', eval(disp0 + 4))
        !           214:
        !           215: Zdisp( movl,   disp0,(%ebx), %esi)
        !           216:        movl    disp1(%ebx), %ebp
        !           217: Zdisp( M4_inst,disp0,(%edx), %esi)
        !           218: Zdisp( movl,   %esi, disp0,(%edi))
        !           219:        M4_inst disp1(%edx), %ebp
        !           220:        movl    %ebp, disp1(%edi)
        !           221: ')
        !           222:
        !           223:        decl    %ecx
        !           224:        leal    UNROLL_BYTES(%ebx), %ebx
        !           225:        leal    UNROLL_BYTES(%edi), %edi
        !           226:        jns     L(top)
        !           227:
        !           228:
        !           229:        mov     PARAM_SIZE, %esi
        !           230:        movl    SAVE_EBP, %ebp
        !           231:        movl    $0, %eax
        !           232:
        !           233:        decl    %esi
        !           234:        js      L(even)
        !           235:
        !           236:        movl    (%ebx), %ecx
        !           237:        M4_inst UNROLL_BYTES(%edx), %ecx
        !           238:        movl    %ecx, (%edi)
        !           239: L(even):
        !           240:
        !           241:        movl    SAVE_EDI, %edi
        !           242:        movl    SAVE_EBX, %ebx
        !           243:        setc    %al
        !           244:
        !           245:        movl    SAVE_ESI, %esi
        !           246:        addl    $STACK_SPACE, %esp
        !           247:
        !           248:        ret
        !           249:
        !           250: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>