[BACK]Return to aors_n.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/aors_n.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
                      2: dnl
                      3: dnl  K7: 1.64 cycles/limb (at 16 limb/loop).
                      4:
                      5:
                      6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      7: dnl
                      8: dnl  This file is part of the GNU MP Library.
                      9: dnl
                     10: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     11: dnl  modify it under the terms of the GNU Lesser General Public License as
                     12: dnl  published by the Free Software Foundation; either version 2.1 of the
                     13: dnl  License, or (at your option) any later version.
                     14: dnl
                     15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     18: dnl  Lesser General Public License for more details.
                     19: dnl
                     20: dnl  You should have received a copy of the GNU Lesser General Public
                     21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     23: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     24:
                     25:
                     26: include(`../config.m4')
                     27:
                     28:
                     29: dnl  K7: UNROLL_COUNT cycles/limb
                     30: dnl           8           1.9
                     31: dnl          16           1.64
                     32: dnl          32           1.7
                     33: dnl          64           2.0
                     34: dnl  Maximum possible with the current code is 64.
                     35:
                     36: deflit(UNROLL_COUNT, 16)
                     37:
                     38:
                     39: ifdef(`OPERATION_add_n', `
                     40:        define(M4_inst,        adcl)
                     41:        define(M4_function_n,  mpn_add_n)
                     42:        define(M4_function_nc, mpn_add_nc)
                     43:        define(M4_description, add)
                     44: ',`ifdef(`OPERATION_sub_n', `
                     45:        define(M4_inst,        sbbl)
                     46:        define(M4_function_n,  mpn_sub_n)
                     47:        define(M4_function_nc, mpn_sub_nc)
                     48:        define(M4_description, subtract)
                     49: ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
                     50: ')')')
                     51:
                     52: MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
                     53:
                     54:
                     55: C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
                     56: C                         mp_size_t size);
                     57: C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
                     58: C                         mp_size_t size, mp_limb_t carry);
                     59: C
                     60: C Calculate src1,size M4_description src2,size, and store the result in
                     61: C dst,size.  The return value is the carry bit from the top of the result (1
                     62: C or 0).
                     63: C
                     64: C The _nc version accepts 1 or 0 for an initial carry into the low limb of
                     65: C the calculation.  Note values other than 1 or 0 here will lead to garbage
                     66: C results.
                     67: C
                     68: C This code runs at 1.64 cycles/limb, which is probably the best possible
                     69: C with plain integer operations.  Each limb is 2 loads and 1 store, and in
                     70: C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
                     71: C c/l.
                     72:
                     73: dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
                     74: ifdef(`PIC',`
                     75: deflit(UNROLL_THRESHOLD, 8)
                     76: ',`
                     77: deflit(UNROLL_THRESHOLD, 8)
                     78: ')
                     79:
                     80: defframe(PARAM_CARRY,20)
                     81: defframe(PARAM_SIZE, 16)
                     82: defframe(PARAM_SRC2, 12)
                     83: defframe(PARAM_SRC1, 8)
                     84: defframe(PARAM_DST,  4)
                     85:
                     86: defframe(SAVE_EBP, -4)
                     87: defframe(SAVE_ESI, -8)
                     88: defframe(SAVE_EBX, -12)
                     89: defframe(SAVE_EDI, -16)
                     90: deflit(STACK_SPACE, 16)
                     91:
                     92:        .text
                     93:        ALIGN(32)
                     94: deflit(`FRAME',0)
                     95:
                     96: PROLOGUE(M4_function_nc)
                     97:        movl    PARAM_CARRY, %eax
                     98:        jmp     LF(M4_function_n,start)
                     99: EPILOGUE()
                    100:
                    101: PROLOGUE(M4_function_n)
                    102:
                    103:        xorl    %eax, %eax      C carry
                    104: L(start):
                    105:        movl    PARAM_SIZE, %ecx
                    106:        subl    $STACK_SPACE, %esp
                    107: deflit(`FRAME',STACK_SPACE)
                    108:
                    109:        movl    %edi, SAVE_EDI
                    110:        movl    %ebx, SAVE_EBX
                    111:        cmpl    $UNROLL_THRESHOLD, %ecx
                    112:
                    113:        movl    PARAM_SRC2, %edx
                    114:        movl    PARAM_SRC1, %ebx
                    115:        jae     L(unroll)
                    116:
                    117:        movl    PARAM_DST, %edi
                    118:        leal    (%ebx,%ecx,4), %ebx
                    119:        leal    (%edx,%ecx,4), %edx
                    120:
                    121:        leal    (%edi,%ecx,4), %edi
                    122:        negl    %ecx
                    123:        shrl    %eax
                    124:
                    125:        C This loop in in a single 16 byte code block already, so no
                    126:        C alignment necessary.
                    127: L(simple):
                    128:        C eax   scratch
                    129:        C ebx   src1
                    130:        C ecx   counter
                    131:        C edx   src2
                    132:        C esi
                    133:        C edi   dst
                    134:        C ebp
                    135:
                    136:        movl    (%ebx,%ecx,4), %eax
                    137:        M4_inst (%edx,%ecx,4), %eax
                    138:        movl    %eax, (%edi,%ecx,4)
                    139:        incl    %ecx
                    140:        jnz     L(simple)
                    141:
                    142:        movl    $0, %eax
                    143:        movl    SAVE_EDI, %edi
                    144:
                    145:        movl    SAVE_EBX, %ebx
                    146:        setc    %al
                    147:        addl    $STACK_SPACE, %esp
                    148:
                    149:        ret
                    150:
                    151:
                    152: C -----------------------------------------------------------------------------
                    153:        C This is at 0x55, close enough to aligned.
                    154: L(unroll):
                    155: deflit(`FRAME',STACK_SPACE)
                    156:        movl    %ebp, SAVE_EBP
                    157:        andl    $-2, %ecx               C size low bit masked out
                    158:        andl    $1, PARAM_SIZE          C size low bit kept
                    159:
                    160:        movl    %ecx, %edi
                    161:        decl    %ecx
                    162:        movl    PARAM_DST, %ebp
                    163:
                    164:        shrl    $UNROLL_LOG2, %ecx
                    165:        negl    %edi
                    166:        movl    %esi, SAVE_ESI
                    167:
                    168:        andl    $UNROLL_MASK, %edi
                    169:
                    170: ifdef(`PIC',`
                    171:        call    L(pic_calc)
                    172: L(here):
                    173: ',`
                    174:        leal    L(entry) (%edi,%edi,8), %esi    C 9 bytes per
                    175: ')
                    176:        negl    %edi
                    177:        shrl    %eax
                    178:
                    179:        leal    ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
                    180:        leal    ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
                    181:        leal    ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
                    182:
                    183:        jmp     *%esi
                    184:
                    185:
                    186: ifdef(`PIC',`
                    187: L(pic_calc):
                    188:        C See README.family about old gas bugs
                    189:        leal    (%edi,%edi,8), %esi
                    190:        addl    $L(entry)-L(here), %esi
                    191:        addl    (%esp), %esi
                    192:        ret
                    193: ')
                    194:
                    195:
                    196: C -----------------------------------------------------------------------------
                    197:        ALIGN(32)
                    198: L(top):
                    199:        C eax   zero
                    200:        C ebx   src1
                    201:        C ecx   counter
                    202:        C edx   src2
                    203:        C esi   scratch (was computed jump)
                    204:        C edi   dst
                    205:        C ebp   scratch
                    206:
                    207:        leal    UNROLL_BYTES(%edx), %edx
                    208:
                    209: L(entry):
                    210: deflit(CHUNK_COUNT, 2)
                    211: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
                    212:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
                    213:        deflit(`disp1', eval(disp0 + 4))
                    214:
                    215: Zdisp( movl,   disp0,(%ebx), %esi)
                    216:        movl    disp1(%ebx), %ebp
                    217: Zdisp( M4_inst,disp0,(%edx), %esi)
                    218: Zdisp( movl,   %esi, disp0,(%edi))
                    219:        M4_inst disp1(%edx), %ebp
                    220:        movl    %ebp, disp1(%edi)
                    221: ')
                    222:
                    223:        decl    %ecx
                    224:        leal    UNROLL_BYTES(%ebx), %ebx
                    225:        leal    UNROLL_BYTES(%edi), %edi
                    226:        jns     L(top)
                    227:
                    228:
                    229:        mov     PARAM_SIZE, %esi
                    230:        movl    SAVE_EBP, %ebp
                    231:        movl    $0, %eax
                    232:
                    233:        decl    %esi
                    234:        js      L(even)
                    235:
                    236:        movl    (%ebx), %ecx
                    237:        M4_inst UNROLL_BYTES(%edx), %ecx
                    238:        movl    %ecx, (%edi)
                    239: L(even):
                    240:
                    241:        movl    SAVE_EDI, %edi
                    242:        movl    SAVE_EBX, %ebx
                    243:        setc    %al
                    244:
                    245:        movl    SAVE_ESI, %esi
                    246:        addl    $STACK_SPACE, %esp
                    247:
                    248:        ret
                    249:
                    250: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>