[BACK]Return to aors_n.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aors_n.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
                      2: dnl
                      3: dnl  K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
                      4:
                      5:
                      6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      7: dnl
                      8: dnl  This file is part of the GNU MP Library.
                      9: dnl
                     10: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     11: dnl  modify it under the terms of the GNU Lesser General Public License as
                     12: dnl  published by the Free Software Foundation; either version 2.1 of the
                     13: dnl  License, or (at your option) any later version.
                     14: dnl
                     15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     18: dnl  Lesser General Public License for more details.
                     19: dnl
                     20: dnl  You should have received a copy of the GNU Lesser General Public
                     21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     23: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     24:
                     25:
                     26: include(`../config.m4')
                     27:
                     28:
                     29: ifdef(`OPERATION_add_n', `
                     30:        define(M4_inst,        adcl)
                     31:        define(M4_function_n,  mpn_add_n)
                     32:        define(M4_function_nc, mpn_add_nc)
                     33:        define(M4_description, add)
                     34: ',`ifdef(`OPERATION_sub_n', `
                     35:        define(M4_inst,        sbbl)
                     36:        define(M4_function_n,  mpn_sub_n)
                     37:        define(M4_function_nc, mpn_sub_nc)
                     38:        define(M4_description, subtract)
                     39: ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
                     40: ')')')
                     41:
                     42: MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
                     43:
                     44:
                     45: C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
                     46: C                          mp_size_t size);
                     47: C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
                     48: C                            mp_size_t size, mp_limb_t carry);
                     49: C
                     50: C Calculate src1,size M4_description src2,size, and store the result in
                     51: C dst,size.  The return value is the carry bit from the top of the result
                     52: C (1 or 0).
                     53: C
                     54: C The _nc version accepts 1 or 0 for an initial carry into the low limb of
                     55: C the calculation.  Note values other than 1 or 0 here will lead to garbage
                     56: C results.
                     57: C
                     58: C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
                     59: C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
                     60: C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
                     61:
                     62: define(PARAM_CARRY, `FRAME+20(%esp)')
                     63: define(PARAM_SIZE,  `FRAME+16(%esp)')
                     64: define(PARAM_SRC2,  `FRAME+12(%esp)')
                     65: define(PARAM_SRC1,  `FRAME+8(%esp)')
                     66: define(PARAM_DST,   `FRAME+4(%esp)')
                     67: deflit(`FRAME',0)
                     68:
                     69: dnl  minimum 5 because the unrolled code can't handle less
                     70: deflit(UNROLL_THRESHOLD, 5)
                     71:
                     72:        .text
                     73:        ALIGN(32)
                     74:
                     75: PROLOGUE(M4_function_nc)
                     76:        movl    PARAM_CARRY, %eax
                     77:        jmp     LF(M4_function_n,start)
                     78: EPILOGUE()
                     79:
                     80:
                     81: PROLOGUE(M4_function_n)
                     82:        xorl    %eax, %eax
                     83: L(start):
                     84:        movl    PARAM_SIZE, %ecx
                     85:        pushl   %ebx
                     86: FRAME_pushl()
                     87:
                     88:        movl    PARAM_SRC1, %ebx
                     89:        pushl   %edi
                     90: FRAME_pushl()
                     91:
                     92:        movl    PARAM_SRC2, %edx
                     93:        cmpl    $UNROLL_THRESHOLD, %ecx
                     94:
                     95:        movl    PARAM_DST, %edi
                     96:        jae     L(unroll)
                     97:
                     98:
                     99:        shrl    %eax            C initial carry flag
                    100:
                    101:        C offset 0x21 here, close enough to aligned
                    102: L(simple):
                    103:        C eax   scratch
                    104:        C ebx   src1
                    105:        C ecx   counter
                    106:        C edx   src2
                    107:        C esi
                    108:        C edi   dst
                    109:        C ebp
                    110:        C
                    111:        C The store to (%edi) could be done with a stosl; it'd be smaller
                    112:        C code, but there's no speed gain and a cld would have to be added
                    113:        C (per mpn/x86/README.family).
                    114:
                    115:        movl    (%ebx), %eax
                    116:        leal    4(%ebx), %ebx
                    117:
                    118:        M4_inst (%edx), %eax
                    119:
                    120:        movl    %eax, (%edi)
                    121:        leal    4(%edi), %edi
                    122:
                    123:        leal    4(%edx), %edx
                    124:        loop    L(simple)
                    125:
                    126:
                    127:        movl    $0, %eax
                    128:        popl    %edi
                    129:
                    130:        setc    %al
                    131:
                    132:        popl    %ebx
                    133:        ret
                    134:
                    135:
                    136: C -----------------------------------------------------------------------------
                    137: L(unroll):
                    138:        C eax   carry
                    139:        C ebx   src1
                    140:        C ecx   counter
                    141:        C edx   src2
                    142:        C esi
                    143:        C edi   dst
                    144:        C ebp
                    145:
                    146:        cmpl    %edi, %ebx
                    147:        pushl   %esi
                    148:
                    149:        je      L(inplace)
                    150:
                    151: ifdef(`OPERATION_add_n',`
                    152:        cmpl    %edi, %edx
                    153:
                    154:        je      L(inplace_reverse)
                    155: ')
                    156:
                    157:        movl    %ecx, %esi
                    158:
                    159:        andl    $-4, %ecx
                    160:        andl    $3, %esi
                    161:
                    162:        leal    (%ebx,%ecx,4), %ebx
                    163:        leal    (%edx,%ecx,4), %edx
                    164:        leal    (%edi,%ecx,4), %edi
                    165:
                    166:        negl    %ecx
                    167:        shrl    %eax
                    168:
                    169:        ALIGN(32)
                    170: L(normal_top):
                    171:        C eax   counter, qwords, negative
                    172:        C ebx   src1
                    173:        C ecx   scratch
                    174:        C edx   src2
                    175:        C esi
                    176:        C edi   dst
                    177:        C ebp
                    178:
                    179:        movl    (%ebx,%ecx,4), %eax
                    180:        leal    5(%ecx), %ecx
                    181:        M4_inst -20(%edx,%ecx,4), %eax
                    182:        movl    %eax, -20(%edi,%ecx,4)
                    183:
                    184:        movl    4-20(%ebx,%ecx,4), %eax
                    185:        M4_inst 4-20(%edx,%ecx,4), %eax
                    186:        movl    %eax, 4-20(%edi,%ecx,4)
                    187:
                    188:        movl    8-20(%ebx,%ecx,4), %eax
                    189:        M4_inst 8-20(%edx,%ecx,4), %eax
                    190:        movl    %eax, 8-20(%edi,%ecx,4)
                    191:
                    192:        movl    12-20(%ebx,%ecx,4), %eax
                    193:        M4_inst 12-20(%edx,%ecx,4), %eax
                    194:        movl    %eax, 12-20(%edi,%ecx,4)
                    195:
                    196:        loop    L(normal_top)
                    197:
                    198:
                    199:        decl    %esi
                    200:        jz      L(normal_finish_one)
                    201:        js      L(normal_done)
                    202:
                    203:        C two or three more limbs
                    204:
                    205:        movl    (%ebx), %eax
                    206:        M4_inst (%edx), %eax
                    207:        movl    %eax, (%edi)
                    208:
                    209:        movl    4(%ebx), %eax
                    210:        M4_inst 4(%edx), %eax
                    211:        decl    %esi
                    212:        movl    %eax, 4(%edi)
                    213:
                    214:        jz      L(normal_done)
                    215:        movl    $2, %ecx
                    216:
                    217: L(normal_finish_one):
                    218:        movl    (%ebx,%ecx,4), %eax
                    219:        M4_inst (%edx,%ecx,4), %eax
                    220:        movl    %eax, (%edi,%ecx,4)
                    221:
                    222: L(normal_done):
                    223:        popl    %esi
                    224:        popl    %edi
                    225:
                    226:        movl    $0, %eax
                    227:        popl    %ebx
                    228:
                    229:        setc    %al
                    230:
                    231:        ret
                    232:
                    233:
                    234: C -----------------------------------------------------------------------------
                    235:
                    236: ifdef(`OPERATION_add_n',`
                    237: L(inplace_reverse):
                    238:        C dst==src2
                    239:
                    240:        movl    %ebx, %edx
                    241: ')
                    242:
                    243: L(inplace):
                    244:        C eax   initial carry
                    245:        C ebx
                    246:        C ecx   size
                    247:        C edx   src
                    248:        C esi
                    249:        C edi   dst
                    250:        C ebp
                    251:
                    252:        leal    -1(%ecx), %esi
                    253:        decl    %ecx
                    254:
                    255:        andl    $-4, %ecx
                    256:        andl    $3, %esi
                    257:
                    258:        movl    (%edx), %ebx            C src low limb
                    259:        leal    (%edx,%ecx,4), %edx
                    260:
                    261:        leal    (%edi,%ecx,4), %edi
                    262:        negl    %ecx
                    263:
                    264:        shrl    %eax
                    265:
                    266:
                    267:        ALIGN(32)
                    268: L(inplace_top):
                    269:        C eax
                    270:        C ebx   next src limb
                    271:        C ecx   size
                    272:        C edx   src
                    273:        C esi
                    274:        C edi   dst
                    275:        C ebp
                    276:
                    277:        M4_inst %ebx, (%edi,%ecx,4)
                    278:
                    279:        movl    4(%edx,%ecx,4), %eax
                    280:        leal    5(%ecx), %ecx
                    281:
                    282:        M4_inst %eax, 4-20(%edi,%ecx,4)
                    283:
                    284:        movl    8-20(%edx,%ecx,4), %eax
                    285:        movl    12-20(%edx,%ecx,4), %ebx
                    286:
                    287:        M4_inst %eax, 8-20(%edi,%ecx,4)
                    288:        M4_inst %ebx, 12-20(%edi,%ecx,4)
                    289:
                    290:        movl    16-20(%edx,%ecx,4), %ebx
                    291:        loop    L(inplace_top)
                    292:
                    293:
                    294:        C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
                    295:
                    296:        M4_inst %ebx, (%edi)
                    297:
                    298:        decl    %esi
                    299:        jz      L(inplace_finish_one)
                    300:        js      L(inplace_done)
                    301:
                    302:        C two or three more limbs
                    303:
                    304:        movl    4(%edx), %eax
                    305:        movl    8(%edx), %ebx
                    306:        M4_inst %eax, 4(%edi)
                    307:        M4_inst %ebx, 8(%edi)
                    308:
                    309:        decl    %esi
                    310:        movl    $2, %ecx
                    311:
                    312:        jz      L(normal_done)
                    313:
                    314: L(inplace_finish_one):
                    315:        movl    4(%edx,%ecx,4), %eax
                    316:        M4_inst %eax, 4(%edi,%ecx,4)
                    317:
                    318: L(inplace_done):
                    319:        popl    %esi
                    320:        popl    %edi
                    321:
                    322:        movl    $0, %eax
                    323:        popl    %ebx
                    324:
                    325:        setc    %al
                    326:
                    327:        ret
                    328:
                    329: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>