[BACK]Return to aors_n.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aors_n.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
        !             2: dnl
        !             3: dnl  K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: ifdef(`OPERATION_add_n', `
        !            30:        define(M4_inst,        adcl)
        !            31:        define(M4_function_n,  mpn_add_n)
        !            32:        define(M4_function_nc, mpn_add_nc)
        !            33:        define(M4_description, add)
        !            34: ',`ifdef(`OPERATION_sub_n', `
        !            35:        define(M4_inst,        sbbl)
        !            36:        define(M4_function_n,  mpn_sub_n)
        !            37:        define(M4_function_nc, mpn_sub_nc)
        !            38:        define(M4_description, subtract)
        !            39: ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
        !            40: ')')')
        !            41:
        !            42: MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
        !            43:
        !            44:
        !            45: C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
        !            46: C                          mp_size_t size);
        !            47: C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
        !            48: C                            mp_size_t size, mp_limb_t carry);
        !            49: C
        !            50: C Calculate src1,size M4_description src2,size, and store the result in
        !            51: C dst,size.  The return value is the carry bit from the top of the result
        !            52: C (1 or 0).
        !            53: C
        !            54: C The _nc version accepts 1 or 0 for an initial carry into the low limb of
        !            55: C the calculation.  Note values other than 1 or 0 here will lead to garbage
        !            56: C results.
        !            57: C
        !            58: C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
        !            59: C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
        !            60: C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
        !            61:
        !            62: define(PARAM_CARRY, `FRAME+20(%esp)')
        !            63: define(PARAM_SIZE,  `FRAME+16(%esp)')
        !            64: define(PARAM_SRC2,  `FRAME+12(%esp)')
        !            65: define(PARAM_SRC1,  `FRAME+8(%esp)')
        !            66: define(PARAM_DST,   `FRAME+4(%esp)')
        !            67: deflit(`FRAME',0)
        !            68:
        !            69: dnl  minimum 5 because the unrolled code can't handle less
        !            70: deflit(UNROLL_THRESHOLD, 5)
        !            71:
        !            72:        .text
        !            73:        ALIGN(32)
        !            74:
        !            75: PROLOGUE(M4_function_nc)
        !            76:        movl    PARAM_CARRY, %eax
        !            77:        jmp     LF(M4_function_n,start)
        !            78: EPILOGUE()
        !            79:
        !            80:
        !            81: PROLOGUE(M4_function_n)
        !            82:        xorl    %eax, %eax
        !            83: L(start):
        !            84:        movl    PARAM_SIZE, %ecx
        !            85:        pushl   %ebx
        !            86: FRAME_pushl()
        !            87:
        !            88:        movl    PARAM_SRC1, %ebx
        !            89:        pushl   %edi
        !            90: FRAME_pushl()
        !            91:
        !            92:        movl    PARAM_SRC2, %edx
        !            93:        cmpl    $UNROLL_THRESHOLD, %ecx
        !            94:
        !            95:        movl    PARAM_DST, %edi
        !            96:        jae     L(unroll)
        !            97:
        !            98:
        !            99:        shrl    %eax            C initial carry flag
        !           100:
        !           101:        C offset 0x21 here, close enough to aligned
        !           102: L(simple):
        !           103:        C eax   scratch
        !           104:        C ebx   src1
        !           105:        C ecx   counter
        !           106:        C edx   src2
        !           107:        C esi
        !           108:        C edi   dst
        !           109:        C ebp
        !           110:        C
        !           111:        C The store to (%edi) could be done with a stosl; it'd be smaller
        !           112:        C code, but there's no speed gain and a cld would have to be added
        !           113:        C (per mpn/x86/README.family).
        !           114:
        !           115:        movl    (%ebx), %eax
        !           116:        leal    4(%ebx), %ebx
        !           117:
        !           118:        M4_inst (%edx), %eax
        !           119:
        !           120:        movl    %eax, (%edi)
        !           121:        leal    4(%edi), %edi
        !           122:
        !           123:        leal    4(%edx), %edx
        !           124:        loop    L(simple)
        !           125:
        !           126:
        !           127:        movl    $0, %eax
        !           128:        popl    %edi
        !           129:
        !           130:        setc    %al
        !           131:
        !           132:        popl    %ebx
        !           133:        ret
        !           134:
        !           135:
        !           136: C -----------------------------------------------------------------------------
        !           137: L(unroll):
        !           138:        C eax   carry
        !           139:        C ebx   src1
        !           140:        C ecx   counter
        !           141:        C edx   src2
        !           142:        C esi
        !           143:        C edi   dst
        !           144:        C ebp
        !           145:
        !           146:        cmpl    %edi, %ebx
        !           147:        pushl   %esi
        !           148:
        !           149:        je      L(inplace)
        !           150:
        !           151: ifdef(`OPERATION_add_n',`
        !           152:        cmpl    %edi, %edx
        !           153:
        !           154:        je      L(inplace_reverse)
        !           155: ')
        !           156:
        !           157:        movl    %ecx, %esi
        !           158:
        !           159:        andl    $-4, %ecx
        !           160:        andl    $3, %esi
        !           161:
        !           162:        leal    (%ebx,%ecx,4), %ebx
        !           163:        leal    (%edx,%ecx,4), %edx
        !           164:        leal    (%edi,%ecx,4), %edi
        !           165:
        !           166:        negl    %ecx
        !           167:        shrl    %eax
        !           168:
        !           169:        ALIGN(32)
        !           170: L(normal_top):
        !           171:        C eax   counter, qwords, negative
        !           172:        C ebx   src1
        !           173:        C ecx   scratch
        !           174:        C edx   src2
        !           175:        C esi
        !           176:        C edi   dst
        !           177:        C ebp
        !           178:
        !           179:        movl    (%ebx,%ecx,4), %eax
        !           180:        leal    5(%ecx), %ecx
        !           181:        M4_inst -20(%edx,%ecx,4), %eax
        !           182:        movl    %eax, -20(%edi,%ecx,4)
        !           183:
        !           184:        movl    4-20(%ebx,%ecx,4), %eax
        !           185:        M4_inst 4-20(%edx,%ecx,4), %eax
        !           186:        movl    %eax, 4-20(%edi,%ecx,4)
        !           187:
        !           188:        movl    8-20(%ebx,%ecx,4), %eax
        !           189:        M4_inst 8-20(%edx,%ecx,4), %eax
        !           190:        movl    %eax, 8-20(%edi,%ecx,4)
        !           191:
        !           192:        movl    12-20(%ebx,%ecx,4), %eax
        !           193:        M4_inst 12-20(%edx,%ecx,4), %eax
        !           194:        movl    %eax, 12-20(%edi,%ecx,4)
        !           195:
        !           196:        loop    L(normal_top)
        !           197:
        !           198:
        !           199:        decl    %esi
        !           200:        jz      L(normal_finish_one)
        !           201:        js      L(normal_done)
        !           202:
        !           203:        C two or three more limbs
        !           204:
        !           205:        movl    (%ebx), %eax
        !           206:        M4_inst (%edx), %eax
        !           207:        movl    %eax, (%edi)
        !           208:
        !           209:        movl    4(%ebx), %eax
        !           210:        M4_inst 4(%edx), %eax
        !           211:        decl    %esi
        !           212:        movl    %eax, 4(%edi)
        !           213:
        !           214:        jz      L(normal_done)
        !           215:        movl    $2, %ecx
        !           216:
        !           217: L(normal_finish_one):
        !           218:        movl    (%ebx,%ecx,4), %eax
        !           219:        M4_inst (%edx,%ecx,4), %eax
        !           220:        movl    %eax, (%edi,%ecx,4)
        !           221:
        !           222: L(normal_done):
        !           223:        popl    %esi
        !           224:        popl    %edi
        !           225:
        !           226:        movl    $0, %eax
        !           227:        popl    %ebx
        !           228:
        !           229:        setc    %al
        !           230:
        !           231:        ret
        !           232:
        !           233:
        !           234: C -----------------------------------------------------------------------------
        !           235:
        !           236: ifdef(`OPERATION_add_n',`
        !           237: L(inplace_reverse):
        !           238:        C dst==src2
        !           239:
        !           240:        movl    %ebx, %edx
        !           241: ')
        !           242:
        !           243: L(inplace):
        !           244:        C eax   initial carry
        !           245:        C ebx
        !           246:        C ecx   size
        !           247:        C edx   src
        !           248:        C esi
        !           249:        C edi   dst
        !           250:        C ebp
        !           251:
        !           252:        leal    -1(%ecx), %esi
        !           253:        decl    %ecx
        !           254:
        !           255:        andl    $-4, %ecx
        !           256:        andl    $3, %esi
        !           257:
        !           258:        movl    (%edx), %ebx            C src low limb
        !           259:        leal    (%edx,%ecx,4), %edx
        !           260:
        !           261:        leal    (%edi,%ecx,4), %edi
        !           262:        negl    %ecx
        !           263:
        !           264:        shrl    %eax
        !           265:
        !           266:
        !           267:        ALIGN(32)
        !           268: L(inplace_top):
        !           269:        C eax
        !           270:        C ebx   next src limb
        !           271:        C ecx   size
        !           272:        C edx   src
        !           273:        C esi
        !           274:        C edi   dst
        !           275:        C ebp
        !           276:
        !           277:        M4_inst %ebx, (%edi,%ecx,4)
        !           278:
        !           279:        movl    4(%edx,%ecx,4), %eax
        !           280:        leal    5(%ecx), %ecx
        !           281:
        !           282:        M4_inst %eax, 4-20(%edi,%ecx,4)
        !           283:
        !           284:        movl    8-20(%edx,%ecx,4), %eax
        !           285:        movl    12-20(%edx,%ecx,4), %ebx
        !           286:
        !           287:        M4_inst %eax, 8-20(%edi,%ecx,4)
        !           288:        M4_inst %ebx, 12-20(%edi,%ecx,4)
        !           289:
        !           290:        movl    16-20(%edx,%ecx,4), %ebx
        !           291:        loop    L(inplace_top)
        !           292:
        !           293:
        !           294:        C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
        !           295:
        !           296:        M4_inst %ebx, (%edi)
        !           297:
        !           298:        decl    %esi
        !           299:        jz      L(inplace_finish_one)
        !           300:        js      L(inplace_done)
        !           301:
        !           302:        C two or three more limbs
        !           303:
        !           304:        movl    4(%edx), %eax
        !           305:        movl    8(%edx), %ebx
        !           306:        M4_inst %eax, 4(%edi)
        !           307:        M4_inst %ebx, 8(%edi)
        !           308:
        !           309:        decl    %esi
        !           310:        movl    $2, %ecx
        !           311:
        !           312:        jz      L(normal_done)
        !           313:
        !           314: L(inplace_finish_one):
        !           315:        movl    4(%edx,%ecx,4), %eax
        !           316:        M4_inst %eax, 4(%edi,%ecx,4)
        !           317:
        !           318: L(inplace_done):
        !           319:        popl    %esi
        !           320:        popl    %edi
        !           321:
        !           322:        movl    $0, %eax
        !           323:        popl    %ebx
        !           324:
        !           325:        setc    %al
        !           326:
        !           327:        ret
        !           328:
        !           329: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>