[BACK]Return to aorsmul_1.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/aorsmul_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
        !             2: dnl
        !             3: dnl  K7: 3.9 cycles/limb.
        !             4: dnl
        !             5: dnl  Future: It should be possible to avoid the separate mul after the
        !             6: dnl  unrolled loop by moving the movl/adcl to the top.
        !             7:
        !             8:
        !             9: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !            10: dnl
        !            11: dnl  This file is part of the GNU MP Library.
        !            12: dnl
        !            13: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            14: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            15: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            16: dnl  License, or (at your option) any later version.
        !            17: dnl
        !            18: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            19: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            20: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            21: dnl  Lesser General Public License for more details.
        !            22: dnl
        !            23: dnl  You should have received a copy of the GNU Lesser General Public
        !            24: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            25: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            26: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            27:
        !            28:
        !            29: include(`../config.m4')
        !            30:
        !            31:
        !            32: dnl  K7: UNROLL_COUNT  cycles/limb
        !            33: dnl           4            4.42
        !            34: dnl           8            4.16
        !            35: dnl          16            3.9
        !            36: dnl          32            3.9
        !            37: dnl          64            3.87
        !            38: dnl  Maximum possible with the current code is 64.
        !            39:
        !            40: deflit(UNROLL_COUNT, 16)
        !            41:
        !            42:
        !            43: ifdef(`OPERATION_addmul_1',`
        !            44:        define(M4_inst,        addl)
        !            45:        define(M4_function_1,  mpn_addmul_1)
        !            46:        define(M4_function_1c, mpn_addmul_1c)
        !            47:        define(M4_description, add it to)
        !            48:        define(M4_desc_retval, carry)
        !            49: ',`ifdef(`OPERATION_submul_1',`
        !            50:        define(M4_inst,        subl)
        !            51:        define(M4_function_1,  mpn_submul_1)
        !            52:        define(M4_function_1c, mpn_submul_1c)
        !            53:        define(M4_description, subtract it from)
        !            54:        define(M4_desc_retval, borrow)
        !            55: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
        !            56: ')')')
        !            57:
        !            58: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
        !            59:
        !            60:
        !            61: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            62: C                            mp_limb_t mult);
        !            63: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            64: C                             mp_limb_t mult, mp_limb_t carry);
        !            65: C
        !            66: C Calculate src,size multiplied by mult and M4_description dst,size.
        !            67: C Return the M4_desc_retval limb from the top of the result.
        !            68:
        !            69: ifdef(`PIC',`
        !            70: deflit(UNROLL_THRESHOLD, 9)
        !            71: ',`
        !            72: deflit(UNROLL_THRESHOLD, 6)
        !            73: ')
        !            74:
        !            75: defframe(PARAM_CARRY,     20)
        !            76: defframe(PARAM_MULTIPLIER,16)
        !            77: defframe(PARAM_SIZE,      12)
        !            78: defframe(PARAM_SRC,       8)
        !            79: defframe(PARAM_DST,       4)
        !            80: deflit(`FRAME',0)
        !            81:
        !            82: defframe(SAVE_EBX, -4)
        !            83: defframe(SAVE_ESI, -8)
        !            84: defframe(SAVE_EDI, -12)
        !            85: defframe(SAVE_EBP, -16)
        !            86: deflit(SAVE_SIZE, 16)
        !            87:
        !            88:        .text
        !            89:        ALIGN(32)
        !            90: PROLOGUE(M4_function_1)
        !            91:        movl    PARAM_SIZE, %edx
        !            92:        movl    PARAM_SRC, %eax
        !            93:        xorl    %ecx, %ecx
        !            94:
        !            95:        decl    %edx
        !            96:        jnz     LF(M4_function_1c,start_1)
        !            97:
        !            98:        movl    (%eax), %eax
        !            99:        movl    PARAM_DST, %ecx
        !           100:
        !           101:        mull    PARAM_MULTIPLIER
        !           102:
        !           103:        M4_inst %eax, (%ecx)
        !           104:        adcl    $0, %edx
        !           105:        movl    %edx, %eax
        !           106:
        !           107:        ret
        !           108: EPILOGUE()
        !           109:
        !           110:        ALIGN(16)
        !           111: PROLOGUE(M4_function_1c)
        !           112:        movl    PARAM_SIZE, %edx
        !           113:        movl    PARAM_SRC, %eax
        !           114:
        !           115:        decl    %edx
        !           116:        jnz     L(more_than_one_limb)
        !           117:
        !           118:        movl    (%eax), %eax
        !           119:        movl    PARAM_DST, %ecx
        !           120:
        !           121:        mull    PARAM_MULTIPLIER
        !           122:
        !           123:        addl    PARAM_CARRY, %eax
        !           124:
        !           125:        adcl    $0, %edx
        !           126:        M4_inst %eax, (%ecx)
        !           127:
        !           128:        adcl    $0, %edx
        !           129:        movl    %edx, %eax
        !           130:
        !           131:        ret
        !           132:
        !           133:
        !           134:        C offset 0x44 so close enough to aligned
        !           135: L(more_than_one_limb):
        !           136:        movl    PARAM_CARRY, %ecx
        !           137: L(start_1):
        !           138:        C eax   src
        !           139:        C ecx   initial carry
        !           140:        C edx   size-1
        !           141:        subl    $SAVE_SIZE, %esp
        !           142: deflit(`FRAME',16)
        !           143:
        !           144:        movl    %ebx, SAVE_EBX
        !           145:        movl    %esi, SAVE_ESI
        !           146:        movl    %edx, %ebx      C size-1
        !           147:
        !           148:        movl    PARAM_SRC, %esi
        !           149:        movl    %ebp, SAVE_EBP
        !           150:        cmpl    $UNROLL_THRESHOLD, %edx
        !           151:
        !           152:        movl    PARAM_MULTIPLIER, %ebp
        !           153:        movl    %edi, SAVE_EDI
        !           154:
        !           155:        movl    (%esi), %eax    C src low limb
        !           156:        movl    PARAM_DST, %edi
        !           157:        ja      L(unroll)
        !           158:
        !           159:
        !           160:        C simple loop
        !           161:
        !           162:        leal    4(%esi,%ebx,4), %esi    C point one limb past last
        !           163:        leal    (%edi,%ebx,4), %edi     C point at last limb
        !           164:        negl    %ebx
        !           165:
        !           166:        C The movl to load the next source limb is done well ahead of the
        !           167:        C mul.  This is necessary for full speed, and leads to one limb
        !           168:        C handled separately at the end.
        !           169:
        !           170: L(simple):
        !           171:        C eax   src limb
        !           172:        C ebx   loop counter
        !           173:        C ecx   carry limb
        !           174:        C edx   scratch
        !           175:        C esi   src
        !           176:        C edi   dst
        !           177:        C ebp   multiplier
        !           178:
        !           179:        mull    %ebp
        !           180:
        !           181:        addl    %eax, %ecx
        !           182:        adcl    $0, %edx
        !           183:
        !           184:        M4_inst %ecx, (%edi,%ebx,4)
        !           185:        movl    (%esi,%ebx,4), %eax
        !           186:        adcl    $0, %edx
        !           187:
        !           188:        incl    %ebx
        !           189:        movl    %edx, %ecx
        !           190:        jnz     L(simple)
        !           191:
        !           192:
        !           193:        mull    %ebp
        !           194:
        !           195:        movl    SAVE_EBX, %ebx
        !           196:        movl    SAVE_ESI, %esi
        !           197:        movl    SAVE_EBP, %ebp
        !           198:
        !           199:        addl    %eax, %ecx
        !           200:        adcl    $0, %edx
        !           201:
        !           202:        M4_inst %ecx, (%edi)
        !           203:        adcl    $0, %edx
        !           204:        movl    SAVE_EDI, %edi
        !           205:
        !           206:        addl    $SAVE_SIZE, %esp
        !           207:        movl    %edx, %eax
        !           208:        ret
        !           209:
        !           210:
        !           211:
        !           212: C -----------------------------------------------------------------------------
        !           213:        ALIGN(16)
        !           214: L(unroll):
        !           215:        C eax   src low limb
        !           216:        C ebx   size-1
        !           217:        C ecx   carry
        !           218:        C edx   size-1
        !           219:        C esi   src
        !           220:        C edi   dst
        !           221:        C ebp   multiplier
        !           222:
        !           223: dnl  overlapping with parameters no longer needed
        !           224: define(VAR_COUNTER,`PARAM_SIZE')
        !           225: define(VAR_JUMP,   `PARAM_MULTIPLIER')
        !           226:
        !           227:        subl    $2, %ebx        C (size-2)-1
        !           228:        decl    %edx            C size-2
        !           229:
        !           230:        shrl    $UNROLL_LOG2, %ebx
        !           231:        negl    %edx
        !           232:
        !           233:        movl    %ebx, VAR_COUNTER
        !           234:        andl    $UNROLL_MASK, %edx
        !           235:
        !           236:        movl    %edx, %ebx
        !           237:        shll    $4, %edx
        !           238:
        !           239: ifdef(`PIC',`
        !           240:        call    L(pic_calc)
        !           241: L(here):
        !           242: ',`
        !           243:        leal    L(entry) (%edx,%ebx,1), %edx
        !           244: ')
        !           245:        negl    %ebx
        !           246:        movl    %edx, VAR_JUMP
        !           247:
        !           248:        mull    %ebp
        !           249:
        !           250:        addl    %eax, %ecx      C initial carry, becomes low carry
        !           251:        adcl    $0, %edx
        !           252:        testb   $1, %bl
        !           253:
        !           254:        movl    4(%esi), %eax   C src second limb
        !           255:        leal    ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
        !           256:        leal    ifelse(UNROLL_BYTES,256,128)   (%edi,%ebx,4), %edi
        !           257:
        !           258:        movl    %edx, %ebx      C high carry
        !           259:        cmovnz( %ecx, %ebx)     C high,low carry other way around
        !           260:        cmovnz( %edx, %ecx)
        !           261:
        !           262:        jmp     *VAR_JUMP
        !           263:
        !           264:
        !           265: ifdef(`PIC',`
        !           266: L(pic_calc):
        !           267:        C See README.family about old gas bugs
        !           268:        leal    (%edx,%ebx,1), %edx
        !           269:        addl    $L(entry)-L(here), %edx
        !           270:        addl    (%esp), %edx
        !           271:        ret
        !           272: ')
        !           273:
        !           274:
        !           275: C -----------------------------------------------------------------------------
        !           276: C This code uses a "two carry limbs" scheme.  At the top of the loop the
        !           277: C carries are ebx=lo, ecx=hi, then they swap for each limb processed.  For
        !           278: C the computed jump an odd size means they start one way around, an even
        !           279: C size the other.  Either way one limb is handled separately at the start of
        !           280: C the loop.
        !           281: C
        !           282: C The positioning of the movl to load the next source limb is important.
        !           283: C Moving it after the adcl with a view to avoiding a separate mul at the end
        !           284: C of the loop slows the code down.
        !           285:
        !           286:        ALIGN(32)
        !           287: L(top):
        !           288:        C eax   src limb
        !           289:        C ebx   carry high
        !           290:        C ecx   carry low
        !           291:        C edx   scratch
        !           292:        C esi   src+8
        !           293:        C edi   dst
        !           294:        C ebp   multiplier
        !           295:        C
        !           296:        C VAR_COUNTER  loop counter
        !           297:        C
        !           298:        C 17 bytes each limb
        !           299:
        !           300: L(entry):
        !           301: deflit(CHUNK_COUNT,2)
        !           302: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           303:        deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
        !           304:        deflit(`disp1', eval(disp0 + 4))
        !           305:
        !           306:        mull    %ebp
        !           307:
        !           308: Zdisp( M4_inst,%ecx, disp0,(%edi))
        !           309:        movl    $0, %ecx
        !           310:
        !           311:        adcl    %eax, %ebx
        !           312:
        !           313: Zdisp( movl,   disp0,(%esi), %eax)
        !           314:        adcl    %edx, %ecx
        !           315:
        !           316:
        !           317:        mull    %ebp
        !           318:
        !           319:        M4_inst %ebx, disp1(%edi)
        !           320:        movl    $0, %ebx
        !           321:
        !           322:        adcl    %eax, %ecx
        !           323:
        !           324:        movl    disp1(%esi), %eax
        !           325:        adcl    %edx, %ebx
        !           326: ')
        !           327:
        !           328:        decl    VAR_COUNTER
        !           329:        leal    UNROLL_BYTES(%esi), %esi
        !           330:        leal    UNROLL_BYTES(%edi), %edi
        !           331:
        !           332:        jns     L(top)
        !           333:
        !           334:
        !           335:        C eax   src limb
        !           336:        C ebx   carry high
        !           337:        C ecx   carry low
        !           338:        C edx
        !           339:        C esi
        !           340:        C edi   dst (points at second last limb)
        !           341:        C ebp   multiplier
        !           342: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
        !           343: deflit(`disp1', eval(disp0-0 + 4))
        !           344:
        !           345:        mull    %ebp
        !           346:
        !           347:        M4_inst %ecx, disp0(%edi)
        !           348:        movl    SAVE_EBP, %ebp
        !           349:
        !           350:        adcl    %ebx, %eax
        !           351:        movl    SAVE_EBX, %ebx
        !           352:        movl    SAVE_ESI, %esi
        !           353:
        !           354:        adcl    $0, %edx
        !           355:        M4_inst %eax, disp1(%edi)
        !           356:        movl    SAVE_EDI, %edi
        !           357:
        !           358:        adcl    $0, %edx
        !           359:        addl    $SAVE_SIZE, %esp
        !           360:
        !           361:        movl    %edx, %eax
        !           362:        ret
        !           363:
        !           364: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>