OpenXM_contrib/gmp/mpn/x86/k6/mul_1.asm - annotate

Return to mul_1.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_1.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
        !             2: dnl
        !             3: dnl  K6: 6.25 cycles/limb.
        !             4:
        !             5:
        !             6: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             7: dnl
        !             8: dnl  This file is part of the GNU MP Library.
        !             9: dnl
        !            10: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            11: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            12: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            13: dnl  License, or (at your option) any later version.
        !            14: dnl
        !            15: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            16: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            17: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            18: dnl  Lesser General Public License for more details.
        !            19: dnl
        !            20: dnl  You should have received a copy of the GNU Lesser General Public
        !            21: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            22: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            23: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            24:
        !            25:
        !            26: include(`../config.m4')
        !            27:
        !            28:
        !            29: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            30: C                      mp_limb_t multiplier);
        !            31: C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
        !            32: C                       mp_limb_t multiplier, mp_limb_t carry);
        !            33: C
        !            34: C Multiply src,size by mult and store the result in dst,size.
        !            35: C Return the carry limb from the top of the result.
        !            36: C
        !            37: C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
        !            38: C the low limb of the result.
        !            39:
        !            40: defframe(PARAM_CARRY,     20)
        !            41: defframe(PARAM_MULTIPLIER,16)
        !            42: defframe(PARAM_SIZE,      12)
        !            43: defframe(PARAM_SRC,       8)
        !            44: defframe(PARAM_DST,       4)
        !            45:
        !            46: dnl  minimum 5 because the unrolled code can't handle less
        !            47: deflit(UNROLL_THRESHOLD, 5)
        !            48:
        !            49:        .text
        !            50:        ALIGN(32)
        !            51:
        !            52: PROLOGUE(mpn_mul_1c)
        !            53:        pushl   %esi
        !            54: deflit(`FRAME',4)
        !            55:        movl    PARAM_CARRY, %esi
        !            56:        jmp     LF(mpn_mul_1,start_nc)
        !            57: EPILOGUE()
        !            58:
        !            59:
        !            60: PROLOGUE(mpn_mul_1)
        !            61:        push    %esi
        !            62: deflit(`FRAME',4)
        !            63:        xorl    %esi, %esi      C initial carry
        !            64:
        !            65: L(start_nc):
        !            66:        mov     PARAM_SIZE, %ecx
        !            67:        push    %ebx
        !            68: FRAME_pushl()
        !            69:
        !            70:        movl    PARAM_SRC, %ebx
        !            71:        push    %edi
        !            72: FRAME_pushl()
        !            73:
        !            74:        movl    PARAM_DST, %edi
        !            75:        pushl   %ebp
        !            76: FRAME_pushl()
        !            77:
        !            78:        cmpl    $UNROLL_THRESHOLD, %ecx
        !            79:        movl    PARAM_MULTIPLIER, %ebp
        !            80:
        !            81:        jae     L(unroll)
        !            82:
        !            83:
        !            84:        C code offset 0x22 here, close enough to aligned
        !            85: L(simple):
        !            86:        C eax   scratch
        !            87:        C ebx   src
        !            88:        C ecx   counter
        !            89:        C edx   scratch
        !            90:        C esi   carry
        !            91:        C edi   dst
        !            92:        C ebp   multiplier
        !            93:        C
        !            94:        C this loop 8 cycles/limb
        !            95:
        !            96:        movl    (%ebx), %eax
        !            97:        addl    $4, %ebx
        !            98:
        !            99:        mull    %ebp
        !           100:
        !           101:        addl    %esi, %eax
        !           102:        movl    $0, %esi
        !           103:
        !           104:        adcl    %edx, %esi
        !           105:
        !           106:        movl    %eax, (%edi)
        !           107:        addl    $4, %edi
        !           108:
        !           109:        loop    L(simple)
        !           110:
        !           111:
        !           112:        popl    %ebp
        !           113:
        !           114:        popl    %edi
        !           115:        popl    %ebx
        !           116:
        !           117:        movl    %esi, %eax
        !           118:        popl    %esi
        !           119:
        !           120:        ret
        !           121:
        !           122:
        !           123: C -----------------------------------------------------------------------------
        !           124: C The code for each limb is 6 cycles, with instruction decoding being the
        !           125: C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
        !           126: C cycles/limb in total.
        !           127: C
        !           128: C The secret ingredient to get 6.25 is to start the loop with the mul and
        !           129: C have the load/store pair at the end.  Rotating the load/store to the top
        !           130: C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
        !           131: C
        !           132: C The whole unrolled loop fits nicely in exactly 80 bytes.
        !           133:
        !           134:
        !           135:        ALIGN(16)       C already aligned to 16 here actually
        !           136: L(unroll):
        !           137:        movl    (%ebx), %eax
        !           138:        leal    -16(%ebx,%ecx,4), %ebx
        !           139:
        !           140:        leal    -16(%edi,%ecx,4), %edi
        !           141:        subl    $4, %ecx
        !           142:
        !           143:        negl    %ecx
        !           144:
        !           145:
        !           146:        ALIGN(16)       C one byte nop for this alignment
        !           147: L(top):
        !           148:        C eax   scratch
        !           149:        C ebx   &src[size-4]
        !           150:        C ecx   counter
        !           151:        C edx   scratch
        !           152:        C esi   carry
        !           153:        C edi   &dst[size-4]
        !           154:        C ebp   multiplier
        !           155:
        !           156:        mull    %ebp
        !           157:
        !           158:        addl    %esi, %eax
        !           159:        movl    $0, %esi
        !           160:
        !           161:        adcl    %edx, %esi
        !           162:
        !           163:        movl    %eax, (%edi,%ecx,4)
        !           164:        movl    4(%ebx,%ecx,4), %eax
        !           165:
        !           166:
        !           167:        mull    %ebp
        !           168:
        !           169:        addl    %esi, %eax
        !           170:        movl    $0, %esi
        !           171:
        !           172:        adcl    %edx, %esi
        !           173:
        !           174:        movl    %eax, 4(%edi,%ecx,4)
        !           175:        movl    8(%ebx,%ecx,4), %eax
        !           176:
        !           177:
        !           178:        mull    %ebp
        !           179:
        !           180:        addl    %esi, %eax
        !           181:        movl    $0, %esi
        !           182:
        !           183:        adcl    %edx, %esi
        !           184:
        !           185:        movl    %eax, 8(%edi,%ecx,4)
        !           186:        movl    12(%ebx,%ecx,4), %eax
        !           187:
        !           188:
        !           189:        mull    %ebp
        !           190:
        !           191:        addl    %esi, %eax
        !           192:        movl    $0, %esi
        !           193:
        !           194:        adcl    %edx, %esi
        !           195:
        !           196:        movl    %eax, 12(%edi,%ecx,4)
        !           197:        movl    16(%ebx,%ecx,4), %eax
        !           198:
        !           199:
        !           200:        addl    $4, %ecx
        !           201:        js      L(top)
        !           202:
        !           203:
        !           204:
        !           205:        C eax   next src limb
        !           206:        C ebx   &src[size-4]
        !           207:        C ecx   0 to 3 representing respectively 4 to 1 further limbs
        !           208:        C edx
        !           209:        C esi   carry
        !           210:        C edi   &dst[size-4]
        !           211:
        !           212:        testb   $2, %cl
        !           213:        jnz     L(finish_not_two)
        !           214:
        !           215:        mull    %ebp
        !           216:
        !           217:        addl    %esi, %eax
        !           218:        movl    $0, %esi
        !           219:
        !           220:        adcl    %edx, %esi
        !           221:
        !           222:        movl    %eax, (%edi,%ecx,4)
        !           223:        movl    4(%ebx,%ecx,4), %eax
        !           224:
        !           225:
        !           226:        mull    %ebp
        !           227:
        !           228:        addl    %esi, %eax
        !           229:        movl    $0, %esi
        !           230:
        !           231:        adcl    %edx, %esi
        !           232:
        !           233:        movl    %eax, 4(%edi,%ecx,4)
        !           234:        movl    8(%ebx,%ecx,4), %eax
        !           235:
        !           236:        addl    $2, %ecx
        !           237: L(finish_not_two):
        !           238:
        !           239:
        !           240:        testb   $1, %cl
        !           241:        jnz     L(finish_not_one)
        !           242:
        !           243:        mull    %ebp
        !           244:
        !           245:        addl    %esi, %eax
        !           246:        movl    $0, %esi
        !           247:
        !           248:        adcl    %edx, %esi
        !           249:
        !           250:        movl    %eax, 8(%edi)
        !           251:        movl    12(%ebx), %eax
        !           252: L(finish_not_one):
        !           253:
        !           254:
        !           255:        mull    %ebp
        !           256:
        !           257:        addl    %esi, %eax
        !           258:        popl    %ebp
        !           259:
        !           260:        adcl    $0, %edx
        !           261:
        !           262:        movl    %eax, 12(%edi)
        !           263:        popl    %edi
        !           264:
        !           265:        popl    %ebx
        !           266:        movl    %edx, %eax
        !           267:
        !           268:        popl    %esi
        !           269:
        !           270:        ret
        !           271:
        !           272: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>