OpenXM_contrib/gmp/mpn/x86/k7/mmx/copyi.asm - annotate

Return to copyi.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/copyi.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
        !             2: dnl
        !             3: dnl     alignment dst/src, A=0mod8 N=4mod8
        !             4: dnl        A/A   A/N   N/A   N/N
        !             5: dnl  K7    0.75  1.0   1.0   0.75
        !             6:
        !             7:
        !             8: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             9: dnl
        !            10: dnl  This file is part of the GNU MP Library.
        !            11: dnl
        !            12: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            13: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            14: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            15: dnl  License, or (at your option) any later version.
        !            16: dnl
        !            17: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            18: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            19: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            20: dnl  Lesser General Public License for more details.
        !            21: dnl
        !            22: dnl  You should have received a copy of the GNU Lesser General Public
        !            23: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            24: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            25: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            26:
        !            27:
        !            28: include(`../config.m4')
        !            29:
        !            30:
        !            31: C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
        !            32: C
        !            33: C Copy src,size to dst,size.
        !            34: C
        !            35: C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
        !            36: C 1.33 c/l.
        !            37: C
        !            38: C The K7 can do two loads, or two stores, or a load and a store, in one
        !            39: C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
        !            40: C however nothing under 0.7 c/l is known.
        !            41: C
        !            42: C If both source and destination are unaligned then one limb is processed at
        !            43: C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
        !            44: C used unaligned it would be 1.5 c/l.
        !            45:
        !            46: defframe(PARAM_SIZE,12)
        !            47: defframe(PARAM_SRC, 8)
        !            48: defframe(PARAM_DST, 4)
        !            49:
        !            50: dnl  parameter space reused
        !            51: define(SAVE_EBX,`PARAM_SIZE')
        !            52:
        !            53: dnl  minimum 5 since the unrolled code can't handle less than 5
        !            54: deflit(UNROLL_THRESHOLD, 5)
        !            55:
        !            56:        .text
        !            57:        ALIGN(32)
        !            58: PROLOGUE(mpn_copyi)
        !            59: deflit(`FRAME',0)
        !            60:
        !            61:        movl    PARAM_SIZE, %ecx
        !            62:        movl    %ebx, SAVE_EBX
        !            63:
        !            64:        movl    PARAM_SRC, %eax
        !            65:        movl    PARAM_DST, %edx
        !            66:
        !            67:        cmpl    $UNROLL_THRESHOLD, %ecx
        !            68:        jae     L(unroll)
        !            69:
        !            70:        orl     %ecx, %ecx
        !            71:        jz      L(simple_done)
        !            72:
        !            73: L(simple):
        !            74:        C eax   src, incrementing
        !            75:        C ebx   scratch
        !            76:        C ecx   counter
        !            77:        C edx   dst, incrementing
        !            78:        C
        !            79:        C this loop is 2 cycles/limb
        !            80:
        !            81:        movl    (%eax), %ebx
        !            82:        movl    %ebx, (%edx)
        !            83:        decl    %ecx
        !            84:        leal    4(%eax), %eax
        !            85:        leal    4(%edx), %edx
        !            86:        jnz     L(simple)
        !            87:
        !            88: L(simple_done):
        !            89:        movl    SAVE_EBX, %ebx
        !            90:        ret
        !            91:
        !            92:
        !            93: L(unroll):
        !            94:        movl    %eax, %ebx
        !            95:        leal    -12(%eax,%ecx,4), %eax  C src end - 12
        !            96:        subl    $3, %ecx                C size-3
        !            97:
        !            98:        andl    %edx, %ebx
        !            99:        leal    (%edx,%ecx,4), %edx     C dst end - 12
        !           100:        negl    %ecx
        !           101:
        !           102:        testl   $4, %ebx   C testl to pad code closer to 16 bytes for L(top)
        !           103:        jz      L(aligned)
        !           104:
        !           105:        C both src and dst unaligned, process one limb to align them
        !           106:        movl    (%eax,%ecx,4), %ebx
        !           107:        movl    %ebx, (%edx,%ecx,4)
        !           108:        incl    %ecx
        !           109: L(aligned):
        !           110:
        !           111:
        !           112:        ALIGN(16)
        !           113: L(top):
        !           114:        C eax   src end - 12
        !           115:        C ebx
        !           116:        C ecx   counter, negative, limbs
        !           117:        C edx   dst end - 12
        !           118:
        !           119:        movq    (%eax,%ecx,4), %mm0
        !           120:        movq    8(%eax,%ecx,4), %mm1
        !           121:        addl    $4, %ecx
        !           122:        movq    %mm0, -16(%edx,%ecx,4)
        !           123:        movq    %mm1, -16+8(%edx,%ecx,4)
        !           124:        ja      L(top)          C jump no carry and not zero
        !           125:
        !           126:
        !           127:        C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
        !           128:
        !           129:        testb   $2, %cl
        !           130:        jnz     L(finish_not_two)
        !           131:
        !           132:        movq    (%eax,%ecx,4), %mm0
        !           133:        movq    %mm0, (%edx,%ecx,4)
        !           134: L(finish_not_two):
        !           135:
        !           136:        testb   $1, %cl
        !           137:        jnz     L(done)
        !           138:
        !           139:        movl    8(%eax), %ebx
        !           140:        movl    %ebx, 8(%edx)
        !           141:
        !           142: L(done):
        !           143:        movl    SAVE_EBX, %ebx
        !           144:        emms
        !           145:        ret
        !           146:
        !           147: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>