OpenXM_contrib/gmp/mpn/x86/p6/copyd.asm - annotate

Return to copyd.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / p6
Annotation of OpenXM_contrib/gmp/mpn/x86/p6/copyd.asm, Revision 1.1

1.1     ! ohara       1: dnl  Intel P6 mpn_copyd -- copy limb vector backwards.
        !             2:
        !             3: dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C P6: 1.75 cycles/limb, or 0.75 if no overlap
        !            26:
        !            27:
        !            28: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
        !            29: C
        !            30: C An explicit loop is used because a decrementing rep movsl is a bit slow at
        !            31: C 2.4 c/l.  That rep movsl also has about a 40 cycle startup time, and the
        !            32: C code here stands a chance of being faster if the branches predict well.
        !            33: C
        !            34: C The slightly strange loop form seems necessary for the claimed speed.
        !            35: C Maybe load/store ordering affects it.
        !            36: C
        !            37: C The source and destination are checked to see if they're actually
        !            38: C overlapping, since it might be possible to use an incrementing rep movsl
        !            39: C at 0.75 c/l.  (It doesn't suffer the bad startup time of the decrementing
        !            40: C version.)
        !            41: C
        !            42: C Enhancements:
        !            43: C
        !            44: C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
        !            45: C one store each cycle.  Unrolling the loop below would approach 1.0, but
        !            46: C it'd be good to know why something like store/load/subl + store/load/jnz
        !            47: C doesn't already run at 1.0 c/l.  It looks like it should decode in 2
        !            48: C cycles, but doesn't run that way.
        !            49:
        !            50: defframe(PARAM_SIZE,12)
        !            51: defframe(PARAM_SRC, 8)
        !            52: defframe(PARAM_DST, 4)
        !            53:
        !            54: dnl  re-using parameter space
        !            55: define(SAVE_ESI,`PARAM_SIZE')
        !            56: define(SAVE_EDI,`PARAM_SRC')
        !            57:
        !            58:        TEXT
        !            59:        ALIGN(16)
        !            60:
        !            61: PROLOGUE(mpn_copyd)
        !            62: deflit(`FRAME',0)
        !            63:
        !            64:        movl    PARAM_SIZE, %ecx
        !            65:
        !            66:        movl    %esi, SAVE_ESI
        !            67:        movl    PARAM_SRC, %esi
        !            68:
        !            69:        movl    %edi, SAVE_EDI
        !            70:        movl    PARAM_DST, %edi
        !            71:
        !            72:        subl    $1, %ecx
        !            73:        jb      L(zero)
        !            74:
        !            75:        movl    (%esi,%ecx,4), %eax             C src[size-1]
        !            76:        jz      L(one)
        !            77:
        !            78:        movl    -4(%esi,%ecx,4), %edx           C src[size-2]
        !            79:        subl    $2, %ecx
        !            80:        jbe     L(done_loop)                    C 2 or 3 limbs only
        !            81:
        !            82:
        !            83:        C The usual overlap is
        !            84:        C
        !            85:        C     high                   low
        !            86:        C     +------------------+
        !            87:        C     |               dst|
        !            88:        C     +------------------+
        !            89:        C           +------------------+
        !            90:        C           |               src|
        !            91:        C           +------------------+
        !            92:        C
        !            93:        C We can use an incrementing copy in the following circumstances.
        !            94:        C
        !            95:        C     src+4*size<=dst, since then the regions are disjoint
        !            96:        C
        !            97:        C     src==dst, clearly (though this shouldn't occur normally)
        !            98:        C
        !            99:        C     src>dst, since in that case it's a requirement of the
        !           100:        C              parameters that src>=dst+size*4, and hence the
        !           101:        C              regions are disjoint
        !           102:        C
        !           103:
        !           104:        leal    (%edi,%ecx,4), %edx
        !           105:        cmpl    %edi, %esi
        !           106:        jae     L(use_movsl)            C src >= dst
        !           107:
        !           108:        cmpl    %edi, %edx
        !           109:        movl    4(%esi,%ecx,4), %edx    C src[size-2] again
        !           110:        jbe     L(use_movsl)            C src+4*size <= dst
        !           111:
        !           112:
        !           113: L(top):
        !           114:        C eax   prev high limb
        !           115:        C ebx
        !           116:        C ecx   counter, size-3 down to 0 or -1, inclusive, by 2s
        !           117:        C edx   prev low limb
        !           118:        C esi   src
        !           119:        C edi   dst
        !           120:        C ebp
        !           121:
        !           122:        movl    %eax, 8(%edi,%ecx,4)
        !           123:        movl    (%esi,%ecx,4), %eax
        !           124:
        !           125:        movl    %edx, 4(%edi,%ecx,4)
        !           126:        movl    -4(%esi,%ecx,4), %edx
        !           127:
        !           128:        subl    $2, %ecx
        !           129:        jnbe    L(top)
        !           130:
        !           131:
        !           132: L(done_loop):
        !           133:        movl    %eax, 8(%edi,%ecx,4)
        !           134:        movl    %edx, 4(%edi,%ecx,4)
        !           135:
        !           136:        C copy low limb (needed if size was odd, but will already have been
        !           137:        C done in the loop if size was even)
        !           138:        movl    (%esi), %eax
        !           139: L(one):
        !           140:        movl    %eax, (%edi)
        !           141:        movl    SAVE_EDI, %edi
        !           142:        movl    SAVE_ESI, %esi
        !           143:
        !           144:        ret
        !           145:
        !           146:
        !           147: L(use_movsl):
        !           148:        C eax
        !           149:        C ebx
        !           150:        C ecx   size-3
        !           151:        C edx
        !           152:        C esi   src
        !           153:        C edi   dst
        !           154:        C ebp
        !           155:
        !           156:        addl    $3, %ecx
        !           157:
        !           158:        cld             C better safe than sorry, see mpn/x86/README
        !           159:
        !           160:        rep
        !           161:        movsl
        !           162:
        !           163: L(zero):
        !           164:        movl    SAVE_ESI, %esi
        !           165:        movl    SAVE_EDI, %edi
        !           166:
        !           167:        ret
        !           168:
        !           169: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>