[BACK]Return to copyi.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium

Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/copyi.asm, Revision 1.1

1.1     ! ohara       1: dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
        !             2:
        !             3: dnl  Copyright 1996, 2001, 2002 Free Software Foundation, Inc.
        !             4: dnl
        !             5: dnl  This file is part of the GNU MP Library.
        !             6: dnl
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             8: dnl  modify it under the terms of the GNU Lesser General Public License as
        !             9: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            10: dnl  License, or (at your option) any later version.
        !            11: dnl
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            15: dnl  Lesser General Public License for more details.
        !            16: dnl
        !            17: dnl  You should have received a copy of the GNU Lesser General Public
        !            18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            20: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24:
        !            25: C P5: 1.25 cycles/limb
        !            26:
        !            27:
        !            28: C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
        !            29: C
        !            30: C Destination prefetching is done to avoid repeated write-throughs on lines
        !            31: C not already in L1.
        !            32: C
        !            33: C At least one of the src or dst pointer needs to be incremented rather than
        !            34: C using indexing, so that there's somewhere to put the loop control without
        !            35: C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
        !            36: C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
        !            37: C subtracts in the finishup code.
        !            38: C
        !            39: C The block of finishup code is almost as big as the main loop itself, which
        !            40: C is unfortunate, but it's faster that way than with say rep movsl, by about
        !            41: C 10 cycles for instance on P55.
        !            42: C
        !            43: C There's nothing to be gained from MMX on P55, since it can do only one
        !            44: C movq load (or store) per cycle, so the throughput would be the same as the
        !            45: C code here (and even then only if src and dst have the same alignment mod
        !            46: C 8).
        !            47:
        !            48: defframe(PARAM_SIZE,12)
        !            49: defframe(PARAM_SRC, 8)
        !            50: defframe(PARAM_DST, 4)
        !            51:
        !            52: PROLOGUE(mpn_copyi)
        !            53: deflit(`FRAME',0)
        !            54:
        !            55:        movl    PARAM_SIZE, %ecx
        !            56:        movl    PARAM_DST, %edx
        !            57:
        !            58:        pushl   %ebx    FRAME_pushl()
        !            59:        pushl   %esi    FRAME_pushl()
        !            60:
        !            61:        leal    (%edx,%ecx,4), %edx     C &dst[size-1]
        !            62:        xorl    $-1, %ecx               C -size-1
        !            63:
        !            64:        movl    PARAM_SRC, %esi
        !            65:        addl    $8, %ecx                C -size+7
        !            66:
        !            67:        jns     L(end)
        !            68:
        !            69:        movl    -28(%edx,%ecx,4), %eax  C fetch destination cache line, dst[0]
        !            70:        nop
        !            71:
        !            72: L(top):
        !            73:        C eax   scratch
        !            74:        C ebx   scratch
        !            75:        C ecx   counter, limbs, negative
        !            76:        C edx   &dst[size-1]
        !            77:        C esi   src, incrementing
        !            78:        C edi
        !            79:        C ebp
        !            80:
        !            81:        movl    (%edx,%ecx,4), %eax     C fetch destination cache line
        !            82:        addl    $8, %ecx
        !            83:
        !            84:        movl    (%esi), %eax            C read words pairwise
        !            85:        movl    4(%esi), %ebx
        !            86:        movl    %eax, -60(%edx,%ecx,4)  C store words pairwise
        !            87:        movl    %ebx, -56(%edx,%ecx,4)
        !            88:
        !            89:        movl    8(%esi), %eax
        !            90:        movl    12(%esi), %ebx
        !            91:        movl    %eax, -52(%edx,%ecx,4)
        !            92:        movl    %ebx, -48(%edx,%ecx,4)
        !            93:
        !            94:        movl    16(%esi), %eax
        !            95:        movl    20(%esi), %ebx
        !            96:        movl    %eax, -44(%edx,%ecx,4)
        !            97:        movl    %ebx, -40(%edx,%ecx,4)
        !            98:
        !            99:        movl    24(%esi), %eax
        !           100:        movl    28(%esi), %ebx
        !           101:        movl    %eax, -36(%edx,%ecx,4)
        !           102:        movl    %ebx, -32(%edx,%ecx,4)
        !           103:
        !           104:        leal    32(%esi), %esi
        !           105:        js      L(top)
        !           106:
        !           107:
        !           108: L(end):
        !           109:        C ecx   0 to 7, representing respectively 7 to 0 limbs remaining
        !           110:        C esi   src end
        !           111:        C edx   dst, next location to store
        !           112:
        !           113:        subl    $4, %ecx
        !           114:        jns     L(no4)
        !           115:
        !           116:        movl    (%esi), %eax
        !           117:        movl    4(%esi), %ebx
        !           118:        movl    %eax, -12(%edx,%ecx,4)
        !           119:        movl    %ebx, -8(%edx,%ecx,4)
        !           120:
        !           121:        movl    8(%esi), %eax
        !           122:        movl    12(%esi), %ebx
        !           123:        movl    %eax, -4(%edx,%ecx,4)
        !           124:        movl    %ebx, (%edx,%ecx,4)
        !           125:
        !           126:        addl    $16, %esi
        !           127:        addl    $4, %ecx
        !           128: L(no4):
        !           129:
        !           130:        subl    $2, %ecx
        !           131:        jns     L(no2)
        !           132:
        !           133:        movl    (%esi), %eax
        !           134:        movl    4(%esi), %ebx
        !           135:        movl    %eax, -4(%edx,%ecx,4)
        !           136:        movl    %ebx, (%edx,%ecx,4)
        !           137:
        !           138:        addl    $8, %esi
        !           139:        addl    $2, %ecx
        !           140: L(no2):
        !           141:
        !           142:        jnz     L(done)
        !           143:
        !           144:        movl    (%esi), %eax
        !           145:        movl    %eax, -4(%edx,%ecx,4)   C risk of cache bank clash here
        !           146:
        !           147: L(done):
        !           148:        popl    %esi
        !           149:        popl    %ebx
        !           150:
        !           151:        ret
        !           152:
        !           153: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>