[BACK]Return to copyd.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / k62mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyd.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
                      2:
1.1.1.2 ! ohara       3: dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24:
1.1.1.2 ! ohara      25: C K6-2: 1.0 cycles/limb
1.1       maekawa    26:
                     27:
                     28: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
                     29: C
1.1.1.2 ! ohara      30: C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
        !            31: C cycle startup time, which amounts for instance to a 2x speedup at 15
        !            32: C limbs.
1.1       maekawa    33: C
1.1.1.2 ! ohara      34: C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
        !            35: C processing one limb separately to make it aligned.  This and a final odd
        !            36: C limb are handled in a branch-free fashion, ending up re-copying if the
        !            37: C special case isn't needed.
        !            38: C
        !            39: C Alternatives:
        !            40: C
        !            41: C There used to be a big unrolled version of this, running at 0.56 c/l if
        !            42: C the destination was aligned, but that seemed rather excessive for the
        !            43: C relative importance of copyd.
        !            44: C
        !            45: C If the destination alignment is ignored and just left to run at 1.17 c/l
        !            46: C some code size and a fixed few cycles can be saved.  Considering how few
        !            47: C uses copyd finds perhaps that should be favoured.  The current code has
        !            48: C the attraction of being no slower than a basic rep movsl though.
1.1       maekawa    49:
                     50: defframe(PARAM_SIZE,12)
                     51: defframe(PARAM_SRC, 8)
                     52: defframe(PARAM_DST, 4)
                     53:
1.1.1.2 ! ohara      54: dnl  re-using parameter space
        !            55: define(SAVE_EBX,`PARAM_SIZE')
1.1       maekawa    56:
1.1.1.2 ! ohara      57:        TEXT
        !            58:        ALIGN(16)
1.1       maekawa    59:
1.1.1.2 ! ohara      60: PROLOGUE(mpn_copyd)
        !            61: deflit(`FRAME',0)
1.1       maekawa    62:
1.1.1.2 ! ohara      63:        movl    PARAM_SIZE, %ecx
        !            64:        movl    %ebx, SAVE_EBX
1.1       maekawa    65:
1.1.1.2 ! ohara      66:        movl    PARAM_SRC, %eax
        !            67:        movl    PARAM_DST, %edx
1.1       maekawa    68:
1.1.1.2 ! ohara      69:        subl    $1, %ecx                C better code alignment than decl
        !            70:        jb      L(zero)
1.1       maekawa    71:
1.1.1.2 ! ohara      72:        jz      L(one_more)
        !            73:        leal    4(%edx,%ecx,4), %ebx
1.1       maekawa    74:
1.1.1.2 ! ohara      75: Zdisp( movd,   0,(%eax,%ecx,4), %mm0)  C high limb
        !            76: Zdisp( movd,   %mm0, 0,(%edx,%ecx,4))  C Zdisp for good code alignment
1.1       maekawa    77:
1.1.1.2 ! ohara      78:        cmpl    $1, %ecx
        !            79:        je      L(one_more)
1.1       maekawa    80:
1.1.1.2 ! ohara      81:        shrl    $2, %ebx
        !            82:        andl    $1, %ebx                C 1 if dst[size-2] unaligned
1.1       maekawa    83:
1.1.1.2 ! ohara      84:        subl    %ebx, %ecx
        !            85:        nop                             C code alignment
1.1       maekawa    86:
                     87: L(top):
1.1.1.2 ! ohara      88:        C eax   src
1.1       maekawa    89:        C ebx
1.1.1.2 ! ohara      90:        C ecx   counter
        !            91:        C edx   dst
1.1       maekawa    92:
1.1.1.2 ! ohara      93:        movq    -4(%eax,%ecx,4), %mm0
        !            94:        subl    $2, %ecx
1.1       maekawa    95:
1.1.1.2 ! ohara      96:        movq    %mm0, 4(%edx,%ecx,4)
        !            97:        ja      L(top)
1.1       maekawa    98:
                     99:
1.1.1.2 ! ohara     100: L(one_more):
        !           101:         movd    (%eax), %mm0
        !           102:         movd    %mm0, (%edx)
1.1       maekawa   103:
1.1.1.2 ! ohara     104:        movl    SAVE_EBX, %ebx
        !           105:        emms_or_femms
        !           106: L(zero):
1.1       maekawa   107:        ret
                    108:
                    109: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>