[BACK]Return to copyd.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / k62mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyd.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
        !             2: dnl
        !             3: dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
        !             4: dnl  alignment.
        !             5:
        !             6:
        !             7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             8: dnl
        !             9: dnl  This file is part of the GNU MP Library.
        !            10: dnl
        !            11: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            12: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            13: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            14: dnl  License, or (at your option) any later version.
        !            15: dnl
        !            16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            19: dnl  Lesser General Public License for more details.
        !            20: dnl
        !            21: dnl  You should have received a copy of the GNU Lesser General Public
        !            22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            24: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            25:
        !            26:
        !            27: include(`../config.m4')
        !            28:
        !            29:
        !            30: dnl  K6-2 aligned:
        !            31: dnl  UNROLL_COUNT cycles/limb
        !            32: dnl        8          0.75
        !            33: dnl       16          0.625
        !            34: dnl       32          0.5625
        !            35: dnl       64          0.53
        !            36: dnl  Maximum possible with the current code is 64, the minimum is 2.
        !            37:
        !            38: deflit(UNROLL_COUNT, 32)
        !            39:
        !            40:
        !            41: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
        !            42: C
        !            43: C Copy src,size to dst,size, processing limbs from high to low addresses.
        !            44: C
        !            45: C The comments in copyi.asm apply here too.
        !            46:
        !            47:
        !            48: defframe(PARAM_SIZE,12)
        !            49: defframe(PARAM_SRC, 8)
        !            50: defframe(PARAM_DST, 4)
        !            51: deflit(`FRAME',0)
        !            52:
        !            53:        .text
        !            54:        ALIGN(32)
        !            55:
        !            56: PROLOGUE(mpn_copyd)
        !            57:        movl    PARAM_SIZE, %ecx
        !            58:        movl    %esi, %eax
        !            59:
        !            60:        movl    PARAM_SRC, %esi
        !            61:        movl    %edi, %edx
        !            62:
        !            63:        std
        !            64:
        !            65:        movl    PARAM_DST, %edi
        !            66:        cmpl    $UNROLL_COUNT, %ecx
        !            67:
        !            68:        leal    -4(%esi,%ecx,4), %esi
        !            69:
        !            70:        leal    -4(%edi,%ecx,4), %edi
        !            71:        ja      L(unroll)
        !            72:
        !            73: L(simple):
        !            74:        rep
        !            75:        movsl
        !            76:
        !            77:        cld
        !            78:
        !            79:        movl    %eax, %esi
        !            80:        movl    %edx, %edi
        !            81:
        !            82:        ret
        !            83:
        !            84:
        !            85: L(unroll):
        !            86:        C if src and dst are different alignments mod8, then use rep movs
        !            87:        C if src and dst are both 4mod8 then process one limb to get 0mod8
        !            88:
        !            89:        pushl   %ebx
        !            90:        leal    (%esi,%edi), %ebx
        !            91:
        !            92:        testb   $4, %bl
        !            93:        popl    %ebx
        !            94:
        !            95:        jnz     L(simple)
        !            96:        testl   $4, %esi
        !            97:
        !            98:        leal    -UNROLL_COUNT(%ecx), %ecx
        !            99:        jnz     L(already_aligned)
        !           100:
        !           101:        movsl
        !           102:
        !           103:        decl    %ecx
        !           104: L(already_aligned):
        !           105:
        !           106:
        !           107: ifelse(UNROLL_BYTES,256,`
        !           108:        subl    $128, %esi
        !           109:        subl    $128, %edi
        !           110: ')
        !           111:
        !           112:        C offset 0x3D here, but gets full speed without further alignment
        !           113: L(top):
        !           114:        C eax   saved esi
        !           115:        C ebx
        !           116:        C ecx   counter, limbs
        !           117:        C edx   saved edi
        !           118:        C esi   src, incrementing
        !           119:        C edi   dst, incrementing
        !           120:        C ebp
        !           121:        C
        !           122:        C `disp' is never 0, so don't need to force 0(%esi).
        !           123:
        !           124: deflit(CHUNK_COUNT, 2)
        !           125: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           126:        deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
        !           127:        movq    disp(%esi), %mm0
        !           128:        movq    %mm0, disp(%edi)
        !           129: ')
        !           130:
        !           131:        leal    -UNROLL_BYTES(%esi), %esi
        !           132:        subl    $UNROLL_COUNT, %ecx
        !           133:
        !           134:        leal    -UNROLL_BYTES(%edi), %edi
        !           135:        jns     L(top)
        !           136:
        !           137:
        !           138:        C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
        !           139:        C UNROLL_COUNT-1 limbs remaining
        !           140:
        !           141:        testb   $eval(UNROLL_COUNT/2), %cl
        !           142:
        !           143:        leal    UNROLL_COUNT(%ecx), %ecx
        !           144:        jz      L(not_half)
        !           145:
        !           146:
        !           147:        C at an unroll count of 32 this block of code is 16 cycles faster than
        !           148:        C the rep movs, less 3 or 4 to test whether to do it
        !           149:
        !           150: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
        !           151:        deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
        !           152:        movq    disp(%esi), %mm0
        !           153:        movq    %mm0, disp(%edi)
        !           154: ')
        !           155:
        !           156:        subl    $eval(UNROLL_BYTES/2), %esi
        !           157:        subl    $eval(UNROLL_BYTES/2), %edi
        !           158:
        !           159:        subl    $eval(UNROLL_COUNT/2), %ecx
        !           160: L(not_half):
        !           161:
        !           162:
        !           163: ifelse(UNROLL_BYTES,256,`
        !           164:        addl    $128, %esi
        !           165:        addl    $128, %edi
        !           166: ')
        !           167:
        !           168:        rep
        !           169:        movsl
        !           170:
        !           171:        cld
        !           172:
        !           173:        movl    %eax, %esi
        !           174:        movl    %edx, %edi
        !           175:
        !           176:        femms
        !           177:        ret
        !           178:
        !           179: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>