[BACK]Return to copyi.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / k62mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyi.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
        !             2: dnl
        !             3: dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
        !             4: dnl  alignment.
        !             5:
        !             6:
        !             7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
        !             8: dnl
        !             9: dnl  This file is part of the GNU MP Library.
        !            10: dnl
        !            11: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            12: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            13: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            14: dnl  License, or (at your option) any later version.
        !            15: dnl
        !            16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            19: dnl  Lesser General Public License for more details.
        !            20: dnl
        !            21: dnl  You should have received a copy of the GNU Lesser General Public
        !            22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            24: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            25:
        !            26:
        !            27: include(`../config.m4')
        !            28:
        !            29:
        !            30: dnl  K6-2 aligned:
        !            31: dnl  UNROLL_COUNT cycles/limb
        !            32: dnl        8          0.75
        !            33: dnl       16          0.625
        !            34: dnl       32          0.5625
        !            35: dnl       64          0.53
        !            36: dnl  Maximum possible with the current code is 64, the minimum is 2.
        !            37:
        !            38: deflit(UNROLL_COUNT, 32)
        !            39:
        !            40:
        !            41: C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
        !            42: C
        !            43: C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
        !            44: C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
        !            45: C used instead.
        !            46: C
        !            47: C         mod8
        !            48: C      src  dst
        !            49: C       0    0    both aligned, use mmx
        !            50: C       0    4    unaligned, use rep movs
        !            51: C       4    0    unaligned, use rep movs
        !            52: C       4    4    do one movs, then both aligned, use mmx
        !            53: C
        !            54: C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
        !            55: C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
        !            56: C
        !            57: C A pattern of two movq loads and two movq stores (or four and four) was
        !            58: C tried, but found to be the same speed as just one of each.
        !            59: C
        !            60: C Note that this code only suits K6-2 and K6-3.  Plain K6 does only one mmx
        !            61: C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
        !            62: C movs.
        !            63: C
        !            64: C Enhancement:
        !            65: C
        !            66: C Addressing modes like disp(%esi,%ecx,4) aren't currently used.  They'd
        !            67: C make it possible to avoid incrementing %esi and %edi in the loop and hence
        !            68: C get loop overhead down to 1 cycle.  Care would be needed to avoid bad
        !            69: C cache line crossings since the "movq"s would then be 5 code bytes rather
        !            70: C than 4.
        !            71:
        !            72:
        !            73: defframe(PARAM_SIZE,12)
        !            74: defframe(PARAM_SRC, 8)
        !            75: defframe(PARAM_DST, 4)
        !            76: deflit(`FRAME',0)
        !            77:
        !            78:        .text
        !            79:        ALIGN(32)
        !            80:
        !            81: PROLOGUE(mpn_copyi)
        !            82:        movl    PARAM_SIZE, %ecx
        !            83:        movl    %esi, %eax
        !            84:
        !            85:        movl    PARAM_SRC, %esi
        !            86:        movl    %edi, %edx
        !            87:
        !            88:        cld
        !            89:
        !            90:        movl    PARAM_DST, %edi
        !            91:        cmpl    $UNROLL_COUNT, %ecx
        !            92:
        !            93:        ja      L(unroll)
        !            94:
        !            95: L(simple):
        !            96:        rep
        !            97:        movsl
        !            98:
        !            99:        movl    %eax, %esi
        !           100:        movl    %edx, %edi
        !           101:
        !           102:        ret
        !           103:
        !           104:
        !           105: L(unroll):
        !           106:        C if src and dst are different alignments mod8, then use rep movs
        !           107:        C if src and dst are both 4mod8 then process one limb to get 0mod8
        !           108:
        !           109:        pushl   %ebx
        !           110:        leal    (%esi,%edi), %ebx
        !           111:
        !           112:        testb   $4, %bl
        !           113:        popl    %ebx
        !           114:
        !           115:        jnz     L(simple)
        !           116:        testl   $4, %esi
        !           117:
        !           118:        leal    -UNROLL_COUNT(%ecx), %ecx
        !           119:        jz      L(already_aligned)
        !           120:
        !           121:        decl    %ecx
        !           122:
        !           123:        movsl
        !           124: L(already_aligned):
        !           125:
        !           126:
        !           127: ifelse(UNROLL_BYTES,256,`
        !           128:        addl    $128, %esi
        !           129:        addl    $128, %edi
        !           130: ')
        !           131:
        !           132:        C this is offset 0x34, no alignment needed
        !           133: L(top):
        !           134:        C eax   saved esi
        !           135:        C ebx
        !           136:        C ecx   counter, limbs
        !           137:        C edx   saved edi
        !           138:        C esi   src, incrementing
        !           139:        C edi   dst, incrementing
        !           140:        C ebp
        !           141:        C
        !           142:        C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
        !           143:        C 0(%edi) keeps code aligned to 16 byte boundaries.
        !           144:
        !           145: deflit(CHUNK_COUNT, 2)
        !           146: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
        !           147:        deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
        !           148: Zdisp( movq,   disp,(%esi), %mm0)
        !           149: Zdisp( movq,   %mm0, disp,(%edi))
        !           150: ')
        !           151:
        !           152:        addl    $UNROLL_BYTES, %esi
        !           153:        subl    $UNROLL_COUNT, %ecx
        !           154:
        !           155:        leal    UNROLL_BYTES(%edi), %edi
        !           156:        jns     L(top)
        !           157:
        !           158:
        !           159:        C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
        !           160:        C UNROLL_COUNT-1 limbs remaining
        !           161:
        !           162:        testb   $eval(UNROLL_COUNT/2), %cl
        !           163:
        !           164:        leal    UNROLL_COUNT(%ecx), %ecx
        !           165:        jz      L(not_half)
        !           166:
        !           167:        C at an unroll count of 32 this block of code is 16 cycles faster than
        !           168:        C the rep movs, less 3 or 4 to test whether to do it
        !           169:
        !           170: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
        !           171:        deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
        !           172:        movq    disp(%esi), %mm0
        !           173:        movq    %mm0, disp(%edi)
        !           174: ')
        !           175:        addl    $eval(UNROLL_BYTES/2), %esi
        !           176:        addl    $eval(UNROLL_BYTES/2), %edi
        !           177:
        !           178:        subl    $eval(UNROLL_COUNT/2), %ecx
        !           179: L(not_half):
        !           180:
        !           181:
        !           182: ifelse(UNROLL_BYTES,256,`
        !           183:        subl    $128, %esi
        !           184:        subl    $128, %edi
        !           185: ')
        !           186:
        !           187:        rep
        !           188:        movsl
        !           189:
        !           190:        movl    %eax, %esi
        !           191:        movl    %edx, %edi
        !           192:
        !           193:        femms
        !           194:        ret
        !           195:
        !           196: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>