OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyd.asm - annotate

Return to copyd.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / k62mmx
Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyd.asm, Revision 1.1.1.1

1.1       maekawa     1: dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
                      2: dnl
                      3: dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
                      4: dnl  alignment.
                      5:
                      6:
                      7: dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      8: dnl
                      9: dnl  This file is part of the GNU MP Library.
                     10: dnl
                     11: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     12: dnl  modify it under the terms of the GNU Lesser General Public License as
                     13: dnl  published by the Free Software Foundation; either version 2.1 of the
                     14: dnl  License, or (at your option) any later version.
                     15: dnl
                     16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     19: dnl  Lesser General Public License for more details.
                     20: dnl
                     21: dnl  You should have received a copy of the GNU Lesser General Public
                     22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     24: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     25:
                     26:
                     27: include(`../config.m4')
                     28:
                     29:
                     30: dnl  K6-2 aligned:
                     31: dnl  UNROLL_COUNT cycles/limb
                     32: dnl        8          0.75
                     33: dnl       16          0.625
                     34: dnl       32          0.5625
                     35: dnl       64          0.53
                     36: dnl  Maximum possible with the current code is 64, the minimum is 2.
                     37:
                     38: deflit(UNROLL_COUNT, 32)
                     39:
                     40:
                     41: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
                     42: C
                     43: C Copy src,size to dst,size, processing limbs from high to low addresses.
                     44: C
                     45: C The comments in copyi.asm apply here too.
                     46:
                     47:
                     48: defframe(PARAM_SIZE,12)
                     49: defframe(PARAM_SRC, 8)
                     50: defframe(PARAM_DST, 4)
                     51: deflit(`FRAME',0)
                     52:
                     53:        .text
                     54:        ALIGN(32)
                     55:
                     56: PROLOGUE(mpn_copyd)
                     57:        movl    PARAM_SIZE, %ecx
                     58:        movl    %esi, %eax
                     59:
                     60:        movl    PARAM_SRC, %esi
                     61:        movl    %edi, %edx
                     62:
                     63:        std
                     64:
                     65:        movl    PARAM_DST, %edi
                     66:        cmpl    $UNROLL_COUNT, %ecx
                     67:
                     68:        leal    -4(%esi,%ecx,4), %esi
                     69:
                     70:        leal    -4(%edi,%ecx,4), %edi
                     71:        ja      L(unroll)
                     72:
                     73: L(simple):
                     74:        rep
                     75:        movsl
                     76:
                     77:        cld
                     78:
                     79:        movl    %eax, %esi
                     80:        movl    %edx, %edi
                     81:
                     82:        ret
                     83:
                     84:
                     85: L(unroll):
                     86:        C if src and dst are different alignments mod8, then use rep movs
                     87:        C if src and dst are both 4mod8 then process one limb to get 0mod8
                     88:
                     89:        pushl   %ebx
                     90:        leal    (%esi,%edi), %ebx
                     91:
                     92:        testb   $4, %bl
                     93:        popl    %ebx
                     94:
                     95:        jnz     L(simple)
                     96:        testl   $4, %esi
                     97:
                     98:        leal    -UNROLL_COUNT(%ecx), %ecx
                     99:        jnz     L(already_aligned)
                    100:
                    101:        movsl
                    102:
                    103:        decl    %ecx
                    104: L(already_aligned):
                    105:
                    106:
                    107: ifelse(UNROLL_BYTES,256,`
                    108:        subl    $128, %esi
                    109:        subl    $128, %edi
                    110: ')
                    111:
                    112:        C offset 0x3D here, but gets full speed without further alignment
                    113: L(top):
                    114:        C eax   saved esi
                    115:        C ebx
                    116:        C ecx   counter, limbs
                    117:        C edx   saved edi
                    118:        C esi   src, incrementing
                    119:        C edi   dst, incrementing
                    120:        C ebp
                    121:        C
                    122:        C `disp' is never 0, so don't need to force 0(%esi).
                    123:
                    124: deflit(CHUNK_COUNT, 2)
                    125: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
                    126:        deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
                    127:        movq    disp(%esi), %mm0
                    128:        movq    %mm0, disp(%edi)
                    129: ')
                    130:
                    131:        leal    -UNROLL_BYTES(%esi), %esi
                    132:        subl    $UNROLL_COUNT, %ecx
                    133:
                    134:        leal    -UNROLL_BYTES(%edi), %edi
                    135:        jns     L(top)
                    136:
                    137:
                    138:        C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
                    139:        C UNROLL_COUNT-1 limbs remaining
                    140:
                    141:        testb   $eval(UNROLL_COUNT/2), %cl
                    142:
                    143:        leal    UNROLL_COUNT(%ecx), %ecx
                    144:        jz      L(not_half)
                    145:
                    146:
                    147:        C at an unroll count of 32 this block of code is 16 cycles faster than
                    148:        C the rep movs, less 3 or 4 to test whether to do it
                    149:
                    150: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
                    151:        deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
                    152:        movq    disp(%esi), %mm0
                    153:        movq    %mm0, disp(%edi)
                    154: ')
                    155:
                    156:        subl    $eval(UNROLL_BYTES/2), %esi
                    157:        subl    $eval(UNROLL_BYTES/2), %edi
                    158:
                    159:        subl    $eval(UNROLL_COUNT/2), %ecx
                    160: L(not_half):
                    161:
                    162:
                    163: ifelse(UNROLL_BYTES,256,`
                    164:        addl    $128, %esi
                    165:        addl    $128, %edi
                    166: ')
                    167:
                    168:        rep
                    169:        movsl
                    170:
                    171:        cld
                    172:
                    173:        movl    %eax, %esi
                    174:        movl    %edx, %edi
                    175:
                    176:        femms
                    177:        ret
                    178:
                    179: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>