[BACK]Return to copyd.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / k62mmx

Diff for /OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm between version 1.1.1.1 and 1.1.1.2

version 1.1.1.1, 2000/09/09 14:12:42 version 1.1.1.2, 2003/08/25 16:06:28
Line 1 
Line 1 
 dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.  dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
 dnl  
 dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data  
 dnl  alignment.  
   
   dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
 dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.  
 dnl  dnl
 dnl  This file is part of the GNU MP Library.  dnl  This file is part of the GNU MP Library.
 dnl  dnl
Line 23  dnl  License along with the GNU MP Library; see the fi
Line 19  dnl  License along with the GNU MP Library; see the fi
 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -  dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
 dnl  Suite 330, Boston, MA 02111-1307, USA.  dnl  Suite 330, Boston, MA 02111-1307, USA.
   
   
 include(`../config.m4')  include(`../config.m4')
   
   
 dnl  K6-2 aligned:  C K6-2: 1.0 cycles/limb
 dnl  UNROLL_COUNT cycles/limb  
 dnl        8          0.75  
 dnl       16          0.625  
 dnl       32          0.5625  
 dnl       64          0.53  
 dnl  Maximum possible with the current code is 64, the minimum is 2.  
   
 deflit(UNROLL_COUNT, 32)  
   
   
 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);  C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
 C  C
 C Copy src,size to dst,size, processing limbs from high to low addresses.  C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
   C cycle startup time, which amounts for instance to a 2x speedup at 15
   C limbs.
 C  C
 C The comments in copyi.asm apply here too.  C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
   C processing one limb separately to make it aligned.  This and a final odd
   C limb are handled in a branch-free fashion, ending up re-copying if the
   C special case isn't needed.
   C
   C Alternatives:
   C
   C There used to be a big unrolled version of this, running at 0.56 c/l if
   C the destination was aligned, but that seemed rather excessive for the
   C relative importance of copyd.
   C
   C If the destination alignment is ignored and just left to run at 1.17 c/l
   C some code size and a fixed few cycles can be saved.  Considering how few
   C uses copyd finds perhaps that should be favoured.  The current code has
   C the attraction of being no slower than a basic rep movsl though.
   
   
 defframe(PARAM_SIZE,12)  defframe(PARAM_SIZE,12)
 defframe(PARAM_SRC, 8)  defframe(PARAM_SRC, 8)
 defframe(PARAM_DST, 4)  defframe(PARAM_DST, 4)
 deflit(`FRAME',0)  
   
         .text  dnl  re-using parameter space
         ALIGN(32)  define(SAVE_EBX,`PARAM_SIZE')
   
           TEXT
           ALIGN(16)
   
 PROLOGUE(mpn_copyd)  PROLOGUE(mpn_copyd)
   deflit(`FRAME',0)
   
         movl    PARAM_SIZE, %ecx          movl    PARAM_SIZE, %ecx
         movl    %esi, %eax          movl    %ebx, SAVE_EBX
   
         movl    PARAM_SRC, %esi          movl    PARAM_SRC, %eax
         movl    %edi, %edx          movl    PARAM_DST, %edx
   
         std          subl    $1, %ecx                C better code alignment than decl
           jb      L(zero)
   
         movl    PARAM_DST, %edi          jz      L(one_more)
         cmpl    $UNROLL_COUNT, %ecx          leal    4(%edx,%ecx,4), %ebx
   
         leal    -4(%esi,%ecx,4), %esi  Zdisp(  movd,   0,(%eax,%ecx,4), %mm0)  C high limb
   Zdisp(  movd,   %mm0, 0,(%edx,%ecx,4))  C Zdisp for good code alignment
   
         leal    -4(%edi,%ecx,4), %edi          cmpl    $1, %ecx
         ja      L(unroll)          je      L(one_more)
   
 L(simple):          shrl    $2, %ebx
         rep          andl    $1, %ebx                C 1 if dst[size-2] unaligned
         movsl  
   
         cld          subl    %ebx, %ecx
           nop                             C code alignment
   
         movl    %eax, %esi  
         movl    %edx, %edi  
   
         ret  
   
   
 L(unroll):  
         C if src and dst are different alignments mod8, then use rep movs  
         C if src and dst are both 4mod8 then process one limb to get 0mod8  
   
         pushl   %ebx  
         leal    (%esi,%edi), %ebx  
   
         testb   $4, %bl  
         popl    %ebx  
   
         jnz     L(simple)  
         testl   $4, %esi  
   
         leal    -UNROLL_COUNT(%ecx), %ecx  
         jnz     L(already_aligned)  
   
         movsl  
   
         decl    %ecx  
 L(already_aligned):  
   
   
 ifelse(UNROLL_BYTES,256,`  
         subl    $128, %esi  
         subl    $128, %edi  
 ')  
   
         C offset 0x3D here, but gets full speed without further alignment  
 L(top):  L(top):
         C eax   saved esi          C eax   src
         C ebx          C ebx
         C ecx   counter, limbs          C ecx   counter
         C edx   saved edi          C edx   dst
         C esi   src, incrementing  
         C edi   dst, incrementing  
         C ebp  
         C  
         C `disp' is never 0, so don't need to force 0(%esi).  
   
 deflit(CHUNK_COUNT, 2)          movq    -4(%eax,%ecx,4), %mm0
 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `          subl    $2, %ecx
         deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))  
         movq    disp(%esi), %mm0  
         movq    %mm0, disp(%edi)  
 ')  
   
         leal    -UNROLL_BYTES(%esi), %esi          movq    %mm0, 4(%edx,%ecx,4)
         subl    $UNROLL_COUNT, %ecx          ja      L(top)
   
         leal    -UNROLL_BYTES(%edi), %edi  
         jns     L(top)  
   
   L(one_more):
           movd    (%eax), %mm0
           movd    %mm0, (%edx)
   
         C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to          movl    SAVE_EBX, %ebx
         C UNROLL_COUNT-1 limbs remaining          emms_or_femms
   L(zero):
         testb   $eval(UNROLL_COUNT/2), %cl  
   
         leal    UNROLL_COUNT(%ecx), %ecx  
         jz      L(not_half)  
   
   
         C at an unroll count of 32 this block of code is 16 cycles faster than  
         C the rep movs, less 3 or 4 to test whether to do it  
   
 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `  
         deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))  
         movq    disp(%esi), %mm0  
         movq    %mm0, disp(%edi)  
 ')  
   
         subl    $eval(UNROLL_BYTES/2), %esi  
         subl    $eval(UNROLL_BYTES/2), %edi  
   
         subl    $eval(UNROLL_COUNT/2), %ecx  
 L(not_half):  
   
   
 ifelse(UNROLL_BYTES,256,`  
         addl    $128, %esi  
         addl    $128, %edi  
 ')  
   
         rep  
         movsl  
   
         cld  
   
         movl    %eax, %esi  
         movl    %edx, %edi  
   
         femms  
         ret          ret
   
 EPILOGUE()  EPILOGUE()

Legend:
Removed from v.1.1.1.1  
changed lines
  Added in v.1.1.1.2

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>