OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyd.asm - diff

Return to copyd.asm CVS log

Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / k62mmx

Diff for /OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm between version 1.1.1.1 and 1.1.1.2

-version 1.1.1.1, 2000/09/09 14:12:42
+version 1.1.1.2, 2003/08/25 16:06:28
 Line 1
 Line 1
 Line 1
  dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
- dnl
- dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
- dnl  alignment.
+ dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
- dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
  dnl
  dnl  This file is part of the GNU MP Library.
  dnl
-Line 23  dnl  License along with the GNU MP Library; see the fi
+Line 19  dnl  License along with the GNU MP Library; see the fi
 Line 23  dnl  License along with the GNU MP Library; see the fi
 Line 19  dnl  License along with the GNU MP Library; see the fi
  dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
  dnl  Suite 330, Boston, MA 02111-1307, USA.
  include(`../config.m4')
- dnl  K6-2 aligned:
+ C K6-2: 1.0 cycles/limb
- dnl  UNROLL_COUNT cycles/limb
- dnl        8          0.75
- dnl       16          0.625
- dnl       32          0.5625
- dnl       64          0.53
- dnl  Maximum possible with the current code is 64, the minimum is 2.
- deflit(UNROLL_COUNT, 32)
  C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
  C
- C Copy src,size to dst,size, processing limbs from high to low addresses.
+ C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
+ C cycle startup time, which amounts for instance to a 2x speedup at 15
+ C limbs.
  C
- C The comments in copyi.asm apply here too.
+ C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
+ C processing one limb separately to make it aligned.  This and a final odd
+ C limb are handled in a branch-free fashion, ending up re-copying if the
+ C special case isn't needed.
+ C
+ C Alternatives:
+ C
+ C There used to be a big unrolled version of this, running at 0.56 c/l if
+ C the destination was aligned, but that seemed rather excessive for the
+ C relative importance of copyd.
+ C
+ C If the destination alignment is ignored and just left to run at 1.17 c/l
+ C some code size and a fixed few cycles can be saved.  Considering how few
+ C uses copyd finds perhaps that should be favoured.  The current code has
+ C the attraction of being no slower than a basic rep movsl though.
  defframe(PARAM_SIZE,12)
  defframe(PARAM_SRC, 8)
  defframe(PARAM_DST, 4)
- deflit(`FRAME',0)
-         .text
+ dnl  re-using parameter space
-         ALIGN(32)
+ define(SAVE_EBX,`PARAM_SIZE')
+         TEXT
+         ALIGN(16)
  PROLOGUE(mpn_copyd)
+ deflit(`FRAME',0)
          movl    PARAM_SIZE, %ecx
-         movl    %esi, %eax
+         movl    %ebx, SAVE_EBX
-         movl    PARAM_SRC, %esi
+         movl    PARAM_SRC, %eax
-         movl    %edi, %edx
+         movl    PARAM_DST, %edx
-         std
+         subl    $1, %ecx                C better code alignment than decl
+         jb      L(zero)
-         movl    PARAM_DST, %edi
+         jz      L(one_more)
-         cmpl    $UNROLL_COUNT, %ecx
+         leal    4(%edx,%ecx,4), %ebx
-         leal    -4(%esi,%ecx,4), %esi
+ Zdisp(  movd,   0,(%eax,%ecx,4), %mm0)  C high limb
+ Zdisp(  movd,   %mm0, 0,(%edx,%ecx,4))  C Zdisp for good code alignment
-         leal    -4(%edi,%ecx,4), %edi
+         cmpl    $1, %ecx
-         ja      L(unroll)
+         je      L(one_more)
- L(simple):
+         shrl    $2, %ebx
-         rep
+         andl    $1, %ebx                C 1 if dst[size-2] unaligned
-         movsl
-         cld
+         subl    %ebx, %ecx
+         nop                             C code alignment
-         movl    %eax, %esi
-         movl    %edx, %edi
-         ret
- L(unroll):
-         C if src and dst are different alignments mod8, then use rep movs
-         C if src and dst are both 4mod8 then process one limb to get 0mod8
-         pushl   %ebx
-         leal    (%esi,%edi), %ebx
-         testb   $4, %bl
-         popl    %ebx
-         jnz     L(simple)
-         testl   $4, %esi
-         leal    -UNROLL_COUNT(%ecx), %ecx
-         jnz     L(already_aligned)
-         movsl
-         decl    %ecx
- L(already_aligned):
- ifelse(UNROLL_BYTES,256,`
-         subl    $128, %esi
-         subl    $128, %edi
- ')
-         C offset 0x3D here, but gets full speed without further alignment
  L(top):
-         C eax   saved esi
+         C eax   src
          C ebx
-         C ecx   counter, limbs
+         C ecx   counter
-         C edx   saved edi
+         C edx   dst
-         C esi   src, incrementing
-         C edi   dst, incrementing
-         C ebp
-         C
-         C `disp' is never 0, so don't need to force 0(%esi).
- deflit(CHUNK_COUNT, 2)
+         movq    -4(%eax,%ecx,4), %mm0
- forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+         subl    $2, %ecx
-         deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
-         movq    disp(%esi), %mm0
-         movq    %mm0, disp(%edi)
- ')
-         leal    -UNROLL_BYTES(%esi), %esi
+         movq    %mm0, 4(%edx,%ecx,4)
-         subl    $UNROLL_COUNT, %ecx
+         ja      L(top)
-         leal    -UNROLL_BYTES(%edi), %edi
-         jns     L(top)
+ L(one_more):
+         movd    (%eax), %mm0
+         movd    %mm0, (%edx)
-         C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+         movl    SAVE_EBX, %ebx
-         C UNROLL_COUNT-1 limbs remaining
+         emms_or_femms
+ L(zero):
-         testb   $eval(UNROLL_COUNT/2), %cl
-         leal    UNROLL_COUNT(%ecx), %ecx
-         jz      L(not_half)
-         C at an unroll count of 32 this block of code is 16 cycles faster than
-         C the rep movs, less 3 or 4 to test whether to do it
- forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
-         deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
-         movq    disp(%esi), %mm0
-         movq    %mm0, disp(%edi)
- ')
-         subl    $eval(UNROLL_BYTES/2), %esi
-         subl    $eval(UNROLL_BYTES/2), %edi
-         subl    $eval(UNROLL_COUNT/2), %ecx
- L(not_half):
- ifelse(UNROLL_BYTES,256,`
-         addl    $128, %esi
-         addl    $128, %edi
- ')
-         rep
-         movsl
-         cld
-         movl    %eax, %esi
-         movl    %edx, %edi
-         femms
          ret
  EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>