===================================================================
RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.1 -r1.1.1.2
--- OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm	2000/09/09 14:12:42	1.1.1.1
+++ OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm	2003/08/25 16:06:28	1.1.1.2
@@ -1,10 +1,6 @@
 dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
-dnl 
-dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
-dnl  alignment.
 
-
-dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
 dnl 
 dnl  This file is part of the GNU MP Library.
 dnl 
@@ -23,157 +19,91 @@ dnl  License along with the GNU MP Library; see the fi
 dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
 dnl  Suite 330, Boston, MA 02111-1307, USA.
 
-
 include(`../config.m4')
 
 
-dnl  K6-2 aligned:
-dnl  UNROLL_COUNT cycles/limb
-dnl        8          0.75
-dnl       16          0.625
-dnl       32          0.5625
-dnl       64          0.53
-dnl  Maximum possible with the current code is 64, the minimum is 2.
+C K6-2: 1.0 cycles/limb
 
-deflit(UNROLL_COUNT, 32)
 
-
 C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
 C
-C Copy src,size to dst,size, processing limbs from high to low addresses.
+C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
+C cycle startup time, which amounts for instance to a 2x speedup at 15
+C limbs.
 C
-C The comments in copyi.asm apply here too.
+C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
+C processing one limb separately to make it aligned.  This and a final odd
+C limb are handled in a branch-free fashion, ending up re-copying if the
+C special case isn't needed.
+C
+C Alternatives:
+C
+C There used to be a big unrolled version of this, running at 0.56 c/l if
+C the destination was aligned, but that seemed rather excessive for the
+C relative importance of copyd.
+C
+C If the destination alignment is ignored and just left to run at 1.17 c/l
+C some code size and a fixed few cycles can be saved.  Considering how few
+C uses copyd finds perhaps that should be favoured.  The current code has
+C the attraction of being no slower than a basic rep movsl though.
 
-
 defframe(PARAM_SIZE,12)
 defframe(PARAM_SRC, 8)
 defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
 
-	.text
-	ALIGN(32)
+dnl  re-using parameter space
+define(SAVE_EBX,`PARAM_SIZE')
 
+	TEXT
+	ALIGN(16)
+
 PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
 	movl	PARAM_SIZE, %ecx
-	movl	%esi, %eax
+	movl	%ebx, SAVE_EBX
 
-	movl	PARAM_SRC, %esi
-	movl	%edi, %edx
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
 
-	std
+	subl	$1, %ecx		C better code alignment than decl
+	jb	L(zero)
 
-	movl	PARAM_DST, %edi
-	cmpl	$UNROLL_COUNT, %ecx
+	jz	L(one_more)
+	leal	4(%edx,%ecx,4), %ebx
 
-	leal	-4(%esi,%ecx,4), %esi
+Zdisp(	movd,	0,(%eax,%ecx,4), %mm0)	C high limb
+Zdisp(	movd,	%mm0, 0,(%edx,%ecx,4))	C Zdisp for good code alignment
 
-	leal	-4(%edi,%ecx,4), %edi
-	ja	L(unroll)
+	cmpl	$1, %ecx
+	je	L(one_more)
 
-L(simple):
-	rep
-	movsl
+	shrl	$2, %ebx
+	andl	$1, %ebx		C 1 if dst[size-2] unaligned
 
-	cld
+	subl	%ebx, %ecx
+	nop				C code alignment
 
-	movl	%eax, %esi
-	movl	%edx, %edi
-
-	ret
-
-
-L(unroll):
-	C if src and dst are different alignments mod8, then use rep movs
-	C if src and dst are both 4mod8 then process one limb to get 0mod8
-
-	pushl	%ebx
-	leal	(%esi,%edi), %ebx
-
-	testb	$4, %bl
-	popl	%ebx
-	
-	jnz	L(simple)
-	testl	$4, %esi
-
-	leal	-UNROLL_COUNT(%ecx), %ecx
-	jnz	L(already_aligned)
-
-	movsl
-
-	decl	%ecx
-L(already_aligned):
-
-
-ifelse(UNROLL_BYTES,256,`
-	subl	$128, %esi
-	subl	$128, %edi
-')
-
-	C offset 0x3D here, but gets full speed without further alignment
 L(top):
-	C eax	saved esi
+	C eax	src
 	C ebx
-	C ecx	counter, limbs
-	C edx	saved edi
-	C esi	src, incrementing
-	C edi	dst, incrementing
-	C ebp
-	C
-	C `disp' is never 0, so don't need to force 0(%esi).
+	C ecx	counter
+	C edx	dst
 
-deflit(CHUNK_COUNT, 2)
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
-	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
-	movq	disp(%esi), %mm0
-	movq	%mm0, disp(%edi)
-')
+	movq	-4(%eax,%ecx,4), %mm0
+	subl	$2, %ecx
 
-	leal	-UNROLL_BYTES(%esi), %esi
-	subl	$UNROLL_COUNT, %ecx
+	movq	%mm0, 4(%edx,%ecx,4)
+	ja	L(top)
 
-	leal	-UNROLL_BYTES(%edi), %edi
-	jns	L(top)
 
+L(one_more):
+        movd    (%eax), %mm0
+        movd    %mm0, (%edx)
 
-	C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
-	C UNROLL_COUNT-1 limbs remaining
-
-	testb	$eval(UNROLL_COUNT/2), %cl
-
-	leal	UNROLL_COUNT(%ecx), %ecx
-	jz	L(not_half)
-
-
-	C at an unroll count of 32 this block of code is 16 cycles faster than
-	C the rep movs, less 3 or 4 to test whether to do it
-
-forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
-	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
-	movq	disp(%esi), %mm0
-	movq	%mm0, disp(%edi)
-')
-
-	subl	$eval(UNROLL_BYTES/2), %esi
-	subl	$eval(UNROLL_BYTES/2), %edi
-
-	subl	$eval(UNROLL_COUNT/2), %ecx
-L(not_half):
-
-
-ifelse(UNROLL_BYTES,256,`
-	addl	$128, %esi
-	addl	$128, %edi
-')
-
-	rep
-	movsl
-
-	cld
-
-	movl	%eax, %esi
-	movl	%edx, %edi
-
-	femms
+	movl	SAVE_EBX, %ebx
+	emms_or_femms
+L(zero):
 	ret
 
 EPILOGUE()