=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1.1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm 2000/09/09 14:12:42 1.1.1.1 +++ OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/Attic/copyd.asm 2003/08/25 16:06:28 1.1.1.2 @@ -1,10 +1,6 @@ dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. -dnl -dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data -dnl alignment. - -dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl Copyright 2001, 2002 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl @@ -23,157 +19,91 @@ dnl License along with the GNU MP Library; see the fi dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - dnl Suite 330, Boston, MA 02111-1307, USA. - include(`../config.m4') -dnl K6-2 aligned: -dnl UNROLL_COUNT cycles/limb -dnl 8 0.75 -dnl 16 0.625 -dnl 32 0.5625 -dnl 64 0.53 -dnl Maximum possible with the current code is 64, the minimum is 2. +C K6-2: 1.0 cycles/limb -deflit(UNROLL_COUNT, 32) - C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); C -C Copy src,size to dst,size, processing limbs from high to low addresses. +C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30 +C cycle startup time, which amounts for instance to a 2x speedup at 15 +C limbs. C -C The comments in copyi.asm apply here too. +C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by +C processing one limb separately to make it aligned. This and a final odd +C limb are handled in a branch-free fashion, ending up re-copying if the +C special case isn't needed. +C +C Alternatives: +C +C There used to be a big unrolled version of this, running at 0.56 c/l if +C the destination was aligned, but that seemed rather excessive for the +C relative importance of copyd. +C +C If the destination alignment is ignored and just left to run at 1.17 c/l +C some code size and a fixed few cycles can be saved. Considering how few +C uses copyd finds perhaps that should be favoured. The current code has +C the attraction of being no slower than a basic rep movsl though. - defframe(PARAM_SIZE,12) defframe(PARAM_SRC, 8) defframe(PARAM_DST, 4) -deflit(`FRAME',0) - .text - ALIGN(32) +dnl re-using parameter space +define(SAVE_EBX,`PARAM_SIZE') + TEXT + ALIGN(16) + PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + movl PARAM_SIZE, %ecx - movl %esi, %eax + movl %ebx, SAVE_EBX - movl PARAM_SRC, %esi - movl %edi, %edx + movl PARAM_SRC, %eax + movl PARAM_DST, %edx - std + subl $1, %ecx C better code alignment than decl + jb L(zero) - movl PARAM_DST, %edi - cmpl $UNROLL_COUNT, %ecx + jz L(one_more) + leal 4(%edx,%ecx,4), %ebx - leal -4(%esi,%ecx,4), %esi +Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb +Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment - leal -4(%edi,%ecx,4), %edi - ja L(unroll) + cmpl $1, %ecx + je L(one_more) -L(simple): - rep - movsl + shrl $2, %ebx + andl $1, %ebx C 1 if dst[size-2] unaligned - cld + subl %ebx, %ecx + nop C code alignment - movl %eax, %esi - movl %edx, %edi - - ret - - -L(unroll): - C if src and dst are different alignments mod8, then use rep movs - C if src and dst are both 4mod8 then process one limb to get 0mod8 - - pushl %ebx - leal (%esi,%edi), %ebx - - testb $4, %bl - popl %ebx - - jnz L(simple) - testl $4, %esi - - leal -UNROLL_COUNT(%ecx), %ecx - jnz L(already_aligned) - - movsl - - decl %ecx -L(already_aligned): - - -ifelse(UNROLL_BYTES,256,` - subl $128, %esi - subl $128, %edi -') - - C offset 0x3D here, but gets full speed without further alignment L(top): - C eax saved esi + C eax src C ebx - C ecx counter, limbs - C edx saved edi - C esi src, incrementing - C edi dst, incrementing - C ebp - C - C `disp' is never 0, so don't need to force 0(%esi). + C ecx counter + C edx dst -deflit(CHUNK_COUNT, 2) -forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` - deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) - movq disp(%esi), %mm0 - movq %mm0, disp(%edi) -') + movq -4(%eax,%ecx,4), %mm0 + subl $2, %ecx - leal -UNROLL_BYTES(%esi), %esi - subl $UNROLL_COUNT, %ecx + movq %mm0, 4(%edx,%ecx,4) + ja L(top) - leal -UNROLL_BYTES(%edi), %edi - jns L(top) +L(one_more): + movd (%eax), %mm0 + movd %mm0, (%edx) - C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to - C UNROLL_COUNT-1 limbs remaining - - testb $eval(UNROLL_COUNT/2), %cl - - leal UNROLL_COUNT(%ecx), %ecx - jz L(not_half) - - - C at an unroll count of 32 this block of code is 16 cycles faster than - C the rep movs, less 3 or 4 to test whether to do it - -forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, ` - deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) - movq disp(%esi), %mm0 - movq %mm0, disp(%edi) -') - - subl $eval(UNROLL_BYTES/2), %esi - subl $eval(UNROLL_BYTES/2), %edi - - subl $eval(UNROLL_COUNT/2), %ecx -L(not_half): - - -ifelse(UNROLL_BYTES,256,` - addl $128, %esi - addl $128, %edi -') - - rep - movsl - - cld - - movl %eax, %esi - movl %edx, %edi - - femms + movl SAVE_EBX, %ebx + emms_or_femms +L(zero): ret EPILOGUE()