version 1.1.1.1, 2000/09/09 14:12:42 |
version 1.1.1.2, 2003/08/25 16:06:28 |
|
|
dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. |
dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. |
dnl |
|
dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data |
|
dnl alignment. |
|
|
|
|
dnl Copyright 2001, 2002 Free Software Foundation, Inc. |
dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. |
|
dnl |
dnl |
dnl This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
dnl |
dnl |
Line 23 dnl License along with the GNU MP Library; see the fi |
|
Line 19 dnl License along with the GNU MP Library; see the fi |
|
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - |
dnl Suite 330, Boston, MA 02111-1307, USA. |
dnl Suite 330, Boston, MA 02111-1307, USA. |
|
|
|
|
include(`../config.m4') |
include(`../config.m4') |
|
|
|
|
dnl K6-2 aligned: |
C K6-2: 1.0 cycles/limb |
dnl UNROLL_COUNT cycles/limb |
|
dnl 8 0.75 |
|
dnl 16 0.625 |
|
dnl 32 0.5625 |
|
dnl 64 0.53 |
|
dnl Maximum possible with the current code is 64, the minimum is 2. |
|
|
|
deflit(UNROLL_COUNT, 32) |
|
|
|
|
|
C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); |
C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); |
C |
C |
C Copy src,size to dst,size, processing limbs from high to low addresses. |
C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30 |
|
C cycle startup time, which amounts for instance to a 2x speedup at 15 |
|
C limbs. |
C |
C |
C The comments in copyi.asm apply here too. |
C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by |
|
C processing one limb separately to make it aligned. This and a final odd |
|
C limb are handled in a branch-free fashion, ending up re-copying if the |
|
C special case isn't needed. |
|
C |
|
C Alternatives: |
|
C |
|
C There used to be a big unrolled version of this, running at 0.56 c/l if |
|
C the destination was aligned, but that seemed rather excessive for the |
|
C relative importance of copyd. |
|
C |
|
C If the destination alignment is ignored and just left to run at 1.17 c/l |
|
C some code size and a fixed few cycles can be saved. Considering how few |
|
C uses copyd finds perhaps that should be favoured. The current code has |
|
C the attraction of being no slower than a basic rep movsl though. |
|
|
|
|
defframe(PARAM_SIZE,12) |
defframe(PARAM_SIZE,12) |
defframe(PARAM_SRC, 8) |
defframe(PARAM_SRC, 8) |
defframe(PARAM_DST, 4) |
defframe(PARAM_DST, 4) |
deflit(`FRAME',0) |
|
|
|
.text |
dnl re-using parameter space |
ALIGN(32) |
define(SAVE_EBX,`PARAM_SIZE') |
|
|
|
TEXT |
|
ALIGN(16) |
|
|
PROLOGUE(mpn_copyd) |
PROLOGUE(mpn_copyd) |
|
deflit(`FRAME',0) |
|
|
movl PARAM_SIZE, %ecx |
movl PARAM_SIZE, %ecx |
movl %esi, %eax |
movl %ebx, SAVE_EBX |
|
|
movl PARAM_SRC, %esi |
movl PARAM_SRC, %eax |
movl %edi, %edx |
movl PARAM_DST, %edx |
|
|
std |
subl $1, %ecx C better code alignment than decl |
|
jb L(zero) |
|
|
movl PARAM_DST, %edi |
jz L(one_more) |
cmpl $UNROLL_COUNT, %ecx |
leal 4(%edx,%ecx,4), %ebx |
|
|
leal -4(%esi,%ecx,4), %esi |
Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb |
|
Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment |
|
|
leal -4(%edi,%ecx,4), %edi |
cmpl $1, %ecx |
ja L(unroll) |
je L(one_more) |
|
|
L(simple): |
shrl $2, %ebx |
rep |
andl $1, %ebx C 1 if dst[size-2] unaligned |
movsl |
|
|
|
cld |
subl %ebx, %ecx |
|
nop C code alignment |
|
|
movl %eax, %esi |
|
movl %edx, %edi |
|
|
|
ret |
|
|
|
|
|
L(unroll): |
|
C if src and dst are different alignments mod8, then use rep movs |
|
C if src and dst are both 4mod8 then process one limb to get 0mod8 |
|
|
|
pushl %ebx |
|
leal (%esi,%edi), %ebx |
|
|
|
testb $4, %bl |
|
popl %ebx |
|
|
|
jnz L(simple) |
|
testl $4, %esi |
|
|
|
leal -UNROLL_COUNT(%ecx), %ecx |
|
jnz L(already_aligned) |
|
|
|
movsl |
|
|
|
decl %ecx |
|
L(already_aligned): |
|
|
|
|
|
ifelse(UNROLL_BYTES,256,` |
|
subl $128, %esi |
|
subl $128, %edi |
|
') |
|
|
|
C offset 0x3D here, but gets full speed without further alignment |
|
L(top): |
L(top): |
C eax saved esi |
C eax src |
C ebx |
C ebx |
C ecx counter, limbs |
C ecx counter |
C edx saved edi |
C edx dst |
C esi src, incrementing |
|
C edi dst, incrementing |
|
C ebp |
|
C |
|
C `disp' is never 0, so don't need to force 0(%esi). |
|
|
|
deflit(CHUNK_COUNT, 2) |
movq -4(%eax,%ecx,4), %mm0 |
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` |
subl $2, %ecx |
deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) |
|
movq disp(%esi), %mm0 |
|
movq %mm0, disp(%edi) |
|
') |
|
|
|
leal -UNROLL_BYTES(%esi), %esi |
movq %mm0, 4(%edx,%ecx,4) |
subl $UNROLL_COUNT, %ecx |
ja L(top) |
|
|
leal -UNROLL_BYTES(%edi), %edi |
|
jns L(top) |
|
|
|
|
L(one_more): |
|
movd (%eax), %mm0 |
|
movd %mm0, (%edx) |
|
|
C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to |
movl SAVE_EBX, %ebx |
C UNROLL_COUNT-1 limbs remaining |
emms_or_femms |
|
L(zero): |
testb $eval(UNROLL_COUNT/2), %cl |
|
|
|
leal UNROLL_COUNT(%ecx), %ecx |
|
jz L(not_half) |
|
|
|
|
|
C at an unroll count of 32 this block of code is 16 cycles faster than |
|
C the rep movs, less 3 or 4 to test whether to do it |
|
|
|
forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, ` |
|
deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) |
|
movq disp(%esi), %mm0 |
|
movq %mm0, disp(%edi) |
|
') |
|
|
|
subl $eval(UNROLL_BYTES/2), %esi |
|
subl $eval(UNROLL_BYTES/2), %edi |
|
|
|
subl $eval(UNROLL_COUNT/2), %ecx |
|
L(not_half): |
|
|
|
|
|
ifelse(UNROLL_BYTES,256,` |
|
addl $128, %esi |
|
addl $128, %edi |
|
') |
|
|
|
rep |
|
movsl |
|
|
|
cld |
|
|
|
movl %eax, %esi |
|
movl %edx, %edi |
|
|
|
femms |
|
ret |
ret |
|
|
EPILOGUE() |
EPILOGUE() |