Annotation of OpenXM_contrib/gmp/mpn/x86/k6/k62mmx/copyd.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
2:
1.1.1.2 ! ohara 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K6-2: 1.0 cycles/limb
1.1 maekawa 26:
27:
28: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
29: C
1.1.1.2 ! ohara 30: C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
! 31: C cycle startup time, which amounts for instance to a 2x speedup at 15
! 32: C limbs.
1.1 maekawa 33: C
1.1.1.2 ! ohara 34: C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
! 35: C processing one limb separately to make it aligned. This and a final odd
! 36: C limb are handled in a branch-free fashion, ending up re-copying if the
! 37: C special case isn't needed.
! 38: C
! 39: C Alternatives:
! 40: C
! 41: C There used to be a big unrolled version of this, running at 0.56 c/l if
! 42: C the destination was aligned, but that seemed rather excessive for the
! 43: C relative importance of copyd.
! 44: C
! 45: C If the destination alignment is ignored and just left to run at 1.17 c/l
! 46: C some code size and a fixed few cycles can be saved. Considering how few
! 47: C uses copyd finds perhaps that should be favoured. The current code has
! 48: C the attraction of being no slower than a basic rep movsl though.
1.1 maekawa 49:
50: defframe(PARAM_SIZE,12)
51: defframe(PARAM_SRC, 8)
52: defframe(PARAM_DST, 4)
53:
1.1.1.2 ! ohara 54: dnl re-using parameter space
! 55: define(SAVE_EBX,`PARAM_SIZE')
1.1 maekawa 56:
1.1.1.2 ! ohara 57: TEXT
! 58: ALIGN(16)
1.1 maekawa 59:
1.1.1.2 ! ohara 60: PROLOGUE(mpn_copyd)
! 61: deflit(`FRAME',0)
1.1 maekawa 62:
1.1.1.2 ! ohara 63: movl PARAM_SIZE, %ecx
! 64: movl %ebx, SAVE_EBX
1.1 maekawa 65:
1.1.1.2 ! ohara 66: movl PARAM_SRC, %eax
! 67: movl PARAM_DST, %edx
1.1 maekawa 68:
1.1.1.2 ! ohara 69: subl $1, %ecx C better code alignment than decl
! 70: jb L(zero)
1.1 maekawa 71:
1.1.1.2 ! ohara 72: jz L(one_more)
! 73: leal 4(%edx,%ecx,4), %ebx
1.1 maekawa 74:
1.1.1.2 ! ohara 75: Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb
! 76: Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment
1.1 maekawa 77:
1.1.1.2 ! ohara 78: cmpl $1, %ecx
! 79: je L(one_more)
1.1 maekawa 80:
1.1.1.2 ! ohara 81: shrl $2, %ebx
! 82: andl $1, %ebx C 1 if dst[size-2] unaligned
1.1 maekawa 83:
1.1.1.2 ! ohara 84: subl %ebx, %ecx
! 85: nop C code alignment
1.1 maekawa 86:
87: L(top):
1.1.1.2 ! ohara 88: C eax src
1.1 maekawa 89: C ebx
1.1.1.2 ! ohara 90: C ecx counter
! 91: C edx dst
1.1 maekawa 92:
1.1.1.2 ! ohara 93: movq -4(%eax,%ecx,4), %mm0
! 94: subl $2, %ecx
1.1 maekawa 95:
1.1.1.2 ! ohara 96: movq %mm0, 4(%edx,%ecx,4)
! 97: ja L(top)
1.1 maekawa 98:
99:
1.1.1.2 ! ohara 100: L(one_more):
! 101: movd (%eax), %mm0
! 102: movd %mm0, (%edx)
1.1 maekawa 103:
1.1.1.2 ! ohara 104: movl SAVE_EBX, %ebx
! 105: emms_or_femms
! 106: L(zero):
1.1 maekawa 107: ret
108:
109: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>