Annotation of OpenXM_contrib/gmp/mpn/x86/p6/copyd.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel P6 mpn_copyd -- copy limb vector backwards.
! 2:
! 3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C P6: 1.75 cycles/limb, or 0.75 if no overlap
! 26:
! 27:
! 28: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
! 29: C
! 30: C An explicit loop is used because a decrementing rep movsl is a bit slow at
! 31: C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the
! 32: C code here stands a chance of being faster if the branches predict well.
! 33: C
! 34: C The slightly strange loop form seems necessary for the claimed speed.
! 35: C Maybe load/store ordering affects it.
! 36: C
! 37: C The source and destination are checked to see if they're actually
! 38: C overlapping, since it might be possible to use an incrementing rep movsl
! 39: C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing
! 40: C version.)
! 41: C
! 42: C Enhancements:
! 43: C
! 44: C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
! 45: C one store each cycle. Unrolling the loop below would approach 1.0, but
! 46: C it'd be good to know why something like store/load/subl + store/load/jnz
! 47: C doesn't already run at 1.0 c/l. It looks like it should decode in 2
! 48: C cycles, but doesn't run that way.
! 49:
! 50: defframe(PARAM_SIZE,12)
! 51: defframe(PARAM_SRC, 8)
! 52: defframe(PARAM_DST, 4)
! 53:
! 54: dnl re-using parameter space
! 55: define(SAVE_ESI,`PARAM_SIZE')
! 56: define(SAVE_EDI,`PARAM_SRC')
! 57:
! 58: TEXT
! 59: ALIGN(16)
! 60:
! 61: PROLOGUE(mpn_copyd)
! 62: deflit(`FRAME',0)
! 63:
! 64: movl PARAM_SIZE, %ecx
! 65:
! 66: movl %esi, SAVE_ESI
! 67: movl PARAM_SRC, %esi
! 68:
! 69: movl %edi, SAVE_EDI
! 70: movl PARAM_DST, %edi
! 71:
! 72: subl $1, %ecx
! 73: jb L(zero)
! 74:
! 75: movl (%esi,%ecx,4), %eax C src[size-1]
! 76: jz L(one)
! 77:
! 78: movl -4(%esi,%ecx,4), %edx C src[size-2]
! 79: subl $2, %ecx
! 80: jbe L(done_loop) C 2 or 3 limbs only
! 81:
! 82:
! 83: C The usual overlap is
! 84: C
! 85: C high low
! 86: C +------------------+
! 87: C | dst|
! 88: C +------------------+
! 89: C +------------------+
! 90: C | src|
! 91: C +------------------+
! 92: C
! 93: C We can use an incrementing copy in the following circumstances.
! 94: C
! 95: C src+4*size<=dst, since then the regions are disjoint
! 96: C
! 97: C src==dst, clearly (though this shouldn't occur normally)
! 98: C
! 99: C src>dst, since in that case it's a requirement of the
! 100: C parameters that src>=dst+size*4, and hence the
! 101: C regions are disjoint
! 102: C
! 103:
! 104: leal (%edi,%ecx,4), %edx
! 105: cmpl %edi, %esi
! 106: jae L(use_movsl) C src >= dst
! 107:
! 108: cmpl %edi, %edx
! 109: movl 4(%esi,%ecx,4), %edx C src[size-2] again
! 110: jbe L(use_movsl) C src+4*size <= dst
! 111:
! 112:
! 113: L(top):
! 114: C eax prev high limb
! 115: C ebx
! 116: C ecx counter, size-3 down to 0 or -1, inclusive, by 2s
! 117: C edx prev low limb
! 118: C esi src
! 119: C edi dst
! 120: C ebp
! 121:
! 122: movl %eax, 8(%edi,%ecx,4)
! 123: movl (%esi,%ecx,4), %eax
! 124:
! 125: movl %edx, 4(%edi,%ecx,4)
! 126: movl -4(%esi,%ecx,4), %edx
! 127:
! 128: subl $2, %ecx
! 129: jnbe L(top)
! 130:
! 131:
! 132: L(done_loop):
! 133: movl %eax, 8(%edi,%ecx,4)
! 134: movl %edx, 4(%edi,%ecx,4)
! 135:
! 136: C copy low limb (needed if size was odd, but will already have been
! 137: C done in the loop if size was even)
! 138: movl (%esi), %eax
! 139: L(one):
! 140: movl %eax, (%edi)
! 141: movl SAVE_EDI, %edi
! 142: movl SAVE_ESI, %esi
! 143:
! 144: ret
! 145:
! 146:
! 147: L(use_movsl):
! 148: C eax
! 149: C ebx
! 150: C ecx size-3
! 151: C edx
! 152: C esi src
! 153: C edi dst
! 154: C ebp
! 155:
! 156: addl $3, %ecx
! 157:
! 158: cld C better safe than sorry, see mpn/x86/README
! 159:
! 160: rep
! 161: movsl
! 162:
! 163: L(zero):
! 164: movl SAVE_ESI, %esi
! 165: movl SAVE_EDI, %edi
! 166:
! 167: ret
! 168:
! 169: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>