Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/copyi.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
! 2: dnl
! 3: dnl alignment dst/src, A=0mod8 N=4mod8
! 4: dnl A/A A/N N/A N/N
! 5: dnl K7 0.75 1.0 1.0 0.75
! 6:
! 7:
! 8: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 9: dnl
! 10: dnl This file is part of the GNU MP Library.
! 11: dnl
! 12: dnl The GNU MP Library is free software; you can redistribute it and/or
! 13: dnl modify it under the terms of the GNU Lesser General Public License as
! 14: dnl published by the Free Software Foundation; either version 2.1 of the
! 15: dnl License, or (at your option) any later version.
! 16: dnl
! 17: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 18: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 19: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 20: dnl Lesser General Public License for more details.
! 21: dnl
! 22: dnl You should have received a copy of the GNU Lesser General Public
! 23: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 24: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 25: dnl Suite 330, Boston, MA 02111-1307, USA.
! 26:
! 27:
! 28: include(`../config.m4')
! 29:
! 30:
! 31: C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
! 32: C
! 33: C Copy src,size to dst,size.
! 34: C
! 35: C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
! 36: C 1.33 c/l.
! 37: C
! 38: C The K7 can do two loads, or two stores, or a load and a store, in one
! 39: C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
! 40: C however nothing under 0.7 c/l is known.
! 41: C
! 42: C If both source and destination are unaligned then one limb is processed at
! 43: C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
! 44: C used unaligned it would be 1.5 c/l.
! 45:
! 46: defframe(PARAM_SIZE,12)
! 47: defframe(PARAM_SRC, 8)
! 48: defframe(PARAM_DST, 4)
! 49:
! 50: dnl parameter space reused
! 51: define(SAVE_EBX,`PARAM_SIZE')
! 52:
! 53: dnl minimum 5 since the unrolled code can't handle less than 5
! 54: deflit(UNROLL_THRESHOLD, 5)
! 55:
! 56: .text
! 57: ALIGN(32)
! 58: PROLOGUE(mpn_copyi)
! 59: deflit(`FRAME',0)
! 60:
! 61: movl PARAM_SIZE, %ecx
! 62: movl %ebx, SAVE_EBX
! 63:
! 64: movl PARAM_SRC, %eax
! 65: movl PARAM_DST, %edx
! 66:
! 67: cmpl $UNROLL_THRESHOLD, %ecx
! 68: jae L(unroll)
! 69:
! 70: orl %ecx, %ecx
! 71: jz L(simple_done)
! 72:
! 73: L(simple):
! 74: C eax src, incrementing
! 75: C ebx scratch
! 76: C ecx counter
! 77: C edx dst, incrementing
! 78: C
! 79: C this loop is 2 cycles/limb
! 80:
! 81: movl (%eax), %ebx
! 82: movl %ebx, (%edx)
! 83: decl %ecx
! 84: leal 4(%eax), %eax
! 85: leal 4(%edx), %edx
! 86: jnz L(simple)
! 87:
! 88: L(simple_done):
! 89: movl SAVE_EBX, %ebx
! 90: ret
! 91:
! 92:
! 93: L(unroll):
! 94: movl %eax, %ebx
! 95: leal -12(%eax,%ecx,4), %eax C src end - 12
! 96: subl $3, %ecx C size-3
! 97:
! 98: andl %edx, %ebx
! 99: leal (%edx,%ecx,4), %edx C dst end - 12
! 100: negl %ecx
! 101:
! 102: testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
! 103: jz L(aligned)
! 104:
! 105: C both src and dst unaligned, process one limb to align them
! 106: movl (%eax,%ecx,4), %ebx
! 107: movl %ebx, (%edx,%ecx,4)
! 108: incl %ecx
! 109: L(aligned):
! 110:
! 111:
! 112: ALIGN(16)
! 113: L(top):
! 114: C eax src end - 12
! 115: C ebx
! 116: C ecx counter, negative, limbs
! 117: C edx dst end - 12
! 118:
! 119: movq (%eax,%ecx,4), %mm0
! 120: movq 8(%eax,%ecx,4), %mm1
! 121: addl $4, %ecx
! 122: movq %mm0, -16(%edx,%ecx,4)
! 123: movq %mm1, -16+8(%edx,%ecx,4)
! 124: ja L(top) C jump no carry and not zero
! 125:
! 126:
! 127: C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
! 128:
! 129: testb $2, %cl
! 130: jnz L(finish_not_two)
! 131:
! 132: movq (%eax,%ecx,4), %mm0
! 133: movq %mm0, (%edx,%ecx,4)
! 134: L(finish_not_two):
! 135:
! 136: testb $1, %cl
! 137: jnz L(done)
! 138:
! 139: movl 8(%eax), %ebx
! 140: movl %ebx, 8(%edx)
! 141:
! 142: L(done):
! 143: movl SAVE_EBX, %ebx
! 144: emms
! 145: ret
! 146:
! 147: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>