Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/copyi.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K7 mpn_copyi -- copy limb vector, incrementing.
2: dnl
3: dnl alignment dst/src, A=0mod8 N=4mod8
4: dnl A/A A/N N/A N/N
5: dnl K7 0.75 1.0 1.0 0.75
6:
7:
8: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
9: dnl
10: dnl This file is part of the GNU MP Library.
11: dnl
12: dnl The GNU MP Library is free software; you can redistribute it and/or
13: dnl modify it under the terms of the GNU Lesser General Public License as
14: dnl published by the Free Software Foundation; either version 2.1 of the
15: dnl License, or (at your option) any later version.
16: dnl
17: dnl The GNU MP Library is distributed in the hope that it will be useful,
18: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
19: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20: dnl Lesser General Public License for more details.
21: dnl
22: dnl You should have received a copy of the GNU Lesser General Public
23: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
24: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
25: dnl Suite 330, Boston, MA 02111-1307, USA.
26:
27:
28: include(`../config.m4')
29:
30:
31: C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
32: C
33: C Copy src,size to dst,size.
34: C
35: C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
36: C 1.33 c/l.
37: C
38: C The K7 can do two loads, or two stores, or a load and a store, in one
39: C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
40: C however nothing under 0.7 c/l is known.
41: C
42: C If both source and destination are unaligned then one limb is processed at
43: C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
44: C used unaligned it would be 1.5 c/l.
45:
46: defframe(PARAM_SIZE,12)
47: defframe(PARAM_SRC, 8)
48: defframe(PARAM_DST, 4)
49:
50: dnl parameter space reused
51: define(SAVE_EBX,`PARAM_SIZE')
52:
53: dnl minimum 5 since the unrolled code can't handle less than 5
54: deflit(UNROLL_THRESHOLD, 5)
55:
56: .text
57: ALIGN(32)
58: PROLOGUE(mpn_copyi)
59: deflit(`FRAME',0)
60:
61: movl PARAM_SIZE, %ecx
62: movl %ebx, SAVE_EBX
63:
64: movl PARAM_SRC, %eax
65: movl PARAM_DST, %edx
66:
67: cmpl $UNROLL_THRESHOLD, %ecx
68: jae L(unroll)
69:
70: orl %ecx, %ecx
71: jz L(simple_done)
72:
73: L(simple):
74: C eax src, incrementing
75: C ebx scratch
76: C ecx counter
77: C edx dst, incrementing
78: C
79: C this loop is 2 cycles/limb
80:
81: movl (%eax), %ebx
82: movl %ebx, (%edx)
83: decl %ecx
84: leal 4(%eax), %eax
85: leal 4(%edx), %edx
86: jnz L(simple)
87:
88: L(simple_done):
89: movl SAVE_EBX, %ebx
90: ret
91:
92:
93: L(unroll):
94: movl %eax, %ebx
95: leal -12(%eax,%ecx,4), %eax C src end - 12
96: subl $3, %ecx C size-3
97:
98: andl %edx, %ebx
99: leal (%edx,%ecx,4), %edx C dst end - 12
100: negl %ecx
101:
102: testl $4, %ebx C testl to pad code closer to 16 bytes for L(top)
103: jz L(aligned)
104:
105: C both src and dst unaligned, process one limb to align them
106: movl (%eax,%ecx,4), %ebx
107: movl %ebx, (%edx,%ecx,4)
108: incl %ecx
109: L(aligned):
110:
111:
112: ALIGN(16)
113: L(top):
114: C eax src end - 12
115: C ebx
116: C ecx counter, negative, limbs
117: C edx dst end - 12
118:
119: movq (%eax,%ecx,4), %mm0
120: movq 8(%eax,%ecx,4), %mm1
121: addl $4, %ecx
122: movq %mm0, -16(%edx,%ecx,4)
123: movq %mm1, -16+8(%edx,%ecx,4)
124: ja L(top) C jump no carry and not zero
125:
126:
127: C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
128:
129: testb $2, %cl
130: jnz L(finish_not_two)
131:
132: movq (%eax,%ecx,4), %mm0
133: movq %mm0, (%edx,%ecx,4)
134: L(finish_not_two):
135:
136: testb $1, %cl
137: jnz L(done)
138:
139: movl 8(%eax), %ebx
140: movl %ebx, 8(%edx)
141:
142: L(done):
143: movl SAVE_EBX, %ebx
144: emms
145: ret
146:
147: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>