Annotation of OpenXM_contrib/gmp/mpn/x86/p6/copyd.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Intel P6 mpn_copyd -- copy limb vector backwards.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
25: C P6: 1.75 cycles/limb, or 0.75 if no overlap
26:
27:
28: C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
29: C
30: C An explicit loop is used because a decrementing rep movsl is a bit slow at
31: C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the
32: C code here stands a chance of being faster if the branches predict well.
33: C
34: C The slightly strange loop form seems necessary for the claimed speed.
35: C Maybe load/store ordering affects it.
36: C
37: C The source and destination are checked to see if they're actually
38: C overlapping, since it might be possible to use an incrementing rep movsl
39: C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing
40: C version.)
41: C
42: C Enhancements:
43: C
44: C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
45: C one store each cycle. Unrolling the loop below would approach 1.0, but
46: C it'd be good to know why something like store/load/subl + store/load/jnz
47: C doesn't already run at 1.0 c/l. It looks like it should decode in 2
48: C cycles, but doesn't run that way.
49:
50: defframe(PARAM_SIZE,12)
51: defframe(PARAM_SRC, 8)
52: defframe(PARAM_DST, 4)
53:
54: dnl re-using parameter space
55: define(SAVE_ESI,`PARAM_SIZE')
56: define(SAVE_EDI,`PARAM_SRC')
57:
58: TEXT
59: ALIGN(16)
60:
61: PROLOGUE(mpn_copyd)
62: deflit(`FRAME',0)
63:
64: movl PARAM_SIZE, %ecx
65:
66: movl %esi, SAVE_ESI
67: movl PARAM_SRC, %esi
68:
69: movl %edi, SAVE_EDI
70: movl PARAM_DST, %edi
71:
72: subl $1, %ecx
73: jb L(zero)
74:
75: movl (%esi,%ecx,4), %eax C src[size-1]
76: jz L(one)
77:
78: movl -4(%esi,%ecx,4), %edx C src[size-2]
79: subl $2, %ecx
80: jbe L(done_loop) C 2 or 3 limbs only
81:
82:
83: C The usual overlap is
84: C
85: C high low
86: C +------------------+
87: C | dst|
88: C +------------------+
89: C +------------------+
90: C | src|
91: C +------------------+
92: C
93: C We can use an incrementing copy in the following circumstances.
94: C
95: C src+4*size<=dst, since then the regions are disjoint
96: C
97: C src==dst, clearly (though this shouldn't occur normally)
98: C
99: C src>dst, since in that case it's a requirement of the
100: C parameters that src>=dst+size*4, and hence the
101: C regions are disjoint
102: C
103:
104: leal (%edi,%ecx,4), %edx
105: cmpl %edi, %esi
106: jae L(use_movsl) C src >= dst
107:
108: cmpl %edi, %edx
109: movl 4(%esi,%ecx,4), %edx C src[size-2] again
110: jbe L(use_movsl) C src+4*size <= dst
111:
112:
113: L(top):
114: C eax prev high limb
115: C ebx
116: C ecx counter, size-3 down to 0 or -1, inclusive, by 2s
117: C edx prev low limb
118: C esi src
119: C edi dst
120: C ebp
121:
122: movl %eax, 8(%edi,%ecx,4)
123: movl (%esi,%ecx,4), %eax
124:
125: movl %edx, 4(%edi,%ecx,4)
126: movl -4(%esi,%ecx,4), %edx
127:
128: subl $2, %ecx
129: jnbe L(top)
130:
131:
132: L(done_loop):
133: movl %eax, 8(%edi,%ecx,4)
134: movl %edx, 4(%edi,%ecx,4)
135:
136: C copy low limb (needed if size was odd, but will already have been
137: C done in the loop if size was even)
138: movl (%esi), %eax
139: L(one):
140: movl %eax, (%edi)
141: movl SAVE_EDI, %edi
142: movl SAVE_ESI, %esi
143:
144: ret
145:
146:
147: L(use_movsl):
148: C eax
149: C ebx
150: C ecx size-3
151: C edx
152: C esi src
153: C edi dst
154: C ebp
155:
156: addl $3, %ecx
157:
158: cld C better safe than sorry, see mpn/x86/README
159:
160: rep
161: movsl
162:
163: L(zero):
164: movl SAVE_ESI, %esi
165: movl SAVE_EDI, %edi
166:
167: ret
168:
169: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>