Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/rshift.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl Intel Pentium mpn_rshift -- mpn right shift.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
1.1 maekawa 4: dnl Foundation, Inc.
5: dnl
6: dnl This file is part of the GNU MP Library.
7: dnl
8: dnl The GNU MP Library is free software; you can redistribute it and/or
9: dnl modify it under the terms of the GNU Lesser General Public License as
10: dnl published by the Free Software Foundation; either version 2.1 of the
11: dnl License, or (at your option) any later version.
12: dnl
13: dnl The GNU MP Library is distributed in the hope that it will be useful,
14: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16: dnl Lesser General Public License for more details.
17: dnl
18: dnl You should have received a copy of the GNU Lesser General Public
19: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
20: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
21: dnl Suite 330, Boston, MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25:
1.1.1.2 ! ohara 26: C cycles/limb
! 27: C P5,P54: 6.0
! 28: C P55: 5.375
! 29:
! 30:
1.1 maekawa 31: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
32: C unsigned shift);
33: C
34: C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
35: C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
36:
37: defframe(PARAM_SHIFT,16)
38: defframe(PARAM_SIZE, 12)
39: defframe(PARAM_SRC, 8)
40: defframe(PARAM_DST, 4)
41:
1.1.1.2 ! ohara 42: TEXT
1.1 maekawa 43: ALIGN(8)
44: PROLOGUE(mpn_rshift)
45:
46: pushl %edi
47: pushl %esi
48: pushl %ebx
49: pushl %ebp
50: deflit(`FRAME',16)
51:
52: movl PARAM_DST,%edi
53: movl PARAM_SRC,%esi
54: movl PARAM_SIZE,%ebp
55: movl PARAM_SHIFT,%ecx
56:
57: C We can use faster code for shift-by-1 under certain conditions.
58: cmp $1,%ecx
59: jne L(normal)
60: leal 4(%edi),%eax
61: cmpl %esi,%eax
62: jnc L(special) C jump if res_ptr + 1 >= s_ptr
63: leal (%edi,%ebp,4),%eax
64: cmpl %eax,%esi
65: jnc L(special) C jump if s_ptr >= res_ptr + size
66:
67: L(normal):
68: movl (%esi),%edx
69: addl $4,%esi
70: xorl %eax,%eax
71: shrdl( %cl, %edx, %eax) C compute carry limb
72: pushl %eax C push carry limb onto stack
73:
74: decl %ebp
75: pushl %ebp
76: shrl $3,%ebp
77: jz L(end)
78:
79: movl (%edi),%eax C fetch destination cache line
80:
81: ALIGN(4)
82: L(oop): movl 28(%edi),%eax C fetch destination cache line
83: movl %edx,%ebx
84:
85: movl (%esi),%eax
86: movl 4(%esi),%edx
87: shrdl( %cl, %eax, %ebx)
88: shrdl( %cl, %edx, %eax)
89: movl %ebx,(%edi)
90: movl %eax,4(%edi)
91:
92: movl 8(%esi),%ebx
93: movl 12(%esi),%eax
94: shrdl( %cl, %ebx, %edx)
95: shrdl( %cl, %eax, %ebx)
96: movl %edx,8(%edi)
97: movl %ebx,12(%edi)
98:
99: movl 16(%esi),%edx
100: movl 20(%esi),%ebx
101: shrdl( %cl, %edx, %eax)
102: shrdl( %cl, %ebx, %edx)
103: movl %eax,16(%edi)
104: movl %edx,20(%edi)
105:
106: movl 24(%esi),%eax
107: movl 28(%esi),%edx
108: shrdl( %cl, %eax, %ebx)
109: shrdl( %cl, %edx, %eax)
110: movl %ebx,24(%edi)
111: movl %eax,28(%edi)
112:
113: addl $32,%esi
114: addl $32,%edi
115: decl %ebp
116: jnz L(oop)
117:
118: L(end): popl %ebp
119: andl $7,%ebp
120: jz L(end2)
121: L(oop2):
122: movl (%esi),%eax
123: shrdl( %cl,%eax,%edx) C compute result limb
124: movl %edx,(%edi)
125: movl %eax,%edx
126: addl $4,%esi
127: addl $4,%edi
128: decl %ebp
129: jnz L(oop2)
130:
131: L(end2):
132: shrl %cl,%edx C compute most significant limb
133: movl %edx,(%edi) C store it
134:
135: popl %eax C pop carry limb
136:
137: popl %ebp
138: popl %ebx
139: popl %esi
140: popl %edi
141: ret
142:
143:
144: C We loop from least significant end of the arrays, which is only
145: C permissable if the source and destination don't overlap, since the
146: C function is documented to work for overlapping source and destination.
147:
148: L(special):
149: leal -4(%edi,%ebp,4),%edi
150: leal -4(%esi,%ebp,4),%esi
151:
152: movl (%esi),%edx
153: subl $4,%esi
154:
155: decl %ebp
156: pushl %ebp
157: shrl $3,%ebp
158:
159: shrl %edx
160: incl %ebp
161: decl %ebp
162: jz L(Lend)
163:
164: movl (%edi),%eax C fetch destination cache line
165:
166: ALIGN(4)
167: L(Loop):
168: movl -28(%edi),%eax C fetch destination cache line
169: movl %edx,%ebx
170:
171: movl (%esi),%eax
172: movl -4(%esi),%edx
173: rcrl %eax
174: movl %ebx,(%edi)
175: rcrl %edx
176: movl %eax,-4(%edi)
177:
178: movl -8(%esi),%ebx
179: movl -12(%esi),%eax
180: rcrl %ebx
181: movl %edx,-8(%edi)
182: rcrl %eax
183: movl %ebx,-12(%edi)
184:
185: movl -16(%esi),%edx
186: movl -20(%esi),%ebx
187: rcrl %edx
188: movl %eax,-16(%edi)
189: rcrl %ebx
190: movl %edx,-20(%edi)
191:
192: movl -24(%esi),%eax
193: movl -28(%esi),%edx
194: rcrl %eax
195: movl %ebx,-24(%edi)
196: rcrl %edx
197: movl %eax,-28(%edi)
198:
199: leal -32(%esi),%esi C use leal not to clobber carry
200: leal -32(%edi),%edi
201: decl %ebp
202: jnz L(Loop)
203:
204: L(Lend):
205: popl %ebp
206: sbbl %eax,%eax C save carry in %eax
207: andl $7,%ebp
208: jz L(Lend2)
209: addl %eax,%eax C restore carry from eax
210: L(Loop2):
211: movl %edx,%ebx
212: movl (%esi),%edx
213: rcrl %edx
214: movl %ebx,(%edi)
215:
216: leal -4(%esi),%esi C use leal not to clobber carry
217: leal -4(%edi),%edi
218: decl %ebp
219: jnz L(Loop2)
220:
221: jmp L(L1)
222: L(Lend2):
223: addl %eax,%eax C restore carry from eax
224: L(L1): movl %edx,(%edi) C store last limb
225:
226: movl $0,%eax
227: rcrl %eax
228:
229: popl %ebp
230: popl %ebx
231: popl %esi
232: popl %edi
233: ret
234:
235: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>