Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/rshift.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl Intel Pentium mpn_rshift -- mpn right shift.
2: dnl
3: dnl cycles/limb
4: dnl P5,P54: 6.0
5: dnl P55: 5.375
6:
7:
8: dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
9: dnl Foundation, Inc.
10: dnl
11: dnl This file is part of the GNU MP Library.
12: dnl
13: dnl The GNU MP Library is free software; you can redistribute it and/or
14: dnl modify it under the terms of the GNU Lesser General Public License as
15: dnl published by the Free Software Foundation; either version 2.1 of the
16: dnl License, or (at your option) any later version.
17: dnl
18: dnl The GNU MP Library is distributed in the hope that it will be useful,
19: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
20: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21: dnl Lesser General Public License for more details.
22: dnl
23: dnl You should have received a copy of the GNU Lesser General Public
24: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
25: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
26: dnl Suite 330, Boston, MA 02111-1307, USA.
27:
28:
29: include(`../config.m4')
30:
31:
32: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
33: C unsigned shift);
34: C
35: C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
36: C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
37:
38: defframe(PARAM_SHIFT,16)
39: defframe(PARAM_SIZE, 12)
40: defframe(PARAM_SRC, 8)
41: defframe(PARAM_DST, 4)
42:
43: .text
44: ALIGN(8)
45: PROLOGUE(mpn_rshift)
46:
47: pushl %edi
48: pushl %esi
49: pushl %ebx
50: pushl %ebp
51: deflit(`FRAME',16)
52:
53: movl PARAM_DST,%edi
54: movl PARAM_SRC,%esi
55: movl PARAM_SIZE,%ebp
56: movl PARAM_SHIFT,%ecx
57:
58: C We can use faster code for shift-by-1 under certain conditions.
59: cmp $1,%ecx
60: jne L(normal)
61: leal 4(%edi),%eax
62: cmpl %esi,%eax
63: jnc L(special) C jump if res_ptr + 1 >= s_ptr
64: leal (%edi,%ebp,4),%eax
65: cmpl %eax,%esi
66: jnc L(special) C jump if s_ptr >= res_ptr + size
67:
68: L(normal):
69: movl (%esi),%edx
70: addl $4,%esi
71: xorl %eax,%eax
72: shrdl( %cl, %edx, %eax) C compute carry limb
73: pushl %eax C push carry limb onto stack
74:
75: decl %ebp
76: pushl %ebp
77: shrl $3,%ebp
78: jz L(end)
79:
80: movl (%edi),%eax C fetch destination cache line
81:
82: ALIGN(4)
83: L(oop): movl 28(%edi),%eax C fetch destination cache line
84: movl %edx,%ebx
85:
86: movl (%esi),%eax
87: movl 4(%esi),%edx
88: shrdl( %cl, %eax, %ebx)
89: shrdl( %cl, %edx, %eax)
90: movl %ebx,(%edi)
91: movl %eax,4(%edi)
92:
93: movl 8(%esi),%ebx
94: movl 12(%esi),%eax
95: shrdl( %cl, %ebx, %edx)
96: shrdl( %cl, %eax, %ebx)
97: movl %edx,8(%edi)
98: movl %ebx,12(%edi)
99:
100: movl 16(%esi),%edx
101: movl 20(%esi),%ebx
102: shrdl( %cl, %edx, %eax)
103: shrdl( %cl, %ebx, %edx)
104: movl %eax,16(%edi)
105: movl %edx,20(%edi)
106:
107: movl 24(%esi),%eax
108: movl 28(%esi),%edx
109: shrdl( %cl, %eax, %ebx)
110: shrdl( %cl, %edx, %eax)
111: movl %ebx,24(%edi)
112: movl %eax,28(%edi)
113:
114: addl $32,%esi
115: addl $32,%edi
116: decl %ebp
117: jnz L(oop)
118:
119: L(end): popl %ebp
120: andl $7,%ebp
121: jz L(end2)
122: L(oop2):
123: movl (%esi),%eax
124: shrdl( %cl,%eax,%edx) C compute result limb
125: movl %edx,(%edi)
126: movl %eax,%edx
127: addl $4,%esi
128: addl $4,%edi
129: decl %ebp
130: jnz L(oop2)
131:
132: L(end2):
133: shrl %cl,%edx C compute most significant limb
134: movl %edx,(%edi) C store it
135:
136: popl %eax C pop carry limb
137:
138: popl %ebp
139: popl %ebx
140: popl %esi
141: popl %edi
142: ret
143:
144:
145: C We loop from least significant end of the arrays, which is only
146: C permissable if the source and destination don't overlap, since the
147: C function is documented to work for overlapping source and destination.
148:
149: L(special):
150: leal -4(%edi,%ebp,4),%edi
151: leal -4(%esi,%ebp,4),%esi
152:
153: movl (%esi),%edx
154: subl $4,%esi
155:
156: decl %ebp
157: pushl %ebp
158: shrl $3,%ebp
159:
160: shrl %edx
161: incl %ebp
162: decl %ebp
163: jz L(Lend)
164:
165: movl (%edi),%eax C fetch destination cache line
166:
167: ALIGN(4)
168: L(Loop):
169: movl -28(%edi),%eax C fetch destination cache line
170: movl %edx,%ebx
171:
172: movl (%esi),%eax
173: movl -4(%esi),%edx
174: rcrl %eax
175: movl %ebx,(%edi)
176: rcrl %edx
177: movl %eax,-4(%edi)
178:
179: movl -8(%esi),%ebx
180: movl -12(%esi),%eax
181: rcrl %ebx
182: movl %edx,-8(%edi)
183: rcrl %eax
184: movl %ebx,-12(%edi)
185:
186: movl -16(%esi),%edx
187: movl -20(%esi),%ebx
188: rcrl %edx
189: movl %eax,-16(%edi)
190: rcrl %ebx
191: movl %edx,-20(%edi)
192:
193: movl -24(%esi),%eax
194: movl -28(%esi),%edx
195: rcrl %eax
196: movl %ebx,-24(%edi)
197: rcrl %edx
198: movl %eax,-28(%edi)
199:
200: leal -32(%esi),%esi C use leal not to clobber carry
201: leal -32(%edi),%edi
202: decl %ebp
203: jnz L(Loop)
204:
205: L(Lend):
206: popl %ebp
207: sbbl %eax,%eax C save carry in %eax
208: andl $7,%ebp
209: jz L(Lend2)
210: addl %eax,%eax C restore carry from eax
211: L(Loop2):
212: movl %edx,%ebx
213: movl (%esi),%edx
214: rcrl %edx
215: movl %ebx,(%edi)
216:
217: leal -4(%esi),%esi C use leal not to clobber carry
218: leal -4(%edi),%edi
219: decl %ebp
220: jnz L(Loop2)
221:
222: jmp L(L1)
223: L(Lend2):
224: addl %eax,%eax C restore carry from eax
225: L(L1): movl %edx,(%edi) C store last limb
226:
227: movl $0,%eax
228: rcrl %eax
229:
230: popl %ebp
231: popl %ebx
232: popl %esi
233: popl %edi
234: ret
235:
236: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>