Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/lshift.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl Intel Pentium mpn_lshift -- mpn left shift.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
1.1 maekawa 4: dnl Foundation, Inc.
5: dnl
6: dnl This file is part of the GNU MP Library.
7: dnl
8: dnl The GNU MP Library is free software; you can redistribute it and/or
9: dnl modify it under the terms of the GNU Lesser General Public License as
10: dnl published by the Free Software Foundation; either version 2.1 of the
11: dnl License, or (at your option) any later version.
12: dnl
13: dnl The GNU MP Library is distributed in the hope that it will be useful,
14: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16: dnl Lesser General Public License for more details.
17: dnl
18: dnl You should have received a copy of the GNU Lesser General Public
19: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
20: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
21: dnl Suite 330, Boston, MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25:
1.1.1.2 ! ohara 26: C cycles/limb
! 27: C P5,P54: 6.0
! 28: C P55: 5.375
! 29:
! 30:
1.1 maekawa 31: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
32: C unsigned shift);
33: C
34: C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
35: C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
36:
37: defframe(PARAM_SHIFT,16)
38: defframe(PARAM_SIZE, 12)
39: defframe(PARAM_SRC, 8)
40: defframe(PARAM_DST, 4)
41:
1.1.1.2 ! ohara 42: TEXT
1.1 maekawa 43: ALIGN(8)
44: PROLOGUE(mpn_lshift)
45:
46: pushl %edi
47: pushl %esi
48: pushl %ebx
49: pushl %ebp
50: deflit(`FRAME',16)
51:
52: movl PARAM_DST,%edi
53: movl PARAM_SRC,%esi
54: movl PARAM_SIZE,%ebp
55: movl PARAM_SHIFT,%ecx
56:
57: C We can use faster code for shift-by-1 under certain conditions.
58: cmp $1,%ecx
59: jne L(normal)
60: leal 4(%esi),%eax
61: cmpl %edi,%eax
62: jnc L(special) C jump if s_ptr + 1 >= res_ptr
63: leal (%esi,%ebp,4),%eax
64: cmpl %eax,%edi
65: jnc L(special) C jump if res_ptr >= s_ptr + size
66:
67: L(normal):
68: leal -4(%edi,%ebp,4),%edi
69: leal -4(%esi,%ebp,4),%esi
70:
71: movl (%esi),%edx
72: subl $4,%esi
73: xorl %eax,%eax
74: shldl( %cl, %edx, %eax) C compute carry limb
75: pushl %eax C push carry limb onto stack
76:
77: decl %ebp
78: pushl %ebp
79: shrl $3,%ebp
80: jz L(end)
81:
82: movl (%edi),%eax C fetch destination cache line
83:
84: ALIGN(4)
85: L(oop): movl -28(%edi),%eax C fetch destination cache line
86: movl %edx,%ebx
87:
88: movl (%esi),%eax
89: movl -4(%esi),%edx
90: shldl( %cl, %eax, %ebx)
91: shldl( %cl, %edx, %eax)
92: movl %ebx,(%edi)
93: movl %eax,-4(%edi)
94:
95: movl -8(%esi),%ebx
96: movl -12(%esi),%eax
97: shldl( %cl, %ebx, %edx)
98: shldl( %cl, %eax, %ebx)
99: movl %edx,-8(%edi)
100: movl %ebx,-12(%edi)
101:
102: movl -16(%esi),%edx
103: movl -20(%esi),%ebx
104: shldl( %cl, %edx, %eax)
105: shldl( %cl, %ebx, %edx)
106: movl %eax,-16(%edi)
107: movl %edx,-20(%edi)
108:
109: movl -24(%esi),%eax
110: movl -28(%esi),%edx
111: shldl( %cl, %eax, %ebx)
112: shldl( %cl, %edx, %eax)
113: movl %ebx,-24(%edi)
114: movl %eax,-28(%edi)
115:
116: subl $32,%esi
117: subl $32,%edi
118: decl %ebp
119: jnz L(oop)
120:
121: L(end): popl %ebp
122: andl $7,%ebp
123: jz L(end2)
124: L(oop2):
125: movl (%esi),%eax
126: shldl( %cl,%eax,%edx)
127: movl %edx,(%edi)
128: movl %eax,%edx
129: subl $4,%esi
130: subl $4,%edi
131: decl %ebp
132: jnz L(oop2)
133:
134: L(end2):
135: shll %cl,%edx C compute least significant limb
136: movl %edx,(%edi) C store it
137:
138: popl %eax C pop carry limb
139:
140: popl %ebp
141: popl %ebx
142: popl %esi
143: popl %edi
144: ret
145:
146:
147: C We loop from least significant end of the arrays, which is only
148: C permissable if the source and destination don't overlap, since the
149: C function is documented to work for overlapping source and destination.
150:
151: L(special):
152: movl (%esi),%edx
153: addl $4,%esi
154:
155: decl %ebp
156: pushl %ebp
157: shrl $3,%ebp
158:
159: addl %edx,%edx
160: incl %ebp
161: decl %ebp
162: jz L(Lend)
163:
164: movl (%edi),%eax C fetch destination cache line
165:
166: ALIGN(4)
167: L(Loop):
168: movl 28(%edi),%eax C fetch destination cache line
169: movl %edx,%ebx
170:
171: movl (%esi),%eax
172: movl 4(%esi),%edx
173: adcl %eax,%eax
174: movl %ebx,(%edi)
175: adcl %edx,%edx
176: movl %eax,4(%edi)
177:
178: movl 8(%esi),%ebx
179: movl 12(%esi),%eax
180: adcl %ebx,%ebx
181: movl %edx,8(%edi)
182: adcl %eax,%eax
183: movl %ebx,12(%edi)
184:
185: movl 16(%esi),%edx
186: movl 20(%esi),%ebx
187: adcl %edx,%edx
188: movl %eax,16(%edi)
189: adcl %ebx,%ebx
190: movl %edx,20(%edi)
191:
192: movl 24(%esi),%eax
193: movl 28(%esi),%edx
194: adcl %eax,%eax
195: movl %ebx,24(%edi)
196: adcl %edx,%edx
197: movl %eax,28(%edi)
198:
199: leal 32(%esi),%esi C use leal not to clobber carry
200: leal 32(%edi),%edi
201: decl %ebp
202: jnz L(Loop)
203:
204: L(Lend):
205: popl %ebp
206: sbbl %eax,%eax C save carry in %eax
207: andl $7,%ebp
208: jz L(Lend2)
209: addl %eax,%eax C restore carry from eax
210: L(Loop2):
211: movl %edx,%ebx
212: movl (%esi),%edx
213: adcl %edx,%edx
214: movl %ebx,(%edi)
215:
216: leal 4(%esi),%esi C use leal not to clobber carry
217: leal 4(%edi),%edi
218: decl %ebp
219: jnz L(Loop2)
220:
221: jmp L(L1)
222: L(Lend2):
223: addl %eax,%eax C restore carry from eax
224: L(L1): movl %edx,(%edi) C store last limb
225:
226: sbbl %eax,%eax
227: negl %eax
228:
229: popl %ebp
230: popl %ebx
231: popl %esi
232: popl %edi
233: ret
234:
235: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>