Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/lshift.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl Intel Pentium mpn_lshift -- mpn left shift.
2: dnl
3: dnl cycles/limb
4: dnl P5,P54: 6.0
5: dnl P55: 5.375
6:
7:
8: dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
9: dnl Foundation, Inc.
10: dnl
11: dnl This file is part of the GNU MP Library.
12: dnl
13: dnl The GNU MP Library is free software; you can redistribute it and/or
14: dnl modify it under the terms of the GNU Lesser General Public License as
15: dnl published by the Free Software Foundation; either version 2.1 of the
16: dnl License, or (at your option) any later version.
17: dnl
18: dnl The GNU MP Library is distributed in the hope that it will be useful,
19: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
20: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21: dnl Lesser General Public License for more details.
22: dnl
23: dnl You should have received a copy of the GNU Lesser General Public
24: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
25: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
26: dnl Suite 330, Boston, MA 02111-1307, USA.
27:
28:
29: include(`../config.m4')
30:
31:
32: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
33: C unsigned shift);
34: C
35: C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
36: C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
37:
38: defframe(PARAM_SHIFT,16)
39: defframe(PARAM_SIZE, 12)
40: defframe(PARAM_SRC, 8)
41: defframe(PARAM_DST, 4)
42:
43: .text
44: ALIGN(8)
45: PROLOGUE(mpn_lshift)
46:
47: pushl %edi
48: pushl %esi
49: pushl %ebx
50: pushl %ebp
51: deflit(`FRAME',16)
52:
53: movl PARAM_DST,%edi
54: movl PARAM_SRC,%esi
55: movl PARAM_SIZE,%ebp
56: movl PARAM_SHIFT,%ecx
57:
58: C We can use faster code for shift-by-1 under certain conditions.
59: cmp $1,%ecx
60: jne L(normal)
61: leal 4(%esi),%eax
62: cmpl %edi,%eax
63: jnc L(special) C jump if s_ptr + 1 >= res_ptr
64: leal (%esi,%ebp,4),%eax
65: cmpl %eax,%edi
66: jnc L(special) C jump if res_ptr >= s_ptr + size
67:
68: L(normal):
69: leal -4(%edi,%ebp,4),%edi
70: leal -4(%esi,%ebp,4),%esi
71:
72: movl (%esi),%edx
73: subl $4,%esi
74: xorl %eax,%eax
75: shldl( %cl, %edx, %eax) C compute carry limb
76: pushl %eax C push carry limb onto stack
77:
78: decl %ebp
79: pushl %ebp
80: shrl $3,%ebp
81: jz L(end)
82:
83: movl (%edi),%eax C fetch destination cache line
84:
85: ALIGN(4)
86: L(oop): movl -28(%edi),%eax C fetch destination cache line
87: movl %edx,%ebx
88:
89: movl (%esi),%eax
90: movl -4(%esi),%edx
91: shldl( %cl, %eax, %ebx)
92: shldl( %cl, %edx, %eax)
93: movl %ebx,(%edi)
94: movl %eax,-4(%edi)
95:
96: movl -8(%esi),%ebx
97: movl -12(%esi),%eax
98: shldl( %cl, %ebx, %edx)
99: shldl( %cl, %eax, %ebx)
100: movl %edx,-8(%edi)
101: movl %ebx,-12(%edi)
102:
103: movl -16(%esi),%edx
104: movl -20(%esi),%ebx
105: shldl( %cl, %edx, %eax)
106: shldl( %cl, %ebx, %edx)
107: movl %eax,-16(%edi)
108: movl %edx,-20(%edi)
109:
110: movl -24(%esi),%eax
111: movl -28(%esi),%edx
112: shldl( %cl, %eax, %ebx)
113: shldl( %cl, %edx, %eax)
114: movl %ebx,-24(%edi)
115: movl %eax,-28(%edi)
116:
117: subl $32,%esi
118: subl $32,%edi
119: decl %ebp
120: jnz L(oop)
121:
122: L(end): popl %ebp
123: andl $7,%ebp
124: jz L(end2)
125: L(oop2):
126: movl (%esi),%eax
127: shldl( %cl,%eax,%edx)
128: movl %edx,(%edi)
129: movl %eax,%edx
130: subl $4,%esi
131: subl $4,%edi
132: decl %ebp
133: jnz L(oop2)
134:
135: L(end2):
136: shll %cl,%edx C compute least significant limb
137: movl %edx,(%edi) C store it
138:
139: popl %eax C pop carry limb
140:
141: popl %ebp
142: popl %ebx
143: popl %esi
144: popl %edi
145: ret
146:
147:
148: C We loop from least significant end of the arrays, which is only
149: C permissable if the source and destination don't overlap, since the
150: C function is documented to work for overlapping source and destination.
151:
152: L(special):
153: movl (%esi),%edx
154: addl $4,%esi
155:
156: decl %ebp
157: pushl %ebp
158: shrl $3,%ebp
159:
160: addl %edx,%edx
161: incl %ebp
162: decl %ebp
163: jz L(Lend)
164:
165: movl (%edi),%eax C fetch destination cache line
166:
167: ALIGN(4)
168: L(Loop):
169: movl 28(%edi),%eax C fetch destination cache line
170: movl %edx,%ebx
171:
172: movl (%esi),%eax
173: movl 4(%esi),%edx
174: adcl %eax,%eax
175: movl %ebx,(%edi)
176: adcl %edx,%edx
177: movl %eax,4(%edi)
178:
179: movl 8(%esi),%ebx
180: movl 12(%esi),%eax
181: adcl %ebx,%ebx
182: movl %edx,8(%edi)
183: adcl %eax,%eax
184: movl %ebx,12(%edi)
185:
186: movl 16(%esi),%edx
187: movl 20(%esi),%ebx
188: adcl %edx,%edx
189: movl %eax,16(%edi)
190: adcl %ebx,%ebx
191: movl %edx,20(%edi)
192:
193: movl 24(%esi),%eax
194: movl 28(%esi),%edx
195: adcl %eax,%eax
196: movl %ebx,24(%edi)
197: adcl %edx,%edx
198: movl %eax,28(%edi)
199:
200: leal 32(%esi),%esi C use leal not to clobber carry
201: leal 32(%edi),%edi
202: decl %ebp
203: jnz L(Loop)
204:
205: L(Lend):
206: popl %ebp
207: sbbl %eax,%eax C save carry in %eax
208: andl $7,%ebp
209: jz L(Lend2)
210: addl %eax,%eax C restore carry from eax
211: L(Loop2):
212: movl %edx,%ebx
213: movl (%esi),%edx
214: adcl %edx,%edx
215: movl %ebx,(%edi)
216:
217: leal 4(%esi),%esi C use leal not to clobber carry
218: leal 4(%edi),%edi
219: decl %ebp
220: jnz L(Loop2)
221:
222: jmp L(L1)
223: L(Lend2):
224: addl %eax,%eax C restore carry from eax
225: L(L1): movl %edx,(%edi) C store last limb
226:
227: sbbl %eax,%eax
228: negl %eax
229:
230: popl %ebp
231: popl %ebx
232: popl %esi
233: popl %edi
234: ret
235:
236: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>