Annotation of OpenXM_contrib/gmp/mpn/x86/mul_basecase.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
2: dnl in a third limb vector.
3:
4:
5: dnl Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation,
6: dnl Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C void mpn_mul_basecase (mp_ptr wp,
30: C mp_srcptr xp, mp_size_t xsize,
31: C mp_srcptr yp, mp_size_t ysize);
32: C
33: C This was written in a haste since the Pentium optimized code that was used
34: C for all x86 machines was slow for the Pentium II. This code would benefit
35: C from some cleanup.
36: C
37: C To shave off some percentage of the run-time, one should make 4 variants
38: C of the Louter loop, for the four different outcomes of un mod 4. That
39: C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
40: C part of the function, but since it is not very large, that would be
41: C acceptable.
42: C
43: C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
44: C unknown.
45:
46: defframe(PARAM_YSIZE,20)
47: defframe(PARAM_YP, 16)
48: defframe(PARAM_XSIZE,12)
49: defframe(PARAM_XP, 8)
50: defframe(PARAM_WP, 4)
51:
52: defframe(VAR_MULTIPLIER, -4)
53: defframe(VAR_COUNTER, -8)
54: deflit(VAR_STACK_SPACE, 8)
55:
56: .text
57: ALIGN(8)
58:
59: PROLOGUE(mpn_mul_basecase)
60: deflit(`FRAME',0)
61:
62: subl $VAR_STACK_SPACE,%esp
63: pushl %esi
64: pushl %ebp
65: pushl %edi
66: deflit(`FRAME',eval(VAR_STACK_SPACE+12))
67:
68: movl PARAM_XP,%esi
69: movl PARAM_WP,%edi
70: movl PARAM_YP,%ebp
71:
72: movl (%esi),%eax C load xp[0]
73: mull (%ebp) C multiply by yp[0]
74: movl %eax,(%edi) C store to wp[0]
75: movl PARAM_XSIZE,%ecx C xsize
76: decl %ecx C If xsize = 1, ysize = 1 too
77: jz L(done)
78:
79: pushl %ebx
80: FRAME_pushl()
81: movl %edx,%ebx
82:
83: leal 4(%esi),%esi
84: leal 4(%edi),%edi
85:
86: L(oopM):
87: movl (%esi),%eax C load next limb at xp[j]
88: leal 4(%esi),%esi
89: mull (%ebp)
90: addl %ebx,%eax
91: movl %edx,%ebx
92: adcl $0,%ebx
93: movl %eax,(%edi)
94: leal 4(%edi),%edi
95: decl %ecx
96: jnz L(oopM)
97:
98: movl %ebx,(%edi) C most significant limb of product
99: addl $4,%edi C increment wp
100: movl PARAM_XSIZE,%eax
101: shll $2,%eax
102: subl %eax,%edi
103: subl %eax,%esi
104:
105: movl PARAM_YSIZE,%eax C ysize
106: decl %eax
107: jz L(skip)
108: movl %eax,VAR_COUNTER C set index i to ysize
109:
110: L(outer):
111: movl PARAM_YP,%ebp C yp
112: addl $4,%ebp C make ebp point to next v limb
113: movl %ebp,PARAM_YP
114: movl (%ebp),%eax C copy y limb ...
115: movl %eax,VAR_MULTIPLIER C ... to stack slot
116: movl PARAM_XSIZE,%ecx
117:
118: xorl %ebx,%ebx
119: andl $3,%ecx
120: jz L(end0)
121:
122: L(oop0):
123: movl (%esi),%eax
124: mull VAR_MULTIPLIER
125: leal 4(%esi),%esi
126: addl %ebx,%eax
127: movl $0,%ebx
128: adcl %ebx,%edx
129: addl %eax,(%edi)
130: adcl %edx,%ebx C propagate carry into cylimb
131:
132: leal 4(%edi),%edi
133: decl %ecx
134: jnz L(oop0)
135:
136: L(end0):
137: movl PARAM_XSIZE,%ecx
138: shrl $2,%ecx
139: jz L(endX)
140:
141: ALIGN(8)
142: L(oopX):
143: movl (%esi),%eax
144: mull VAR_MULTIPLIER
145: addl %eax,%ebx
146: movl $0,%ebp
147: adcl %edx,%ebp
148:
149: movl 4(%esi),%eax
150: mull VAR_MULTIPLIER
151: addl %ebx,(%edi)
152: adcl %eax,%ebp C new lo + cylimb
153: movl $0,%ebx
154: adcl %edx,%ebx
155:
156: movl 8(%esi),%eax
157: mull VAR_MULTIPLIER
158: addl %ebp,4(%edi)
159: adcl %eax,%ebx C new lo + cylimb
160: movl $0,%ebp
161: adcl %edx,%ebp
162:
163: movl 12(%esi),%eax
164: mull VAR_MULTIPLIER
165: addl %ebx,8(%edi)
166: adcl %eax,%ebp C new lo + cylimb
167: movl $0,%ebx
168: adcl %edx,%ebx
169:
170: addl %ebp,12(%edi)
171: adcl $0,%ebx C propagate carry into cylimb
172:
173: leal 16(%esi),%esi
174: leal 16(%edi),%edi
175: decl %ecx
176: jnz L(oopX)
177:
178: L(endX):
179: movl %ebx,(%edi)
180: addl $4,%edi
181:
182: C we incremented wp and xp in the loop above; compensate
183: movl PARAM_XSIZE,%eax
184: shll $2,%eax
185: subl %eax,%edi
186: subl %eax,%esi
187:
188: movl VAR_COUNTER,%eax
189: decl %eax
190: movl %eax,VAR_COUNTER
191: jnz L(outer)
192:
193: L(skip):
194: popl %ebx
195: popl %edi
196: popl %ebp
197: popl %esi
198: addl $8,%esp
199: ret
200:
201: L(done):
202: movl %edx,4(%edi) C store to wp[1]
203: popl %edi
204: popl %ebp
205: popl %esi
206: addl $8,%esp
207: ret
208:
209: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>