Annotation of OpenXM_contrib/gmp/mpn/x86/mul_basecase.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
2: dnl in a third limb vector.
3:
1.1.1.2 ! ohara 4: dnl Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software
! 5: dnl Foundation, Inc.
1.1 maekawa 6: dnl
7: dnl This file is part of the GNU MP Library.
8: dnl
9: dnl The GNU MP Library is free software; you can redistribute it and/or
10: dnl modify it under the terms of the GNU Lesser General Public License as
11: dnl published by the Free Software Foundation; either version 2.1 of the
12: dnl License, or (at your option) any later version.
13: dnl
14: dnl The GNU MP Library is distributed in the hope that it will be useful,
15: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17: dnl Lesser General Public License for more details.
18: dnl
19: dnl You should have received a copy of the GNU Lesser General Public
20: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
21: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
22: dnl Suite 330, Boston, MA 02111-1307, USA.
23:
24: include(`../config.m4')
25:
26:
1.1.1.2 ! ohara 27: C cycles/crossproduct
! 28: C P5: 15
! 29: C P6: 7.5
! 30: C K6: 12.5
! 31: C K7: 5.5
! 32: C P4: 24
! 33:
! 34:
1.1 maekawa 35: C void mpn_mul_basecase (mp_ptr wp,
36: C mp_srcptr xp, mp_size_t xsize,
37: C mp_srcptr yp, mp_size_t ysize);
38: C
39: C This was written in a haste since the Pentium optimized code that was used
40: C for all x86 machines was slow for the Pentium II. This code would benefit
41: C from some cleanup.
42: C
43: C To shave off some percentage of the run-time, one should make 4 variants
44: C of the Louter loop, for the four different outcomes of un mod 4. That
45: C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
46: C part of the function, but since it is not very large, that would be
47: C acceptable.
48: C
49: C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
50: C unknown.
51:
52: defframe(PARAM_YSIZE,20)
53: defframe(PARAM_YP, 16)
54: defframe(PARAM_XSIZE,12)
55: defframe(PARAM_XP, 8)
56: defframe(PARAM_WP, 4)
57:
58: defframe(VAR_MULTIPLIER, -4)
59: defframe(VAR_COUNTER, -8)
60: deflit(VAR_STACK_SPACE, 8)
61:
1.1.1.2 ! ohara 62: TEXT
1.1 maekawa 63: ALIGN(8)
64:
65: PROLOGUE(mpn_mul_basecase)
66: deflit(`FRAME',0)
67:
68: subl $VAR_STACK_SPACE,%esp
69: pushl %esi
70: pushl %ebp
71: pushl %edi
72: deflit(`FRAME',eval(VAR_STACK_SPACE+12))
73:
74: movl PARAM_XP,%esi
75: movl PARAM_WP,%edi
76: movl PARAM_YP,%ebp
77:
78: movl (%esi),%eax C load xp[0]
79: mull (%ebp) C multiply by yp[0]
80: movl %eax,(%edi) C store to wp[0]
81: movl PARAM_XSIZE,%ecx C xsize
82: decl %ecx C If xsize = 1, ysize = 1 too
83: jz L(done)
84:
85: pushl %ebx
86: FRAME_pushl()
87: movl %edx,%ebx
88:
89: leal 4(%esi),%esi
90: leal 4(%edi),%edi
91:
92: L(oopM):
93: movl (%esi),%eax C load next limb at xp[j]
94: leal 4(%esi),%esi
95: mull (%ebp)
96: addl %ebx,%eax
97: movl %edx,%ebx
98: adcl $0,%ebx
99: movl %eax,(%edi)
100: leal 4(%edi),%edi
101: decl %ecx
102: jnz L(oopM)
103:
104: movl %ebx,(%edi) C most significant limb of product
105: addl $4,%edi C increment wp
106: movl PARAM_XSIZE,%eax
107: shll $2,%eax
108: subl %eax,%edi
109: subl %eax,%esi
110:
111: movl PARAM_YSIZE,%eax C ysize
112: decl %eax
113: jz L(skip)
114: movl %eax,VAR_COUNTER C set index i to ysize
115:
116: L(outer):
117: movl PARAM_YP,%ebp C yp
118: addl $4,%ebp C make ebp point to next v limb
119: movl %ebp,PARAM_YP
120: movl (%ebp),%eax C copy y limb ...
121: movl %eax,VAR_MULTIPLIER C ... to stack slot
122: movl PARAM_XSIZE,%ecx
123:
124: xorl %ebx,%ebx
125: andl $3,%ecx
126: jz L(end0)
127:
128: L(oop0):
129: movl (%esi),%eax
130: mull VAR_MULTIPLIER
131: leal 4(%esi),%esi
132: addl %ebx,%eax
133: movl $0,%ebx
134: adcl %ebx,%edx
135: addl %eax,(%edi)
136: adcl %edx,%ebx C propagate carry into cylimb
137:
138: leal 4(%edi),%edi
139: decl %ecx
140: jnz L(oop0)
141:
142: L(end0):
143: movl PARAM_XSIZE,%ecx
144: shrl $2,%ecx
145: jz L(endX)
146:
147: ALIGN(8)
148: L(oopX):
149: movl (%esi),%eax
150: mull VAR_MULTIPLIER
151: addl %eax,%ebx
152: movl $0,%ebp
153: adcl %edx,%ebp
154:
155: movl 4(%esi),%eax
156: mull VAR_MULTIPLIER
157: addl %ebx,(%edi)
158: adcl %eax,%ebp C new lo + cylimb
159: movl $0,%ebx
160: adcl %edx,%ebx
161:
162: movl 8(%esi),%eax
163: mull VAR_MULTIPLIER
164: addl %ebp,4(%edi)
165: adcl %eax,%ebx C new lo + cylimb
166: movl $0,%ebp
167: adcl %edx,%ebp
168:
169: movl 12(%esi),%eax
170: mull VAR_MULTIPLIER
171: addl %ebx,8(%edi)
172: adcl %eax,%ebp C new lo + cylimb
173: movl $0,%ebx
174: adcl %edx,%ebx
175:
176: addl %ebp,12(%edi)
177: adcl $0,%ebx C propagate carry into cylimb
178:
179: leal 16(%esi),%esi
180: leal 16(%edi),%edi
181: decl %ecx
182: jnz L(oopX)
183:
184: L(endX):
185: movl %ebx,(%edi)
186: addl $4,%edi
187:
188: C we incremented wp and xp in the loop above; compensate
189: movl PARAM_XSIZE,%eax
190: shll $2,%eax
191: subl %eax,%edi
192: subl %eax,%esi
193:
194: movl VAR_COUNTER,%eax
195: decl %eax
196: movl %eax,VAR_COUNTER
197: jnz L(outer)
198:
199: L(skip):
200: popl %ebx
201: popl %edi
202: popl %ebp
203: popl %esi
204: addl $8,%esp
205: ret
206:
207: L(done):
208: movl %edx,4(%edi) C store to wp[1]
209: popl %edi
210: popl %ebp
211: popl %esi
212: addl $8,%esp
213: ret
214:
215: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>