Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_1.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K6: 6.25 cycles/limb.
! 26:
! 27:
1.1 maekawa 28: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29: C mp_limb_t multiplier);
30: C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
31: C mp_limb_t multiplier, mp_limb_t carry);
32: C
33: C Multiply src,size by mult and store the result in dst,size.
34: C Return the carry limb from the top of the result.
35: C
36: C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
37: C the low limb of the result.
38:
39: defframe(PARAM_CARRY, 20)
40: defframe(PARAM_MULTIPLIER,16)
41: defframe(PARAM_SIZE, 12)
42: defframe(PARAM_SRC, 8)
43: defframe(PARAM_DST, 4)
44:
45: dnl minimum 5 because the unrolled code can't handle less
46: deflit(UNROLL_THRESHOLD, 5)
47:
1.1.1.2 ! ohara 48: TEXT
1.1 maekawa 49: ALIGN(32)
50:
51: PROLOGUE(mpn_mul_1c)
52: pushl %esi
53: deflit(`FRAME',4)
54: movl PARAM_CARRY, %esi
1.1.1.2 ! ohara 55: jmp L(start_nc)
1.1 maekawa 56: EPILOGUE()
57:
58:
59: PROLOGUE(mpn_mul_1)
60: push %esi
61: deflit(`FRAME',4)
62: xorl %esi, %esi C initial carry
63:
64: L(start_nc):
65: mov PARAM_SIZE, %ecx
66: push %ebx
67: FRAME_pushl()
68:
69: movl PARAM_SRC, %ebx
70: push %edi
71: FRAME_pushl()
72:
73: movl PARAM_DST, %edi
74: pushl %ebp
75: FRAME_pushl()
76:
77: cmpl $UNROLL_THRESHOLD, %ecx
78: movl PARAM_MULTIPLIER, %ebp
79:
80: jae L(unroll)
81:
82:
83: C code offset 0x22 here, close enough to aligned
84: L(simple):
85: C eax scratch
86: C ebx src
87: C ecx counter
88: C edx scratch
89: C esi carry
90: C edi dst
91: C ebp multiplier
92: C
93: C this loop 8 cycles/limb
94:
95: movl (%ebx), %eax
96: addl $4, %ebx
97:
98: mull %ebp
99:
100: addl %esi, %eax
101: movl $0, %esi
102:
103: adcl %edx, %esi
104:
105: movl %eax, (%edi)
106: addl $4, %edi
107:
108: loop L(simple)
109:
110:
111: popl %ebp
112:
113: popl %edi
114: popl %ebx
115:
116: movl %esi, %eax
117: popl %esi
118:
119: ret
120:
121:
122: C -----------------------------------------------------------------------------
123: C The code for each limb is 6 cycles, with instruction decoding being the
124: C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
125: C cycles/limb in total.
126: C
127: C The secret ingredient to get 6.25 is to start the loop with the mul and
128: C have the load/store pair at the end. Rotating the load/store to the top
129: C is an 0.5 c/l slowdown. (Some address generation effect probably.)
130: C
131: C The whole unrolled loop fits nicely in exactly 80 bytes.
132:
133:
134: ALIGN(16) C already aligned to 16 here actually
135: L(unroll):
136: movl (%ebx), %eax
137: leal -16(%ebx,%ecx,4), %ebx
138:
139: leal -16(%edi,%ecx,4), %edi
140: subl $4, %ecx
141:
142: negl %ecx
143:
144:
145: ALIGN(16) C one byte nop for this alignment
146: L(top):
147: C eax scratch
148: C ebx &src[size-4]
149: C ecx counter
150: C edx scratch
151: C esi carry
152: C edi &dst[size-4]
153: C ebp multiplier
154:
155: mull %ebp
156:
157: addl %esi, %eax
158: movl $0, %esi
159:
160: adcl %edx, %esi
161:
162: movl %eax, (%edi,%ecx,4)
163: movl 4(%ebx,%ecx,4), %eax
164:
165:
166: mull %ebp
167:
168: addl %esi, %eax
169: movl $0, %esi
170:
171: adcl %edx, %esi
172:
173: movl %eax, 4(%edi,%ecx,4)
174: movl 8(%ebx,%ecx,4), %eax
175:
176:
177: mull %ebp
178:
179: addl %esi, %eax
180: movl $0, %esi
181:
182: adcl %edx, %esi
183:
184: movl %eax, 8(%edi,%ecx,4)
185: movl 12(%ebx,%ecx,4), %eax
186:
187:
188: mull %ebp
189:
190: addl %esi, %eax
191: movl $0, %esi
192:
193: adcl %edx, %esi
194:
195: movl %eax, 12(%edi,%ecx,4)
196: movl 16(%ebx,%ecx,4), %eax
197:
198:
199: addl $4, %ecx
200: js L(top)
201:
202:
203:
204: C eax next src limb
205: C ebx &src[size-4]
206: C ecx 0 to 3 representing respectively 4 to 1 further limbs
207: C edx
208: C esi carry
209: C edi &dst[size-4]
210:
211: testb $2, %cl
212: jnz L(finish_not_two)
213:
214: mull %ebp
215:
216: addl %esi, %eax
217: movl $0, %esi
218:
219: adcl %edx, %esi
220:
221: movl %eax, (%edi,%ecx,4)
222: movl 4(%ebx,%ecx,4), %eax
223:
224:
225: mull %ebp
226:
227: addl %esi, %eax
228: movl $0, %esi
229:
230: adcl %edx, %esi
231:
232: movl %eax, 4(%edi,%ecx,4)
233: movl 8(%ebx,%ecx,4), %eax
234:
235: addl $2, %ecx
236: L(finish_not_two):
237:
238:
239: testb $1, %cl
240: jnz L(finish_not_one)
241:
242: mull %ebp
243:
244: addl %esi, %eax
245: movl $0, %esi
246:
247: adcl %edx, %esi
248:
249: movl %eax, 8(%edi)
250: movl 12(%ebx), %eax
251: L(finish_not_one):
252:
253:
254: mull %ebp
255:
256: addl %esi, %eax
257: popl %ebp
258:
259: adcl $0, %edx
260:
261: movl %eax, 12(%edi)
262: popl %edi
263:
264: popl %ebx
265: movl %edx, %eax
266:
267: popl %esi
268:
269: ret
270:
271: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>