Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mul_1.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
2: dnl
3: dnl K6: 6.25 cycles/limb.
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
30: C mp_limb_t multiplier);
31: C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
32: C mp_limb_t multiplier, mp_limb_t carry);
33: C
34: C Multiply src,size by mult and store the result in dst,size.
35: C Return the carry limb from the top of the result.
36: C
37: C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
38: C the low limb of the result.
39:
40: defframe(PARAM_CARRY, 20)
41: defframe(PARAM_MULTIPLIER,16)
42: defframe(PARAM_SIZE, 12)
43: defframe(PARAM_SRC, 8)
44: defframe(PARAM_DST, 4)
45:
46: dnl minimum 5 because the unrolled code can't handle less
47: deflit(UNROLL_THRESHOLD, 5)
48:
49: .text
50: ALIGN(32)
51:
52: PROLOGUE(mpn_mul_1c)
53: pushl %esi
54: deflit(`FRAME',4)
55: movl PARAM_CARRY, %esi
56: jmp LF(mpn_mul_1,start_nc)
57: EPILOGUE()
58:
59:
60: PROLOGUE(mpn_mul_1)
61: push %esi
62: deflit(`FRAME',4)
63: xorl %esi, %esi C initial carry
64:
65: L(start_nc):
66: mov PARAM_SIZE, %ecx
67: push %ebx
68: FRAME_pushl()
69:
70: movl PARAM_SRC, %ebx
71: push %edi
72: FRAME_pushl()
73:
74: movl PARAM_DST, %edi
75: pushl %ebp
76: FRAME_pushl()
77:
78: cmpl $UNROLL_THRESHOLD, %ecx
79: movl PARAM_MULTIPLIER, %ebp
80:
81: jae L(unroll)
82:
83:
84: C code offset 0x22 here, close enough to aligned
85: L(simple):
86: C eax scratch
87: C ebx src
88: C ecx counter
89: C edx scratch
90: C esi carry
91: C edi dst
92: C ebp multiplier
93: C
94: C this loop 8 cycles/limb
95:
96: movl (%ebx), %eax
97: addl $4, %ebx
98:
99: mull %ebp
100:
101: addl %esi, %eax
102: movl $0, %esi
103:
104: adcl %edx, %esi
105:
106: movl %eax, (%edi)
107: addl $4, %edi
108:
109: loop L(simple)
110:
111:
112: popl %ebp
113:
114: popl %edi
115: popl %ebx
116:
117: movl %esi, %eax
118: popl %esi
119:
120: ret
121:
122:
123: C -----------------------------------------------------------------------------
124: C The code for each limb is 6 cycles, with instruction decoding being the
125: C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
126: C cycles/limb in total.
127: C
128: C The secret ingredient to get 6.25 is to start the loop with the mul and
129: C have the load/store pair at the end. Rotating the load/store to the top
130: C is an 0.5 c/l slowdown. (Some address generation effect probably.)
131: C
132: C The whole unrolled loop fits nicely in exactly 80 bytes.
133:
134:
135: ALIGN(16) C already aligned to 16 here actually
136: L(unroll):
137: movl (%ebx), %eax
138: leal -16(%ebx,%ecx,4), %ebx
139:
140: leal -16(%edi,%ecx,4), %edi
141: subl $4, %ecx
142:
143: negl %ecx
144:
145:
146: ALIGN(16) C one byte nop for this alignment
147: L(top):
148: C eax scratch
149: C ebx &src[size-4]
150: C ecx counter
151: C edx scratch
152: C esi carry
153: C edi &dst[size-4]
154: C ebp multiplier
155:
156: mull %ebp
157:
158: addl %esi, %eax
159: movl $0, %esi
160:
161: adcl %edx, %esi
162:
163: movl %eax, (%edi,%ecx,4)
164: movl 4(%ebx,%ecx,4), %eax
165:
166:
167: mull %ebp
168:
169: addl %esi, %eax
170: movl $0, %esi
171:
172: adcl %edx, %esi
173:
174: movl %eax, 4(%edi,%ecx,4)
175: movl 8(%ebx,%ecx,4), %eax
176:
177:
178: mull %ebp
179:
180: addl %esi, %eax
181: movl $0, %esi
182:
183: adcl %edx, %esi
184:
185: movl %eax, 8(%edi,%ecx,4)
186: movl 12(%ebx,%ecx,4), %eax
187:
188:
189: mull %ebp
190:
191: addl %esi, %eax
192: movl $0, %esi
193:
194: adcl %edx, %esi
195:
196: movl %eax, 12(%edi,%ecx,4)
197: movl 16(%ebx,%ecx,4), %eax
198:
199:
200: addl $4, %ecx
201: js L(top)
202:
203:
204:
205: C eax next src limb
206: C ebx &src[size-4]
207: C ecx 0 to 3 representing respectively 4 to 1 further limbs
208: C edx
209: C esi carry
210: C edi &dst[size-4]
211:
212: testb $2, %cl
213: jnz L(finish_not_two)
214:
215: mull %ebp
216:
217: addl %esi, %eax
218: movl $0, %esi
219:
220: adcl %edx, %esi
221:
222: movl %eax, (%edi,%ecx,4)
223: movl 4(%ebx,%ecx,4), %eax
224:
225:
226: mull %ebp
227:
228: addl %esi, %eax
229: movl $0, %esi
230:
231: adcl %edx, %esi
232:
233: movl %eax, 4(%edi,%ecx,4)
234: movl 8(%ebx,%ecx,4), %eax
235:
236: addl $2, %ecx
237: L(finish_not_two):
238:
239:
240: testb $1, %cl
241: jnz L(finish_not_one)
242:
243: mull %ebp
244:
245: addl %esi, %eax
246: movl $0, %esi
247:
248: adcl %edx, %esi
249:
250: movl %eax, 8(%edi)
251: movl 12(%ebx), %eax
252: L(finish_not_one):
253:
254:
255: mull %ebp
256:
257: addl %esi, %eax
258: popl %ebp
259:
260: adcl $0, %edx
261:
262: movl %eax, 12(%edi)
263: popl %edi
264:
265: popl %ebx
266: movl %edx, %eax
267:
268: popl %esi
269:
270: ret
271:
272: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>