Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mul_1.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K7 mpn_mul_1 -- mpn by limb multiply.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K7: 3.4 cycles/limb (at 16 limbs/loop).
! 26:
! 27:
! 28:
1.1 maekawa 29: dnl K7: UNROLL_COUNT cycles/limb
30: dnl 8 3.9
31: dnl 16 3.4
32: dnl 32 3.4
33: dnl 64 3.35
34: dnl Maximum possible with the current code is 64.
35:
36: deflit(UNROLL_COUNT, 16)
37:
38:
39: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40: C mp_limb_t multiplier);
41: C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
42: C mp_limb_t multiplier, mp_limb_t carry);
43: C
44: C Multiply src,size by mult and store the result in dst,size.
45: C Return the carry limb from the top of the result.
46: C
47: C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
48: C the low limb of the destination.
49: C
50: C Variations on the unrolled loop have been tried, with the current
51: C registers or with the counter on the stack to free up ecx. The current
52: C code is the fastest found.
53: C
54: C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
55: C from the unrolled loop actually slows it down to 5.0 cycles/limb. Code
56: C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
57: C without having to change the computed jump. There's obviously something
58: C fishy going on, perhaps with what execution units the mul needs.
59:
60: defframe(PARAM_CARRY, 20)
61: defframe(PARAM_MULTIPLIER,16)
62: defframe(PARAM_SIZE, 12)
63: defframe(PARAM_SRC, 8)
64: defframe(PARAM_DST, 4)
65:
66: defframe(SAVE_EBP, -4)
67: defframe(SAVE_EDI, -8)
68: defframe(SAVE_ESI, -12)
69: defframe(SAVE_EBX, -16)
70: deflit(STACK_SPACE, 16)
71:
72: dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
73: ifdef(`PIC',`
74: deflit(UNROLL_THRESHOLD, 7)
75: ',`
76: deflit(UNROLL_THRESHOLD, 5)
77: ')
78:
1.1.1.2 ! ohara 79: TEXT
1.1 maekawa 80: ALIGN(32)
81: PROLOGUE(mpn_mul_1c)
82: deflit(`FRAME',0)
83: movl PARAM_CARRY, %edx
1.1.1.2 ! ohara 84: jmp L(start_nc)
1.1 maekawa 85: EPILOGUE()
86:
87:
88: PROLOGUE(mpn_mul_1)
89: deflit(`FRAME',0)
90: xorl %edx, %edx C initial carry
91: L(start_nc):
92: movl PARAM_SIZE, %ecx
93: subl $STACK_SPACE, %esp
94: deflit(`FRAME', STACK_SPACE)
95:
96: movl %edi, SAVE_EDI
97: movl %ebx, SAVE_EBX
98: movl %edx, %ebx
99:
100: movl %esi, SAVE_ESI
101: movl PARAM_SRC, %esi
102: cmpl $UNROLL_THRESHOLD, %ecx
103:
104: movl PARAM_DST, %edi
105: movl %ebp, SAVE_EBP
106: jae L(unroll)
107:
108: leal (%esi,%ecx,4), %esi
109: leal (%edi,%ecx,4), %edi
110: negl %ecx
111:
112: movl PARAM_MULTIPLIER, %ebp
113:
114: L(simple):
115: C eax scratch
116: C ebx carry
117: C ecx counter (negative)
118: C edx scratch
119: C esi src
120: C edi dst
121: C ebp multiplier
122:
123: movl (%esi,%ecx,4), %eax
124:
125: mull %ebp
126:
127: addl %ebx, %eax
128: movl %eax, (%edi,%ecx,4)
129: movl $0, %ebx
130:
131: adcl %edx, %ebx
132: incl %ecx
133: jnz L(simple)
134:
135: movl %ebx, %eax
136: movl SAVE_EBX, %ebx
137: movl SAVE_ESI, %esi
138:
139: movl SAVE_EDI, %edi
140: movl SAVE_EBP, %ebp
141: addl $STACK_SPACE, %esp
142:
143: ret
144:
145:
146: C -----------------------------------------------------------------------------
147: C The mov to load the next source limb is done well ahead of the mul, this
148: C is necessary for full speed. It leads to one limb handled separately
149: C after the loop.
150: C
151: C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
152: C to avoid having an 0x80 displacement in the code for the last limb in the
153: C unrolled loop. This is for a fair comparison between 16 and 32 unrolling.
154:
155: ifelse(eval(UNROLL_COUNT >= 32),1,`
156: deflit(SRC_OFFSET,4)
157: ',`
158: deflit(SRC_OFFSET,)
159: ')
160:
161: C this is offset 0x62, so close enough to aligned
162: L(unroll):
163: C eax
164: C ebx initial carry
165: C ecx size
166: C edx
167: C esi src
168: C edi dst
169: C ebp
170: deflit(`FRAME', STACK_SPACE)
171:
172: leal -1(%ecx), %edx C one limb handled at end
173: leal -2(%ecx), %ecx C and ecx is one less than edx
174: movl %ebp, SAVE_EBP
175:
176: negl %edx
177: shrl $UNROLL_LOG2, %ecx C unrolled loop counter
178: movl (%esi), %eax C src low limb
179:
180: andl $UNROLL_MASK, %edx
181: movl PARAM_DST, %edi
182:
183: movl %edx, %ebp
184: shll $4, %edx
185:
186: C 17 code bytes per limb
187: ifdef(`PIC',`
188: call L(add_eip_to_edx)
189: L(here):
190: ',`
191: leal L(entry) (%edx,%ebp), %edx
192: ')
193: negl %ebp
194:
195: leal ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
196: leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
197: movl PARAM_MULTIPLIER, %ebp
198:
199: jmp *%edx
200:
201:
202: ifdef(`PIC',`
203: L(add_eip_to_edx):
1.1.1.2 ! ohara 204: C See mpn/x86/README about old gas bugs
1.1 maekawa 205: leal (%edx,%ebp), %edx
206: addl $L(entry)-L(here), %edx
207: addl (%esp), %edx
208: ret
209: ')
210:
211:
212: C ----------------------------------------------------------------------------
213: ALIGN(32)
214: L(top):
215: C eax next src limb
216: C ebx carry
217: C ecx counter
218: C edx scratch
219: C esi src+4
220: C edi dst
221: C ebp multiplier
222: C
223: C 17 code bytes per limb processed
224:
225: L(entry):
226: forloop(i, 0, UNROLL_COUNT-1, `
227: deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
228: deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
229:
230: mull %ebp
231:
232: addl %eax, %ebx
233: Zdisp( movl, disp_src,(%esi), %eax)
234: Zdisp( movl, %ebx, disp_dst,(%edi))
235:
236: movl $0, %ebx
237: adcl %edx, %ebx
238: ')
239:
240: decl %ecx
241:
242: leal UNROLL_BYTES(%esi), %esi
243: leal UNROLL_BYTES(%edi), %edi
244: jns L(top)
245:
246:
247: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
248:
249: mull %ebp
250:
251: addl %eax, %ebx
252: movl $0, %eax
253: movl SAVE_ESI, %esi
254:
255: movl %ebx, disp0(%edi)
256: movl SAVE_EBX, %ebx
257: movl SAVE_EDI, %edi
258:
259: adcl %edx, %eax
260: movl SAVE_EBP, %ebp
261: addl $STACK_SPACE, %esp
262:
263: ret
264:
265: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>