Annotation of OpenXM_contrib/gmp/mpn/x86/p6/aorsmul_1.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2: dnl
3: dnl P6: 6.35 cycles/limb (at 16 limbs/loop).
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: dnl P6 UNROLL_COUNT cycles/limb
30: dnl 8 6.7
31: dnl 16 6.35
32: dnl 32 6.3
33: dnl 64 6.3
34: dnl Maximum possible with the current code is 64.
35:
36: deflit(UNROLL_COUNT, 16)
37:
38:
39: ifdef(`OPERATION_addmul_1', `
40: define(M4_inst, addl)
41: define(M4_function_1, mpn_addmul_1)
42: define(M4_function_1c, mpn_addmul_1c)
43: define(M4_description, add it to)
44: define(M4_desc_retval, carry)
45: ',`ifdef(`OPERATION_submul_1', `
46: define(M4_inst, subl)
47: define(M4_function_1, mpn_submul_1)
48: define(M4_function_1c, mpn_submul_1c)
49: define(M4_description, subtract it from)
50: define(M4_desc_retval, borrow)
51: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
52: ')')')
53:
54: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
55:
56:
57: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
58: C mp_limb_t mult);
59: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
60: C mp_limb_t mult, mp_limb_t carry);
61: C
62: C Calculate src,size multiplied by mult and M4_description dst,size.
63: C Return the M4_desc_retval limb from the top of the result.
64: C
65: C This code is pretty much the same as the K6 code. The unrolled loop is
66: C the same, but there's just a few scheduling tweaks in the setups and the
67: C simple loop.
68: C
69: C A number of variations have been tried for the unrolled loop, with one or
70: C two carries, and with loads scheduled earlier, but nothing faster than 6
71: C cycles/limb has been found.
72:
73: ifdef(`PIC',`
74: deflit(UNROLL_THRESHOLD, 5)
75: ',`
76: deflit(UNROLL_THRESHOLD, 5)
77: ')
78:
79: defframe(PARAM_CARRY, 20)
80: defframe(PARAM_MULTIPLIER,16)
81: defframe(PARAM_SIZE, 12)
82: defframe(PARAM_SRC, 8)
83: defframe(PARAM_DST, 4)
84:
85: .text
86: ALIGN(32)
87:
88: PROLOGUE(M4_function_1c)
89: pushl %ebx
90: deflit(`FRAME',4)
91: movl PARAM_CARRY, %ebx
92: jmp LF(M4_function_1,start_nc)
93: EPILOGUE()
94:
95: PROLOGUE(M4_function_1)
96: push %ebx
97: deflit(`FRAME',4)
98: xorl %ebx, %ebx C initial carry
99:
100: L(start_nc):
101: movl PARAM_SIZE, %ecx
102: pushl %esi
103: deflit(`FRAME',8)
104:
105: movl PARAM_SRC, %esi
106: pushl %edi
107: deflit(`FRAME',12)
108:
109: movl PARAM_DST, %edi
110: pushl %ebp
111: deflit(`FRAME',16)
112: cmpl $UNROLL_THRESHOLD, %ecx
113:
114: movl PARAM_MULTIPLIER, %ebp
115: jae L(unroll)
116:
117:
118: C simple loop
119: C this is offset 0x22, so close enough to aligned
120: L(simple):
121: C eax scratch
122: C ebx carry
123: C ecx counter
124: C edx scratch
125: C esi src
126: C edi dst
127: C ebp multiplier
128:
129: movl (%esi), %eax
130: addl $4, %edi
131:
132: mull %ebp
133:
134: addl %ebx, %eax
135: adcl $0, %edx
136:
137: M4_inst %eax, -4(%edi)
138: movl %edx, %ebx
139:
140: adcl $0, %ebx
141: decl %ecx
142:
143: leal 4(%esi), %esi
144: jnz L(simple)
145:
146:
147: popl %ebp
148: popl %edi
149:
150: popl %esi
151: movl %ebx, %eax
152:
153: popl %ebx
154: ret
155:
156:
157:
158: C------------------------------------------------------------------------------
159: C VAR_JUMP holds the computed jump temporarily because there's not enough
160: C registers when doing the mul for the initial two carry limbs.
161: C
162: C The add/adc for the initial carry in %ebx is necessary only for the
163: C mpn_add/submul_1c entry points. Duplicating the startup code to
164: C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
165: C idea.
166:
167: dnl overlapping with parameters already fetched
168: define(VAR_COUNTER,`PARAM_SIZE')
169: define(VAR_JUMP, `PARAM_DST')
170:
171: C this is offset 0x43, so close enough to aligned
172: L(unroll):
173: C eax
174: C ebx initial carry
175: C ecx size
176: C edx
177: C esi src
178: C edi dst
179: C ebp
180:
181: movl %ecx, %edx
182: decl %ecx
183:
184: subl $2, %edx
185: negl %ecx
186:
187: shrl $UNROLL_LOG2, %edx
188: andl $UNROLL_MASK, %ecx
189:
190: movl %edx, VAR_COUNTER
191: movl %ecx, %edx
192:
193: C 15 code bytes per limb
194: ifdef(`PIC',`
195: call L(pic_calc)
196: L(here):
197: ',`
198: shll $4, %edx
199: negl %ecx
200:
201: leal L(entry) (%edx,%ecx,1), %edx
202: ')
203: movl (%esi), %eax C src low limb
204:
205: movl %edx, VAR_JUMP
206: leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
207:
208: mull %ebp
209:
210: addl %ebx, %eax C initial carry (from _1c)
211: adcl $0, %edx
212:
213: movl %edx, %ebx C high carry
214: leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
215:
216: movl VAR_JUMP, %edx
217: testl $1, %ecx
218: movl %eax, %ecx C low carry
219:
220: cmovnz( %ebx, %ecx) C high,low carry other way around
221: cmovnz( %eax, %ebx)
222:
223: jmp *%edx
224:
225:
226: ifdef(`PIC',`
227: L(pic_calc):
228: shll $4, %edx
229: negl %ecx
230:
231: C See README.family about old gas bugs
232: leal (%edx,%ecx,1), %edx
233: addl $L(entry)-L(here), %edx
234:
235: addl (%esp), %edx
236:
237: ret
238: ')
239:
240:
241: C -----------------------------------------------------------
242: ALIGN(32)
243: L(top):
244: deflit(`FRAME',16)
245: C eax scratch
246: C ebx carry hi
247: C ecx carry lo
248: C edx scratch
249: C esi src
250: C edi dst
251: C ebp multiplier
252: C
253: C VAR_COUNTER loop counter
254: C
255: C 15 code bytes per limb
256:
257: addl $UNROLL_BYTES, %edi
258:
259: L(entry):
260: deflit(CHUNK_COUNT,2)
261: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
262: deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
263: deflit(`disp1', eval(disp0 + 4))
264:
265: Zdisp( movl, disp0,(%esi), %eax)
266: mull %ebp
267: Zdisp( M4_inst,%ecx, disp0,(%edi))
268: adcl %eax, %ebx
269: movl %edx, %ecx
270: adcl $0, %ecx
271:
272: movl disp1(%esi), %eax
273: mull %ebp
274: M4_inst %ebx, disp1(%edi)
275: adcl %eax, %ecx
276: movl %edx, %ebx
277: adcl $0, %ebx
278: ')
279:
280: decl VAR_COUNTER
281: leal UNROLL_BYTES(%esi), %esi
282:
283: jns L(top)
284:
285:
286: deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
287:
288: M4_inst %ecx, disp0(%edi)
289: movl %ebx, %eax
290:
291: popl %ebp
292: popl %edi
293:
294: popl %esi
295: popl %ebx
296: adcl $0, %eax
297:
298: ret
299:
300: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>