Annotation of OpenXM_contrib/gmp/mpn/x86/k7/aorsmul_1.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K7: 3.9 cycles/limb.
! 26: C
! 27: C Future: It should be possible to avoid the separate mul after the
! 28: C unrolled loop by moving the movl/adcl to the top.
! 29:
! 30:
! 31:
1.1 maekawa 32: dnl K7: UNROLL_COUNT cycles/limb
33: dnl 4 4.42
34: dnl 8 4.16
35: dnl 16 3.9
36: dnl 32 3.9
37: dnl 64 3.87
38: dnl Maximum possible with the current code is 64.
39:
40: deflit(UNROLL_COUNT, 16)
41:
42:
43: ifdef(`OPERATION_addmul_1',`
44: define(M4_inst, addl)
45: define(M4_function_1, mpn_addmul_1)
46: define(M4_function_1c, mpn_addmul_1c)
47: define(M4_description, add it to)
48: define(M4_desc_retval, carry)
49: ',`ifdef(`OPERATION_submul_1',`
50: define(M4_inst, subl)
51: define(M4_function_1, mpn_submul_1)
52: define(M4_function_1c, mpn_submul_1c)
53: define(M4_description, subtract it from)
54: define(M4_desc_retval, borrow)
55: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
56: ')')')
57:
58: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
59:
60:
61: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
62: C mp_limb_t mult);
63: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
64: C mp_limb_t mult, mp_limb_t carry);
65: C
66: C Calculate src,size multiplied by mult and M4_description dst,size.
67: C Return the M4_desc_retval limb from the top of the result.
68:
69: ifdef(`PIC',`
70: deflit(UNROLL_THRESHOLD, 9)
71: ',`
72: deflit(UNROLL_THRESHOLD, 6)
73: ')
74:
75: defframe(PARAM_CARRY, 20)
76: defframe(PARAM_MULTIPLIER,16)
77: defframe(PARAM_SIZE, 12)
78: defframe(PARAM_SRC, 8)
79: defframe(PARAM_DST, 4)
80: deflit(`FRAME',0)
81:
82: defframe(SAVE_EBX, -4)
83: defframe(SAVE_ESI, -8)
84: defframe(SAVE_EDI, -12)
85: defframe(SAVE_EBP, -16)
86: deflit(SAVE_SIZE, 16)
87:
1.1.1.2 ! ohara 88: TEXT
1.1 maekawa 89: ALIGN(32)
90: PROLOGUE(M4_function_1)
91: movl PARAM_SIZE, %edx
92: movl PARAM_SRC, %eax
93: xorl %ecx, %ecx
94:
95: decl %edx
1.1.1.2 ! ohara 96: jnz L(start_1)
1.1 maekawa 97:
98: movl (%eax), %eax
99: movl PARAM_DST, %ecx
100:
101: mull PARAM_MULTIPLIER
102:
103: M4_inst %eax, (%ecx)
104: adcl $0, %edx
105: movl %edx, %eax
106:
107: ret
108: EPILOGUE()
109:
110: ALIGN(16)
111: PROLOGUE(M4_function_1c)
112: movl PARAM_SIZE, %edx
113: movl PARAM_SRC, %eax
114:
115: decl %edx
116: jnz L(more_than_one_limb)
117:
118: movl (%eax), %eax
119: movl PARAM_DST, %ecx
120:
121: mull PARAM_MULTIPLIER
122:
123: addl PARAM_CARRY, %eax
124:
125: adcl $0, %edx
126: M4_inst %eax, (%ecx)
127:
128: adcl $0, %edx
129: movl %edx, %eax
130:
131: ret
132:
133:
134: C offset 0x44 so close enough to aligned
135: L(more_than_one_limb):
136: movl PARAM_CARRY, %ecx
137: L(start_1):
138: C eax src
139: C ecx initial carry
140: C edx size-1
141: subl $SAVE_SIZE, %esp
142: deflit(`FRAME',16)
143:
144: movl %ebx, SAVE_EBX
145: movl %esi, SAVE_ESI
146: movl %edx, %ebx C size-1
147:
148: movl PARAM_SRC, %esi
149: movl %ebp, SAVE_EBP
150: cmpl $UNROLL_THRESHOLD, %edx
151:
152: movl PARAM_MULTIPLIER, %ebp
153: movl %edi, SAVE_EDI
154:
155: movl (%esi), %eax C src low limb
156: movl PARAM_DST, %edi
157: ja L(unroll)
158:
159:
160: C simple loop
161:
162: leal 4(%esi,%ebx,4), %esi C point one limb past last
163: leal (%edi,%ebx,4), %edi C point at last limb
164: negl %ebx
165:
166: C The movl to load the next source limb is done well ahead of the
167: C mul. This is necessary for full speed, and leads to one limb
168: C handled separately at the end.
169:
170: L(simple):
171: C eax src limb
172: C ebx loop counter
173: C ecx carry limb
174: C edx scratch
175: C esi src
176: C edi dst
177: C ebp multiplier
178:
179: mull %ebp
180:
181: addl %eax, %ecx
182: adcl $0, %edx
183:
184: M4_inst %ecx, (%edi,%ebx,4)
185: movl (%esi,%ebx,4), %eax
186: adcl $0, %edx
187:
188: incl %ebx
189: movl %edx, %ecx
190: jnz L(simple)
191:
192:
193: mull %ebp
194:
195: movl SAVE_EBX, %ebx
196: movl SAVE_ESI, %esi
197: movl SAVE_EBP, %ebp
198:
199: addl %eax, %ecx
200: adcl $0, %edx
201:
202: M4_inst %ecx, (%edi)
203: adcl $0, %edx
204: movl SAVE_EDI, %edi
205:
206: addl $SAVE_SIZE, %esp
207: movl %edx, %eax
208: ret
209:
210:
211:
212: C -----------------------------------------------------------------------------
213: ALIGN(16)
214: L(unroll):
215: C eax src low limb
216: C ebx size-1
217: C ecx carry
218: C edx size-1
219: C esi src
220: C edi dst
221: C ebp multiplier
222:
223: dnl overlapping with parameters no longer needed
224: define(VAR_COUNTER,`PARAM_SIZE')
225: define(VAR_JUMP, `PARAM_MULTIPLIER')
226:
227: subl $2, %ebx C (size-2)-1
228: decl %edx C size-2
229:
230: shrl $UNROLL_LOG2, %ebx
231: negl %edx
232:
233: movl %ebx, VAR_COUNTER
234: andl $UNROLL_MASK, %edx
235:
236: movl %edx, %ebx
237: shll $4, %edx
238:
239: ifdef(`PIC',`
240: call L(pic_calc)
241: L(here):
242: ',`
243: leal L(entry) (%edx,%ebx,1), %edx
244: ')
245: negl %ebx
246: movl %edx, VAR_JUMP
247:
248: mull %ebp
249:
250: addl %eax, %ecx C initial carry, becomes low carry
251: adcl $0, %edx
252: testb $1, %bl
253:
254: movl 4(%esi), %eax C src second limb
255: leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
256: leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
257:
258: movl %edx, %ebx C high carry
259: cmovnz( %ecx, %ebx) C high,low carry other way around
260: cmovnz( %edx, %ecx)
261:
262: jmp *VAR_JUMP
263:
264:
265: ifdef(`PIC',`
266: L(pic_calc):
1.1.1.2 ! ohara 267: C See mpn/x86/README about old gas bugs
1.1 maekawa 268: leal (%edx,%ebx,1), %edx
269: addl $L(entry)-L(here), %edx
270: addl (%esp), %edx
271: ret
272: ')
273:
274:
275: C -----------------------------------------------------------------------------
276: C This code uses a "two carry limbs" scheme. At the top of the loop the
277: C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For
278: C the computed jump an odd size means they start one way around, an even
279: C size the other. Either way one limb is handled separately at the start of
280: C the loop.
281: C
282: C The positioning of the movl to load the next source limb is important.
283: C Moving it after the adcl with a view to avoiding a separate mul at the end
284: C of the loop slows the code down.
285:
286: ALIGN(32)
287: L(top):
288: C eax src limb
289: C ebx carry high
290: C ecx carry low
291: C edx scratch
292: C esi src+8
293: C edi dst
294: C ebp multiplier
295: C
296: C VAR_COUNTER loop counter
297: C
298: C 17 bytes each limb
299:
300: L(entry):
301: deflit(CHUNK_COUNT,2)
302: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
303: deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
304: deflit(`disp1', eval(disp0 + 4))
305:
306: mull %ebp
307:
308: Zdisp( M4_inst,%ecx, disp0,(%edi))
309: movl $0, %ecx
310:
311: adcl %eax, %ebx
312:
313: Zdisp( movl, disp0,(%esi), %eax)
314: adcl %edx, %ecx
315:
316:
317: mull %ebp
318:
319: M4_inst %ebx, disp1(%edi)
320: movl $0, %ebx
321:
322: adcl %eax, %ecx
323:
324: movl disp1(%esi), %eax
325: adcl %edx, %ebx
326: ')
327:
328: decl VAR_COUNTER
329: leal UNROLL_BYTES(%esi), %esi
330: leal UNROLL_BYTES(%edi), %edi
331:
332: jns L(top)
333:
334:
335: C eax src limb
336: C ebx carry high
337: C ecx carry low
338: C edx
339: C esi
340: C edi dst (points at second last limb)
341: C ebp multiplier
342: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
343: deflit(`disp1', eval(disp0-0 + 4))
344:
345: mull %ebp
346:
347: M4_inst %ecx, disp0(%edi)
348: movl SAVE_EBP, %ebp
349:
350: adcl %ebx, %eax
351: movl SAVE_EBX, %ebx
352: movl SAVE_ESI, %esi
353:
354: adcl $0, %edx
355: M4_inst %eax, disp1(%edi)
356: movl SAVE_EDI, %edi
357:
358: adcl $0, %edx
359: addl $SAVE_SIZE, %esp
360:
361: movl %edx, %eax
362: ret
363:
364: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>