Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aors_n.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
2:
1.1.1.2 ! ohara 3: dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24:
1.1.1.2 ! ohara 25: C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
! 26:
! 27:
1.1 maekawa 28: ifdef(`OPERATION_add_n', `
29: define(M4_inst, adcl)
30: define(M4_function_n, mpn_add_n)
31: define(M4_function_nc, mpn_add_nc)
32: define(M4_description, add)
33: ',`ifdef(`OPERATION_sub_n', `
34: define(M4_inst, sbbl)
35: define(M4_function_n, mpn_sub_n)
36: define(M4_function_nc, mpn_sub_nc)
37: define(M4_description, subtract)
38: ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
39: ')')')
40:
41: MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
42:
43:
44: C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
45: C mp_size_t size);
46: C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
47: C mp_size_t size, mp_limb_t carry);
48: C
49: C Calculate src1,size M4_description src2,size, and store the result in
50: C dst,size. The return value is the carry bit from the top of the result
51: C (1 or 0).
52: C
53: C The _nc version accepts 1 or 0 for an initial carry into the low limb of
54: C the calculation. Note values other than 1 or 0 here will lead to garbage
55: C results.
56: C
57: C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
58: C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
59: C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
60:
61: define(PARAM_CARRY, `FRAME+20(%esp)')
62: define(PARAM_SIZE, `FRAME+16(%esp)')
63: define(PARAM_SRC2, `FRAME+12(%esp)')
64: define(PARAM_SRC1, `FRAME+8(%esp)')
65: define(PARAM_DST, `FRAME+4(%esp)')
66: deflit(`FRAME',0)
67:
68: dnl minimum 5 because the unrolled code can't handle less
69: deflit(UNROLL_THRESHOLD, 5)
70:
1.1.1.2 ! ohara 71: TEXT
1.1 maekawa 72: ALIGN(32)
73:
74: PROLOGUE(M4_function_nc)
75: movl PARAM_CARRY, %eax
1.1.1.2 ! ohara 76: jmp L(start)
1.1 maekawa 77: EPILOGUE()
78:
79:
80: PROLOGUE(M4_function_n)
81: xorl %eax, %eax
82: L(start):
83: movl PARAM_SIZE, %ecx
84: pushl %ebx
85: FRAME_pushl()
86:
87: movl PARAM_SRC1, %ebx
88: pushl %edi
89: FRAME_pushl()
90:
91: movl PARAM_SRC2, %edx
92: cmpl $UNROLL_THRESHOLD, %ecx
93:
94: movl PARAM_DST, %edi
95: jae L(unroll)
96:
97:
98: shrl %eax C initial carry flag
99:
100: C offset 0x21 here, close enough to aligned
101: L(simple):
102: C eax scratch
103: C ebx src1
104: C ecx counter
105: C edx src2
106: C esi
107: C edi dst
108: C ebp
109: C
110: C The store to (%edi) could be done with a stosl; it'd be smaller
111: C code, but there's no speed gain and a cld would have to be added
1.1.1.2 ! ohara 112: C (per mpn/x86/README).
1.1 maekawa 113:
114: movl (%ebx), %eax
115: leal 4(%ebx), %ebx
116:
117: M4_inst (%edx), %eax
118:
119: movl %eax, (%edi)
120: leal 4(%edi), %edi
121:
122: leal 4(%edx), %edx
123: loop L(simple)
124:
125:
126: movl $0, %eax
127: popl %edi
128:
129: setc %al
130:
131: popl %ebx
132: ret
133:
134:
135: C -----------------------------------------------------------------------------
136: L(unroll):
137: C eax carry
138: C ebx src1
139: C ecx counter
140: C edx src2
141: C esi
142: C edi dst
143: C ebp
144:
145: cmpl %edi, %ebx
146: pushl %esi
147:
148: je L(inplace)
149:
150: ifdef(`OPERATION_add_n',`
151: cmpl %edi, %edx
152:
153: je L(inplace_reverse)
154: ')
155:
156: movl %ecx, %esi
157:
158: andl $-4, %ecx
159: andl $3, %esi
160:
161: leal (%ebx,%ecx,4), %ebx
162: leal (%edx,%ecx,4), %edx
163: leal (%edi,%ecx,4), %edi
164:
165: negl %ecx
166: shrl %eax
167:
168: ALIGN(32)
169: L(normal_top):
170: C eax counter, qwords, negative
171: C ebx src1
172: C ecx scratch
173: C edx src2
174: C esi
175: C edi dst
176: C ebp
177:
178: movl (%ebx,%ecx,4), %eax
179: leal 5(%ecx), %ecx
180: M4_inst -20(%edx,%ecx,4), %eax
181: movl %eax, -20(%edi,%ecx,4)
182:
183: movl 4-20(%ebx,%ecx,4), %eax
184: M4_inst 4-20(%edx,%ecx,4), %eax
185: movl %eax, 4-20(%edi,%ecx,4)
186:
187: movl 8-20(%ebx,%ecx,4), %eax
188: M4_inst 8-20(%edx,%ecx,4), %eax
189: movl %eax, 8-20(%edi,%ecx,4)
190:
191: movl 12-20(%ebx,%ecx,4), %eax
192: M4_inst 12-20(%edx,%ecx,4), %eax
193: movl %eax, 12-20(%edi,%ecx,4)
194:
195: loop L(normal_top)
196:
197:
198: decl %esi
199: jz L(normal_finish_one)
200: js L(normal_done)
201:
202: C two or three more limbs
203:
204: movl (%ebx), %eax
205: M4_inst (%edx), %eax
206: movl %eax, (%edi)
207:
208: movl 4(%ebx), %eax
209: M4_inst 4(%edx), %eax
210: decl %esi
211: movl %eax, 4(%edi)
212:
213: jz L(normal_done)
214: movl $2, %ecx
215:
216: L(normal_finish_one):
217: movl (%ebx,%ecx,4), %eax
218: M4_inst (%edx,%ecx,4), %eax
219: movl %eax, (%edi,%ecx,4)
220:
221: L(normal_done):
222: popl %esi
223: popl %edi
224:
225: movl $0, %eax
226: popl %ebx
227:
228: setc %al
229:
230: ret
231:
232:
233: C -----------------------------------------------------------------------------
234:
235: ifdef(`OPERATION_add_n',`
236: L(inplace_reverse):
237: C dst==src2
238:
239: movl %ebx, %edx
240: ')
241:
242: L(inplace):
243: C eax initial carry
244: C ebx
245: C ecx size
246: C edx src
247: C esi
248: C edi dst
249: C ebp
250:
251: leal -1(%ecx), %esi
252: decl %ecx
253:
254: andl $-4, %ecx
255: andl $3, %esi
256:
257: movl (%edx), %ebx C src low limb
258: leal (%edx,%ecx,4), %edx
259:
260: leal (%edi,%ecx,4), %edi
261: negl %ecx
262:
263: shrl %eax
264:
265:
266: ALIGN(32)
267: L(inplace_top):
268: C eax
269: C ebx next src limb
270: C ecx size
271: C edx src
272: C esi
273: C edi dst
274: C ebp
275:
276: M4_inst %ebx, (%edi,%ecx,4)
277:
278: movl 4(%edx,%ecx,4), %eax
279: leal 5(%ecx), %ecx
280:
281: M4_inst %eax, 4-20(%edi,%ecx,4)
282:
283: movl 8-20(%edx,%ecx,4), %eax
284: movl 12-20(%edx,%ecx,4), %ebx
285:
286: M4_inst %eax, 8-20(%edi,%ecx,4)
287: M4_inst %ebx, 12-20(%edi,%ecx,4)
288:
289: movl 16-20(%edx,%ecx,4), %ebx
290: loop L(inplace_top)
291:
292:
293: C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
294:
295: M4_inst %ebx, (%edi)
296:
297: decl %esi
298: jz L(inplace_finish_one)
299: js L(inplace_done)
300:
301: C two or three more limbs
302:
303: movl 4(%edx), %eax
304: movl 8(%edx), %ebx
305: M4_inst %eax, 4(%edi)
306: M4_inst %ebx, 8(%edi)
307:
308: decl %esi
309: movl $2, %ecx
310:
311: jz L(normal_done)
312:
313: L(inplace_finish_one):
314: movl 4(%edx,%ecx,4), %eax
315: M4_inst %eax, 4(%edi,%ecx,4)
316:
317: L(inplace_done):
318: popl %esi
319: popl %edi
320:
321: movl $0, %eax
322: popl %ebx
323:
324: setc %al
325:
326: ret
327:
328: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>