Annotation of OpenXM_contrib/gmp/mpn/x86/k6/aors_n.asm, Revision 1.1.1.1
1.1 maekawa 1: dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
2: dnl
3: dnl K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
4:
5:
6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
7: dnl
8: dnl This file is part of the GNU MP Library.
9: dnl
10: dnl The GNU MP Library is free software; you can redistribute it and/or
11: dnl modify it under the terms of the GNU Lesser General Public License as
12: dnl published by the Free Software Foundation; either version 2.1 of the
13: dnl License, or (at your option) any later version.
14: dnl
15: dnl The GNU MP Library is distributed in the hope that it will be useful,
16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18: dnl Lesser General Public License for more details.
19: dnl
20: dnl You should have received a copy of the GNU Lesser General Public
21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
23: dnl Suite 330, Boston, MA 02111-1307, USA.
24:
25:
26: include(`../config.m4')
27:
28:
29: ifdef(`OPERATION_add_n', `
30: define(M4_inst, adcl)
31: define(M4_function_n, mpn_add_n)
32: define(M4_function_nc, mpn_add_nc)
33: define(M4_description, add)
34: ',`ifdef(`OPERATION_sub_n', `
35: define(M4_inst, sbbl)
36: define(M4_function_n, mpn_sub_n)
37: define(M4_function_nc, mpn_sub_nc)
38: define(M4_description, subtract)
39: ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
40: ')')')
41:
42: MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
43:
44:
45: C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
46: C mp_size_t size);
47: C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
48: C mp_size_t size, mp_limb_t carry);
49: C
50: C Calculate src1,size M4_description src2,size, and store the result in
51: C dst,size. The return value is the carry bit from the top of the result
52: C (1 or 0).
53: C
54: C The _nc version accepts 1 or 0 for an initial carry into the low limb of
55: C the calculation. Note values other than 1 or 0 here will lead to garbage
56: C results.
57: C
58: C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
59: C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
60: C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
61:
62: define(PARAM_CARRY, `FRAME+20(%esp)')
63: define(PARAM_SIZE, `FRAME+16(%esp)')
64: define(PARAM_SRC2, `FRAME+12(%esp)')
65: define(PARAM_SRC1, `FRAME+8(%esp)')
66: define(PARAM_DST, `FRAME+4(%esp)')
67: deflit(`FRAME',0)
68:
69: dnl minimum 5 because the unrolled code can't handle less
70: deflit(UNROLL_THRESHOLD, 5)
71:
72: .text
73: ALIGN(32)
74:
75: PROLOGUE(M4_function_nc)
76: movl PARAM_CARRY, %eax
77: jmp LF(M4_function_n,start)
78: EPILOGUE()
79:
80:
81: PROLOGUE(M4_function_n)
82: xorl %eax, %eax
83: L(start):
84: movl PARAM_SIZE, %ecx
85: pushl %ebx
86: FRAME_pushl()
87:
88: movl PARAM_SRC1, %ebx
89: pushl %edi
90: FRAME_pushl()
91:
92: movl PARAM_SRC2, %edx
93: cmpl $UNROLL_THRESHOLD, %ecx
94:
95: movl PARAM_DST, %edi
96: jae L(unroll)
97:
98:
99: shrl %eax C initial carry flag
100:
101: C offset 0x21 here, close enough to aligned
102: L(simple):
103: C eax scratch
104: C ebx src1
105: C ecx counter
106: C edx src2
107: C esi
108: C edi dst
109: C ebp
110: C
111: C The store to (%edi) could be done with a stosl; it'd be smaller
112: C code, but there's no speed gain and a cld would have to be added
113: C (per mpn/x86/README.family).
114:
115: movl (%ebx), %eax
116: leal 4(%ebx), %ebx
117:
118: M4_inst (%edx), %eax
119:
120: movl %eax, (%edi)
121: leal 4(%edi), %edi
122:
123: leal 4(%edx), %edx
124: loop L(simple)
125:
126:
127: movl $0, %eax
128: popl %edi
129:
130: setc %al
131:
132: popl %ebx
133: ret
134:
135:
136: C -----------------------------------------------------------------------------
137: L(unroll):
138: C eax carry
139: C ebx src1
140: C ecx counter
141: C edx src2
142: C esi
143: C edi dst
144: C ebp
145:
146: cmpl %edi, %ebx
147: pushl %esi
148:
149: je L(inplace)
150:
151: ifdef(`OPERATION_add_n',`
152: cmpl %edi, %edx
153:
154: je L(inplace_reverse)
155: ')
156:
157: movl %ecx, %esi
158:
159: andl $-4, %ecx
160: andl $3, %esi
161:
162: leal (%ebx,%ecx,4), %ebx
163: leal (%edx,%ecx,4), %edx
164: leal (%edi,%ecx,4), %edi
165:
166: negl %ecx
167: shrl %eax
168:
169: ALIGN(32)
170: L(normal_top):
171: C eax counter, qwords, negative
172: C ebx src1
173: C ecx scratch
174: C edx src2
175: C esi
176: C edi dst
177: C ebp
178:
179: movl (%ebx,%ecx,4), %eax
180: leal 5(%ecx), %ecx
181: M4_inst -20(%edx,%ecx,4), %eax
182: movl %eax, -20(%edi,%ecx,4)
183:
184: movl 4-20(%ebx,%ecx,4), %eax
185: M4_inst 4-20(%edx,%ecx,4), %eax
186: movl %eax, 4-20(%edi,%ecx,4)
187:
188: movl 8-20(%ebx,%ecx,4), %eax
189: M4_inst 8-20(%edx,%ecx,4), %eax
190: movl %eax, 8-20(%edi,%ecx,4)
191:
192: movl 12-20(%ebx,%ecx,4), %eax
193: M4_inst 12-20(%edx,%ecx,4), %eax
194: movl %eax, 12-20(%edi,%ecx,4)
195:
196: loop L(normal_top)
197:
198:
199: decl %esi
200: jz L(normal_finish_one)
201: js L(normal_done)
202:
203: C two or three more limbs
204:
205: movl (%ebx), %eax
206: M4_inst (%edx), %eax
207: movl %eax, (%edi)
208:
209: movl 4(%ebx), %eax
210: M4_inst 4(%edx), %eax
211: decl %esi
212: movl %eax, 4(%edi)
213:
214: jz L(normal_done)
215: movl $2, %ecx
216:
217: L(normal_finish_one):
218: movl (%ebx,%ecx,4), %eax
219: M4_inst (%edx,%ecx,4), %eax
220: movl %eax, (%edi,%ecx,4)
221:
222: L(normal_done):
223: popl %esi
224: popl %edi
225:
226: movl $0, %eax
227: popl %ebx
228:
229: setc %al
230:
231: ret
232:
233:
234: C -----------------------------------------------------------------------------
235:
236: ifdef(`OPERATION_add_n',`
237: L(inplace_reverse):
238: C dst==src2
239:
240: movl %ebx, %edx
241: ')
242:
243: L(inplace):
244: C eax initial carry
245: C ebx
246: C ecx size
247: C edx src
248: C esi
249: C edi dst
250: C ebp
251:
252: leal -1(%ecx), %esi
253: decl %ecx
254:
255: andl $-4, %ecx
256: andl $3, %esi
257:
258: movl (%edx), %ebx C src low limb
259: leal (%edx,%ecx,4), %edx
260:
261: leal (%edi,%ecx,4), %edi
262: negl %ecx
263:
264: shrl %eax
265:
266:
267: ALIGN(32)
268: L(inplace_top):
269: C eax
270: C ebx next src limb
271: C ecx size
272: C edx src
273: C esi
274: C edi dst
275: C ebp
276:
277: M4_inst %ebx, (%edi,%ecx,4)
278:
279: movl 4(%edx,%ecx,4), %eax
280: leal 5(%ecx), %ecx
281:
282: M4_inst %eax, 4-20(%edi,%ecx,4)
283:
284: movl 8-20(%edx,%ecx,4), %eax
285: movl 12-20(%edx,%ecx,4), %ebx
286:
287: M4_inst %eax, 8-20(%edi,%ecx,4)
288: M4_inst %ebx, 12-20(%edi,%ecx,4)
289:
290: movl 16-20(%edx,%ecx,4), %ebx
291: loop L(inplace_top)
292:
293:
294: C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
295:
296: M4_inst %ebx, (%edi)
297:
298: decl %esi
299: jz L(inplace_finish_one)
300: js L(inplace_done)
301:
302: C two or three more limbs
303:
304: movl 4(%edx), %eax
305: movl 8(%edx), %ebx
306: M4_inst %eax, 4(%edi)
307: M4_inst %ebx, 8(%edi)
308:
309: decl %esi
310: movl $2, %ecx
311:
312: jz L(normal_done)
313:
314: L(inplace_finish_one):
315: movl 4(%edx,%ecx,4), %eax
316: M4_inst %eax, 4(%edi,%ecx,4)
317:
318: L(inplace_done):
319: popl %esi
320: popl %edi
321:
322: movl $0, %eax
323: popl %ebx
324:
325: setc %al
326:
327: ret
328:
329: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>