Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/mul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 nails mpn_mul_1.
2:
3: dnl Copyright 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: dnl INPUT PARAMETERS
25: define(`rp',`r16')
26: define(`up',`r17')
27: define(`n',`r18')
28: define(`vl0',`r19')
29:
30: define(`numb_mask',`r24')
31:
32: define(`m0a',`r0')
33: define(`m0b',`r1')
34: define(`m1a',`r2')
35: define(`m1b',`r3')
36: define(`m2a',`r20')
37: define(`m2b',`r21')
38: define(`m3a',`r22')
39: define(`m3b',`r23')
40:
41: define(`acc0',`r27')
42: define(`acc1',`r25')
43:
44: define(`ul0',`r4')
45: define(`ul1',`r5')
46: define(`ul2',`r6')
47: define(`ul3',`r7')
48:
49: C unused scratch
50: C unused saved r10 r11
51:
52: define(`NAIL_BITS',`GMP_NAIL_BITS')
53: define(`NUMB_BITS',`GMP_NUMB_BITS')
54:
55: dnl This declaration is munged by configure
56: NAILS_SUPPORT(1-63)
57:
58: dnl Runs at 3.5 cycles/limb. Naively made from addmul_1.asm. A better
59: dnl implementation could bring speed to 2.75 cycles/limb.
60:
61: dnl Register usage:
62: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
63: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
64: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
65: dnl return address: 26
66: dnl global pointer: 29
67: dnl stack pointer: 30
68:
69: ASM_START()
70: PROLOGUE(mpn_mul_1)
71: lda r30, -240(r30)
72: C stq r9, 8(r30)
73: C stq r10, 16(r30)
74: C stq r11, 24(r30)
75: C stq r12, 32(r30)
76: C stq r13, 40(r30)
77: C stq r14, 48(r30)
78: stq r15, 56(r30)
79:
80: sll vl0, NAIL_BITS, vl0
81: lda numb_mask, -1(r31)
82: srl numb_mask, NAIL_BITS, numb_mask
83:
84: bic r31, r31, r15
85: bic r31, r31, m3b
86:
87: and n, 3, r25
88: beq r25, L4
89: Loop0:
90: ldq ul0, 0(up)
91: mulq vl0, ul0, m0a C U1
92: srl m0a,NAIL_BITS, r8
93: addq r8, m3b, acc0
94: addq r15, acc0, acc0
95: umulh vl0, ul0, m3b C U1
96: srl acc0,NUMB_BITS, r15
97: and acc0,numb_mask, r28
98: stq r28, 0(rp)
99: lda rp, 8(rp)
100: lda up, 8(up)
101: lda r25, -1(r25)
102: bne r25, Loop0
103:
104: L4:
105: lda n, -4(n)
106: bge n, L_4_or_more
107: L_0_to_3:
108: addq m3b, r15, r0
109: br r31, Lret
110:
111: L_4_or_more:
112: ldq ul0, 0(up)
113: ldq ul1, 8(up)
114: ldq ul2, 16(up)
115: ldq ul3, 24(up)
116: lda n, -4(n)
117: lda up, 32(up)
118: bge n, L_8_or_more
119: L_4_to_8:
120: mulq vl0, ul0, m0a C U1
121: umulh vl0, ul0, m0b C U1
122: mulq vl0, ul1, m1a C U1
123: umulh vl0, ul1, m1b C U1
124: mulq vl0, ul2, m2a C U1
125: umulh vl0, ul2, m2b C U1
126: srl m0a,NAIL_BITS, r8
127: mulq vl0, ul3, m3a C U1
128: addq r8, m3b, acc0
129: umulh vl0, ul3, m3b C U1
130: srl m1a,NAIL_BITS, r8
131: addq r15, acc0, acc0
132:
133: addq r8, m0b, acc1
134: srl acc0,NUMB_BITS, r15
135: and acc0,numb_mask, r28
136: srl m2a,NAIL_BITS, r8
137: addq r15, acc1, acc1
138: bis r31, r31, r31 C nop
139: addq r8, m1b, acc0
140: srl acc1,NUMB_BITS, r15
141: stq r28, 0(rp)
142: and acc1,numb_mask, r28
143: srl m3a,NAIL_BITS, r8
144: addq r15, acc0, acc0
145: bis r31, r31, r31 C nop
146: addq r8, m2b, acc1
147: srl acc0,NUMB_BITS, r15
148: stq r28, 8(rp)
149: and acc0,numb_mask, r28
150: addq r15, acc1, acc1
151: bis r31, r31, r31 C nop
152: srl acc1,NUMB_BITS, r15
153: stq r28, 16(rp)
154: and acc1,numb_mask, r28
155: addq m3b, r15, acc0
156: stq r28, 24(rp)
157: and acc0,numb_mask, r0
158:
159: br r31, Lret
160:
161: L_8_or_more:
162: mulq vl0, ul0, m0a C U1
163: umulh vl0, ul0, m0b C U1
164: ldq ul0, 0(up)
165: mulq vl0, ul1, m1a C U1
166: umulh vl0, ul1, m1b C U1
167: ldq ul1, 8(up)
168: mulq vl0, ul2, m2a C U1
169: umulh vl0, ul2, m2b C U1
170: ldq ul2, 16(up)
171: srl m0a,NAIL_BITS, r8
172: mulq vl0, ul3, m3a C U1
173: addq r8, m3b, acc0
174: umulh vl0, ul3, m3b C U1
175: ldq ul3, 24(up)
176: srl m1a,NAIL_BITS, r8
177: addq r15, acc0, acc0
178: lda n, -4(n)
179: lda up, 32(up)
180: lda rp, 32(rp)
181: bge n, L_12_or_more C U0
182: L_8_to_11:
183: mulq vl0, ul0, m0a C U1
184: addq r8, m0b, acc1
185: srl acc0,NUMB_BITS, r15
186: umulh vl0, ul0, m0b C U1
187: and acc0,numb_mask, r28
188: srl m2a,NAIL_BITS, r8
189: addq r15, acc1, acc1
190: bis r31, r31, r31 C nop
191: mulq vl0, ul1, m1a C U1
192: addq r8, m1b, acc0
193: srl acc1,NUMB_BITS, r15
194: stq r28, -32(rp)
195: umulh vl0, ul1, m1b C U1
196: and acc1,numb_mask, r28
197: srl m3a,NAIL_BITS, r8
198: addq r15, acc0, acc0
199: bis r31, r31, r31 C nop
200: mulq vl0, ul2, m2a C U1
201: addq r8, m2b, acc1
202: srl acc0,NUMB_BITS, r15
203: stq r28, -24(rp)
204: umulh vl0, ul2, m2b C U1
205: and acc0,numb_mask, r28
206: srl m0a,NAIL_BITS, r8
207: addq r15, acc1, acc1
208: bis r31, r31, r31 C nop
209: mulq vl0, ul3, m3a C U1
210: addq r8, m3b, acc0
211: srl acc1,NUMB_BITS, r15
212: stq r28, -16(rp)
213: umulh vl0, ul3, m3b C U1
214: and acc1,numb_mask, r28
215: srl m1a,NAIL_BITS, r8
216: addq r15, acc0, acc0
217:
218: addq r8, m0b, acc1
219: srl acc0,NUMB_BITS, r15
220: stq r28, -8(rp)
221: and acc0,numb_mask, r28
222: srl m2a,NAIL_BITS, r8
223: addq r15, acc1, acc1
224: bis r31, r31, r31 C nop
225: addq r8, m1b, acc0
226: srl acc1,NUMB_BITS, r15
227: stq r28, 0(rp)
228: and acc1,numb_mask, r28
229: srl m3a,NAIL_BITS, r8
230: addq r15, acc0, acc0
231: bis r31, r31, r31 C nop
232: addq r8, m2b, acc1
233: srl acc0,NUMB_BITS, r15
234: stq r28, 8(rp)
235: and acc0,numb_mask, r28
236: addq r15, acc1, acc1
237: bis r31, r31, r31 C nop
238: srl acc1,NUMB_BITS, r15
239: stq r28, 16(rp)
240: and acc1,numb_mask, r28
241: addq m3b, r15, acc0
242: stq r28, 24(rp)
243: and acc0,numb_mask, r0
244:
245: br r31, Lret
246:
247: L_12_or_more:
248: mulq vl0, ul0, m0a C U1
249: addq r8, m0b, acc1
250: srl acc0,NUMB_BITS, r15
251: umulh vl0, ul0, m0b C U1
252: ldq ul0, 0(up)
253: and acc0,numb_mask, r28
254: srl m2a,NAIL_BITS, r8
255: addq r15, acc1, acc1
256: bis r31, r31, r31 C nop
257: mulq vl0, ul1, m1a C U1
258: addq r8, m1b, acc0
259: srl acc1,NUMB_BITS, r15
260: stq r28, -32(rp)
261: umulh vl0, ul1, m1b C U1
262: ldq ul1, 8(up)
263: and acc1,numb_mask, r28
264: srl m3a,NAIL_BITS, r8
265: addq r15, acc0, acc0
266: bis r31, r31, r31 C nop
267: mulq vl0, ul2, m2a C U1
268: addq r8, m2b, acc1
269: srl acc0,NUMB_BITS, r15
270: stq r28, -24(rp)
271: umulh vl0, ul2, m2b C U1
272: ldq ul2, 16(up)
273: and acc0,numb_mask, r28
274: srl m0a,NAIL_BITS, r8
275: addq r15, acc1, acc1
276: bis r31, r31, r31 C nop
277: mulq vl0, ul3, m3a C U1
278: addq r8, m3b, acc0
279: srl acc1,NUMB_BITS, r15
280: stq r28, -16(rp)
281: umulh vl0, ul3, m3b C U1
282: ldq ul3, 24(up)
283: and acc1,numb_mask, r28
284: srl m1a,NAIL_BITS, r8
285: addq r15, acc0, acc0
286: bis r31, r31, r31 C nop
287: bis r31, r31, r31 C nop
288: bis r31, r31, r31 C nop
289: bis r31, r31, r31 C nop
290: bis r31, r31, r31 C nop
291: lda n, -4(n)
292: lda up, 32(up)
293: lda rp, 32(rp)
294: blt n, L_end C U0
295:
296: Loop:
297: C
298: mulq vl0, ul0, m0a C U1
299: addq r8, m0b, acc1
300: srl acc0,NUMB_BITS, r15
301: stq r28, -40(rp)
302: C
303: umulh vl0, ul0, m0b C U1
304: ldq ul0, 0(up)
305: bis r31, r31, r31 C nop
306: and acc0,numb_mask, r28
307: C
308: srl m2a,NAIL_BITS, r8
309: bis r31, r31, r31 C nop
310: addq r15, acc1, acc1
311: bis r31, r31, r31 C nop
312: C
313: mulq vl0, ul1, m1a C U1
314: addq r8, m1b, acc0
315: srl acc1,NUMB_BITS, r15
316: stq r28, -32(rp)
317: C
318: umulh vl0, ul1, m1b C U1
319: ldq ul1, 8(up)
320: bis r31, r31, r31 C nop
321: and acc1,numb_mask, r28
322: C
323: srl m3a,NAIL_BITS, r8
324: bis r31, r31, r31 C nop
325: addq r15, acc0, acc0
326: bis r31, r31, r31 C nop
327: C
328: mulq vl0, ul2, m2a C U1
329: addq r8, m2b, acc1
330: srl acc0,NUMB_BITS, r15
331: stq r28, -24(rp)
332: C
333: umulh vl0, ul2, m2b C U1
334: ldq ul2, 16(up)
335: bis r31, r31, r31 C nop
336: and acc0,numb_mask, r28
337: C
338: srl m0a,NAIL_BITS, r8
339: bis r31, r31, r31 C nop
340: addq r15, acc1, acc1
341: bis r31, r31, r31 C nop
342: C
343: mulq vl0, ul3, m3a C U1
344: addq r8, m3b, acc0
345: srl acc1,NUMB_BITS, r15
346: stq r28, -16(rp)
347: C
348: umulh vl0, ul3, m3b C U1
349: ldq ul3, 24(up)
350: bis r31, r31, r31 C nop
351: and acc1,numb_mask, r28
352: C
353: srl m1a,NAIL_BITS, r8
354: bis r31, r31, r31 C nop
355: addq r15, acc0, acc0
356: bis r31, r31, r31 C nop
357: C
358: lda n, -4(n)
359: lda up, 32(up)
360: lda rp, 32(rp)
361: bge n, Loop C U0
362:
363: L_end:
364: mulq vl0, ul0, m0a C U1
365: addq r8, m0b, acc1
366: srl acc0,NUMB_BITS, r15
367: stq r28, -40(rp)
368: umulh vl0, ul0, m0b C U1
369: and acc0,numb_mask, r28
370: srl m2a,NAIL_BITS, r8
371: addq r15, acc1, acc1
372: bis r31, r31, r31 C nop
373: mulq vl0, ul1, m1a C U1
374: addq r8, m1b, acc0
375: srl acc1,NUMB_BITS, r15
376: stq r28, -32(rp)
377: umulh vl0, ul1, m1b C U1
378: and acc1,numb_mask, r28
379: srl m3a,NAIL_BITS, r8
380: addq r15, acc0, acc0
381: bis r31, r31, r31 C nop
382: mulq vl0, ul2, m2a C U1
383: addq r8, m2b, acc1
384: srl acc0,NUMB_BITS, r15
385: stq r28, -24(rp)
386: umulh vl0, ul2, m2b C U1
387: and acc0,numb_mask, r28
388: srl m0a,NAIL_BITS, r8
389: addq r15, acc1, acc1
390: bis r31, r31, r31 C nop
391: mulq vl0, ul3, m3a C U1
392: addq r8, m3b, acc0
393: srl acc1,NUMB_BITS, r15
394: stq r28, -16(rp)
395: umulh vl0, ul3, m3b C U1
396: and acc1,numb_mask, r28
397: srl m1a,NAIL_BITS, r8
398: addq r15, acc0, acc0
399: bis r31, r31, r31 C nop
400: bis r31, r31, r31 C nop
401: bis r31, r31, r31 C nop
402: bis r31, r31, r31 C nop
403: bis r31, r31, r31 C nop
404: lda rp, 32(rp)
405:
406: addq r8, m0b, acc1
407: srl acc0,NUMB_BITS, r15
408: stq r28, -40(rp)
409: and acc0,numb_mask, r28
410: srl m2a,NAIL_BITS, r8
411: addq r15, acc1, acc1
412: bis r31, r31, r31 C nop
413: addq r8, m1b, acc0
414: srl acc1,NUMB_BITS, r15
415: stq r28, -32(rp)
416: and acc1,numb_mask, r28
417: srl m3a,NAIL_BITS, r8
418: addq r15, acc0, acc0
419: bis r31, r31, r31 C nop
420: addq r8, m2b, acc1
421: srl acc0,NUMB_BITS, r15
422: stq r28, -24(rp)
423: and acc0,numb_mask, r28
424: addq r15, acc1, acc1
425: bis r31, r31, r31 C nop
426: srl acc1,NUMB_BITS, r15
427: stq r28, -16(rp)
428: and acc1,numb_mask, r28
429: addq m3b, r15, acc0
430: stq r28, -8(rp)
431: and acc0,numb_mask, r0
432: Lret:
433: C ldq r9, 8(r30)
434: C ldq r10, 16(r30)
435: C ldq r11, 24(r30)
436: C ldq r12, 32(r30)
437: C ldq r13, 40(r30)
438: C ldq r14, 48(r30)
439: ldq r15, 56(r30)
440: lda r30, 240(r30)
441: ret r31, (r26), 1
442: EPILOGUE(mpn_mul_1)
443: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>