Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 nails mpn_addmul_1.
2:
3: dnl Copyright 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: dnl INPUT PARAMETERS
25: define(`rp',`r16')
26: define(`up',`r17')
27: define(`n',`r18')
28: define(`vl0',`r19')
29:
30: define(`numb_mask',`r14')
31:
32: define(`m0a',`r0')
33: define(`m0b',`r1')
34: define(`m1a',`r2')
35: define(`m1b',`r3')
36: define(`m2a',`r20')
37: define(`m2b',`r21')
38: define(`m3a',`r12')
39: define(`m3b',`r13')
40:
41: define(`acc0',`r9')
42: define(`acc1',`r27')
43:
44: define(`ul0',`r4')
45: define(`ul1',`r5')
46: define(`ul2',`r6')
47: define(`ul3',`r7')
48:
49: define(`rl0',`r22')
50: define(`rl1',`r23')
51: define(`rl2',`r24')
52: define(`rl3',`r25')
53:
54: C unused scratch
55: C unused saved r10 r11
56:
57: define(`NAIL_BITS',`GMP_NAIL_BITS')
58: define(`NUMB_BITS',`GMP_NUMB_BITS')
59:
60: dnl This declaration is munged by configure
61: NAILS_SUPPORT(2-63)
62:
63: dnl Runs at 4.5 cycles/limb. Local scheduling should bring that down to 3.5
64: dnl cycles/limb. It would be possible to reach 3.25 cycles/limb with 8-way
65: dnl unrolling.
66:
67: dnl Register usage:
68: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
69: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
70: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
71: dnl return address: 26
72: dnl global pointer: 29
73: dnl stack pointer: 30
74:
75: ASM_START()
76: PROLOGUE(mpn_addmul_1)
77: lda r30, -240(r30)
78: stq r9, 8(r30)
79: C stq r10, 16(r30)
80: C stq r11, 24(r30)
81: stq r12, 32(r30)
82: stq r13, 40(r30)
83: stq r14, 48(r30)
84: stq r15, 56(r30)
85:
86: sll vl0, NAIL_BITS, vl0
87: lda numb_mask, -1(r31)
88: srl numb_mask, NAIL_BITS, numb_mask
89:
90: bic r31, r31, r15
91: bic r31, r31, m3b
92:
93: and n, 3, r25
94: beq r25, L4
95: Loop0:
96: ldq ul0, 0(up)
97: ldq rl0, 0(rp)
98: mulq vl0, ul0, m0a C U1
99: srl m0a,NAIL_BITS, r8
100: addq r8, m3b, acc0
101: addq rl0, acc0, acc0
102: addq r15, acc0, acc0
103: umulh vl0, ul0, m3b C U1
104: srl acc0,NUMB_BITS, r15
105: and acc0,numb_mask, r28
106: stq r28, 0(rp)
107: lda rp, 8(rp)
108: lda up, 8(up)
109: lda r25, -1(r25)
110: bne r25, Loop0
111:
112: L4:
113: lda n, -4(n)
114: bge n, L_4_or_more
115: L_0_to_3:
116: addq m3b, r15, r0
117: br r31, Lret
118:
119: L_4_or_more:
120: ldq ul0, 0(up)
121: ldq ul1, 8(up)
122: ldq ul2, 16(up)
123: ldq ul3, 24(up)
124: ldq rl0, 0(rp)
125: lda n, -4(n)
126: lda up, 32(up)
127: bge n, L_8_or_more
128: L_4_to_8:
129: mulq vl0, ul0, m0a C U1
130: umulh vl0, ul0, m0b C U1
131: ldq rl1, 8(rp)
132: mulq vl0, ul1, m1a C U1
133: umulh vl0, ul1, m1b C U1
134: ldq rl2, 16(rp)
135: mulq vl0, ul2, m2a C U1
136: umulh vl0, ul2, m2b C U1
137: srl m0a,NAIL_BITS, r8
138: ldq rl3, 24(rp)
139: mulq vl0, ul3, m3a C U1
140: addq r8, m3b, acc0
141: umulh vl0, ul3, m3b C U1
142: addq rl0, acc0, acc0
143: srl m1a,NAIL_BITS, r8
144: addq r15, acc0, acc0
145:
146: addq r8, m0b, acc1
147: srl acc0,NUMB_BITS, r15
148: addq rl1, acc1, acc1
149: and acc0,numb_mask, r28
150: srl m2a,NAIL_BITS, r8
151: addq r15, acc1, acc1
152: bis r31, r31, r31 C nop
153: addq r8, m1b, acc0
154: srl acc1,NUMB_BITS, r15
155: stq r28, 0(rp)
156: addq rl2, acc0, acc0
157: and acc1,numb_mask, r28
158: srl m3a,NAIL_BITS, r8
159: addq r15, acc0, acc0
160: bis r31, r31, r31 C nop
161: addq r8, m2b, acc1
162: srl acc0,NUMB_BITS, r15
163: stq r28, 8(rp)
164: addq rl3, acc1, acc1
165: and acc0,numb_mask, r28
166: addq r15, acc1, acc1
167: bis r31, r31, r31 C nop
168: srl acc1,NUMB_BITS, r15
169: stq r28, 16(rp)
170: and acc1,numb_mask, r28
171: addq m3b, r15, acc0
172: stq r28, 24(rp)
173: and acc0,numb_mask, r0
174:
175: br r31, Lret
176:
177: L_8_or_more:
178: mulq vl0, ul0, m0a C U1
179: umulh vl0, ul0, m0b C U1
180: ldq ul0, 0(up)
181: ldq rl1, 8(rp)
182: mulq vl0, ul1, m1a C U1
183: umulh vl0, ul1, m1b C U1
184: ldq ul1, 8(up)
185: ldq rl2, 16(rp)
186: mulq vl0, ul2, m2a C U1
187: umulh vl0, ul2, m2b C U1
188: ldq ul2, 16(up)
189: srl m0a,NAIL_BITS, r8
190: ldq rl3, 24(rp)
191: mulq vl0, ul3, m3a C U1
192: addq r8, m3b, acc0
193: umulh vl0, ul3, m3b C U1
194: ldq ul3, 24(up)
195: addq rl0, acc0, acc0
196: srl m1a,NAIL_BITS, r8
197: ldq rl0, 32(rp)
198: addq r15, acc0, acc0
199: lda n, -4(n)
200: lda up, 32(up)
201: lda rp, 32(rp)
202: bge n, L_12_or_more C U0
203: L_8_to_11:
204: mulq vl0, ul0, m0a C U1
205: addq r8, m0b, acc1
206: srl acc0,NUMB_BITS, r15
207: umulh vl0, ul0, m0b C U1
208: addq rl1, acc1, acc1
209: and acc0,numb_mask, r28
210: srl m2a,NAIL_BITS, r8
211: ldq rl1, 8(rp)
212: addq r15, acc1, acc1
213: bis r31, r31, r31 C nop
214: mulq vl0, ul1, m1a C U1
215: addq r8, m1b, acc0
216: srl acc1,NUMB_BITS, r15
217: stq r28, -32(rp)
218: umulh vl0, ul1, m1b C U1
219: addq rl2, acc0, acc0
220: and acc1,numb_mask, r28
221: srl m3a,NAIL_BITS, r8
222: ldq rl2, 16(rp)
223: addq r15, acc0, acc0
224: bis r31, r31, r31 C nop
225: mulq vl0, ul2, m2a C U1
226: addq r8, m2b, acc1
227: srl acc0,NUMB_BITS, r15
228: stq r28, -24(rp)
229: umulh vl0, ul2, m2b C U1
230: addq rl3, acc1, acc1
231: and acc0,numb_mask, r28
232: srl m0a,NAIL_BITS, r8
233: ldq rl3, 24(rp)
234: addq r15, acc1, acc1
235: bis r31, r31, r31 C nop
236: mulq vl0, ul3, m3a C U1
237: addq r8, m3b, acc0
238: srl acc1,NUMB_BITS, r15
239: stq r28, -16(rp)
240: umulh vl0, ul3, m3b C U1
241: addq rl0, acc0, acc0
242: and acc1,numb_mask, r28
243: srl m1a,NAIL_BITS, r8
244: addq r15, acc0, acc0
245:
246: addq r8, m0b, acc1
247: srl acc0,NUMB_BITS, r15
248: stq r28, -8(rp)
249: addq rl1, acc1, acc1
250: and acc0,numb_mask, r28
251: srl m2a,NAIL_BITS, r8
252: addq r15, acc1, acc1
253: bis r31, r31, r31 C nop
254: addq r8, m1b, acc0
255: srl acc1,NUMB_BITS, r15
256: stq r28, 0(rp)
257: addq rl2, acc0, acc0
258: and acc1,numb_mask, r28
259: srl m3a,NAIL_BITS, r8
260: addq r15, acc0, acc0
261: bis r31, r31, r31 C nop
262: addq r8, m2b, acc1
263: srl acc0,NUMB_BITS, r15
264: stq r28, 8(rp)
265: addq rl3, acc1, acc1
266: and acc0,numb_mask, r28
267: addq r15, acc1, acc1
268: bis r31, r31, r31 C nop
269: srl acc1,NUMB_BITS, r15
270: stq r28, 16(rp)
271: and acc1,numb_mask, r28
272: addq m3b, r15, acc0
273: stq r28, 24(rp)
274: and acc0,numb_mask, r0
275:
276: br r31, Lret
277:
278: L_12_or_more:
279: mulq vl0, ul0, m0a C U1
280: addq r8, m0b, acc1
281: srl acc0,NUMB_BITS, r15
282: umulh vl0, ul0, m0b C U1
283: ldq ul0, 0(up)
284: addq rl1, acc1, acc1
285: and acc0,numb_mask, r28
286: srl m2a,NAIL_BITS, r8
287: ldq rl1, 8(rp)
288: addq r15, acc1, acc1
289: bis r31, r31, r31 C nop
290: mulq vl0, ul1, m1a C U1
291: addq r8, m1b, acc0
292: srl acc1,NUMB_BITS, r15
293: stq r28, -32(rp)
294: umulh vl0, ul1, m1b C U1
295: ldq ul1, 8(up)
296: addq rl2, acc0, acc0
297: and acc1,numb_mask, r28
298: srl m3a,NAIL_BITS, r8
299: ldq rl2, 16(rp)
300: addq r15, acc0, acc0
301: bis r31, r31, r31 C nop
302: mulq vl0, ul2, m2a C U1
303: addq r8, m2b, acc1
304: srl acc0,NUMB_BITS, r15
305: stq r28, -24(rp)
306: umulh vl0, ul2, m2b C U1
307: ldq ul2, 16(up)
308: addq rl3, acc1, acc1
309: and acc0,numb_mask, r28
310: srl m0a,NAIL_BITS, r8
311: ldq rl3, 24(rp)
312: addq r15, acc1, acc1
313: bis r31, r31, r31 C nop
314: mulq vl0, ul3, m3a C U1
315: addq r8, m3b, acc0
316: srl acc1,NUMB_BITS, r15
317: stq r28, -16(rp)
318: umulh vl0, ul3, m3b C U1
319: ldq ul3, 24(up)
320: addq rl0, acc0, acc0
321: and acc1,numb_mask, r28
322: srl m1a,NAIL_BITS, r8
323: ldq rl0, 32(rp)
324: addq r15, acc0, acc0
325: bis r31, r31, r31 C nop
326: bis r31, r31, r31 C nop
327: bis r31, r31, r31 C nop
328: bis r31, r31, r31 C nop
329: bis r31, r31, r31 C nop
330: lda n, -4(n)
331: lda up, 32(up)
332: lda rp, 32(rp)
333: blt n, L_end C U0
334:
335: Loop:
336: C
337: mulq vl0, ul0, m0a C U1
338: addq r8, m0b, acc1
339: srl acc0,NUMB_BITS, r15
340: stq r28, -40(rp)
341: C
342: umulh vl0, ul0, m0b C U1
343: ldq ul0, 0(up)
344: addq rl1, acc1, acc1
345: and acc0,numb_mask, r28
346: C
347: srl m2a,NAIL_BITS, r8
348: ldq rl1, 8(rp)
349: addq r15, acc1, acc1
350: bis r31, r31, r31 C nop
351: C
352: mulq vl0, ul1, m1a C U1
353: addq r8, m1b, acc0
354: srl acc1,NUMB_BITS, r15
355: stq r28, -32(rp)
356: C
357: umulh vl0, ul1, m1b C U1
358: ldq ul1, 8(up)
359: addq rl2, acc0, acc0
360: and acc1,numb_mask, r28
361: C
362: srl m3a,NAIL_BITS, r8
363: ldq rl2, 16(rp)
364: addq r15, acc0, acc0
365: bis r31, r31, r31 C nop
366: C
367: mulq vl0, ul2, m2a C U1
368: addq r8, m2b, acc1
369: srl acc0,NUMB_BITS, r15
370: stq r28, -24(rp)
371: C
372: umulh vl0, ul2, m2b C U1
373: ldq ul2, 16(up)
374: addq rl3, acc1, acc1
375: and acc0,numb_mask, r28
376: C
377: srl m0a,NAIL_BITS, r8
378: ldq rl3, 24(rp)
379: addq r15, acc1, acc1
380: bis r31, r31, r31 C nop
381: C
382: mulq vl0, ul3, m3a C U1
383: addq r8, m3b, acc0
384: srl acc1,NUMB_BITS, r15
385: stq r28, -16(rp)
386: C
387: umulh vl0, ul3, m3b C U1
388: ldq ul3, 24(up)
389: addq rl0, acc0, acc0
390: and acc1,numb_mask, r28
391: C
392: srl m1a,NAIL_BITS, r8
393: ldq rl0, 32(rp)
394: addq r15, acc0, acc0
395: bis r31, r31, r31 C nop
396: C
397: bis r31, r31, r31 C nop
398: bis r31, r31, r31 C nop
399: bis r31, r31, r31 C nop
400: bis r31, r31, r31 C nop
401: C
402: lda n, -4(n)
403: lda up, 32(up)
404: lda rp, 32(rp)
405: bge n, Loop C U0
406:
407: L_end:
408: mulq vl0, ul0, m0a C U1
409: addq r8, m0b, acc1
410: srl acc0,NUMB_BITS, r15
411: stq r28, -40(rp)
412: umulh vl0, ul0, m0b C U1
413: addq rl1, acc1, acc1
414: and acc0,numb_mask, r28
415: srl m2a,NAIL_BITS, r8
416: ldq rl1, 8(rp)
417: addq r15, acc1, acc1
418: bis r31, r31, r31 C nop
419: mulq vl0, ul1, m1a C U1
420: addq r8, m1b, acc0
421: srl acc1,NUMB_BITS, r15
422: stq r28, -32(rp)
423: umulh vl0, ul1, m1b C U1
424: addq rl2, acc0, acc0
425: and acc1,numb_mask, r28
426: srl m3a,NAIL_BITS, r8
427: ldq rl2, 16(rp)
428: addq r15, acc0, acc0
429: bis r31, r31, r31 C nop
430: mulq vl0, ul2, m2a C U1
431: addq r8, m2b, acc1
432: srl acc0,NUMB_BITS, r15
433: stq r28, -24(rp)
434: umulh vl0, ul2, m2b C U1
435: addq rl3, acc1, acc1
436: and acc0,numb_mask, r28
437: srl m0a,NAIL_BITS, r8
438: ldq rl3, 24(rp)
439: addq r15, acc1, acc1
440: bis r31, r31, r31 C nop
441: mulq vl0, ul3, m3a C U1
442: addq r8, m3b, acc0
443: srl acc1,NUMB_BITS, r15
444: stq r28, -16(rp)
445: umulh vl0, ul3, m3b C U1
446: addq rl0, acc0, acc0
447: and acc1,numb_mask, r28
448: srl m1a,NAIL_BITS, r8
449: addq r15, acc0, acc0
450: bis r31, r31, r31 C nop
451: bis r31, r31, r31 C nop
452: bis r31, r31, r31 C nop
453: bis r31, r31, r31 C nop
454: bis r31, r31, r31 C nop
455: lda rp, 32(rp)
456:
457: addq r8, m0b, acc1
458: srl acc0,NUMB_BITS, r15
459: stq r28, -40(rp)
460: addq rl1, acc1, acc1
461: and acc0,numb_mask, r28
462: srl m2a,NAIL_BITS, r8
463: addq r15, acc1, acc1
464: bis r31, r31, r31 C nop
465: addq r8, m1b, acc0
466: srl acc1,NUMB_BITS, r15
467: stq r28, -32(rp)
468: addq rl2, acc0, acc0
469: and acc1,numb_mask, r28
470: srl m3a,NAIL_BITS, r8
471: addq r15, acc0, acc0
472: bis r31, r31, r31 C nop
473: addq r8, m2b, acc1
474: srl acc0,NUMB_BITS, r15
475: stq r28, -24(rp)
476: addq rl3, acc1, acc1
477: and acc0,numb_mask, r28
478: addq r15, acc1, acc1
479: bis r31, r31, r31 C nop
480: srl acc1,NUMB_BITS, r15
481: stq r28, -16(rp)
482: and acc1,numb_mask, r28
483: addq m3b, r15, acc0
484: stq r28, -8(rp)
485: and acc0,numb_mask, r0
486: Lret:
487: ldq r9, 8(r30)
488: C ldq r10, 16(r30)
489: C ldq r11, 24(r30)
490: ldq r12, 32(r30)
491: ldq r13, 40(r30)
492: ldq r14, 48(r30)
493: ldq r15, 56(r30)
494: lda r30, 240(r30)
495: ret r31, (r26), 1
496: EPILOGUE(mpn_addmul_1)
497: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>