Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_3.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 nails mpn_addmul_3.
2:
3: dnl Copyright 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: dnl INPUT PARAMETERS
25: define(`rp',`r16')
26: define(`up',`r17')
27: define(`n',`r18')
28: define(`vp',`r19')
29:
30: dnl Useful register aliases
31: define(`numb_mask',`r24')
32: define(`ulimb',`r25')
33: define(`rlimb',`r27')
34:
35: define(`m0a',`r0')
36: define(`m0b',`r1')
37: define(`m1a',`r2')
38: define(`m1b',`r3')
39: define(`m2a',`r20')
40: define(`m2b',`r21')
41:
42: define(`acc0',`r4')
43: define(`acc1',`r5')
44: define(`acc2',`r22')
45:
46: define(`v0',`r6')
47: define(`v1',`r7')
48: define(`v2',`r23')
49:
50: dnl Used for temps: r8 r19 r28
51:
52: define(`NAIL_BITS',`GMP_NAIL_BITS')
53: define(`NUMB_BITS',`GMP_NUMB_BITS')
54:
55: dnl This declaration is munged by configure
56: NAILS_SUPPORT(3-63)
57:
58: dnl Runs at 3.0 cycles/limb. With unrolling, the ulimb load and the 3
59: dnl bookkeeping increments and the `bis' that copies from r22 to r6 could be
60: dnl removed and the instruction count reduced from 26 to to 21. We could
61: dnl thereby probably reach 2 cycles/limb, the IMUL bandwidth.
62:
63: dnl If this is going to be a Karatsuba basecase building block, we need some
64: dnl of the combinations below. That way, we won't ever hit the
65: dnl slower mpn_addmul_1 for any huge multiplication.
66: dnl
67: dnl Alt 3 Alt 4 Alt 5 Alt 6
68: dnl addmul_2 addmul_2 addmul_3 addmul_3
69: dnl addmul_3 addmul_3 addmul_4 addmul_4
70: dnl addmul_4 addmul_5 addmul_5
71: dnl addmul_6
72:
73: dnl Register usage:
74: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
75: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
76: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
77: dnl return address: 26
78: dnl global pointer: 29
79: dnl stack pointer: 30
80:
81: ASM_START()
82: PROLOGUE(mpn_addmul_3)
83: lda numb_mask,-1(r31)
84: srl numb_mask,NAIL_BITS,numb_mask
85:
86: ldq v0, 0(vp)
87: ldq v1, 8(vp)
88: ldq v2, 16(vp)
89:
90: bis r31, r31, acc0 C zero acc0
91: sll v0,NAIL_BITS, v0
92: bis r31, r31, acc1 C zero acc1
93: sll v1,NAIL_BITS, v1
94: bis r31, r31, acc2 C zero acc2
95: sll v2,NAIL_BITS, v2
96: bis r31, r31, r19
97:
98: C MAIN LOOP
99: ldq ulimb, 0(up)
100: lda up, 8(up)
101: mulq v0, ulimb, m0a C U1
102: umulh v0, ulimb, m0b C U1
103: mulq v1, ulimb, m1a C U1
104: umulh v1, ulimb, m1b C U1
105: lda n, -1(n)
106: mulq v2, ulimb, m2a C U1
107: umulh v2, ulimb, m2b C U1
108: beq n, Lend C U0
109: ALIGN(16)
110: Loop:
111: bis r31, r31, r31 C nop
112: ldq rlimb, 0(rp)
113: ldq ulimb, 0(up)
114: addq r19, acc0, acc0 C propagate nail
115:
116: lda rp, 8(rp)
117: srl m0a,NAIL_BITS, r8 C U0
118: lda up, 8(up)
119: mulq v0, ulimb, m0a C U1
120:
121: addq r8, acc0, r19
122: addq m0b, acc1, acc0
123: umulh v0, ulimb, m0b C U1
124: bis r31, r31, r31 C nop
125:
126: addq rlimb, r19, r19
127: srl m1a,NAIL_BITS, r8 C U0
128: bis r31, r31, r31 C nop
129: mulq v1, ulimb, m1a C U1
130:
131: addq r8, acc0, acc0
132: addq m1b, acc2, acc1
133: umulh v1, ulimb, m1b C U1
134: and r19,numb_mask, r28 C extract numb part
135:
136: bis r31, r31, r31 C nop
137: srl m2a,NAIL_BITS, r8 C U0
138: lda n, -1(n)
139: mulq v2, ulimb, m2a C U1
140:
141: addq r8, acc1, acc1
142: bis r31, m2b, acc2
143: umulh v2, ulimb, m2b C U1
144: srl r19,NUMB_BITS, r19 C extract nail part
145:
146: bis r31, r31, r31 C nop
147: stq r28, -8(rp)
148:
149: bne n, Loop C U0
150: C END LOOP
151: Lend:
152: ldq rlimb, 0(rp)
153: addq r19, acc0, acc0 C propagate nail
154: lda rp, 8(rp)
155: srl m0a,NAIL_BITS, r8 C U0
156: addq r8, acc0, r19
157: addq m0b, acc1, acc0
158: addq rlimb, r19, r19
159: srl m1a,NAIL_BITS, r8 C U0
160: addq r8, acc0, acc0
161: addq m1b, acc2, acc1
162: and r19,numb_mask, r28 C extract limb
163: srl m2a,NAIL_BITS, r8 C U0
164: addq r8, acc1, acc1
165: bis r31, m2b, acc2
166: srl r19,NUMB_BITS, r19 C extract nail
167: stq r28, -8(rp)
168:
169: addq r19, acc0, acc0 C propagate nail
170: and acc0,numb_mask, r28
171: stq r28, 0(rp)
172: srl acc0,NUMB_BITS, r19
173: addq r19, acc1, acc1
174:
175: and acc1,numb_mask, r28
176: stq r28, 8(rp)
177: srl acc1,NUMB_BITS, r19
178: addq r19, acc2, m0a
179:
180: ret r31, (r26), 1
181: EPILOGUE(mpn_addmul_3)
182: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>