Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_4.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 nails mpn_addmul_4.
2:
3: dnl Copyright 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: dnl INPUT PARAMETERS
25: define(`rp',`r16')
26: define(`up',`r17')
27: define(`n',`r18')
28: define(`vp',`r19')
29:
30: dnl Useful register aliases
31: define(`numb_mask',`r24')
32: define(`ulimb',`r25')
33: define(`rlimb',`r27')
34:
35: define(`m0a',`r0')
36: define(`m0b',`r1')
37: define(`m1a',`r2')
38: define(`m1b',`r3')
39: define(`m2a',`r20')
40: define(`m2b',`r21')
41: define(`m3a',`r12')
42: define(`m3b',`r13')
43:
44: define(`acc0',`r4')
45: define(`acc1',`r5')
46: define(`acc2',`r22')
47: define(`acc3',`r14')
48:
49: define(`v0',`r6')
50: define(`v1',`r7')
51: define(`v2',`r23')
52: define(`v3',`r15')
53:
54: dnl Used for temps: r8 r19 r28
55:
56: define(`NAIL_BITS',`GMP_NAIL_BITS')
57: define(`NUMB_BITS',`GMP_NUMB_BITS')
58:
59: dnl This declaration is munged by configure
60: NAILS_SUPPORT(4-63)
61:
62: dnl Runs at 2.5 cycles/limb. With unrolling, the ulimb load and the 3
63: dnl bookkeeping increments and the `bis' that copies from r23 to r7 could be
64: dnl removed and the instruction count reduced from 31 to to 26. We could
65: dnl thereby surely reach 2 cycles/limb, the IMUL bandwidth.
66:
67: dnl If this is going to be a Karatsuba basecase building block, we need some
68: dnl of the combinations below. That way, we won't ever hit the
69: dnl slower mpn_addmul_1 for any huge multiplication.
70: dnl
71: dnl Alt 3 Alt 4 Alt 5 Alt 6
72: dnl addmul_2 addmul_2 addmul_3 addmul_3
73: dnl addmul_3 addmul_3 addmul_4 addmul_4
74: dnl addmul_4 addmul_5 addmul_5
75: dnl addmul_6
76:
77: dnl Register usage:
78: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
79: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
80: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
81: dnl return address: 26
82: dnl global pointer: 29
83: dnl stack pointer: 30
84:
85: ASM_START()
86: PROLOGUE(mpn_addmul_4)
87: lda r30, -240(r30)
88: stq r12, 32(r30)
89: stq r13, 40(r30)
90: stq r14, 48(r30)
91: stq r15, 56(r30)
92:
93: lda numb_mask,-1(r31)
94: srl numb_mask,NAIL_BITS,numb_mask
95:
96: ldq v0, 0(vp)
97: ldq v1, 8(vp)
98: ldq v2, 16(vp)
99: ldq v3, 24(vp)
100:
101: bis r31, r31, acc0 C zero acc0
102: sll v0,NAIL_BITS, v0
103: bis r31, r31, acc1 C zero acc1
104: sll v1,NAIL_BITS, v1
105: bis r31, r31, acc2 C zero acc2
106: sll v2,NAIL_BITS, v2
107: bis r31, r31, acc3 C zero acc3
108: sll v3,NAIL_BITS, v3
109: bis r31, r31, r19
110:
111: C MAIN LOOP
112: ldq ulimb, 0(up)
113: lda up, 8(up)
114: mulq v0, ulimb, m0a C U1
115: umulh v0, ulimb, m0b C U1
116: mulq v1, ulimb, m1a C U1
117: umulh v1, ulimb, m1b C U1
118: lda n, -1(n)
119: mulq v2, ulimb, m2a C U1
120: umulh v2, ulimb, m2b C U1
121: mulq v3, ulimb, m3a C U1
122: umulh v3, ulimb, m3b C U1
123: beq n, Lend C U0
124: ALIGN(16)
125: Loop:
126: bis r31, r31, r31 C nop
127: ldq rlimb, 0(rp)
128: ldq ulimb, 0(up)
129: addq r19, acc0, acc0 C propagate nail
130:
131: lda rp, 8(rp)
132: srl m0a,NAIL_BITS, r8 C U0
133: lda up, 8(up)
134: mulq v0, ulimb, m0a C U1
135:
136: addq r8, acc0, r19
137: addq m0b, acc1, acc0
138: umulh v0, ulimb, m0b C U1
139: bis r31, r31, r31 C nop
140:
141: addq rlimb, r19, r19
142: srl m1a,NAIL_BITS, r8 C U0
143: bis r31, r31, r31 C nop
144: mulq v1, ulimb, m1a C U1
145:
146: addq r8, acc0, acc0
147: addq m1b, acc2, acc1
148: umulh v1, ulimb, m1b C U1
149: and r19,numb_mask, r28 C extract numb part
150:
151: bis r31, r31, r31 C nop
152: srl m2a,NAIL_BITS, r8 C U0
153: lda n, -1(n)
154: mulq v2, ulimb, m2a C U1
155:
156: addq r8, acc1, acc1
157: addq m2b, acc3, acc2
158: umulh v2, ulimb, m2b C U1
159: srl r19,NUMB_BITS, r19 C extract nail part
160:
161: bis r31, r31, r31 C nop
162: srl m3a,NAIL_BITS, r8 C U0
163: stq r28, -8(rp)
164: mulq v3, ulimb, m3a C U1
165:
166: addq r8, acc2, acc2
167: bis r31, m3b, acc3
168: umulh v3, ulimb, m3b C U1
169: bne n, Loop C U0
170: C END LOOP
171: Lend:
172: ldq rlimb, 0(rp)
173: addq r19, acc0, acc0 C propagate nail
174: lda rp, 8(rp)
175: srl m0a,NAIL_BITS, r8 C U0
176: addq r8, acc0, r19
177: addq m0b, acc1, acc0
178: addq rlimb, r19, r19
179: srl m1a,NAIL_BITS, r8 C U0
180: addq r8, acc0, acc0
181: addq m1b, acc2, acc1
182: and r19,numb_mask, r28 C extract limb
183: srl m2a,NAIL_BITS, r8 C U0
184: addq r8, acc1, acc1
185: addq m2b, acc3, acc2
186: srl r19,NUMB_BITS, r19 C extract nail
187: srl m3a,NAIL_BITS, r8 C U0
188: stq r28, -8(rp)
189: addq r8, acc2, acc2
190: bis r31, m3b, acc3
191:
192: addq r19, acc0, acc0 C propagate nail
193: and acc0,numb_mask, r28
194: stq r28, 0(rp)
195: srl acc0,NUMB_BITS, r19
196: addq r19, acc1, acc1
197:
198: and acc1,numb_mask, r28
199: stq r28, 8(rp)
200: srl acc1,NUMB_BITS, r19
201: addq r19, acc2, acc2
202:
203: and acc2,numb_mask, r28
204: stq r28, 16(rp)
205: srl acc2,NUMB_BITS, r19
206: addq r19, acc3, r0
207:
208: ldq r12, 32(r30)
209: ldq r13, 40(r30)
210: ldq r14, 48(r30)
211: ldq r15, 56(r30)
212: lda r30, 240(r30)
213: ret r31, (r26), 1
214: EPILOGUE(mpn_addmul_4)
215: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>