Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_2.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 nails mpn_addmul_2.
2:
3: dnl Copyright 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: dnl INPUT PARAMETERS
25: define(`rp',`r16')
26: define(`up',`r17')
27: define(`n',`r18')
28: define(`vp',`r19')
29:
30: dnl Useful register aliases
31: define(`numb_mask',`r24')
32: define(`ulimb',`r25')
33: define(`rlimb',`r27')
34:
35: define(`m0a',`r0')
36: define(`m0b',`r1')
37: define(`m1a',`r2')
38: define(`m1b',`r3')
39:
40: define(`acc0',`r4')
41: define(`acc1',`r5')
42:
43: define(`v0',`r6')
44: define(`v1',`r7')
45:
46: dnl Used for temps: r8 r19 r28
47:
48: define(`NAIL_BITS',`GMP_NAIL_BITS')
49: define(`NUMB_BITS',`GMP_NUMB_BITS')
50:
51: dnl This declaration is munged by configure
52: NAILS_SUPPORT(3-63)
53:
54: dnl Runs at 4.0 cycles/limb. With unrolling, the ulimb load and the 3
55: dnl bookkeeping increments and the `bis' that copies from r21 to r5 could be
56: dnl removed and the instruction count reduced from 21 to to 16. We could
57: dnl thereby reach about 2.3 cycles/limb.
58:
59: dnl If this is going to be a Karatsuba basecase building block, we need some
60: dnl of the combinations below. That way, we won't ever hit the
61: dnl slower mpn_addmul_1 for any huge multiplication.
62: dnl
63: dnl Alt 3 Alt 4 Alt 5 Alt 6
64: dnl addmul_2 addmul_2 addmul_3 addmul_3
65: dnl addmul_3 addmul_3 addmul_4 addmul_4
66: dnl addmul_4 addmul_5 addmul_5
67: dnl addmul_6
68:
69: dnl Register usage:
70: dnl callee-saves: r9 r10 r11 r12 r13 r14 r15
71: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
72: dnl r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
73: dnl return address: 26
74: dnl global pointer: 29
75: dnl stack pointer: 30
76:
77: ASM_START()
78: PROLOGUE(mpn_addmul_2)
79: lda numb_mask,-1(r31)
80: srl numb_mask,NAIL_BITS,numb_mask
81:
82: ldq v0, 0(vp)
83: ldq v1, 8(vp)
84:
85: bis r31, r31, acc0 C zero acc0
86: sll v0,NAIL_BITS, v0
87: bis r31, r31, acc1 C zero acc1
88: sll v1,NAIL_BITS, v1
89: bis r31, r31, r19
90:
91: C MAIN LOOP
92: ldq ulimb, 0(up)
93: lda up, 8(up)
94: mulq v0, ulimb, m0a C U1
95: umulh v0, ulimb, m0b C U1
96: mulq v1, ulimb, m1a C U1
97: umulh v1, ulimb, m1b C U1
98: lda n, -1(n)
99: beq n, Lend C U0
100: ALIGN(16)
101: Loop:
102: bis r31, r31, r31 C nop
103: ldq rlimb, 0(rp)
104: ldq ulimb, 0(up)
105: addq r19, acc0, acc0 C propagate nail
106:
107: lda rp, 8(rp)
108: srl m0a,NAIL_BITS,r8 C U0
109: lda up, 8(up)
110: mulq v0, ulimb, m0a C U1
111:
112: addq r8, acc0, r19
113: addq m0b, acc1, acc0
114: umulh v0, ulimb, m0b C U1
115: bis r31, r31, r31 C nop
116:
117: addq rlimb, r19, r19
118: srl m1a,NAIL_BITS,r8 C U0
119: bis r31, r31, r31 C nop
120: mulq v1, ulimb, m1a C U1
121:
122: addq r8, acc0, acc0
123: bis r31, m1b, acc1
124: umulh v1, ulimb, m1b C U1
125: and r19,numb_mask, r28 C extract numb part
126:
127: lda n, -1(n)
128: srl r19,NUMB_BITS, r19 C extract nail part
129: stq r28, -8(rp)
130: bne n, Loop C U0
131: C END LOOP
132: Lend:
133: ldq rlimb, 0(rp)
134: addq r19, acc0, acc0 C propagate nail
135: lda rp, 8(rp)
136: srl m0a,NAIL_BITS,r8 C U0
137: addq r8, acc0, r19
138: addq m0b, acc1, acc0
139: addq rlimb, r19, r19
140: srl m1a,NAIL_BITS,r8 C U0
141: addq r8, acc0, acc0
142: bis r31, m1b, acc1
143: and r19,numb_mask, r28 C extract limb
144:
145: srl r19,NUMB_BITS, r19 C extract nail
146: stq r28, -8(rp)
147:
148: addq r19, acc0, acc0 C propagate nail
149: and acc0,numb_mask, r28
150: stq r28, 0(rp)
151: srl acc0,NUMB_BITS, r19
152: addq r19, acc1, r0
153:
154: ret r31, (r26), 1
155: EPILOGUE(mpn_addmul_2)
156: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>