Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/sub_n.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 nails mpn_sub_n.
2:
3: dnl Copyright 2002 Free Software Foundation, Inc.
4: dnl
5: dnl This file is part of the GNU MP Library.
6: dnl
7: dnl The GNU MP Library is free software; you can redistribute it and/or
8: dnl modify it under the terms of the GNU Lesser General Public License as
9: dnl published by the Free Software Foundation; either version 2.1 of the
10: dnl License, or (at your option) any later version.
11: dnl
12: dnl The GNU MP Library is distributed in the hope that it will be useful,
13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15: dnl Lesser General Public License for more details.
16: dnl
17: dnl You should have received a copy of the GNU Lesser General Public
18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
20: dnl Suite 330, Boston, MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: dnl INPUT PARAMETERS
25: define(`rp',`r16')
26: define(`up',`r17')
27: define(`vp',`r18')
28: define(`n',`r19')
29:
30: define(`rl0',`r0')
31: define(`rl1',`r1')
32: define(`rl2',`r2')
33: define(`rl3',`r3')
34:
35: define(`ul0',`r4')
36: define(`ul1',`r5')
37: define(`ul2',`r6')
38: define(`ul3',`r7')
39:
40: define(`vl0',`r22')
41: define(`vl1',`r23')
42: define(`vl2',`r24')
43: define(`vl3',`r25')
44:
45: define(`numb_mask',`r21')
46:
47: define(`NAIL_BITS',`GMP_NAIL_BITS')
48: define(`CYSH',63)
49:
50: dnl This declaration is munged by configure
51: NAILS_SUPPORT(1-63)
52:
53: dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb
54: dnl with 8-way unrolling.
55:
56: ASM_START()
57: PROLOGUE(mpn_sub_n)
58: lda numb_mask, -1(r31)
59: srl numb_mask, NAIL_BITS, numb_mask
60: bis r31, r31, r20
61:
62: and n, 3, r25
63: lda n, -4(n)
64: beq r25, L_4_or_more
65:
66: Loop0: ldq ul0, 0(up)
67: lda up, 8(up)
68: ldq vl0, 0(vp)
69: lda vp, 8(vp)
70: lda rp, 8(rp)
71: lda r25, -1(r25)
72: subq ul0, vl0, rl0
73: subq rl0, r20, rl0
74: and rl0, numb_mask, r28
75: stq r28, -8(rp)
76: srl rl0, CYSH, r20
77: bne r25, Loop0
78:
79: blt n, Lret
80:
81: L_4_or_more:
82: ldq ul0, 0(up)
83: ldq vl0, 0(vp)
84: ldq ul1, 8(up)
85: ldq vl1, 8(vp)
86: ldq ul2, 16(up)
87: ldq vl2, 16(vp)
88: ldq ul3, 24(up)
89: ldq vl3, 24(vp)
90: lda up, 32(up)
91: lda vp, 32(vp)
92: lda n, -4(n)
93: bge n, L_8_or_more
94: L_0_to_7:
95: subq ul0, vl0, rl0 C main-add 0
96: subq rl0, r20, rl0 C cy-add 0
97: subq ul1, vl1, rl1 C main-add 1
98: srl rl0, CYSH, r20 C gen cy 0
99: subq rl1, r20, rl1 C cy-add 1
100: and rl0,numb_mask, r27
101: br r31, Lcj0
102:
103: L_8_or_more:
104: subq ul0, vl0, rl0 C main-add 0
105: ldq ul0, 0(up)
106: ldq vl0, 0(vp)
107: subq rl0, r20, rl0 C cy-add 0
108: subq ul1, vl1, rl1 C main-add 1
109: srl rl0, CYSH, r20 C gen cy 0
110: ldq ul1, 8(up)
111: ldq vl1, 8(vp)
112: subq rl1, r20, rl1 C cy-add 1
113: and rl0,numb_mask, r27
114: subq ul2, vl2, rl2 C main-add 2
115: srl rl1, CYSH, r20 C gen cy 1
116: ldq ul2, 16(up)
117: ldq vl2, 16(vp)
118: subq rl2, r20, rl2 C cy-add 2
119: and rl1,numb_mask, r28
120: stq r27, 0(rp)
121: subq ul3, vl3, rl3 C main-add 3
122: srl rl2, CYSH, r20 C gen cy 2
123: ldq ul3, 24(up)
124: ldq vl3, 24(vp)
125: subq rl3, r20, rl3 C cy-add 3
126: and rl2,numb_mask, r27
127: stq r28, 8(rp)
128: lda rp, 32(rp)
129: lda up, 32(up)
130: lda vp, 32(vp)
131: lda n, -4(n)
132: blt n, L_end
133:
134: ALIGN(32)
135: Loop:
136: subq ul0, vl0, rl0 C main-add 0
137: srl rl3, CYSH, r20 C gen cy 3
138: ldq ul0, 0(up)
139: ldq vl0, 0(vp)
140:
141: subq rl0, r20, rl0 C cy-add 0
142: and rl3,numb_mask, r28
143: stq r27, -16(rp)
144: bis r31, r31, r31
145:
146: subq ul1, vl1, rl1 C main-add 1
147: srl rl0, CYSH, r20 C gen cy 0
148: ldq ul1, 8(up)
149: ldq vl1, 8(vp)
150:
151: subq rl1, r20, rl1 C cy-add 1
152: and rl0,numb_mask, r27
153: stq r28, -8(rp)
154: bis r31, r31, r31
155:
156: subq ul2, vl2, rl2 C main-add 2
157: srl rl1, CYSH, r20 C gen cy 1
158: ldq ul2, 16(up)
159: ldq vl2, 16(vp)
160:
161: subq rl2, r20, rl2 C cy-add 2
162: and rl1,numb_mask, r28
163: stq r27, 0(rp)
164: bis r31, r31, r31
165:
166: subq ul3, vl3, rl3 C main-add 3
167: srl rl2, CYSH, r20 C gen cy 2
168: ldq ul3, 24(up)
169: ldq vl3, 24(vp)
170:
171: subq rl3, r20, rl3 C cy-add 3
172: and rl2,numb_mask, r27
173: stq r28, 8(rp)
174: bis r31, r31, r31
175:
176: bis r31, r31, r31
177: lda n, -4(n)
178: lda up, 32(up)
179: lda vp, 32(vp)
180:
181: bis r31, r31, r31
182: bis r31, r31, r31
183: lda rp, 32(rp)
184: bge n, Loop
185: L_end:
186: subq ul0, vl0, rl0 C main-add 0
187: srl rl3, CYSH, r20 C gen cy 3
188: subq rl0, r20, rl0 C cy-add 0
189: and rl3,numb_mask, r28
190: stq r27, -16(rp)
191: subq ul1, vl1, rl1 C main-add 1
192: srl rl0, CYSH, r20 C gen cy 0
193: subq rl1, r20, rl1 C cy-add 1
194: and rl0,numb_mask, r27
195: stq r28, -8(rp)
196: Lcj0: subq ul2, vl2, rl2 C main-add 2
197: srl rl1, CYSH, r20 C gen cy 1
198: subq rl2, r20, rl2 C cy-add 2
199: and rl1,numb_mask, r28
200: stq r27, 0(rp)
201: subq ul3, vl3, rl3 C main-add 3
202: srl rl2, CYSH, r20 C gen cy 2
203: subq rl3, r20, rl3 C cy-add 3
204: and rl2,numb_mask, r27
205: stq r28, 8(rp)
206:
207: srl rl3, CYSH, r20 C gen cy 3
208: and rl3,numb_mask, r28
209: stq r27, 16(rp)
210: stq r28, 24(rp)
211: Lret:
212: and r20, 1, r0
213: ret r31, (r26), 1
214: EPILOGUE(mpn_sub_n)
215: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>