Annotation of OpenXM_contrib/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and
2: dnl subtract the result from a second limb vector.
3:
4: dnl Copyright 1995, 2000, 2001, 2002 Free Software Foundation, Inc.
5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25: C INPUT PARAMETERS
26: define(`res_ptr',`%r26')
27: define(`s1_ptr',`%r25')
28: define(`size_param',`%r24')
29: define(`s2_limb',`%r23')
30:
31: define(`cylimb',`%r28')
32: define(`s0',`%r19')
33: define(`s1',`%r20')
34: define(`s2',`%r3')
35: define(`s3',`%r4')
36: define(`lo0',`%r21')
37: define(`lo1',`%r5')
38: define(`lo2',`%r6')
39: define(`lo3',`%r7')
40: define(`hi0',`%r22')
41: define(`hi1',`%r23') C safe to reuse
42: define(`hi2',`%r29')
43: define(`hi3',`%r1')
44:
45: ASM_START()
46: PROLOGUE(mpn_submul_1)
47: C .callinfo frame=128,no_calls
48:
49: ldo 128(%r30),%r30
50: stws s2_limb,-16(%r30)
51: add %r0,%r0,cylimb C clear cy and cylimb
52: addib,< -4,size_param,L(few_limbs)
53: fldws -16(%r30),%fr31R
54:
55: ldo -112(%r30),%r31
56: stw %r3,-96(%r30)
57: stw %r4,-92(%r30)
58: stw %r5,-88(%r30)
59: stw %r6,-84(%r30)
60: stw %r7,-80(%r30)
61:
62: bb,>=,n s1_ptr,29,L(0)
63:
64: fldws,ma 4(s1_ptr),%fr4
65: ldws 0(res_ptr),s0
66: xmpyu %fr4,%fr31R,%fr5
67: fstds %fr5,-16(%r31)
68: ldws -16(%r31),cylimb
69: ldws -12(%r31),lo0
70: sub s0,lo0,s0
71: add s0,lo0,%r0 C invert cy
72: addib,< -1,size_param,L(few_limbs)
73: stws,ma s0,4(res_ptr)
74:
75: C start software pipeline ----------------------------------------------------
76: .label L(0)
77: fldds,ma 8(s1_ptr),%fr4
78: fldds,ma 8(s1_ptr),%fr8
79:
80: xmpyu %fr4L,%fr31R,%fr5
81: xmpyu %fr4R,%fr31R,%fr6
82: xmpyu %fr8L,%fr31R,%fr9
83: xmpyu %fr8R,%fr31R,%fr10
84:
85: fstds %fr5,-16(%r31)
86: fstds %fr6,-8(%r31)
87: fstds %fr9,0(%r31)
88: fstds %fr10,8(%r31)
89:
90: ldws -16(%r31),hi0
91: ldws -12(%r31),lo0
92: ldws -8(%r31),hi1
93: ldws -4(%r31),lo1
94: ldws 0(%r31),hi2
95: ldws 4(%r31),lo2
96: ldws 8(%r31),hi3
97: ldws 12(%r31),lo3
98:
99: addc lo0,cylimb,lo0
100: addc lo1,hi0,lo1
101: addc lo2,hi1,lo2
102: addc lo3,hi2,lo3
103:
104: addib,< -4,size_param,L(end)
105: addc %r0,hi3,cylimb C propagate carry into cylimb
106: C main loop ------------------------------------------------------------------
107: .label L(loop)
108: fldds,ma 8(s1_ptr),%fr4
109: fldds,ma 8(s1_ptr),%fr8
110:
111: ldws 0(res_ptr),s0
112: xmpyu %fr4L,%fr31R,%fr5
113: ldws 4(res_ptr),s1
114: xmpyu %fr4R,%fr31R,%fr6
115: ldws 8(res_ptr),s2
116: xmpyu %fr8L,%fr31R,%fr9
117: ldws 12(res_ptr),s3
118: xmpyu %fr8R,%fr31R,%fr10
119:
120: fstds %fr5,-16(%r31)
121: sub s0,lo0,s0
122: fstds %fr6,-8(%r31)
123: subb s1,lo1,s1
124: fstds %fr9,0(%r31)
125: subb s2,lo2,s2
126: fstds %fr10,8(%r31)
127: subb s3,lo3,s3
128: subb %r0,%r0,lo0 C these two insns ...
129: add lo0,lo0,%r0 C ... just invert cy
130:
131: ldws -16(%r31),hi0
132: ldws -12(%r31),lo0
133: ldws -8(%r31),hi1
134: ldws -4(%r31),lo1
135: ldws 0(%r31),hi2
136: ldws 4(%r31),lo2
137: ldws 8(%r31),hi3
138: ldws 12(%r31),lo3
139:
140: addc lo0,cylimb,lo0
141: stws,ma s0,4(res_ptr)
142: addc lo1,hi0,lo1
143: stws,ma s1,4(res_ptr)
144: addc lo2,hi1,lo2
145: stws,ma s2,4(res_ptr)
146: addc lo3,hi2,lo3
147: stws,ma s3,4(res_ptr)
148:
149: addib,>= -4,size_param,L(loop)
150: addc %r0,hi3,cylimb C propagate carry into cylimb
151: C finish software pipeline ---------------------------------------------------
152: .label L(end)
153: ldws 0(res_ptr),s0
154: ldws 4(res_ptr),s1
155: ldws 8(res_ptr),s2
156: ldws 12(res_ptr),s3
157:
158: sub s0,lo0,s0
159: stws,ma s0,4(res_ptr)
160: subb s1,lo1,s1
161: stws,ma s1,4(res_ptr)
162: subb s2,lo2,s2
163: stws,ma s2,4(res_ptr)
164: subb s3,lo3,s3
165: stws,ma s3,4(res_ptr)
166: subb %r0,%r0,lo0 C these two insns ...
167: add lo0,lo0,%r0 C ... invert cy
168:
169: C restore callee-saves registers ---------------------------------------------
170: ldw -96(%r30),%r3
171: ldw -92(%r30),%r4
172: ldw -88(%r30),%r5
173: ldw -84(%r30),%r6
174: ldw -80(%r30),%r7
175:
176: .label L(few_limbs)
177: addib,=,n 4,size_param,L(ret)
178:
179: .label L(loop2)
180: fldws,ma 4(s1_ptr),%fr4
181: ldws 0(res_ptr),s0
182: xmpyu %fr4,%fr31R,%fr5
183: fstds %fr5,-16(%r30)
184: ldws -16(%r30),hi0
185: ldws -12(%r30),lo0
186: addc lo0,cylimb,lo0
187: addc %r0,hi0,cylimb
188: sub s0,lo0,s0
189: add s0,lo0,%r0 C invert cy
190: stws,ma s0,4(res_ptr)
191: addib,<> -1,size_param,L(loop2)
192: nop
193:
194: .label L(ret)
195: addc %r0,cylimb,cylimb
196: bv 0(%r2)
197: ldo -128(%r30),%r30
198: EPILOGUE(mpn_submul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>