Annotation of OpenXM_contrib/gmp/mpn/sparc64/sqr_diagonal.asm, Revision 1.1.1.1
1.1 ohara 1: dnl SPARC v9 64-bit mpn_sqr_diagonal.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4:
5: dnl This file is part of the GNU MP Library.
6:
7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
8: dnl it under the terms of the GNU Lesser General Public License as published
9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
10: dnl your option) any later version.
11:
12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: dnl License for more details.
16:
17: dnl You should have received a copy of the GNU Lesser General Public License
18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: dnl MA 02111-1307, USA.
21:
22: include(`../config.m4')
23:
24: C This was generated by the Sun C compiler. It runs at 22 cycles/limb on the
25: C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal
26: C code using the same algorithm. For 1-3 limbs, a special loop was generated,
27: C which causes performance problems in particular for 2 and 3 limbs.
28: C Ultimately, this should be replaced by hand-written code in the same software
29: C pipeline style as e.g., addmul_1.asm.
30:
31: ASM_START()
32: REGISTER(%g2,#scratch)
33: REGISTER(%g3,#scratch)
34: PROLOGUE(mpn_sqr_diagonal)
35: save %sp, -240, %sp
36:
37: sethi %hi(0x1ffc00), %o0
38: sethi %hi(0x3ffc00), %o1
39: add %o0, 1023, %o7
40: cmp %i2, 4
41: add %o1, 1023, %o4
42: or %g0, %i1, %g1
43: or %g0, %i0, %o0
44: bl,pn %xcc, .Lsmall
45: or %g0, 0, %g2
46:
47: ldx [%i1], %o1
48: add %i1, 24, %g1
49: or %g0, 3, %g2
50: srlx %o1, 42, %g3
51: stx %g3, [%sp+2279]
52: and %o1, %o7, %o2
53: stx %o2, [%sp+2263]
54: srlx %o1, 21, %o1
55: ldd [%sp+2279], %f0
56: and %o1, %o7, %o1
57: stx %o1, [%sp+2271]
58: ldx [%i1+8], %o2
59: fxtod %f0, %f12
60: srlx %o2, 21, %o1
61: and %o2, %o7, %g3
62: ldd [%sp+2263], %f2
63: fmuld %f12, %f12, %f10
64: srlx %o2, 42, %o2
65: ldd [%sp+2271], %f0
66: and %o1, %o7, %o1
67: fxtod %f2, %f8
68: stx %o2, [%sp+2279]
69: stx %o1, [%sp+2271]
70: fxtod %f0, %f0
71: stx %g3, [%sp+2263]
72: fdtox %f10, %f14
73: fmuld %f12, %f8, %f6
74: ldx [%i1+16], %o2
75: std %f14, [%sp+2255]
76: fmuld %f0, %f0, %f2
77: fmuld %f8, %f8, %f10
78: srlx %o2, 42, %o1
79: faddd %f6, %f6, %f6
80: fmuld %f12, %f0, %f12
81: fmuld %f0, %f8, %f8
82: ldd [%sp+2279], %f0
83: ldd [%sp+2263], %f4
84: fdtox %f10, %f10
85: std %f10, [%sp+2239]
86: faddd %f2, %f6, %f6
87: ldd [%sp+2271], %f2
88: fdtox %f12, %f12
89: std %f12, [%sp+2247]
90: fdtox %f8, %f8
91: std %f8, [%sp+2231]
92: fdtox %f6, %f6
93: std %f6, [%sp+2223]
94:
95: .Loop: srlx %o2, 21, %g3
96: stx %o1, [%sp+2279]
97: add %g2, 1, %g2
98: and %g3, %o7, %o1
99: ldx [%sp+2255], %g4
100: cmp %g2, %i2
101: stx %o1, [%sp+2271]
102: add %g1, 8, %g1
103: add %o0, 16, %o0
104: ldx [%sp+2239], %o1
105: fxtod %f0, %f10
106: fxtod %f4, %f14
107: ldx [%sp+2231], %i0
108: ldx [%sp+2223], %g5
109: ldx [%sp+2247], %g3
110: and %o2, %o7, %o2
111: fxtod %f2, %f8
112: fmuld %f10, %f10, %f0
113: stx %o2, [%sp+2263]
114: fmuld %f10, %f14, %f6
115: ldx [%g1-8], %o2
116: fmuld %f10, %f8, %f12
117: fdtox %f0, %f2
118: ldd [%sp+2279], %f0
119: fmuld %f8, %f8, %f4
120: faddd %f6, %f6, %f6
121: fmuld %f14, %f14, %f10
122: std %f2, [%sp+2255]
123: sllx %g4, 20, %g4
124: ldd [%sp+2271], %f2
125: fmuld %f8, %f14, %f8
126: sllx %i0, 22, %i1
127: fdtox %f12, %f12
128: std %f12, [%sp+2247]
129: sllx %g5, 42, %i0
130: add %o1, %i1, %o1
131: faddd %f4, %f6, %f6
132: ldd [%sp+2263], %f4
133: add %o1, %i0, %o1
134: add %g3, %g4, %g3
135: fdtox %f10, %f10
136: std %f10, [%sp+2239]
137: srlx %o1, 42, %g4
138: and %g5, %o4, %i0
139: fdtox %f8, %f8
140: std %f8, [%sp+2231]
141: srlx %g5, 22, %g5
142: sub %g4, %i0, %g4
143: fdtox %f6, %f6
144: std %f6, [%sp+2223]
145: srlx %g4, 63, %g4
146: add %g3, %g5, %g3
147: add %g3, %g4, %g3
148: stx %o1, [%o0-16]
149: srlx %o2, 42, %o1
150: bl,pt %xcc, .Loop
151: stx %g3, [%o0-8]
152:
153: stx %o1, [%sp+2279]
154: srlx %o2, 21, %o1
155: fxtod %f0, %f16
156: ldx [%sp+2223], %g3
157: fxtod %f4, %f6
158: and %o2, %o7, %o3
159: stx %o3, [%sp+2263]
160: fxtod %f2, %f4
161: and %o1, %o7, %o1
162: ldx [%sp+2231], %o2
163: sllx %g3, 42, %g4
164: fmuld %f16, %f16, %f14
165: stx %o1, [%sp+2271]
166: fmuld %f16, %f6, %f8
167: add %o0, 48, %o0
168: ldx [%sp+2239], %o1
169: sllx %o2, 22, %o2
170: fmuld %f4, %f4, %f10
171: ldx [%sp+2255], %o3
172: fdtox %f14, %f14
173: fmuld %f4, %f6, %f2
174: std %f14, [%sp+2255]
175: faddd %f8, %f8, %f12
176: add %o1, %o2, %o2
177: fmuld %f16, %f4, %f4
178: ldd [%sp+2279], %f0
179: sllx %o3, 20, %g5
180: add %o2, %g4, %o2
181: fmuld %f6, %f6, %f6
182: srlx %o2, 42, %o3
183: and %g3, %o4, %g4
184: srlx %g3, 22, %g3
185: faddd %f10, %f12, %f16
186: ldd [%sp+2271], %f12
187: ldd [%sp+2263], %f8
188: fxtod %f0, %f0
189: sub %o3, %g4, %o3
190: ldx [%sp+2247], %o1
191: srlx %o3, 63, %o3
192: fdtox %f2, %f10
193: fxtod %f8, %f8
194: std %f10, [%sp+2231]
195: fdtox %f6, %f6
196: std %f6, [%sp+2239]
197: add %o1, %g5, %o1
198: fmuld %f0, %f0, %f2
199: fdtox %f16, %f16
200: std %f16, [%sp+2223]
201: add %o1, %g3, %o1
202: fdtox %f4, %f4
203: std %f4, [%sp+2247]
204: fmuld %f0, %f8, %f10
205: fxtod %f12, %f12
206: add %o1, %o3, %o1
207: stx %o2, [%o0-48]
208: fmuld %f8, %f8, %f6
209: stx %o1, [%o0-40]
210: fdtox %f2, %f2
211: ldx [%sp+2231], %o2
212: faddd %f10, %f10, %f10
213: ldx [%sp+2223], %g3
214: fmuld %f12, %f12, %f4
215: fdtox %f6, %f6
216: ldx [%sp+2239], %o1
217: sllx %o2, 22, %o2
218: fmuld %f12, %f8, %f8
219: sllx %g3, 42, %g5
220: ldx [%sp+2255], %o3
221: fmuld %f0, %f12, %f0
222: add %o1, %o2, %o2
223: faddd %f4, %f10, %f4
224: ldx [%sp+2247], %o1
225: add %o2, %g5, %o2
226: and %g3, %o4, %g4
227: fdtox %f8, %f8
228: sllx %o3, 20, %g5
229: std %f8, [%sp+2231]
230: fdtox %f0, %f0
231: srlx %o2, 42, %o3
232: add %o1, %g5, %o1
233: fdtox %f4, %f4
234: srlx %g3, 22, %g3
235: sub %o3, %g4, %o3
236: std %f6, [%sp+2239]
237: std %f4, [%sp+2223]
238: srlx %o3, 63, %o3
239: add %o1, %g3, %o1
240: std %f2, [%sp+2255]
241: add %o1, %o3, %o1
242: std %f0, [%sp+2247]
243: stx %o2, [%o0-32]
244: stx %o1, [%o0-24]
245: ldx [%sp+2231], %o2
246: ldx [%sp+2223], %o3
247: ldx [%sp+2239], %o1
248: sllx %o2, 22, %o2
249: sllx %o3, 42, %g5
250: ldx [%sp+2255], %g4
251: and %o3, %o4, %g3
252: add %o1, %o2, %o2
253: ldx [%sp+2247], %o1
254: add %o2, %g5, %o2
255: stx %o2, [%o0-16]
256: sllx %g4, 20, %g4
257: srlx %o2, 42, %o2
258: add %o1, %g4, %o1
259: srlx %o3, 22, %o3
260: sub %o2, %g3, %o2
261: srlx %o2, 63, %o2
262: add %o1, %o3, %o1
263: add %o1, %o2, %o1
264: stx %o1, [%o0-8]
265: ret
266: restore %g0, %g0, %g0
267: .Lsmall:
268: ldx [%g1], %o2
269: .Loop0:
270: and %o2, %o7, %o1
271: stx %o1, [%sp+2263]
272: add %g2, 1, %g2
273: srlx %o2, 21, %o1
274: add %g1, 8, %g1
275: srlx %o2, 42, %o2
276: stx %o2, [%sp+2279]
277: and %o1, %o7, %o1
278: ldd [%sp+2263], %f0
279: cmp %g2, %i2
280: stx %o1, [%sp+2271]
281: fxtod %f0, %f6
282: ldd [%sp+2279], %f0
283: ldd [%sp+2271], %f4
284: fxtod %f0, %f2
285: fmuld %f6, %f6, %f0
286: fxtod %f4, %f10
287: fmuld %f2, %f6, %f4
288: fdtox %f0, %f0
289: std %f0, [%sp+2239]
290: fmuld %f10, %f6, %f8
291: fmuld %f10, %f10, %f0
292: faddd %f4, %f4, %f6
293: fmuld %f2, %f2, %f4
294: fdtox %f8, %f8
295: std %f8, [%sp+2231]
296: fmuld %f2, %f10, %f2
297: faddd %f0, %f6, %f0
298: fdtox %f4, %f4
299: std %f4, [%sp+2255]
300: fdtox %f2, %f2
301: std %f2, [%sp+2247]
302: fdtox %f0, %f0
303: std %f0, [%sp+2223]
304: ldx [%sp+2239], %o1
305: ldx [%sp+2255], %g4
306: ldx [%sp+2231], %o2
307: sllx %g4, 20, %g4
308: ldx [%sp+2223], %o3
309: sllx %o2, 22, %o2
310: sllx %o3, 42, %g5
311: add %o1, %o2, %o2
312: ldx [%sp+2247], %o1
313: add %o2, %g5, %o2
314: stx %o2, [%o0]
315: and %o3, %o4, %g3
316: srlx %o2, 42, %o2
317: add %o1, %g4, %o1
318: srlx %o3, 22, %o3
319: sub %o2, %g3, %o2
320: srlx %o2, 63, %o2
321: add %o1, %o3, %o1
322: add %o1, %o2, %o1
323: stx %o1, [%o0+8]
324: add %o0, 16, %o0
325: bl,a,pt %xcc, .Loop0
326: ldx [%g1], %o2
327: ret
328: restore %g0, %g0, %g0
329: EPILOGUE(mpn_sqr_diagonal)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>