Annotation of OpenXM_contrib/gmp/mpn/pa64/sqr_diagonal.asm, Revision 1.1.1.1
1.1 ohara 1: dnl HP-PA 2.0 64-bit mpn_sqr_diagonal.
2:
3: dnl Copyright 2001, 2002 Free Software Foundation, Inc.
4:
5: dnl This file is part of the GNU MP Library.
6:
7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
8: dnl it under the terms of the GNU Lesser General Public License as published
9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
10: dnl your option) any later version.
11:
12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: dnl License for more details.
16:
17: dnl You should have received a copy of the GNU Lesser General Public License
18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: dnl MA 02111-1307, USA.
21:
22:
23: dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
24: dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room
25: dnl for optimization.
26:
27: include(`../config.m4')
28:
29: C INPUT PARAMETERS
30: define(`rp',`%r26')
31: define(`up',`%r25')
32: define(`n',`%r24')
33:
34: define(`p00',`%r28')
35: define(`p32',`%r29')
36: define(`p64',`%r31')
37: define(`t0',`%r19')
38: define(`t1',`%r20')
39:
40: ifdef(`HAVE_ABI_2_0w',
41: ` .level 2.0W
42: ',` .level 2.0N
43: ')
44: PROLOGUE(mpn_sqr_diagonal)
45: .proc
46: .entry
47: ldo 128(%r30),%r30
48:
49: fldds,ma 8(up),%fr8
50: addib,= -1,n,L(end1)
51: nop
52: fldds,ma 8(up),%fr4
53: xmpyu %fr8l,%fr8r,%fr10
54: fstd %fr10,-120(%r30)
55: xmpyu %fr8r,%fr8r,%fr9
56: fstd %fr9,0(rp)
57: xmpyu %fr8l,%fr8l,%fr11
58: fstd %fr11,8(rp)
59: addib,= -1,n,L(end2)
60: ldo 16(rp),rp
61:
62: L(loop) fldds,ma 8(up),%fr8 C load next up limb
63: xmpyu %fr4l,%fr4r,%fr6
64: fstd %fr6,-128(%r30)
65: xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs
66: fstd %fr5,0(rp)
67: xmpyu %fr4l,%fr4l,%fr7
68: fstd %fr7,8(rp)
69: ldd -120(%r30),p32
70: ldd -16(rp),p00 C accumulate in int regs
71: ldd -8(rp),p64
72: depd,z p32,30,31,t0
73: add t0,p00,p00
74: std p00,-16(rp)
75: extrd,u p32,32,33,t1
76: add,dc t1,p64,p64
77: std p64,-8(rp)
78: addib,= -1,n,L(exit)
79: ldo 16(rp),rp
80:
81: fldds,ma 8(up),%fr4
82: xmpyu %fr8l,%fr8r,%fr10
83: fstd %fr10,-120(%r30)
84: xmpyu %fr8r,%fr8r,%fr9
85: fstd %fr9,0(rp)
86: xmpyu %fr8l,%fr8l,%fr11
87: fstd %fr11,8(rp)
88: ldd -128(%r30),p32
89: ldd -16(rp),p00
90: ldd -8(rp),p64
91: depd,z p32,30,31,t0
92: add t0,p00,p00
93: std p00,-16(rp)
94: extrd,u p32,32,33,t1
95: add,dc t1,p64,p64
96: std p64,-8(rp)
97: addib,<> -1,n,L(loop)
98: ldo 16(rp),rp
99:
100: L(end2) xmpyu %fr4l,%fr4r,%fr6
101: fstd %fr6,-128(%r30)
102: xmpyu %fr4r,%fr4r,%fr5
103: fstd %fr5,0(rp)
104: xmpyu %fr4l,%fr4l,%fr7
105: fstd %fr7,8(rp)
106: ldd -120(%r30),p32
107: ldd -16(rp),p00
108: ldd -8(rp),p64
109: depd,z p32,30,31,t0
110: add t0,p00,p00
111: std p00,-16(rp)
112: extrd,u p32,32,33,t1
113: add,dc t1,p64,p64
114: std p64,-8(rp)
115: ldo 16(rp),rp
116: ldd -128(%r30),p32
117: ldd -16(rp),p00
118: ldd -8(rp),p64
119: depd,z p32,30,31,t0
120: add t0,p00,p00
121: std p00,-16(rp)
122: extrd,u p32,32,33,t1
123: add,dc t1,p64,p64
124: std p64,-8(rp)
125: bve (%r2)
126: ldo -128(%r30),%r30
127:
128: L(exit) xmpyu %fr8l,%fr8r,%fr10
129: fstd %fr10,-120(%r30)
130: xmpyu %fr8r,%fr8r,%fr9
131: fstd %fr9,0(rp)
132: xmpyu %fr8l,%fr8l,%fr11
133: fstd %fr11,8(rp)
134: ldd -128(%r30),p32
135: ldd -16(rp),p00
136: ldd -8(rp),p64
137: depd,z p32,31,32,t0
138: add t0,p00,p00
139: extrd,u p32,31,32,t1
140: add,dc t1,p64,p64
141: add t0,p00,p00
142: add,dc t1,p64,p64
143: std p00,-16(rp)
144: std p64,-8(rp)
145: ldo 16(rp),rp
146: ldd -120(%r30),p32
147: ldd -16(rp),p00
148: ldd -8(rp),p64
149: depd,z p32,31,32,t0
150: add t0,p00,p00
151: extrd,u p32,31,32,t1
152: add,dc t1,p64,p64
153: add t0,p00,p00
154: add,dc t1,p64,p64
155: std p00,-16(rp)
156: std p64,-8(rp)
157: bve (%r2)
158: ldo -128(%r30),%r30
159:
160: L(end1) xmpyu %fr8l,%fr8r,%fr10
161: fstd %fr10,-128(%r30)
162: xmpyu %fr8r,%fr8r,%fr9
163: fstd %fr9,0(rp)
164: xmpyu %fr8l,%fr8l,%fr11
165: fstd %fr11,8(rp)
166: ldo 16(rp),rp
167: ldd -128(%r30),p32
168: ldd -16(rp),p00
169: ldd -8(rp),p64
170: depd,z p32,31,32,t0
171: add t0,p00,p00
172: extrd,u p32,31,32,t1
173: add,dc t1,p64,p64
174: add t0,p00,p00
175: add,dc t1,p64,p64
176: std p00,-16(rp)
177: std p64,-8(rp)
178: bve (%r2)
179: ldo -128(%r30),%r30
180: .procend
181: EPILOGUE(mpn_sqr_diagonal)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>