Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/sub_n.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
2: dnl and store difference in a third limb vector.
3:
4: dnl Copyright 2000 Free Software Foundation, Inc.
5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25: dnl INPUT PARAMETERS
26: dnl res_ptr r16
27: dnl s1_ptr r17
28: dnl s2_ptr r18
29: dnl size r19
30:
31: dnl This code runs at 5.4 cycles/limb on EV5, and 2.1 cycles/limb on EV6.
32:
33: dnl This code was written in close cooperation with ev6 pipeline expert
34: dnl Steve Root. Any errors are tege's fault, though.
35:
36: dnl work triplet 0-2
37: dnl work triplet 3-5
38: dnl work triplet 6-8
39: dnl work triplet 9-11
40: dnl carry's 20-23
41:
42: dnl sustains 8 subtracts in 17 cycles !
43: dnl (from the d_cache)
44:
45: dnl pair loads and stores where possible
46: dnl store pairs oct-aligned where possible
47: dnl (didn't need it here)
48: dnl stores are delayed every third cycle
49: dnl loads and stores are delayed by fills
50: dnl U stays still, put code there where possible
51: dnl (note alternation of U1 and U0)
52: dnl L moves because of loads and stores
53: dnl note dampers in L to limit damage
54: dnl note, load ahead of time where possible
55:
56: dnl this odd-looking optimization expects
57: dnl that were having random bits in our data, so
58: dnl that a pure zero result is unlikely. so we
59: dnl penalize the unlikely case to help the
60: dnl common case.
61:
62: ASM_START()
63: PROLOGUE(mpn_sub_n)
64: lda r30, -240(r30)
65: stq r9, 8(r30)
66: stq r10, 16(r30)
67: stq r11, 24(r30)
68:
69: lda r19, -8(r19) C L1 move counter
70:
71: bis r31, r31, r23
72: blt r19, $Lsmall
73:
74: ldq r0, 0(r17) C L0 get next ones
75: ldq r1, 0(r18) C L1
76: ldq r3, 8(r17) C L0 get next ones
77: ldq r4, 8(r18) C L1
78: ldq r6, 16(r17) C L0 get next ones
79: ldq r7, 16(r18) C L1
80:
81: ldq r9, 24(r17) C L0 get next ones
82: ldq r10, 24(r18) C L1
83:
84: subq r0, r1, r2 C U1 sub two data
85:
86: cmpult r0, r1, r20 C U1 did it borrow
87:
88: ldq r0, 32(r17) C L0 get next ones
89: ldq r1, 32(r18) C L1
90:
91: subq r3, r4, r5 C U0 sub two data
92:
93: cmpult r3, r4, r21 C U0 did it borrow
94: ldq r3, 40(r17) C L0 get next ones
95: ldq r4, 40(r18) C L1
96:
97: subq r6, r7, r8 C U1 sub two data
98: subq r5, r20, r24 C U0 borrow from last
99: stq r2, 0(r16) C L1
100:
101: cmpult r6, r7, r22 C U1 did it borrow
102: beq r5, $fix5w C U0 fix exact zero
103: $ret5w: ldq r6, 48(r17) C L0 get next ones
104: ldq r7, 48(r18) C L1
105:
106: bis r31, r31, r31 C L damp out
107: subq r8, r21, r25 C U1 borrow from last
108: bis r31, r31, r31 C L moves in L !
109: subq r9, r10, r11 C U0 sub two data
110:
111: beq r8, $fix6w C U1 fix exact zero
112: $ret6w: cmpult r9, r10, r23 C U0 did it borrow
113: ldq r9, 56(r17) C L0 get next ones
114: ldq r10, 56(r18) C L1
115:
116: lda r17, 64(r17) C L0 move pointer
117: bis r31, r31, r31 C U
118: lda r18, 64(r18) C L1 move pointer
119:
120: lda r19, -8(r19) C L1 move counter
121: blt r19, $Lend
122:
123: C Main loop. 8-way unrolled.
124: ALIGN(8)
125: $Loop:
126: subq r0, r1, r2 C U1 sub two data
127: stq r24, 8(r16) C L0 put an answer
128: subq r11, r22, r24 C U0 borrow from last
129: stq r25, 16(r16) C L1 pair
130:
131: cmpult r0, r1, r20 C U1 did it borrow
132: beq r11, $fix7 C U0 fix exact 0
133: $ret7: ldq r0, 0(r17) C L0 get next ones
134: ldq r1, 0(r18) C L1
135:
136: bis r31, r31, r31 C L damp out
137: subq r2, r23, r25 C U1 borrow from last
138: bis r31, r31, r31 C L moves in L !
139: subq r3, r4, r5 C U0 sub two data
140:
141: beq r2, $fix0 C U1 fix exact zero
142: $ret0: cmpult r3, r4, r21 C U0 did it borrow
143: ldq r3, 8(r17) C L0 get next ones
144: ldq r4, 8(r18) C L1
145:
146: subq r6, r7, r8 C U1 sub two data
147: stq r24, 24(r16) C L0 store pair
148: subq r5, r20, r24 C U0 borrow from last
149: stq r25, 32(r16) C L1
150:
151: cmpult r6, r7, r22 C U1 did it borrow
152: beq r5, $fix1 C U0 fix exact zero
153: $ret1: ldq r6, 16(r17) C L0 get next ones
154: ldq r7, 16(r18) C L1
155:
156: lda r16, 64(r16) C L0 move pointer
157: subq r8, r21, r25 C U1 borrow from last
158: lda r19, -8(r19) C L1 move counter
159: subq r9, r10, r11 C U0 sub two data
160:
161: beq r8, $fix2 C U1 fix exact zero
162: $ret2: cmpult r9, r10, r23 C U0 did it borrow
163: ldq r9, 24(r17) C L0 get next ones
164: ldq r10, 24(r18) C L1
165:
166: subq r0, r1, r2 C U1 sub two data
167: stq r24, -24(r16) C L0 put an answer
168: subq r11, r22, r24 C U0 borrow from last
169: stq r25, -16(r16) C L1 pair
170:
171: cmpult r0, r1, r20 C U1 did it borrow
172: beq r11, $fix3 C U0 fix exact 0
173: $ret3: ldq r0, 32(r17) C L0 get next ones
174: ldq r1, 32(r18) C L1
175:
176: bis r31, r31, r31 C L damp out
177: subq r2, r23, r25 C U1 borrow from last
178: bis r31, r31, r31 C L moves in L !
179: subq r3, r4, r5 C U0 sub two data
180:
181: beq r2, $fix4 C U1 fix exact zero
182: $ret4: cmpult r3, r4, r21 C U0 did it borrow
183: ldq r3, 40(r17) C L0 get next ones
184: ldq r4, 40(r18) C L1
185:
186: subq r6, r7, r8 C U1 sub two data
187: stq r24, -8(r16) C L0 store pair
188: subq r5, r20, r24 C U0 borrow from last
189: stq r25, 0(r16) C L1
190:
191: cmpult r6, r7, r22 C U1 did it borrow
192: beq r5, $fix5 C U0 fix exact zero
193: $ret5: ldq r6, 48(r17) C L0 get next ones
194: ldq r7, 48(r18) C L1
195:
196: bis r31, r31, r31 C L damp out
197: subq r8, r21, r25 C U1 borrow from last
198: bis r31, r31, r31 C L moves in L !
199: subq r9, r10, r11 C U0 sub two data
200:
201: beq r8, $fix6 C U1 fix exact zero
202: $ret6: cmpult r9, r10, r23 C U0 did it borrow
203: ldq r9, 56(r17) C L0 get next ones
204: ldq r10, 56(r18) C L1
205:
206: lda r17, 64(r17) C L0 move pointer
207: bis r31, r31, r31 C U
208: lda r18, 64(r18) C L1 move pointer
209: bge r19, $Loop C U1 loop control
210: C ==== main loop end
211:
212: $Lend:
213: subq r0, r1, r2 C U1 sub two data
214: stq r24, 8(r16) C L0 put an answer
215: subq r11, r22, r24 C U0 borrow from last
216: stq r25, 16(r16) C L1 pair
217:
218: cmpult r0, r1, r20 C U1 did it borrow
219: beq r11, $fix7c C U0 fix exact 0
220: $ret7c:
221: subq r2, r23, r25 C U1 borrow from last
222: subq r3, r4, r5 C U0 sub two data
223:
224: beq r2, $fix0c C U1 fix exact zero
225: $ret0c: cmpult r3, r4, r21 C U0 did it borrow
226:
227: subq r6, r7, r8 C U1 sub two data
228: stq r24, 24(r16) C L0 store pair
229: subq r5, r20, r24 C U0 borrow from last
230: stq r25, 32(r16) C L1
231:
232: cmpult r6, r7, r22 C U1 did it borrow
233: beq r5, $fix1c C U0 fix exact zero
234: $ret1c:
235: lda r16, 64(r16) C L0 move pointer
236: subq r8, r21, r25 C U1 borrow from last
237: subq r9, r10, r11 C U0 sub two data
238:
239: beq r8, $fix2c C U1 fix exact zero
240: $ret2c: cmpult r9, r10, r23 C U0 did it borrow
241:
242: stq r24, -24(r16) C L0 put an answer
243: subq r11, r22, r24 C U0 borrow from last
244: stq r25, -16(r16) C L1 pair
245:
246: beq r11, $fix3c C U0 fix exact 0
247: $ret3c:
248: stq r24, -8(r16) C L0 store pair
249:
250:
251: $Lsmall:
252: lda r19, 8(r19)
253: beq r19, $Lret
254:
255: ldq r0, 0(r17)
256: ldq r1, 0(r18)
257: lda r19, -1(r19)
258: beq r19, $Lend0
259:
260: ALIGN(8)
261: $Loop0: subq r0, r1, r2 C main sub
262: cmpult r0, r1, r8 C compute bw from last sub
263: ldq r0, 8(r17)
264: ldq r1, 8(r18)
265: subq r2, r23, r20 C borrow sub
266: lda r17, 8(r17)
267: lda r18, 8(r18)
268: stq r20, 0(r16)
269: cmpult r2, r23, r23 C compute bw from last sub
270: lda r19, -1(r19) C decr loop cnt
271: bis r8, r23, r23 C combine bw from the two subs
272: lda r16, 8(r16)
273: bne r19, $Loop0
274: $Lend0: subq r0, r1, r2 C main sub
275: cmpult r0, r1, r8 C compute bw from last sub
276: subq r2, r23, r20 C borrow sub
277: cmpult r2, r23, r23 C compute bw from last sub
278: stq r20, 0(r16)
279: bis r8, r23, r23 C combine bw from the two subs
280:
281: $Lret:
282: lda r0, 0(r23) C copy borrow into return register
283:
284: ldq r9, 8(r30)
285: ldq r10, 16(r30)
286: ldq r11, 24(r30)
287: lda r30, 240(r30)
288: ret r31,(r26),1
289:
290:
291: $fix5w: bis r21, r20, r21 C bring forward borrow
292: br r31, $ret5w
293: $fix6w: bis r22, r21, r22 C bring forward borrow
294: br r31, $ret6w
295: $fix0: bis r20, r23, r20 C bring forward borrow
296: br r31, $ret0
297: $fix1: bis r21, r20, r21 C bring forward borrow
298: br r31, $ret1
299: $fix2: bis r22, r21, r22 C bring forward borrow
300: br r31, $ret2
301: $fix3: bis r23, r22, r23 C bring forward borrow
302: br r31, $ret3
303: $fix4: bis r20, r23, r20 C bring forward borrow
304: br r31, $ret4
305: $fix5: bis r20, r21, r21 C bring forward borrow
306: br r31, $ret5
307: $fix6: bis r22, r21, r22 C bring forward borrow
308: br r31, $ret6
309: $fix7: bis r23, r22, r23 C bring forward borrow
310: br r31, $ret7
311: $fix0c: bis r20, r23, r20 C bring forward borrow
312: br r31, $ret0c
313: $fix1c: bis r21, r20, r21 C bring forward borrow
314: br r31, $ret1c
315: $fix2c: bis r22, r21, r22 C bring forward borrow
316: br r31, $ret2c
317: $fix3c: bis r23, r22, r23 C bring forward borrow
318: br r31, $ret3c
319: $fix7c: bis r23, r22, r23 C bring forward borrow
320: br r31, $ret7c
321:
322: EPILOGUE(mpn_sub_n)
323: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>