Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/submul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
2: dnl the result from a second limb vector.
3:
4: dnl Copyright 2000 Free Software Foundation, Inc.
5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25: dnl INPUT PARAMETERS
26: dnl res_ptr r16
27: dnl s1_ptr r17
28: dnl size r18
29: dnl s2_limb r19
30:
31: dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
32: dnl exactly 3.5 cycles/limb on EV6...
33:
34: dnl This code was written in close cooperation with ev6 pipeline expert
35: dnl Steve Root. Any errors are tege's fault, though.
36: dnl
37: dnl Register usages for unrolled loop:
38: dnl 0-3 mul's
39: dnl 4-7 acc's
40: dnl 8-15 mul results
41: dnl 20,21 carry's
42: dnl 22,23 save for stores
43:
44: dnl Sustains 8 mul-adds in 28 cycles in the unrolled inner loop.
45:
46: dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
47: dnl them, so that further disturbance to the schedule is damped.
48:
49: dnl We couldn't pair the loads, because the entangled schedule of the
50: dnl carry's has to happen on one side {0} of the machine. Note, the total
51: dnl use of U0, and the total use of L0 (after attending to the stores).
52: dnl which is part of the reason why....
53:
54: dnl This is a great schedule for the d_cache, a poor schedule for the
55: dnl b_cache. The lockup on U0 means that any stall can't be recovered
56: dnl from. Consider a ldq in L1. say that load gets stalled because it
57: dnl collides with a fill from the b_Cache. On the next cycle, this load
58: dnl gets priority. If first looks at L0, and goes there. The instruction
59: dnl we intended for L0 gets to look at L1, which is NOT where we want
60: dnl it. It either stalls 1, because it can't go in L0, or goes there, and
61: dnl causes a further instruction to stall.
62:
63: dnl So for b_cache, we're likely going to want to put one or more cycles
64: dnl back into the code! And, of course, put in prefetches. For the
65: dnl accumulator, lds, intent to modify. For the multiplier, you might
66: dnl want ldq, evict next, if you're not wanting to use it again soon. Use
67: dnl 256 ahead of present pointer value. At a place where we have an mt
68: dnl followed by a bookkeeping, put the bookkeeping in upper, and the
69: dnl prefetch into lower.
70:
71: dnl Note, the usage of physical registers per cycle is smoothed off, as
72: dnl much as possible.
73:
74: dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
75: dnl like not to have a ldq or stq to preceded a conditional branch in a
76: dnl quadpack. The conditional branch moves the retire pointer one cycle
77: dnl later.
78:
79: dnl Optimization notes:
80: dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
81: dnl Reserved regs: r29 r30 r31
82: dnl Free caller-saves regs in unrolled code: r24 r25 r28
83: dnl We should swap some of the callee-saves regs for some of the free
84: dnl caller-saves regs, saving some overhead cycles.
85: dnl Most importantly, we should write fast code for the 0-7 case.
86: dnl The code we use there are for the 21164, and runs at 7 cycles/limb
87: dnl on the 21264. Should not be hard, if we write specialized code for
88: dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just
89: dnl need a jump table indexed by the low 3 bits of the count argument.
90:
91:
92: ASM_START()
93: PROLOGUE(mpn_submul_1)
94: cmpult r18, 8, r1
95: beq r1, $Large
96:
97: ldq r2, 0(r17) C r2 = s1_limb
98: addq r17, 8, r17 C s1_ptr++
99: subq r18, 1, r18 C size--
100: mulq r2, r19, r3 C r3 = prod_low
101: ldq r5, 0(r16) C r5 = *res_ptr
102: umulh r2, r19, r0 C r0 = prod_high
103: beq r18, $Lend0b C jump if size was == 1
104: ldq r2, 0(r17) C r2 = s1_limb
105: addq r17, 8, r17 C s1_ptr++
106: subq r18, 1, r18 C size--
107: subq r5, r3, r3
108: cmpult r5, r3, r4
109: stq r3, 0(r16)
110: addq r16, 8, r16 C res_ptr++
111: beq r18, $Lend0a C jump if size was == 2
112:
113: ALIGN(8)
114: $Loop0: mulq r2, r19, r3 C r3 = prod_low
115: ldq r5, 0(r16) C r5 = *res_ptr
116: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
117: subq r18, 1, r18 C size--
118: umulh r2, r19, r4 C r4 = cy_limb
119: ldq r2, 0(r17) C r2 = s1_limb
120: addq r17, 8, r17 C s1_ptr++
121: addq r3, r0, r3 C r3 = cy_limb + prod_low
122: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
123: subq r5, r3, r3
124: cmpult r5, r3, r5
125: stq r3, 0(r16)
126: addq r16, 8, r16 C res_ptr++
127: addq r5, r0, r0 C combine carries
128: bne r18, $Loop0
129: $Lend0a:
130: mulq r2, r19, r3 C r3 = prod_low
131: ldq r5, 0(r16) C r5 = *res_ptr
132: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
133: umulh r2, r19, r4 C r4 = cy_limb
134: addq r3, r0, r3 C r3 = cy_limb + prod_low
135: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
136: subq r5, r3, r3
137: cmpult r5, r3, r5
138: stq r3, 0(r16)
139: addq r5, r0, r0 C combine carries
140: addq r4, r0, r0 C cy_limb = prod_high + cy
141: ret r31, (r26), 1
142: $Lend0b:
143: subq r5, r3, r3
144: cmpult r5, r3, r5
145: stq r3, 0(r16)
146: addq r0, r5, r0
147: ret r31, (r26), 1
148:
149: $Large:
150: lda $30, -240($30)
151: stq $9, 8($30)
152: stq $10, 16($30)
153: stq $11, 24($30)
154: stq $12, 32($30)
155: stq $13, 40($30)
156: stq $14, 48($30)
157: stq $15, 56($30)
158:
159: and r18, 7, r20 C count for the first loop, 0-7
160: srl r18, 3, r18 C count for unrolled loop
161: bis r31, r31, r0
162: beq r20, $Lunroll
163: ldq r2, 0(r17) C r2 = s1_limb
164: addq r17, 8, r17 C s1_ptr++
165: subq r20, 1, r20 C size--
166: mulq r2, r19, r3 C r3 = prod_low
167: ldq r5, 0(r16) C r5 = *res_ptr
168: umulh r2, r19, r0 C r0 = prod_high
169: beq r20, $Lend1b C jump if size was == 1
170: ldq r2, 0(r17) C r2 = s1_limb
171: addq r17, 8, r17 C s1_ptr++
172: subq r20, 1, r20 C size--
173: subq r5, r3, r3
174: cmpult r5, r3, r4
175: stq r3, 0(r16)
176: addq r16, 8, r16 C res_ptr++
177: beq r20, $Lend1a C jump if size was == 2
178:
179: ALIGN(8)
180: $Loop1: mulq r2, r19, r3 C r3 = prod_low
181: ldq r5, 0(r16) C r5 = *res_ptr
182: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
183: subq r20, 1, r20 C size--
184: umulh r2, r19, r4 C r4 = cy_limb
185: ldq r2, 0(r17) C r2 = s1_limb
186: addq r17, 8, r17 C s1_ptr++
187: addq r3, r0, r3 C r3 = cy_limb + prod_low
188: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
189: subq r5, r3, r3
190: cmpult r5, r3, r5
191: stq r3, 0(r16)
192: addq r16, 8, r16 C res_ptr++
193: addq r5, r0, r0 C combine carries
194: bne r20, $Loop1
195:
196: $Lend1a:
197: mulq r2, r19, r3 C r3 = prod_low
198: ldq r5, 0(r16) C r5 = *res_ptr
199: addq r4, r0, r0 C cy_limb = cy_limb + 'cy'
200: umulh r2, r19, r4 C r4 = cy_limb
201: addq r3, r0, r3 C r3 = cy_limb + prod_low
202: cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low)
203: subq r5, r3, r3
204: cmpult r5, r3, r5
205: stq r3, 0(r16)
206: addq r16, 8, r16 C res_ptr++
207: addq r5, r0, r0 C combine carries
208: addq r4, r0, r0 C cy_limb = prod_high + cy
209: br r31, $Lunroll
210: $Lend1b:
211: subq r5, r3, r3
212: cmpult r5, r3, r5
213: stq r3, 0(r16)
214: addq r16, 8, r16 C res_ptr++
215: addq r0, r5, r0
216:
217: $Lunroll:
218: lda r17, -16(r17) C L1 bookkeeping
219: lda r16, -16(r16) C L1 bookkeeping
220: bis r0, r31, r12
221:
222: C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
223:
224: ldq r2, 16(r17) C L1
225: ldq r3, 24(r17) C L1
226: lda r18, -1(r18) C L1 bookkeeping
227: ldq r6, 16(r16) C L1
228: ldq r7, 24(r16) C L1
229: ldq r0, 32(r17) C L1
230: mulq r19, r2, r13 C U1
231: ldq r1, 40(r17) C L1
232: umulh r19, r2, r14 C U1
233: mulq r19, r3, r15 C U1
234: lda r17, 64(r17) C L1 bookkeeping
235: ldq r4, 32(r16) C L1
236: ldq r5, 40(r16) C L1
237: umulh r19, r3, r8 C U1
238: ldq r2, -16(r17) C L1
239: mulq r19, r0, r9 C U1
240: ldq r3, -8(r17) C L1
241: umulh r19, r0, r10 C U1
242: subq r6, r13, r13 C L0 lo + acc
243: mulq r19, r1, r11 C U1
244: cmpult r6, r13, r20 C L0 lo add => carry
245: lda r16, 64(r16) C L1 bookkeeping
246: subq r13, r12, r22 C U0 hi add => answer
247: cmpult r13, r12, r21 C L0 hi add => carry
248: addq r14, r20, r14 C U0 hi mul + carry
249: ldq r6, -16(r16) C L1
250: subq r7, r15, r28 C L0 lo + acc
251: addq r14, r21, r14 C U0 hi mul + carry
252: cmpult r7, r15, r20 C L0 lo add => carry
253: ldq r7, -8(r16) C L1
254: umulh r19, r1, r12 C U1
255: subq r28, r14, r23 C U0 hi add => answer
256: ldq r0, 0(r17) C L1
257: mulq r19, r2, r13 C U1
258: cmpult r28, r14, r21 C L0 hi add => carry
259: addq r8, r20, r8 C U0 hi mul + carry
260: ldq r1, 8(r17) C L1
261: umulh r19, r2, r14 C U1
262: subq r4, r9, r9 C L0 lo + acc
263: stq r22, -48(r16) C L0
264: stq r23, -40(r16) C L1
265: mulq r19, r3, r15 C U1
266: addq r8, r21, r8 C U0 hi mul + carry
267: cmpult r4, r9, r20 C L0 lo add => carry
268: subq r9, r8, r22 C U0 hi add => answer
269: ble r18, $Lend C U1 bookkeeping
270:
271: C ____ MAIN UNROLLED LOOP ____
272: ALIGN(16)
273: $Loop:
274: bis r31, r31, r31 C U1 mt
275: cmpult r9, r8, r21 C L0 hi add => carry
276: addq r10, r20, r10 C U0 hi mul + carry
277: ldq r4, 0(r16) C L1
278:
279: bis r31, r31, r31 C U1 mt
280: subq r5, r11, r23 C L0 lo + acc
281: addq r10, r21, r10 C L0 hi mul + carry
282: ldq r2, 16(r17) C L1
283:
284: umulh r19, r3, r8 C U1
285: cmpult r5, r11, r20 C L0 lo add => carry
286: subq r23, r10, r28 C U0 hi add => answer
287: ldq r5, 8(r16) C L1
288:
289: mulq r19, r0, r9 C U1
290: cmpult r23, r10, r21 C L0 hi add => carry
291: addq r12, r20, r12 C U0 hi mul + carry
292: ldq r3, 24(r17) C L1
293:
294: umulh r19, r0, r10 C U1
295: subq r6, r13, r13 C L0 lo + acc
296: stq r22, -32(r16) C L0
297: stq r28, -24(r16) C L1
298:
299: bis r31, r31, r31 C L0 st slosh
300: mulq r19, r1, r11 C U1
301: bis r31, r31, r31 C L1 st slosh
302: addq r12, r21, r12 C U0 hi mul + carry
303:
304: cmpult r6, r13, r20 C L0 lo add => carry
305: bis r31, r31, r31 C U1 mt
306: lda r18, -1(r18) C L1 bookkeeping
307: subq r13, r12, r22 C U0 hi add => answer
308:
309: bis r31, r31, r31 C U1 mt
310: cmpult r13, r12, r21 C L0 hi add => carry
311: addq r14, r20, r14 C U0 hi mul + carry
312: ldq r6, 16(r16) C L1
313:
314: bis r31, r31, r31 C U1 mt
315: subq r7, r15, r23 C L0 lo + acc
316: addq r14, r21, r14 C U0 hi mul + carry
317: ldq r0, 32(r17) C L1
318:
319: umulh r19, r1, r12 C U1
320: cmpult r7, r15, r20 C L0 lo add => carry
321: subq r23, r14, r28 C U0 hi add => answer
322: ldq r7, 24(r16) C L1
323:
324: mulq r19, r2, r13 C U1
325: cmpult r23, r14, r21 C L0 hi add => carry
326: addq r8, r20, r8 C U0 hi mul + carry
327: ldq r1, 40(r17) C L1
328:
329: umulh r19, r2, r14 C U1
330: subq r4, r9, r9 C U0 lo + acc
331: stq r22, -16(r16) C L0
332: stq r28, -8(r16) C L1
333:
334: bis r31, r31, r31 C L0 st slosh
335: mulq r19, r3, r15 C U1
336: bis r31, r31, r31 C L1 st slosh
337: addq r8, r21, r8 C L0 hi mul + carry
338:
339: cmpult r4, r9, r20 C L0 lo add => carry
340: bis r31, r31, r31 C U1 mt
341: lda r17, 64(r17) C L1 bookkeeping
342: subq r9, r8, r22 C U0 hi add => answer
343:
344: bis r31, r31, r31 C U1 mt
345: cmpult r9, r8, r21 C L0 hi add => carry
346: addq r10, r20, r10 C U0 hi mul + carry
347: ldq r4, 32(r16) C L1
348:
349: bis r31, r31, r31 C U1 mt
350: subq r5, r11, r23 C L0 lo + acc
351: addq r10, r21, r10 C L0 hi mul + carry
352: ldq r2, -16(r17) C L1
353:
354: umulh r19, r3, r8 C U1
355: cmpult r5, r11, r20 C L0 lo add => carry
356: subq r23, r10, r28 C U0 hi add => answer
357: ldq r5, 40(r16) C L1
358:
359: mulq r19, r0, r9 C U1
360: cmpult r23, r10, r21 C L0 hi add => carry
361: addq r12, r20, r12 C U0 hi mul + carry
362: ldq r3, -8(r17) C L1
363:
364: umulh r19, r0, r10 C U1
365: subq r6, r13, r13 C L0 lo + acc
366: stq r22, 0(r16) C L0
367: stq r28, 8(r16) C L1
368:
369: bis r31, r31, r31 C L0 st slosh
370: mulq r19, r1, r11 C U1
371: bis r31, r31, r31 C L1 st slosh
372: addq r12, r21, r12 C U0 hi mul + carry
373:
374: cmpult r6, r13, r20 C L0 lo add => carry
375: bis r31, r31, r31 C U1 mt
376: lda r16, 64(r16) C L1 bookkeeping
377: subq r13, r12, r22 C U0 hi add => answer
378:
379: bis r31, r31, r31 C U1 mt
380: cmpult r13, r12, r21 C L0 hi add => carry
381: addq r14, r20, r14 C U0 hi mul + carry
382: ldq r6, -16(r16) C L1
383:
384: bis r31, r31, r31 C U1 mt
385: subq r7, r15, r23 C L0 lo + acc
386: addq r14, r21, r14 C U0 hi mul + carry
387: ldq r0, 0(r17) C L1
388:
389: umulh r19, r1, r12 C U1
390: cmpult r7, r15, r20 C L0 lo add => carry
391: subq r23, r14, r28 C U0 hi add => answer
392: ldq r7, -8(r16) C L1
393:
394: mulq r19, r2, r13 C U1
395: cmpult r23, r14, r21 C L0 hi add => carry
396: addq r8, r20, r8 C U0 hi mul + carry
397: ldq r1, 8(r17) C L1
398:
399: umulh r19, r2, r14 C U1
400: subq r4, r9, r9 C L0 lo + acc
401: stq r22, -48(r16) C L0
402: stq r28, -40(r16) C L1
403:
404: bis r31, r31, r31 C L0 st slosh
405: mulq r19, r3, r15 C U1
406: bis r31, r31, r31 C L1 st slosh
407: addq r8, r21, r8 C U0 hi mul + carry
408:
409: cmpult r4, r9, r20 C L0 lo add => carry
410: subq r9, r8, r22 C U0 hi add => answer
411: bis r31, r31, r31 C L1 mt
412: bgt r18, $Loop C U1 bookkeeping
413:
414: C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
415: $Lend:
416: cmpult r9, r8, r21 C L0 hi add => carry
417: addq r10, r20, r10 C U0 hi mul + carry
418: ldq r4, 0(r16) C L1
419: subq r5, r11, r23 C L0 lo + acc
420: addq r10, r21, r10 C L0 hi mul + carry
421: umulh r19, r3, r8 C U1
422: cmpult r5, r11, r20 C L0 lo add => carry
423: subq r23, r10, r28 C U0 hi add => answer
424: ldq r5, 8(r16) C L1
425: mulq r19, r0, r9 C U1
426: cmpult r23, r10, r21 C L0 hi add => carry
427: addq r12, r20, r12 C U0 hi mul + carry
428: umulh r19, r0, r10 C U1
429: subq r6, r13, r13 C L0 lo + acc
430: stq r22, -32(r16) C L0
431: stq r28, -24(r16) C L1
432: mulq r19, r1, r11 C U1
433: addq r12, r21, r12 C U0 hi mul + carry
434: cmpult r6, r13, r20 C L0 lo add => carry
435: subq r13, r12, r22 C U0 hi add => answer
436: cmpult r13, r12, r21 C L0 hi add => carry
437: addq r14, r20, r14 C U0 hi mul + carry
438: subq r7, r15, r23 C L0 lo + acc
439: addq r14, r21, r14 C U0 hi mul + carry
440: umulh r19, r1, r12 C U1
441: cmpult r7, r15, r20 C L0 lo add => carry
442: subq r23, r14, r28 C U0 hi add => answer
443: cmpult r23, r14, r21 C L0 hi add => carry
444: addq r8, r20, r8 C U0 hi mul + carry
445: subq r4, r9, r9 C U0 lo + acc
446: stq r22, -16(r16) C L0
447: stq r28, -8(r16) C L1
448: addq r8, r21, r8 C L0 hi mul + carry
449: cmpult r4, r9, r20 C L0 lo add => carry
450: subq r9, r8, r22 C U0 hi add => answer
451: cmpult r9, r8, r21 C L0 hi add => carry
452: addq r10, r20, r10 C U0 hi mul + carry
453: subq r5, r11, r23 C L0 lo + acc
454: addq r10, r21, r10 C L0 hi mul + carry
455: cmpult r5, r11, r20 C L0 lo add => carry
456: subq r23, r10, r28 C U0 hi add => answer
457: cmpult r23, r10, r21 C L0 hi add => carry
458: addq r12, r20, r12 C U0 hi mul + carry
459: stq r22, 0(r16) C L0
460: stq r28, 8(r16) C L1
461: addq r12, r21, r0 C U0 hi mul + carry
462:
463: ldq $9, 8($30)
464: ldq $10, 16($30)
465: ldq $11, 24($30)
466: ldq $12, 32($30)
467: ldq $13, 40($30)
468: ldq $14, 48($30)
469: ldq $15, 56($30)
470: lda $30, 240($30)
471: ret r31, (r26), 1
472: EPILOGUE(mpn_submul_1)
473: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>