Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/mul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2: dnl result in a second limb vector.
3:
4: dnl Copyright 2000, 2001 Free Software Foundation, Inc.
5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
25: C INPUT PARAMETERS
26: C res_ptr r16
27: C s1_ptr r17
28: C size r18
29: C s2_limb r19
30:
31: C This code runs at 2.25 cycles/limb on EV6.
32:
33: C This code was written in close cooperation with ev6 pipeline expert
34: C Steve Root. Any errors are tege's fault, though.
35:
36: C Code structure:
37:
38: C code for n < 8
39: C code for n > 8 code for (n mod 8)
40: C code for (n div 8) feed-in code
41: C 8-way unrolled loop
42: C wind-down code
43:
44: C Some notes about unrolled loop:
45: C
46: C r1-r8 multiplies and workup
47: C r21-r28 multiplies and workup
48: C r9-r12 loads
49: C r0 -1
50: C r20,r29,r13-r15 scramble
51: C
52: C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
53: C put-the-carry-into-hi. The idea is that these branches are very rarely
54: C taken, and since a non-taken branch consumes no resurces, that is better
55: C than an addq.
56: C
57: C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
58: C add NEXT cycle #09 which feeds a store in NEXT cycle #02
59:
60: C The code could use some further work:
61: C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
62: C faster than this for size < 3.
63: C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
64: C that is too costly.
65: C 3. Consider using 4-way unrolling, even if that runs slower.
66: C 4. Reduce register usage. In particular, try to avoid using r29.
67:
68:
69: ASM_START()
70: PROLOGUE(mpn_mul_1)
71: cmpult r18, 8, r1
72: beq r1, $Large
73: $Lsmall:
74: ldq r2,0(r17) C r2 = s1_limb
75: lda r18,-1(r18) C size--
76: mulq r2,r19,r3 C r3 = prod_low
77: bic r31,r31,r4 C clear cy_limb
78: umulh r2,r19,r0 C r0 = prod_high
79: beq r18,$Le1a C jump if size was == 1
80: ldq r2,8(r17) C r2 = s1_limb
81: lda r18,-1(r18) C size--
82: stq r3,0(r16)
83: beq r18,$Le2a C jump if size was == 2
84: ALIGN(8)
85: $Lopa: mulq r2,r19,r3 C r3 = prod_low
86: addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
87: lda r18,-1(r18) C size--
88: umulh r2,r19,r4 C r4 = cy_limb
89: ldq r2,16(r17) C r2 = s1_limb
90: lda r17,8(r17) C s1_ptr++
91: addq r3,r0,r3 C r3 = cy_limb + prod_low
92: stq r3,8(r16)
93: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
94: lda r16,8(r16) C res_ptr++
95: bne r18,$Lopa
96:
97: $Le2a: mulq r2,r19,r3 C r3 = prod_low
98: addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
99: umulh r2,r19,r4 C r4 = cy_limb
100: addq r3,r0,r3 C r3 = cy_limb + prod_low
101: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
102: stq r3,8(r16)
103: addq r4,r0,r0 C cy_limb = prod_high + cy
104: ret r31,(r26),1
105: $Le1a: stq r3,0(r16)
106: ret r31,(r26),1
107:
108: $Large:
109: lda r30, -224(r30)
110: stq r26, 0(r30)
111: stq r9, 8(r30)
112: stq r10, 16(r30)
113: stq r11, 24(r30)
114: stq r12, 32(r30)
115: stq r13, 40(r30)
116: stq r14, 48(r30)
117: stq r15, 56(r30)
118: stq r29, 64(r30)
119:
120: and r18, 7, r20 C count for the first loop, 0-7
121: srl r18, 3, r18 C count for unrolled loop
122: bis r31, r31, r21
123: beq r20, $L_8_or_more C skip first loop
124:
125: $L_9_or_more:
126: ldq r2,0(r17) C r2 = s1_limb
127: lda r17,8(r17) C s1_ptr++
128: lda r20,-1(r20) C size--
129: mulq r2,r19,r3 C r3 = prod_low
130: umulh r2,r19,r21 C r21 = prod_high
131: beq r20,$Le1b C jump if size was == 1
132: bis r31, r31, r0 C FIXME: shouldtn't need this
133: ldq r2,0(r17) C r2 = s1_limb
134: lda r17,8(r17) C s1_ptr++
135: lda r20,-1(r20) C size--
136: stq r3,0(r16)
137: lda r16,8(r16) C res_ptr++
138: beq r20,$Le2b C jump if size was == 2
139: ALIGN(8)
140: $Lopb: mulq r2,r19,r3 C r3 = prod_low
141: addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
142: lda r20,-1(r20) C size--
143: umulh r2,r19,r21 C r21 = prod_high
144: ldq r2,0(r17) C r2 = s1_limb
145: lda r17,8(r17) C s1_ptr++
146: addq r3,r0,r3 C r3 = cy_limb + prod_low
147: stq r3,0(r16)
148: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
149: lda r16,8(r16) C res_ptr++
150: bne r20,$Lopb
151:
152: $Le2b: mulq r2,r19,r3 C r3 = prod_low
153: addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
154: umulh r2,r19,r21 C r21 = prod_high
155: addq r3,r0,r3 C r3 = cy_limb + prod_low
156: cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
157: stq r3,0(r16)
158: lda r16,8(r16) C res_ptr++
159: addq r21,r0,r21 C cy_limb = prod_high + cy
160: br r31, $L_8_or_more
161: $Le1b: stq r3,0(r16)
162: lda r16,8(r16) C res_ptr++
163:
164: $L_8_or_more:
165: lda r0, -1(r31) C put -1 in r0, for tricky loop control
166: lda r17, -32(r17) C L1 bookkeeping
167: lda r18, -1(r18) C decrement count
168:
169: ldq r9, 32(r17) C L1
170: ldq r10, 40(r17) C L1
171: mulq r9, r19, r22 C U1 #07
172: ldq r11, 48(r17) C L1
173: umulh r9, r19, r23 C U1 #08
174: ldq r12, 56(r17) C L1
175: mulq r10, r19, r24 C U1 #09
176: ldq r9, 64(r17) C L1
177:
178: lda r17, 64(r17) C L1 bookkeeping
179:
180: umulh r10, r19, r25 C U1 #11
181: mulq r11, r19, r26 C U1 #12
182: umulh r11, r19, r27 C U1 #13
183: mulq r12, r19, r28 C U1 #14
184: ldq r10, 8(r17) C L1
185: umulh r12, r19, r1 C U1 #15
186: ldq r11, 16(r17) C L1
187: mulq r9, r19, r2 C U1 #16
188: ldq r12, 24(r17) C L1
189: umulh r9, r19, r3 C U1 #17
190: addq r21, r22, r13 C L1 mov
191: mulq r10, r19, r4 C U1 #18
192: addq r23, r24, r22 C L0 sum 2 mul's
193: cmpult r13, r21, r14 C L1 carry from sum
194: bgt r18, $L_16_or_more
195:
196: cmpult r22, r24, r24 C U0 carry from sum
197: umulh r10, r19, r5 C U1 #02
198: addq r25, r26, r23 C U0 sum 2 mul's
199: mulq r11, r19, r6 C U1 #03
200: cmpult r23, r26, r25 C U0 carry from sum
201: umulh r11, r19, r7 C U1 #04
202: addq r27, r28, r28 C U0 sum 2 mul's
203: mulq r12, r19, r8 C U1 #05
204: cmpult r28, r27, r15 C L0 carry from sum
205: lda r16, 32(r16) C L1 bookkeeping
206: addq r13, r31, r13 C U0 start carry cascade
207: umulh r12, r19, r21 C U1 #06
208: br r31, ret0c
209:
210: $L_16_or_more:
211: C ---------------------------------------------------------------
212: subq r18,1,r18
213: cmpult r22, r24, r24 C U0 carry from sum
214: ldq r9, 32(r17) C L1
215:
216: umulh r10, r19, r5 C U1 #02
217: addq r25, r26, r23 C U0 sum 2 mul's
218: mulq r11, r19, r6 C U1 #03
219: cmpult r23, r26, r25 C U0 carry from sum
220: umulh r11, r19, r7 C U1 #04
221: addq r27, r28, r28 C U0 sum 2 mul's
222: mulq r12, r19, r8 C U1 #05
223: cmpult r28, r27, r15 C L0 carry from sum
224: lda r16, 32(r16) C L1 bookkeeping
225: addq r13, r31, r13 C U0 start carry cascade
226:
227: umulh r12, r19, r21 C U1 #06
228: C beq r13, fix0w C U0
229: ret0w: addq r22, r14, r26 C L0
230: ldq r10, 40(r17) C L1
231:
232: mulq r9, r19, r22 C U1 #07
233: beq r26, fix1w C U0
234: ret1w: addq r23, r24, r27 C L0
235: ldq r11, 48(r17) C L1
236:
237: umulh r9, r19, r23 C U1 #08
238: beq r27, fix2w C U0
239: ret2w: addq r28, r25, r28 C L0
240: ldq r12, 56(r17) C L1
241:
242: mulq r10, r19, r24 C U1 #09
243: beq r28, fix3w C U0
244: ret3w: addq r1, r2, r20 C L0 sum 2 mul's
245: ldq r9, 64(r17) C L1
246:
247: addq r3, r4, r2 C L0 #10 2 mul's
248: lda r17, 64(r17) C L1 bookkeeping
249: cmpult r20, r1, r29 C U0 carry from sum
250:
251: umulh r10, r19, r25 C U1 #11
252: cmpult r2, r4, r4 C U0 carry from sum
253: stq r13, -32(r16) C L0
254: stq r26, -24(r16) C L1
255:
256: mulq r11, r19, r26 C U1 #12
257: addq r5, r6, r14 C U0 sum 2 mul's
258: stq r27, -16(r16) C L0
259: stq r28, -8(r16) C L1
260:
261: umulh r11, r19, r27 C U1 #13
262: cmpult r14, r6, r3 C U0 carry from sum
263: C could do cross-jumping here:
264: C bra $L_middle_of_unrolled_loop
265: mulq r12, r19, r28 C U1 #14
266: addq r7, r3, r5 C L0 eat carry
267: addq r20, r15, r20 C U0 carry cascade
268: ldq r10, 8(r17) C L1
269:
270: umulh r12, r19, r1 C U1 #15
271: beq r20, fix4 C U0
272: ret4w: addq r2, r29, r6 C L0
273: ldq r11, 16(r17) C L1
274:
275: mulq r9, r19, r2 C U1 #16
276: beq r6, fix5 C U0
277: ret5w: addq r14, r4, r7 C L0
278: ldq r12, 24(r17) C L1
279:
280: umulh r9, r19, r3 C U1 #17
281: beq r7, fix6 C U0
282: ret6w: addq r5, r8, r8 C L0 sum 2
283: addq r21, r22, r13 C L1 sum 2 mul's
284:
285: mulq r10, r19, r4 C U1 #18
286: addq r23, r24, r22 C L0 sum 2 mul's
287: cmpult r13, r21, r14 C L1 carry from sum
288: ble r18, $Lend C U0
289: C ---------------------------------------------------------------
290: ALIGN(16)
291: $Loop:
292: umulh r0, r18, r18 C U1 #01 decrement r18!
293: cmpult r8, r5, r29 C L0 carry from last bunch
294: cmpult r22, r24, r24 C U0 carry from sum
295: ldq r9, 32(r17) C L1
296:
297: umulh r10, r19, r5 C U1 #02
298: addq r25, r26, r23 C U0 sum 2 mul's
299: stq r20, 0(r16) C L0
300: stq r6, 8(r16) C L1
301:
302: mulq r11, r19, r6 C U1 #03
303: cmpult r23, r26, r25 C U0 carry from sum
304: stq r7, 16(r16) C L0
305: stq r8, 24(r16) C L1
306:
307: umulh r11, r19, r7 C U1 #04
308: bis r31, r31, r31 C L0 st slosh
309: bis r31, r31, r31 C L1 st slosh
310: addq r27, r28, r28 C U0 sum 2 mul's
311:
312: mulq r12, r19, r8 C U1 #05
313: cmpult r28, r27, r15 C L0 carry from sum
314: lda r16, 64(r16) C L1 bookkeeping
315: addq r13, r29, r13 C U0 start carry cascade
316:
317: umulh r12, r19, r21 C U1 #06
318: beq r13, fix0 C U0
319: ret0: addq r22, r14, r26 C L0
320: ldq r10, 40(r17) C L1
321:
322: mulq r9, r19, r22 C U1 #07
323: beq r26, fix1 C U0
324: ret1: addq r23, r24, r27 C L0
325: ldq r11, 48(r17) C L1
326:
327: umulh r9, r19, r23 C U1 #08
328: beq r27, fix2 C U0
329: ret2: addq r28, r25, r28 C L0
330: ldq r12, 56(r17) C L1
331:
332: mulq r10, r19, r24 C U1 #09
333: beq r28, fix3 C U0
334: ret3: addq r1, r2, r20 C L0 sum 2 mul's
335: ldq r9, 64(r17) C L1
336:
337: addq r3, r4, r2 C L0 #10 2 mul's
338: bis r31, r31, r31 C U1 mul hole
339: lda r17, 64(r17) C L1 bookkeeping
340: cmpult r20, r1, r29 C U0 carry from sum
341:
342: umulh r10, r19, r25 C U1 #11
343: cmpult r2, r4, r4 C U0 carry from sum
344: stq r13, -32(r16) C L0
345: stq r26, -24(r16) C L1
346:
347: mulq r11, r19, r26 C U1 #12
348: addq r5, r6, r14 C U0 sum 2 mul's
349: stq r27, -16(r16) C L0
350: stq r28, -8(r16) C L1
351:
352: umulh r11, r19, r27 C U1 #13
353: bis r31, r31, r31 C L0 st slosh
354: bis r31, r31, r31 C L1 st slosh
355: cmpult r14, r6, r3 C U0 carry from sum
356: $L_middle_of_unrolled_loop:
357: mulq r12, r19, r28 C U1 #14
358: addq r7, r3, r5 C L0 eat carry
359: addq r20, r15, r20 C U0 carry cascade
360: ldq r10, 8(r17) C L1
361:
362: umulh r12, r19, r1 C U1 #15
363: beq r20, fix4 C U0
364: ret4: addq r2, r29, r6 C L0
365: ldq r11, 16(r17) C L1
366:
367: mulq r9, r19, r2 C U1 #16
368: beq r6, fix5 C U0
369: ret5: addq r14, r4, r7 C L0
370: ldq r12, 24(r17) C L1
371:
372: umulh r9, r19, r3 C U1 #17
373: beq r7, fix6 C U0
374: ret6: addq r5, r8, r8 C L0 sum 2
375: addq r21, r22, r13 C L1 sum 2 mul's
376:
377: mulq r10, r19, r4 C U1 #18
378: addq r23, r24, r22 C L0 sum 2 mul's
379: cmpult r13, r21, r14 C L1 carry from sum
380: bgt r18, $Loop C U0
381: C ---------------------------------------------------------------
382: $Lend:
383: cmpult r8, r5, r29 C L0 carry from last bunch
384: cmpult r22, r24, r24 C U0 carry from sum
385:
386: umulh r10, r19, r5 C U1 #02
387: addq r25, r26, r23 C U0 sum 2 mul's
388: stq r20, 0(r16) C L0
389: stq r6, 8(r16) C L1
390:
391: mulq r11, r19, r6 C U1 #03
392: cmpult r23, r26, r25 C U0 carry from sum
393: stq r7, 16(r16) C L0
394: stq r8, 24(r16) C L1
395:
396: umulh r11, r19, r7 C U1 #04
397: addq r27, r28, r28 C U0 sum 2 mul's
398:
399: mulq r12, r19, r8 C U1 #05
400: cmpult r28, r27, r15 C L0 carry from sum
401: lda r16, 64(r16) C L1 bookkeeping
402: addq r13, r29, r13 C U0 start carry cascade
403:
404: umulh r12, r19, r21 C U1 #06
405: beq r13, fix0c C U0
406: ret0c: addq r22, r14, r26 C L0
407: beq r26, fix1c C U0
408: ret1c: addq r23, r24, r27 C L0
409: beq r27, fix2c C U0
410: ret2c: addq r28, r25, r28 C L0
411: beq r28, fix3c C U0
412: ret3c: addq r1, r2, r20 C L0 sum 2 mul's
413: addq r3, r4, r2 C L0 #10 2 mul's
414: lda r17, 64(r17) C L1 bookkeeping
415: cmpult r20, r1, r29 C U0 carry from sum
416: cmpult r2, r4, r4 C U0 carry from sum
417: stq r13, -32(r16) C L0
418: stq r26, -24(r16) C L1
419: addq r5, r6, r14 C U0 sum 2 mul's
420: stq r27, -16(r16) C L0
421: stq r28, -8(r16) C L1
422: cmpult r14, r6, r3 C U0 carry from sum
423: addq r7, r3, r5 C L0 eat carry
424: addq r20, r15, r20 C U0 carry cascade
425: beq r20, fix4c C U0
426: ret4c: addq r2, r29, r6 C L0
427: beq r6, fix5c C U0
428: ret5c: addq r14, r4, r7 C L0
429: beq r7, fix6c C U0
430: ret6c: addq r5, r8, r8 C L0 sum 2
431: cmpult r8, r5, r29 C L0 carry from last bunch
432: stq r20, 0(r16) C L0
433: stq r6, 8(r16) C L1
434: stq r7, 16(r16) C L0
435: stq r8, 24(r16) C L1
436: addq r29, r21, r0
437:
438: ldq r26, 0(r30)
439: ldq r9, 8(r30)
440: ldq r10, 16(r30)
441: ldq r11, 24(r30)
442: ldq r12, 32(r30)
443: ldq r13, 40(r30)
444: ldq r14, 48(r30)
445: ldq r15, 56(r30)
446: ldq r29, 64(r30)
447: lda r30, 224(r30)
448: ret r31, (r26), 1
449:
450: C fix0w: bis r14, r29, r14 C join carries
451: C br r31, ret0w
452: fix1w: bis r24, r14, r24 C join carries
453: br r31, ret1w
454: fix2w: bis r25, r24, r25 C join carries
455: br r31, ret2w
456: fix3w: bis r15, r25, r15 C join carries
457: br r31, ret3w
458: fix0: bis r14, r29, r14 C join carries
459: br r31, ret0
460: fix1: bis r24, r14, r24 C join carries
461: br r31, ret1
462: fix2: bis r25, r24, r25 C join carries
463: br r31, ret2
464: fix3: bis r15, r25, r15 C join carries
465: br r31, ret3
466: fix4: bis r29, r15, r29 C join carries
467: br r31, ret4
468: fix5: bis r4, r29, r4 C join carries
469: br r31, ret5
470: fix6: addq r5, r4, r5 C can't carry twice!
471: br r31, ret6
472: fix0c: bis r14, r29, r14 C join carries
473: br r31, ret0c
474: fix1c: bis r24, r14, r24 C join carries
475: br r31, ret1c
476: fix2c: bis r25, r24, r25 C join carries
477: br r31, ret2c
478: fix3c: bis r15, r25, r15 C join carries
479: br r31, ret3c
480: fix4c: bis r29, r15, r29 C join carries
481: br r31, ret4c
482: fix5c: bis r4, r29, r4 C join carries
483: br r31, ret5c
484: fix6c: addq r5, r4, r5 C can't carry twice!
485: br r31, ret6c
486:
487: EPILOGUE(mpn_mul_1)
488: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>