Annotation of OpenXM_contrib/gmp/mpn/pa64/addmul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
2: dnl add the result to a second limb vector.
3:
4: dnl Copyright 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23:
24: dnl This approaches 7.0 cycles/limb on PA8000 and 6.375 cycles/limb on PA8500
25: dnl for huge operands. It should be possible to do 6.0 cycles/limb with the
26: dnl current instructions and unrolling level. It is unknown why the code runs
27: dnl somewhat slower.
28:
29: dnl The feed-in and wind-down code has not yet been scheduled. Many cycles
30: dnl could be saved there per call.
31:
32: dnl DESCRIPTION:
33: dnl The main loop "BIG" is 4-way unrolled, mainly to allow
34: dnl effective use of ADD,DC. Delays in moving data via the cache from the FP
35: dnl registers to the IU registers, have demaned a deep software pipeline, and
36: dnl a lot of stack slots for partial products in flight.
37: dnl
38: dnl CODE STRUCTURE:
39: dnl save-some-registers
40: dnl do 0, 1, 2, or 3 limbs
41: dnl if done, restore-some-regs and return
42: dnl save-many-regs
43: dnl do 4, 8, ... limb
44: dnl restore-all-regs
45:
46: dnl STACK LAYOUT:
47: dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
48: dnl slots marked FREE, as well as some slots in the caller's "frame marker".
49: dnl
50: dnl -00 <- r30
51: dnl -08 FREE
52: dnl -10 tmp
53: dnl -18 tmp
54: dnl -20 tmp
55: dnl -28 tmp
56: dnl -30 tmp
57: dnl -38 tmp
58: dnl -40 tmp
59: dnl -48 tmp
60: dnl -50 tmp
61: dnl -58 tmp
62: dnl -60 tmp
63: dnl -68 tmp
64: dnl -70 tmp
65: dnl -78 tmp
66: dnl -80 tmp
67: dnl -88 tmp
68: dnl -90 FREE
69: dnl -98 FREE
70: dnl -a0 FREE
71: dnl -a8 FREE
72: dnl -b0 r13
73: dnl -b8 r12
74: dnl -c0 r11
75: dnl -c8 r10
76: dnl -d0 r8
77: dnl -d8 r8
78: dnl -e0 r7
79: dnl -e8 r6
80: dnl -f0 r5
81: dnl -f8 r4
82: dnl -100 r3
83: dnl Previous frame:
84: dnl [unused area]
85: dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
86:
87:
88: include(`../config.m4')
89:
90: dnl INPUT PARAMETERS:
91: define(`rp',`%r26') dnl
92: define(`up',`%r25') dnl
93: define(`n',`%r24') dnl
94: define(`vlimb',`%r23') dnl
95:
96: define(`climb',`%r23') dnl
97:
98: ifdef(`HAVE_ABI_2_0w',
99: ` .level 2.0W
100: ',` .level 2.0N
101: ')
102: PROLOGUE(mpn_addmul_1)
103:
104: ifdef(`HAVE_ABI_2_0w',
105: ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
106: ')
107: std,ma %r3, 0x100(%r30)
108: std %r4, -0xf8(%r30)
109: std %r5, -0xf0(%r30)
110: ldo 0(%r0), climb C clear climb
111: fldd -0x138(%r30), %fr8 C put vlimb in fp register
112:
113: define(`p032a1',`%r1') dnl
114: define(`p032a2',`%r19') dnl
115:
116: define(`m032',`%r20') dnl
117: define(`m096',`%r21') dnl
118:
119: define(`p000a',`%r22') dnl
120: define(`p064a',`%r29') dnl
121:
122: define(`s000',`%r31') dnl
123:
124: define(`ma000',`%r4') dnl
125: define(`ma064',`%r20') dnl
126:
127: define(`r000',`%r3') dnl
128:
129: extrd,u n, 63, 2, %r5
130: cmpb,= %r5, %r0, L(BIG)
131: nop
132:
133: fldd 0(up), %fr4
134: ldo 8(up), up
135: xmpyu %fr8R, %fr4L, %fr22
136: xmpyu %fr8L, %fr4R, %fr23
137: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
138: xmpyu %fr8R, %fr4R, %fr24
139: xmpyu %fr8L, %fr4L, %fr25
140: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
141: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
142: addib,<> -1, %r5, L(two_or_more)
143: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
144: L(one)
145: ldd -0x78(%r30), p032a1
146: ldd -0x70(%r30), p032a2
147: ldd -0x80(%r30), p000a
148: b L(0_one_out)
149: ldd -0x68(%r30), p064a
150:
151: L(two_or_more)
152: fldd 0(up), %fr4
153: ldo 8(up), up
154: xmpyu %fr8R, %fr4L, %fr22
155: xmpyu %fr8L, %fr4R, %fr23
156: ldd -0x78(%r30), p032a1
157: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
158: xmpyu %fr8R, %fr4R, %fr24
159: xmpyu %fr8L, %fr4L, %fr25
160: ldd -0x70(%r30), p032a2
161: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
162: ldd -0x80(%r30), p000a
163: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
164: ldd -0x68(%r30), p064a
165: addib,<> -1, %r5, L(three_or_more)
166: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
167: L(two)
168: add p032a1, p032a2, m032
169: add,dc %r0, %r0, m096
170: depd,z m032, 31, 32, ma000
171: extrd,u m032, 31, 32, ma064
172: ldd 0(rp), r000
173: b L(0_two_out)
174: depd m096, 31, 32, ma064
175:
176: L(three_or_more)
177: fldd 0(up), %fr4
178: add p032a1, p032a2, m032
179: add,dc %r0, %r0, m096
180: depd,z m032, 31, 32, ma000
181: extrd,u m032, 31, 32, ma064
182: ldd 0(rp), r000
183: dnl addib,= -1, %r5, L(0_out)
184: depd m096, 31, 32, ma064
185: L(oop0)
186: dnl xmpyu %fr8R, %fr4L, %fr22
187: dnl xmpyu %fr8L, %fr4R, %fr23
188: dnl ldd -0x78(%r30), p032a1
189: dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
190: dnl
191: dnl xmpyu %fr8R, %fr4R, %fr24
192: dnl xmpyu %fr8L, %fr4L, %fr25
193: dnl ldd -0x70(%r30), p032a2
194: dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
195: dnl
196: dnl ldo 8(rp), rp
197: dnl add climb, p000a, s000
198: dnl ldd -0x80(%r30), p000a
199: dnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
200: dnl
201: dnl add,dc p064a, %r0, climb
202: dnl ldo 8(up), up
203: dnl ldd -0x68(%r30), p064a
204: dnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
205: dnl
206: dnl add ma000, s000, s000
207: dnl add,dc ma064, climb, climb
208: dnl fldd 0(up), %fr4
209: dnl
210: dnl add r000, s000, s000
211: dnl add,dc %r0, climb, climb
212: dnl std s000, -8(rp)
213: dnl
214: dnl add p032a1, p032a2, m032
215: dnl add,dc %r0, %r0, m096
216: dnl
217: dnl depd,z m032, 31, 32, ma000
218: dnl extrd,u m032, 31, 32, ma064
219: dnl ldd 0(rp), r000
220: dnl addib,<> -1, %r5, L(oop0)
221: dnl depd m096, 31, 32, ma064
222: L(0_out)
223: ldo 8(up), up
224: xmpyu %fr8R, %fr4L, %fr22
225: xmpyu %fr8L, %fr4R, %fr23
226: ldd -0x78(%r30), p032a1
227: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
228: xmpyu %fr8R, %fr4R, %fr24
229: xmpyu %fr8L, %fr4L, %fr25
230: ldd -0x70(%r30), p032a2
231: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
232: ldo 8(rp), rp
233: add climb, p000a, s000
234: ldd -0x80(%r30), p000a
235: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
236: add,dc p064a, %r0, climb
237: ldd -0x68(%r30), p064a
238: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
239: add ma000, s000, s000
240: add,dc ma064, climb, climb
241: add r000, s000, s000
242: add,dc %r0, climb, climb
243: std s000, -8(rp)
244: add p032a1, p032a2, m032
245: add,dc %r0, %r0, m096
246: depd,z m032, 31, 32, ma000
247: extrd,u m032, 31, 32, ma064
248: ldd 0(rp), r000
249: depd m096, 31, 32, ma064
250: L(0_two_out)
251: ldd -0x78(%r30), p032a1
252: ldd -0x70(%r30), p032a2
253: ldo 8(rp), rp
254: add climb, p000a, s000
255: ldd -0x80(%r30), p000a
256: add,dc p064a, %r0, climb
257: ldd -0x68(%r30), p064a
258: add ma000, s000, s000
259: add,dc ma064, climb, climb
260: add r000, s000, s000
261: add,dc %r0, climb, climb
262: std s000, -8(rp)
263: L(0_one_out)
264: add p032a1, p032a2, m032
265: add,dc %r0, %r0, m096
266: depd,z m032, 31, 32, ma000
267: extrd,u m032, 31, 32, ma064
268: ldd 0(rp), r000
269: depd m096, 31, 32, ma064
270:
271: add climb, p000a, s000
272: add,dc p064a, %r0, climb
273: add ma000, s000, s000
274: add,dc ma064, climb, climb
275: add r000, s000, s000
276: add,dc %r0, climb, climb
277: std s000, 0(rp)
278:
279: cmpib,>= 4, n, L(done)
280: ldo 8(rp), rp
281:
282: dnl 4-way unrolled code.
283:
284: L(BIG)
285:
286: define(`p032a1',`%r1') dnl
287: define(`p032a2',`%r19') dnl
288: define(`p096b1',`%r20') dnl
289: define(`p096b2',`%r21') dnl
290: define(`p160c1',`%r22') dnl
291: define(`p160c2',`%r29') dnl
292: define(`p224d1',`%r31') dnl
293: define(`p224d2',`%r3') dnl
294: dnl
295: define(`m032',`%r4') dnl
296: define(`m096',`%r5') dnl
297: define(`m160',`%r6') dnl
298: define(`m224',`%r7') dnl
299: define(`m288',`%r8') dnl
300: dnl
301: define(`p000a',`%r1') dnl
302: define(`p064a',`%r19') dnl
303: define(`p064b',`%r20') dnl
304: define(`p128b',`%r21') dnl
305: define(`p128c',`%r22') dnl
306: define(`p192c',`%r29') dnl
307: define(`p192d',`%r31') dnl
308: define(`p256d',`%r3') dnl
309: dnl
310: define(`s000',`%r10') dnl
311: define(`s064',`%r11') dnl
312: define(`s128',`%r12') dnl
313: define(`s192',`%r13') dnl
314: dnl
315: define(`ma000',`%r9') dnl
316: define(`ma064',`%r4') dnl
317: define(`ma128',`%r5') dnl
318: define(`ma192',`%r6') dnl
319: define(`ma256',`%r7') dnl
320: dnl
321: define(`r000',`%r1') dnl
322: define(`r064',`%r19') dnl
323: define(`r128',`%r20') dnl
324: define(`r192',`%r21') dnl
325:
326: std %r6, -0xe8(%r30)
327: std %r7, -0xe0(%r30)
328: std %r8, -0xd8(%r30)
329: std %r9, -0xd0(%r30)
330: std %r10, -0xc8(%r30)
331: std %r11, -0xc0(%r30)
332: std %r12, -0xb8(%r30)
333: std %r13, -0xb0(%r30)
334:
335: ifdef(`HAVE_ABI_2_0w',
336: ` extrd,u n, 61, 62, n C right shift 2
337: ',` extrd,u n, 61, 30, n C right shift 2, zero extend
338: ')
339:
340: L(4_or_more)
341: fldd 0(up), %fr4
342: fldd 8(up), %fr5
343: fldd 16(up), %fr6
344: fldd 24(up), %fr7
345: xmpyu %fr8R, %fr4L, %fr22
346: xmpyu %fr8L, %fr4R, %fr23
347: xmpyu %fr8R, %fr5L, %fr24
348: xmpyu %fr8L, %fr5R, %fr25
349: xmpyu %fr8R, %fr6L, %fr26
350: xmpyu %fr8L, %fr6R, %fr27
351: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
352: xmpyu %fr8R, %fr7L, %fr28
353: xmpyu %fr8L, %fr7R, %fr29
354: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
355: xmpyu %fr8R, %fr4R, %fr30
356: xmpyu %fr8L, %fr4L, %fr31
357: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
358: xmpyu %fr8R, %fr5R, %fr22
359: xmpyu %fr8L, %fr5L, %fr23
360: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
361: xmpyu %fr8R, %fr6R, %fr24
362: xmpyu %fr8L, %fr6L, %fr25
363: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
364: xmpyu %fr8R, %fr7R, %fr26
365: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
366: addib,<> -1, n, L(8_or_more)
367: xmpyu %fr8L, %fr7L, %fr27
368: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
369: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
370: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
371: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
372: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
373: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
374: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
375: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
376: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
377: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
378: ldd -0x78(%r30), p032a1
379: ldd -0x70(%r30), p032a2
380: ldd -0x38(%r30), p096b1
381: ldd -0x30(%r30), p096b2
382: ldd -0x58(%r30), p160c1
383: ldd -0x50(%r30), p160c2
384: ldd -0x18(%r30), p224d1
385: ldd -0x10(%r30), p224d2
386: b L(end1)
387: nop
388:
389: L(8_or_more)
390: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
391: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
392: ldo 32(up), up
393: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
394: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
395: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
396: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
397: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
398: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
399: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
400: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
401: fldd 0(up), %fr4
402: fldd 8(up), %fr5
403: fldd 16(up), %fr6
404: fldd 24(up), %fr7
405: xmpyu %fr8R, %fr4L, %fr22
406: ldd -0x78(%r30), p032a1
407: xmpyu %fr8L, %fr4R, %fr23
408: xmpyu %fr8R, %fr5L, %fr24
409: ldd -0x70(%r30), p032a2
410: xmpyu %fr8L, %fr5R, %fr25
411: xmpyu %fr8R, %fr6L, %fr26
412: ldd -0x38(%r30), p096b1
413: xmpyu %fr8L, %fr6R, %fr27
414: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
415: xmpyu %fr8R, %fr7L, %fr28
416: ldd -0x30(%r30), p096b2
417: xmpyu %fr8L, %fr7R, %fr29
418: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
419: xmpyu %fr8R, %fr4R, %fr30
420: ldd -0x58(%r30), p160c1
421: xmpyu %fr8L, %fr4L, %fr31
422: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
423: xmpyu %fr8R, %fr5R, %fr22
424: ldd -0x50(%r30), p160c2
425: xmpyu %fr8L, %fr5L, %fr23
426: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
427: xmpyu %fr8R, %fr6R, %fr24
428: ldd -0x18(%r30), p224d1
429: xmpyu %fr8L, %fr6L, %fr25
430: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
431: xmpyu %fr8R, %fr7R, %fr26
432: ldd -0x10(%r30), p224d2
433: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
434: addib,= -1, n, L(end2)
435: xmpyu %fr8L, %fr7L, %fr27
436: L(oop)
437: add p032a1, p032a2, m032
438: ldd -0x80(%r30), p000a
439: add,dc p096b1, p096b2, m096
440: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
441:
442: add,dc p160c1, p160c2, m160
443: ldd -0x68(%r30), p064a
444: add,dc p224d1, p224d2, m224
445: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
446:
447: add,dc %r0, %r0, m288
448: ldd -0x40(%r30), p064b
449: ldo 32(up), up
450: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
451:
452: depd,z m032, 31, 32, ma000
453: ldd -0x28(%r30), p128b
454: extrd,u m032, 31, 32, ma064
455: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
456:
457: depd m096, 31, 32, ma064
458: ldd -0x60(%r30), p128c
459: extrd,u m096, 31, 32, ma128
460: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
461:
462: depd m160, 31, 32, ma128
463: ldd -0x48(%r30), p192c
464: extrd,u m160, 31, 32, ma192
465: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
466:
467: depd m224, 31, 32, ma192
468: ldd -0x20(%r30), p192d
469: extrd,u m224, 31, 32, ma256
470: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
471:
472: depd m288, 31, 32, ma256
473: ldd -0x88(%r30), p256d
474: add climb, p000a, s000
475: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
476:
477: add,dc p064a, p064b, s064
478: ldd 0(rp), r000
479: add,dc p128b, p128c, s128
480: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
481:
482: add,dc p192c, p192d, s192
483: ldd 8(rp), r064
484: add,dc p256d, %r0, climb
485: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
486:
487: ldd 16(rp), r128
488: add ma000, s000, s000 C accum mid 0
489: ldd 24(rp), r192
490: add,dc ma064, s064, s064 C accum mid 1
491:
492: add,dc ma128, s128, s128 C accum mid 2
493: fldd 0(up), %fr4
494: add,dc ma192, s192, s192 C accum mid 3
495: fldd 8(up), %fr5
496:
497: add,dc ma256, climb, climb
498: fldd 16(up), %fr6
499: add r000, s000, s000 C accum rlimb 0
500: fldd 24(up), %fr7
501:
502: add,dc r064, s064, s064 C accum rlimb 1
503: add,dc r128, s128, s128 C accum rlimb 2
504: std s000, 0(rp)
505:
506: add,dc r192, s192, s192 C accum rlimb 3
507: add,dc %r0, climb, climb
508: std s064, 8(rp)
509:
510: xmpyu %fr8R, %fr4L, %fr22
511: ldd -0x78(%r30), p032a1
512: xmpyu %fr8L, %fr4R, %fr23
513: std s128, 16(rp)
514:
515: xmpyu %fr8R, %fr5L, %fr24
516: ldd -0x70(%r30), p032a2
517: xmpyu %fr8L, %fr5R, %fr25
518: std s192, 24(rp)
519:
520: xmpyu %fr8R, %fr6L, %fr26
521: ldd -0x38(%r30), p096b1
522: xmpyu %fr8L, %fr6R, %fr27
523: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
524:
525: xmpyu %fr8R, %fr7L, %fr28
526: ldd -0x30(%r30), p096b2
527: xmpyu %fr8L, %fr7R, %fr29
528: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
529:
530: xmpyu %fr8R, %fr4R, %fr30
531: ldd -0x58(%r30), p160c1
532: xmpyu %fr8L, %fr4L, %fr31
533: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
534:
535: xmpyu %fr8R, %fr5R, %fr22
536: ldd -0x50(%r30), p160c2
537: xmpyu %fr8L, %fr5L, %fr23
538: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
539:
540: xmpyu %fr8R, %fr6R, %fr24
541: ldd -0x18(%r30), p224d1
542: xmpyu %fr8L, %fr6L, %fr25
543: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
544:
545: xmpyu %fr8R, %fr7R, %fr26
546: ldd -0x10(%r30), p224d2
547: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
548: xmpyu %fr8L, %fr7L, %fr27
549:
550: addib,<> -1, n, L(oop)
551: ldo 32(rp), rp
552:
553: L(end2)
554: add p032a1, p032a2, m032
555: ldd -0x80(%r30), p000a
556: add,dc p096b1, p096b2, m096
557: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
558: add,dc p160c1, p160c2, m160
559: ldd -0x68(%r30), p064a
560: add,dc p224d1, p224d2, m224
561: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
562: add,dc %r0, %r0, m288
563: ldd -0x40(%r30), p064b
564: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
565: depd,z m032, 31, 32, ma000
566: ldd -0x28(%r30), p128b
567: extrd,u m032, 31, 32, ma064
568: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
569: depd m096, 31, 32, ma064
570: ldd -0x60(%r30), p128c
571: extrd,u m096, 31, 32, ma128
572: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
573: depd m160, 31, 32, ma128
574: ldd -0x48(%r30), p192c
575: extrd,u m160, 31, 32, ma192
576: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
577: depd m224, 31, 32, ma192
578: ldd -0x20(%r30), p192d
579: extrd,u m224, 31, 32, ma256
580: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
581: depd m288, 31, 32, ma256
582: ldd -0x88(%r30), p256d
583: add climb, p000a, s000
584: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
585: add,dc p064a, p064b, s064
586: ldd 0(rp), r000
587: add,dc p128b, p128c, s128
588: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
589: add,dc p192c, p192d, s192
590: ldd 8(rp), r064
591: add,dc p256d, %r0, climb
592: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
593: ldd 16(rp), r128
594: add ma000, s000, s000 C accum mid 0
595: ldd 24(rp), r192
596: add,dc ma064, s064, s064 C accum mid 1
597: add,dc ma128, s128, s128 C accum mid 2
598: add,dc ma192, s192, s192 C accum mid 3
599: add,dc ma256, climb, climb
600: add r000, s000, s000 C accum rlimb 0
601: add,dc r064, s064, s064 C accum rlimb 1
602: add,dc r128, s128, s128 C accum rlimb 2
603: std s000, 0(rp)
604: add,dc r192, s192, s192 C accum rlimb 3
605: add,dc %r0, climb, climb
606: std s064, 8(rp)
607: ldd -0x78(%r30), p032a1
608: std s128, 16(rp)
609: ldd -0x70(%r30), p032a2
610: std s192, 24(rp)
611: ldd -0x38(%r30), p096b1
612: ldd -0x30(%r30), p096b2
613: ldd -0x58(%r30), p160c1
614: ldd -0x50(%r30), p160c2
615: ldd -0x18(%r30), p224d1
616: ldd -0x10(%r30), p224d2
617: ldo 32(rp), rp
618:
619: L(end1)
620: add p032a1, p032a2, m032
621: ldd -0x80(%r30), p000a
622: add,dc p096b1, p096b2, m096
623: add,dc p160c1, p160c2, m160
624: ldd -0x68(%r30), p064a
625: add,dc p224d1, p224d2, m224
626: add,dc %r0, %r0, m288
627: ldd -0x40(%r30), p064b
628: depd,z m032, 31, 32, ma000
629: ldd -0x28(%r30), p128b
630: extrd,u m032, 31, 32, ma064
631: depd m096, 31, 32, ma064
632: ldd -0x60(%r30), p128c
633: extrd,u m096, 31, 32, ma128
634: depd m160, 31, 32, ma128
635: ldd -0x48(%r30), p192c
636: extrd,u m160, 31, 32, ma192
637: depd m224, 31, 32, ma192
638: ldd -0x20(%r30), p192d
639: extrd,u m224, 31, 32, ma256
640: depd m288, 31, 32, ma256
641: ldd -0x88(%r30), p256d
642: add climb, p000a, s000
643: add,dc p064a, p064b, s064
644: ldd 0(rp), r000
645: add,dc p128b, p128c, s128
646: add,dc p192c, p192d, s192
647: ldd 8(rp), r064
648: add,dc p256d, %r0, climb
649: ldd 16(rp), r128
650: add ma000, s000, s000 C accum mid 0
651: ldd 24(rp), r192
652: add,dc ma064, s064, s064 C accum mid 1
653: add,dc ma128, s128, s128 C accum mid 2
654: add,dc ma192, s192, s192 C accum mid 3
655: add,dc ma256, climb, climb
656: add r000, s000, s000 C accum rlimb 0
657: add,dc r064, s064, s064 C accum rlimb 1
658: add,dc r128, s128, s128 C accum rlimb 2
659: std s000, 0(rp)
660: add,dc r192, s192, s192 C accum rlimb 3
661: add,dc %r0, climb, climb
662: std s064, 8(rp)
663: std s128, 16(rp)
664: std s192, 24(rp)
665:
666: ldd -0xb0(%r30), %r13
667: ldd -0xb8(%r30), %r12
668: ldd -0xc0(%r30), %r11
669: ldd -0xc8(%r30), %r10
670: ldd -0xd0(%r30), %r9
671: ldd -0xd8(%r30), %r8
672: ldd -0xe0(%r30), %r7
673: ldd -0xe8(%r30), %r6
674: L(done)
675: ifdef(`HAVE_ABI_2_0w',
676: ` copy climb, %r28
677: ',` extrd,u climb, 63, 32, %r29
678: extrd,u climb, 31, 32, %r28
679: ')
680: ldd -0xf0(%r30), %r5
681: ldd -0xf8(%r30), %r4
682: bve (%r2)
683: ldd,mb -0x100(%r30), %r3
684: EPILOGUE(mpn_addmul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>