Annotation of OpenXM_contrib/gmp/mpn/pa64/mul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2: dnl the result in a second limb vector.
3:
4: dnl Copyright 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23:
24: dnl This approaches ?? cycles/limb on PA8000 and 5.625 cycles/limb on PA8500
25: dnl for huge operands. These numbers are close to optimal.
26:
27: dnl The feed-in and wind-down code has not yet been scheduled. Many cycles
28: dnl could be saved there per call.
29:
30: dnl DESCRIPTION:
31: dnl The main loop "BIG" is 4-way unrolled, mainly to allow
32: dnl effective use of ADD,DC. Delays in moving data via the cache from the FP
33: dnl registers to the IU registers, have demaned a deep software pipeline, and
34: dnl a lot of stack slots for partial products in flight.
35: dnl
36: dnl CODE STRUCTURE:
37: dnl save-some-registers
38: dnl do 0, 1, 2, or 3 limbs
39: dnl if done, restore-some-regs and return
40: dnl save-many-regs
41: dnl do 4, 8, ... limb
42: dnl restore-all-regs
43:
44: dnl STACK LAYOUT:
45: dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46: dnl slots marked FREE, as well as some slots in the caller's "frame marker".
47: dnl
48: dnl -00 <- r30
49: dnl -08 FREE
50: dnl -10 tmp
51: dnl -18 tmp
52: dnl -20 tmp
53: dnl -28 tmp
54: dnl -30 tmp
55: dnl -38 tmp
56: dnl -40 tmp
57: dnl -48 tmp
58: dnl -50 tmp
59: dnl -58 tmp
60: dnl -60 tmp
61: dnl -68 tmp
62: dnl -70 tmp
63: dnl -78 tmp
64: dnl -80 tmp
65: dnl -88 tmp
66: dnl -90 FREE
67: dnl -98 FREE
68: dnl -a0 FREE
69: dnl -a8 FREE
70: dnl -b0 r13
71: dnl -b8 r12
72: dnl -c0 r11
73: dnl -c8 r10
74: dnl -d0 r8
75: dnl -d8 r8
76: dnl -e0 r7
77: dnl -e8 r6
78: dnl -f0 r5
79: dnl -f8 r4
80: dnl -100 r3
81: dnl Previous frame:
82: dnl [unused area]
83: dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
84:
85:
86: include(`../config.m4')
87:
88: dnl INPUT PARAMETERS:
89: define(`rp',`%r26') dnl
90: define(`up',`%r25') dnl
91: define(`n',`%r24') dnl
92: define(`vlimb',`%r23') dnl
93:
94: define(`climb',`%r23') dnl
95:
96: ifdef(`HAVE_ABI_2_0w',
97: ` .level 2.0W
98: ',` .level 2.0N
99: ')
100: PROLOGUE(mpn_mul_1)
101:
102: ifdef(`HAVE_ABI_2_0w',
103: ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
104: ')
105: std,ma %r3, 0x100(%r30)
106: std %r4, -0xf8(%r30)
107: std %r5, -0xf0(%r30)
108: ldo 0(%r0), climb C clear climb
109: fldd -0x138(%r30), %fr8 C put vlimb in fp register
110:
111: define(`p032a1',`%r1') dnl
112: define(`p032a2',`%r19') dnl
113:
114: define(`m032',`%r20') dnl
115: define(`m096',`%r21') dnl
116:
117: define(`p000a',`%r22') dnl
118: define(`p064a',`%r29') dnl
119:
120: define(`s000',`%r31') dnl
121:
122: define(`ma000',`%r4') dnl
123: define(`ma064',`%r20') dnl
124:
125: C define(`r000',`%r3') dnl FIXME don't save r3 for n < 4.
126:
127: extrd,u n, 63, 2, %r5
128: cmpb,= %r5, %r0, L(BIG)
129: nop
130:
131: fldd 0(up), %fr4
132: ldo 8(up), up
133: xmpyu %fr8R, %fr4L, %fr22
134: xmpyu %fr8L, %fr4R, %fr23
135: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136: xmpyu %fr8R, %fr4R, %fr24
137: xmpyu %fr8L, %fr4L, %fr25
138: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140: addib,<> -1, %r5, L(two_or_more)
141: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
142: L(one)
143: ldd -0x78(%r30), p032a1
144: ldd -0x70(%r30), p032a2
145: ldd -0x80(%r30), p000a
146: b L(0_one_out)
147: ldd -0x68(%r30), p064a
148:
149: L(two_or_more)
150: fldd 0(up), %fr4
151: ldo 8(up), up
152: xmpyu %fr8R, %fr4L, %fr22
153: xmpyu %fr8L, %fr4R, %fr23
154: ldd -0x78(%r30), p032a1
155: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156: xmpyu %fr8R, %fr4R, %fr24
157: xmpyu %fr8L, %fr4L, %fr25
158: ldd -0x70(%r30), p032a2
159: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160: ldd -0x80(%r30), p000a
161: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162: ldd -0x68(%r30), p064a
163: addib,<> -1, %r5, L(three_or_more)
164: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
165: L(two)
166: add p032a1, p032a2, m032
167: add,dc %r0, %r0, m096
168: depd,z m032, 31, 32, ma000
169: extrd,u m032, 31, 32, ma064
170: b L(0_two_out)
171: depd m096, 31, 32, ma064
172:
173: L(three_or_more)
174: fldd 0(up), %fr4
175: add p032a1, p032a2, m032
176: add,dc %r0, %r0, m096
177: depd,z m032, 31, 32, ma000
178: extrd,u m032, 31, 32, ma064
179: dnl addib,= -1, %r5, L(0_out)
180: depd m096, 31, 32, ma064
181: L(oop0)
182: dnl xmpyu %fr8R, %fr4L, %fr22
183: dnl xmpyu %fr8L, %fr4R, %fr23
184: dnl ldd -0x78(%r30), p032a1
185: dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
186: dnl
187: dnl xmpyu %fr8R, %fr4R, %fr24
188: dnl xmpyu %fr8L, %fr4L, %fr25
189: dnl ldd -0x70(%r30), p032a2
190: dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
191: dnl
192: dnl ldo 8(rp), rp
193: dnl add climb, p000a, s000
194: dnl ldd -0x80(%r30), p000a
195: dnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
196: dnl
197: dnl add,dc p064a, %r0, climb
198: dnl ldo 8(up), up
199: dnl ldd -0x68(%r30), p064a
200: dnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
201: dnl
202: dnl add ma000, s000, s000
203: dnl add,dc ma064, climb, climb
204: dnl fldd 0(up), %fr4
205: dnl
206: dnl std s000, -8(rp)
207: dnl
208: dnl add p032a1, p032a2, m032
209: dnl add,dc %r0, %r0, m096
210: dnl
211: dnl depd,z m032, 31, 32, ma000
212: dnl extrd,u m032, 31, 32, ma064
213: dnl addib,<> -1, %r5, L(oop0)
214: dnl depd m096, 31, 32, ma064
215: L(0_out)
216: ldo 8(up), up
217: xmpyu %fr8R, %fr4L, %fr22
218: xmpyu %fr8L, %fr4R, %fr23
219: ldd -0x78(%r30), p032a1
220: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
221: xmpyu %fr8R, %fr4R, %fr24
222: xmpyu %fr8L, %fr4L, %fr25
223: ldd -0x70(%r30), p032a2
224: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
225: ldo 8(rp), rp
226: add climb, p000a, s000
227: ldd -0x80(%r30), p000a
228: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
229: add,dc p064a, %r0, climb
230: ldd -0x68(%r30), p064a
231: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
232: add ma000, s000, s000
233: add,dc ma064, climb, climb
234: std s000, -8(rp)
235: add p032a1, p032a2, m032
236: add,dc %r0, %r0, m096
237: depd,z m032, 31, 32, ma000
238: extrd,u m032, 31, 32, ma064
239: depd m096, 31, 32, ma064
240: L(0_two_out)
241: ldd -0x78(%r30), p032a1
242: ldd -0x70(%r30), p032a2
243: ldo 8(rp), rp
244: add climb, p000a, s000
245: ldd -0x80(%r30), p000a
246: add,dc p064a, %r0, climb
247: ldd -0x68(%r30), p064a
248: add ma000, s000, s000
249: add,dc ma064, climb, climb
250: std s000, -8(rp)
251: L(0_one_out)
252: add p032a1, p032a2, m032
253: add,dc %r0, %r0, m096
254: depd,z m032, 31, 32, ma000
255: extrd,u m032, 31, 32, ma064
256: depd m096, 31, 32, ma064
257:
258: add climb, p000a, s000
259: add,dc p064a, %r0, climb
260: add ma000, s000, s000
261: add,dc ma064, climb, climb
262: std s000, 0(rp)
263:
264: cmpib,>= 4, n, L(done)
265: ldo 8(rp), rp
266:
267: dnl 4-way unrolled code.
268:
269: L(BIG)
270:
271: define(`p032a1',`%r1') dnl
272: define(`p032a2',`%r19') dnl
273: define(`p096b1',`%r20') dnl
274: define(`p096b2',`%r21') dnl
275: define(`p160c1',`%r22') dnl
276: define(`p160c2',`%r29') dnl
277: define(`p224d1',`%r31') dnl
278: define(`p224d2',`%r3') dnl
279: dnl
280: define(`m032',`%r4') dnl
281: define(`m096',`%r5') dnl
282: define(`m160',`%r6') dnl
283: define(`m224',`%r7') dnl
284: define(`m288',`%r8') dnl
285: dnl
286: define(`p000a',`%r1') dnl
287: define(`p064a',`%r19') dnl
288: define(`p064b',`%r20') dnl
289: define(`p128b',`%r21') dnl
290: define(`p128c',`%r22') dnl
291: define(`p192c',`%r29') dnl
292: define(`p192d',`%r31') dnl
293: define(`p256d',`%r3') dnl
294: dnl
295: define(`s000',`%r10') dnl
296: define(`s064',`%r11') dnl
297: define(`s128',`%r12') dnl
298: define(`s192',`%r13') dnl
299: dnl
300: define(`ma000',`%r9') dnl
301: define(`ma064',`%r4') dnl
302: define(`ma128',`%r5') dnl
303: define(`ma192',`%r6') dnl
304: define(`ma256',`%r7') dnl
305:
306: std %r6, -0xe8(%r30)
307: std %r7, -0xe0(%r30)
308: std %r8, -0xd8(%r30)
309: std %r9, -0xd0(%r30)
310: std %r10, -0xc8(%r30)
311: std %r11, -0xc0(%r30)
312: std %r12, -0xb8(%r30)
313: std %r13, -0xb0(%r30)
314:
315: ifdef(`HAVE_ABI_2_0w',
316: ` extrd,u n, 61, 62, n C right shift 2
317: ',` extrd,u n, 61, 30, n C right shift 2, zero extend
318: ')
319:
320: L(4_or_more)
321: fldd 0(up), %fr4
322: fldd 8(up), %fr5
323: fldd 16(up), %fr6
324: fldd 24(up), %fr7
325: xmpyu %fr8R, %fr4L, %fr22
326: xmpyu %fr8L, %fr4R, %fr23
327: xmpyu %fr8R, %fr5L, %fr24
328: xmpyu %fr8L, %fr5R, %fr25
329: xmpyu %fr8R, %fr6L, %fr26
330: xmpyu %fr8L, %fr6R, %fr27
331: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
332: xmpyu %fr8R, %fr7L, %fr28
333: xmpyu %fr8L, %fr7R, %fr29
334: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
335: xmpyu %fr8R, %fr4R, %fr30
336: xmpyu %fr8L, %fr4L, %fr31
337: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
338: xmpyu %fr8R, %fr5R, %fr22
339: xmpyu %fr8L, %fr5L, %fr23
340: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
341: xmpyu %fr8R, %fr6R, %fr24
342: xmpyu %fr8L, %fr6L, %fr25
343: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
344: xmpyu %fr8R, %fr7R, %fr26
345: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
346: addib,<> -1, n, L(8_or_more)
347: xmpyu %fr8L, %fr7L, %fr27
348: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
349: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
350: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
351: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
352: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
353: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
354: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
355: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
356: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
357: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
358: ldd -0x78(%r30), p032a1
359: ldd -0x70(%r30), p032a2
360: ldd -0x38(%r30), p096b1
361: ldd -0x30(%r30), p096b2
362: ldd -0x58(%r30), p160c1
363: ldd -0x50(%r30), p160c2
364: ldd -0x18(%r30), p224d1
365: ldd -0x10(%r30), p224d2
366: b L(end1)
367: nop
368:
369: L(8_or_more)
370: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
371: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
372: ldo 32(up), up
373: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
374: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
375: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
376: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
377: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
378: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
379: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
380: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
381: fldd 0(up), %fr4
382: fldd 8(up), %fr5
383: fldd 16(up), %fr6
384: fldd 24(up), %fr7
385: xmpyu %fr8R, %fr4L, %fr22
386: ldd -0x78(%r30), p032a1
387: xmpyu %fr8L, %fr4R, %fr23
388: xmpyu %fr8R, %fr5L, %fr24
389: ldd -0x70(%r30), p032a2
390: xmpyu %fr8L, %fr5R, %fr25
391: xmpyu %fr8R, %fr6L, %fr26
392: ldd -0x38(%r30), p096b1
393: xmpyu %fr8L, %fr6R, %fr27
394: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
395: xmpyu %fr8R, %fr7L, %fr28
396: ldd -0x30(%r30), p096b2
397: xmpyu %fr8L, %fr7R, %fr29
398: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
399: xmpyu %fr8R, %fr4R, %fr30
400: ldd -0x58(%r30), p160c1
401: xmpyu %fr8L, %fr4L, %fr31
402: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
403: xmpyu %fr8R, %fr5R, %fr22
404: ldd -0x50(%r30), p160c2
405: xmpyu %fr8L, %fr5L, %fr23
406: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
407: xmpyu %fr8R, %fr6R, %fr24
408: ldd -0x18(%r30), p224d1
409: xmpyu %fr8L, %fr6L, %fr25
410: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
411: xmpyu %fr8R, %fr7R, %fr26
412: ldd -0x10(%r30), p224d2
413: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
414: addib,= -1, n, L(end2)
415: xmpyu %fr8L, %fr7L, %fr27
416: L(oop)
417: add p032a1, p032a2, m032
418: ldd -0x80(%r30), p000a
419: add,dc p096b1, p096b2, m096
420: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
421:
422: add,dc p160c1, p160c2, m160
423: ldd -0x68(%r30), p064a
424: add,dc p224d1, p224d2, m224
425: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
426:
427: add,dc %r0, %r0, m288
428: ldd -0x40(%r30), p064b
429: ldo 32(up), up
430: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
431:
432: depd,z m032, 31, 32, ma000
433: ldd -0x28(%r30), p128b
434: extrd,u m032, 31, 32, ma064
435: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
436:
437: depd m096, 31, 32, ma064
438: ldd -0x60(%r30), p128c
439: extrd,u m096, 31, 32, ma128
440: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
441:
442: depd m160, 31, 32, ma128
443: ldd -0x48(%r30), p192c
444: extrd,u m160, 31, 32, ma192
445: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
446:
447: depd m224, 31, 32, ma192
448: ldd -0x20(%r30), p192d
449: extrd,u m224, 31, 32, ma256
450: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
451:
452: depd m288, 31, 32, ma256
453: ldd -0x88(%r30), p256d
454: add climb, p000a, s000
455: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
456:
457: add,dc p064a, p064b, s064
458: add,dc p128b, p128c, s128
459: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
460:
461: add,dc p192c, p192d, s192
462: add,dc p256d, %r0, climb
463: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
464:
465: add ma000, s000, s000 C accum mid 0
466: fldd 0(up), %fr4
467: add,dc ma064, s064, s064 C accum mid 1
468: std s000, 0(rp)
469:
470: add,dc ma128, s128, s128 C accum mid 2
471: fldd 8(up), %fr5
472: add,dc ma192, s192, s192 C accum mid 3
473: std s064, 8(rp)
474:
475: add,dc ma256, climb, climb
476: fldd 16(up), %fr6
477: std s128, 16(rp)
478:
479: xmpyu %fr8R, %fr4L, %fr22
480: ldd -0x78(%r30), p032a1
481: xmpyu %fr8L, %fr4R, %fr23
482: fldd 24(up), %fr7
483:
484: xmpyu %fr8R, %fr5L, %fr24
485: ldd -0x70(%r30), p032a2
486: xmpyu %fr8L, %fr5R, %fr25
487: std s192, 24(rp)
488:
489: xmpyu %fr8R, %fr6L, %fr26
490: ldd -0x38(%r30), p096b1
491: xmpyu %fr8L, %fr6R, %fr27
492: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
493:
494: xmpyu %fr8R, %fr7L, %fr28
495: ldd -0x30(%r30), p096b2
496: xmpyu %fr8L, %fr7R, %fr29
497: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
498:
499: xmpyu %fr8R, %fr4R, %fr30
500: ldd -0x58(%r30), p160c1
501: xmpyu %fr8L, %fr4L, %fr31
502: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
503:
504: xmpyu %fr8R, %fr5R, %fr22
505: ldd -0x50(%r30), p160c2
506: xmpyu %fr8L, %fr5L, %fr23
507: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
508:
509: xmpyu %fr8R, %fr6R, %fr24
510: ldd -0x18(%r30), p224d1
511: xmpyu %fr8L, %fr6L, %fr25
512: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
513:
514: xmpyu %fr8R, %fr7R, %fr26
515: ldd -0x10(%r30), p224d2
516: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
517: xmpyu %fr8L, %fr7L, %fr27
518:
519: addib,<> -1, n, L(oop)
520: ldo 32(rp), rp
521:
522: L(end2)
523: add p032a1, p032a2, m032
524: ldd -0x80(%r30), p000a
525: add,dc p096b1, p096b2, m096
526: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
527: add,dc p160c1, p160c2, m160
528: ldd -0x68(%r30), p064a
529: add,dc p224d1, p224d2, m224
530: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
531: add,dc %r0, %r0, m288
532: ldd -0x40(%r30), p064b
533: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
534: depd,z m032, 31, 32, ma000
535: ldd -0x28(%r30), p128b
536: extrd,u m032, 31, 32, ma064
537: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
538: depd m096, 31, 32, ma064
539: ldd -0x60(%r30), p128c
540: extrd,u m096, 31, 32, ma128
541: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
542: depd m160, 31, 32, ma128
543: ldd -0x48(%r30), p192c
544: extrd,u m160, 31, 32, ma192
545: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
546: depd m224, 31, 32, ma192
547: ldd -0x20(%r30), p192d
548: extrd,u m224, 31, 32, ma256
549: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
550: depd m288, 31, 32, ma256
551: ldd -0x88(%r30), p256d
552: add climb, p000a, s000
553: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
554: add,dc p064a, p064b, s064
555: add,dc p128b, p128c, s128
556: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
557: add,dc p192c, p192d, s192
558: add,dc p256d, %r0, climb
559: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
560: add ma000, s000, s000 C accum mid 0
561: add,dc ma064, s064, s064 C accum mid 1
562: add,dc ma128, s128, s128 C accum mid 2
563: add,dc ma192, s192, s192 C accum mid 3
564: add,dc ma256, climb, climb
565: std s000, 0(rp)
566: std s064, 8(rp)
567: ldd -0x78(%r30), p032a1
568: std s128, 16(rp)
569: ldd -0x70(%r30), p032a2
570: std s192, 24(rp)
571: ldd -0x38(%r30), p096b1
572: ldd -0x30(%r30), p096b2
573: ldd -0x58(%r30), p160c1
574: ldd -0x50(%r30), p160c2
575: ldd -0x18(%r30), p224d1
576: ldd -0x10(%r30), p224d2
577: ldo 32(rp), rp
578:
579: L(end1)
580: add p032a1, p032a2, m032
581: ldd -0x80(%r30), p000a
582: add,dc p096b1, p096b2, m096
583: add,dc p160c1, p160c2, m160
584: ldd -0x68(%r30), p064a
585: add,dc p224d1, p224d2, m224
586: add,dc %r0, %r0, m288
587: ldd -0x40(%r30), p064b
588: depd,z m032, 31, 32, ma000
589: ldd -0x28(%r30), p128b
590: extrd,u m032, 31, 32, ma064
591: depd m096, 31, 32, ma064
592: ldd -0x60(%r30), p128c
593: extrd,u m096, 31, 32, ma128
594: depd m160, 31, 32, ma128
595: ldd -0x48(%r30), p192c
596: extrd,u m160, 31, 32, ma192
597: depd m224, 31, 32, ma192
598: ldd -0x20(%r30), p192d
599: extrd,u m224, 31, 32, ma256
600: depd m288, 31, 32, ma256
601: ldd -0x88(%r30), p256d
602: add climb, p000a, s000
603: add,dc p064a, p064b, s064
604: add,dc p128b, p128c, s128
605: add,dc p192c, p192d, s192
606: add,dc p256d, %r0, climb
607: add ma000, s000, s000 C accum mid 0
608: add,dc ma064, s064, s064 C accum mid 1
609: add,dc ma128, s128, s128 C accum mid 2
610: add,dc ma192, s192, s192 C accum mid 3
611: add,dc ma256, climb, climb
612: std s000, 0(rp)
613: std s064, 8(rp)
614: std s128, 16(rp)
615: std s192, 24(rp)
616:
617: ldd -0xb0(%r30), %r13
618: ldd -0xb8(%r30), %r12
619: ldd -0xc0(%r30), %r11
620: ldd -0xc8(%r30), %r10
621: ldd -0xd0(%r30), %r9
622: ldd -0xd8(%r30), %r8
623: ldd -0xe0(%r30), %r7
624: ldd -0xe8(%r30), %r6
625: L(done)
626: ifdef(`HAVE_ABI_2_0w',
627: ` copy climb, %r28
628: ',` extrd,u climb, 63, 32, %r29
629: extrd,u climb, 31, 32, %r28
630: ')
631: ldd -0xf0(%r30), %r5
632: ldd -0xf8(%r30), %r4
633: bve (%r2)
634: ldd,mb -0x100(%r30), %r3
635: EPILOGUE(mpn_mul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>