Annotation of OpenXM_contrib/gmp/mpn/pa64/submul_1.asm, Revision 1.1.1.1
1.1 ohara 1: dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2: dnl subtract the result from a second limb vector.
3:
4: dnl Copyright 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23:
24: dnl This approaches ?? cycles/limb on PA8000 and 6.75 cycles/limb on PA8500
25: dnl for huge operands.
26:
27: dnl The feed-in and wind-down code has not yet been scheduled. Many cycles
28: dnl could be saved there per call.
29:
30: dnl DESCRIPTION:
31: dnl The main loop "BIG" is 4-way unrolled, mainly to allow
32: dnl effective use of ADD,DC. Delays in moving data via the cache from the FP
33: dnl registers to the IU registers, have demaned a deep software pipeline, and
34: dnl a lot of stack slots for partial products in flight.
35: dnl
36: dnl CODE STRUCTURE:
37: dnl save-some-registers
38: dnl do 0, 1, 2, or 3 limbs
39: dnl if done, restore-some-regs and return
40: dnl save-many-regs
41: dnl do 4, 8, ... limb
42: dnl restore-all-regs
43:
44: dnl STACK LAYOUT:
45: dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46: dnl slots marked FREE, as well as some slots in the caller's "frame marker".
47: dnl
48: dnl -00 <- r30
49: dnl -08 FREE
50: dnl -10 tmp
51: dnl -18 tmp
52: dnl -20 tmp
53: dnl -28 tmp
54: dnl -30 tmp
55: dnl -38 tmp
56: dnl -40 tmp
57: dnl -48 tmp
58: dnl -50 tmp
59: dnl -58 tmp
60: dnl -60 tmp
61: dnl -68 tmp
62: dnl -70 tmp
63: dnl -78 tmp
64: dnl -80 tmp
65: dnl -88 tmp
66: dnl -90 FREE
67: dnl -98 FREE
68: dnl -a0 FREE
69: dnl -a8 FREE
70: dnl -b0 r13
71: dnl -b8 r12
72: dnl -c0 r11
73: dnl -c8 r10
74: dnl -d0 r8
75: dnl -d8 r8
76: dnl -e0 r7
77: dnl -e8 r6
78: dnl -f0 r5
79: dnl -f8 r4
80: dnl -100 r3
81: dnl Previous frame:
82: dnl [unused area]
83: dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
84:
85:
86: include(`../config.m4')
87:
88: dnl INPUT PARAMETERS:
89: define(`rp',`%r26') dnl
90: define(`up',`%r25') dnl
91: define(`n',`%r24') dnl
92: define(`vlimb',`%r23') dnl
93:
94: define(`climb',`%r23') dnl
95:
96: ifdef(`HAVE_ABI_2_0w',
97: ` .level 2.0W
98: ',` .level 2.0N
99: ')
100: PROLOGUE(mpn_submul_1)
101:
102: ifdef(`HAVE_ABI_2_0w',
103: ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
104: ')
105: std,ma %r3, 0x100(%r30)
106: std %r4, -0xf8(%r30)
107: std %r5, -0xf0(%r30)
108: ldo 0(%r0), climb C clear climb
109: fldd -0x138(%r30), %fr8 C put vlimb in fp register
110:
111: define(`p032a1',`%r1') dnl
112: define(`p032a2',`%r19') dnl
113:
114: define(`m032',`%r20') dnl
115: define(`m096',`%r21') dnl
116:
117: define(`p000a',`%r22') dnl
118: define(`p064a',`%r29') dnl
119:
120: define(`s000',`%r31') dnl
121:
122: define(`ma000',`%r4') dnl
123: define(`ma064',`%r20') dnl
124:
125: define(`r000',`%r3') dnl
126:
127: extrd,u n, 63, 2, %r5
128: cmpb,= %r5, %r0, L(BIG)
129: nop
130:
131: fldd 0(up), %fr4
132: ldo 8(up), up
133: xmpyu %fr8R, %fr4L, %fr22
134: xmpyu %fr8L, %fr4R, %fr23
135: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136: xmpyu %fr8R, %fr4R, %fr24
137: xmpyu %fr8L, %fr4L, %fr25
138: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140: addib,<> -1, %r5, L(two_or_more)
141: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
142: L(one)
143: ldd -0x78(%r30), p032a1
144: ldd -0x70(%r30), p032a2
145: ldd -0x80(%r30), p000a
146: b L(0_one_out)
147: ldd -0x68(%r30), p064a
148:
149: L(two_or_more)
150: fldd 0(up), %fr4
151: ldo 8(up), up
152: xmpyu %fr8R, %fr4L, %fr22
153: xmpyu %fr8L, %fr4R, %fr23
154: ldd -0x78(%r30), p032a1
155: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156: xmpyu %fr8R, %fr4R, %fr24
157: xmpyu %fr8L, %fr4L, %fr25
158: ldd -0x70(%r30), p032a2
159: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160: ldd -0x80(%r30), p000a
161: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162: ldd -0x68(%r30), p064a
163: addib,<> -1, %r5, L(three_or_more)
164: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
165: L(two)
166: add p032a1, p032a2, m032
167: add,dc %r0, %r0, m096
168: depd,z m032, 31, 32, ma000
169: extrd,u m032, 31, 32, ma064
170: ldd 0(rp), r000
171: b L(0_two_out)
172: depd m096, 31, 32, ma064
173:
174: L(three_or_more)
175: fldd 0(up), %fr4
176: add p032a1, p032a2, m032
177: add,dc %r0, %r0, m096
178: depd,z m032, 31, 32, ma000
179: extrd,u m032, 31, 32, ma064
180: ldd 0(rp), r000
181: dnl addib,= -1, %r5, L(0_out)
182: depd m096, 31, 32, ma064
183: L(oop0)
184: dnl xmpyu %fr8R, %fr4L, %fr22
185: dnl xmpyu %fr8L, %fr4R, %fr23
186: dnl ldd -0x78(%r30), p032a1
187: dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
188: dnl
189: dnl xmpyu %fr8R, %fr4R, %fr24
190: dnl xmpyu %fr8L, %fr4L, %fr25
191: dnl ldd -0x70(%r30), p032a2
192: dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
193: dnl
194: dnl ldo 8(rp), rp
195: dnl add climb, p000a, s000
196: dnl ldd -0x80(%r30), p000a
197: dnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
198: dnl
199: dnl add,dc p064a, %r0, climb
200: dnl ldo 8(up), up
201: dnl ldd -0x68(%r30), p064a
202: dnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
203: dnl
204: dnl add ma000, s000, s000
205: dnl add,dc ma064, climb, climb
206: dnl fldd 0(up), %fr4
207: dnl
208: dnl sub r000, s000, s000
209: dnl sub,db %r0, climb, climb
210: dnl sub %r0, climb, climb
211: dnl std s000, -8(rp)
212: dnl
213: dnl add p032a1, p032a2, m032
214: dnl add,dc %r0, %r0, m096
215: dnl
216: dnl depd,z m032, 31, 32, ma000
217: dnl extrd,u m032, 31, 32, ma064
218: dnl ldd 0(rp), r000
219: dnl addib,<> -1, %r5, L(oop0)
220: dnl depd m096, 31, 32, ma064
221: L(0_out)
222: ldo 8(up), up
223: xmpyu %fr8R, %fr4L, %fr22
224: xmpyu %fr8L, %fr4R, %fr23
225: ldd -0x78(%r30), p032a1
226: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
227: xmpyu %fr8R, %fr4R, %fr24
228: xmpyu %fr8L, %fr4L, %fr25
229: ldd -0x70(%r30), p032a2
230: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
231: ldo 8(rp), rp
232: add climb, p000a, s000
233: ldd -0x80(%r30), p000a
234: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
235: add,dc p064a, %r0, climb
236: ldd -0x68(%r30), p064a
237: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
238: add ma000, s000, s000
239: add,dc ma064, climb, climb
240: sub r000, s000, s000
241: sub,db %r0, climb, climb
242: sub %r0, climb, climb
243: std s000, -8(rp)
244: add p032a1, p032a2, m032
245: add,dc %r0, %r0, m096
246: depd,z m032, 31, 32, ma000
247: extrd,u m032, 31, 32, ma064
248: ldd 0(rp), r000
249: depd m096, 31, 32, ma064
250: L(0_two_out)
251: ldd -0x78(%r30), p032a1
252: ldd -0x70(%r30), p032a2
253: ldo 8(rp), rp
254: add climb, p000a, s000
255: ldd -0x80(%r30), p000a
256: add,dc p064a, %r0, climb
257: ldd -0x68(%r30), p064a
258: add ma000, s000, s000
259: add,dc ma064, climb, climb
260: sub r000, s000, s000
261: sub,db %r0, climb, climb
262: sub %r0, climb, climb
263: std s000, -8(rp)
264: L(0_one_out)
265: add p032a1, p032a2, m032
266: add,dc %r0, %r0, m096
267: depd,z m032, 31, 32, ma000
268: extrd,u m032, 31, 32, ma064
269: ldd 0(rp), r000
270: depd m096, 31, 32, ma064
271:
272: add climb, p000a, s000
273: add,dc p064a, %r0, climb
274: add ma000, s000, s000
275: add,dc ma064, climb, climb
276: sub r000, s000, s000
277: sub,db %r0, climb, climb
278: sub %r0, climb, climb
279: std s000, 0(rp)
280:
281: cmpib,>= 4, n, L(done)
282: ldo 8(rp), rp
283:
284: dnl 4-way unrolled code.
285:
286: L(BIG)
287:
288: define(`p032a1',`%r1') dnl
289: define(`p032a2',`%r19') dnl
290: define(`p096b1',`%r20') dnl
291: define(`p096b2',`%r21') dnl
292: define(`p160c1',`%r22') dnl
293: define(`p160c2',`%r29') dnl
294: define(`p224d1',`%r31') dnl
295: define(`p224d2',`%r3') dnl
296: dnl
297: define(`m032',`%r4') dnl
298: define(`m096',`%r5') dnl
299: define(`m160',`%r6') dnl
300: define(`m224',`%r7') dnl
301: define(`m288',`%r8') dnl
302: dnl
303: define(`p000a',`%r1') dnl
304: define(`p064a',`%r19') dnl
305: define(`p064b',`%r20') dnl
306: define(`p128b',`%r21') dnl
307: define(`p128c',`%r22') dnl
308: define(`p192c',`%r29') dnl
309: define(`p192d',`%r31') dnl
310: define(`p256d',`%r3') dnl
311: dnl
312: define(`s000',`%r10') dnl
313: define(`s064',`%r11') dnl
314: define(`s128',`%r12') dnl
315: define(`s192',`%r13') dnl
316: dnl
317: define(`ma000',`%r9') dnl
318: define(`ma064',`%r4') dnl
319: define(`ma128',`%r5') dnl
320: define(`ma192',`%r6') dnl
321: define(`ma256',`%r7') dnl
322: dnl
323: define(`r000',`%r1') dnl
324: define(`r064',`%r19') dnl
325: define(`r128',`%r20') dnl
326: define(`r192',`%r21') dnl
327:
328: std %r6, -0xe8(%r30)
329: std %r7, -0xe0(%r30)
330: std %r8, -0xd8(%r30)
331: std %r9, -0xd0(%r30)
332: std %r10, -0xc8(%r30)
333: std %r11, -0xc0(%r30)
334: std %r12, -0xb8(%r30)
335: std %r13, -0xb0(%r30)
336:
337: ifdef(`HAVE_ABI_2_0w',
338: ` extrd,u n, 61, 62, n C right shift 2
339: ',` extrd,u n, 61, 30, n C right shift 2, zero extend
340: ')
341:
342: L(4_or_more)
343: fldd 0(up), %fr4
344: fldd 8(up), %fr5
345: fldd 16(up), %fr6
346: fldd 24(up), %fr7
347: xmpyu %fr8R, %fr4L, %fr22
348: xmpyu %fr8L, %fr4R, %fr23
349: xmpyu %fr8R, %fr5L, %fr24
350: xmpyu %fr8L, %fr5R, %fr25
351: xmpyu %fr8R, %fr6L, %fr26
352: xmpyu %fr8L, %fr6R, %fr27
353: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
354: xmpyu %fr8R, %fr7L, %fr28
355: xmpyu %fr8L, %fr7R, %fr29
356: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
357: xmpyu %fr8R, %fr4R, %fr30
358: xmpyu %fr8L, %fr4L, %fr31
359: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
360: xmpyu %fr8R, %fr5R, %fr22
361: xmpyu %fr8L, %fr5L, %fr23
362: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
363: xmpyu %fr8R, %fr6R, %fr24
364: xmpyu %fr8L, %fr6L, %fr25
365: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
366: xmpyu %fr8R, %fr7R, %fr26
367: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
368: addib,<> -1, n, L(8_or_more)
369: xmpyu %fr8L, %fr7L, %fr27
370: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
371: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
372: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
373: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
374: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
375: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
376: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
377: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
378: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
379: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
380: ldd -0x78(%r30), p032a1
381: ldd -0x70(%r30), p032a2
382: ldd -0x38(%r30), p096b1
383: ldd -0x30(%r30), p096b2
384: ldd -0x58(%r30), p160c1
385: ldd -0x50(%r30), p160c2
386: ldd -0x18(%r30), p224d1
387: ldd -0x10(%r30), p224d2
388: b L(end1)
389: nop
390:
391: L(8_or_more)
392: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
393: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
394: ldo 32(up), up
395: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
396: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
397: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
398: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
399: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
400: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
401: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
402: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
403: fldd 0(up), %fr4
404: fldd 8(up), %fr5
405: fldd 16(up), %fr6
406: fldd 24(up), %fr7
407: xmpyu %fr8R, %fr4L, %fr22
408: ldd -0x78(%r30), p032a1
409: xmpyu %fr8L, %fr4R, %fr23
410: xmpyu %fr8R, %fr5L, %fr24
411: ldd -0x70(%r30), p032a2
412: xmpyu %fr8L, %fr5R, %fr25
413: xmpyu %fr8R, %fr6L, %fr26
414: ldd -0x38(%r30), p096b1
415: xmpyu %fr8L, %fr6R, %fr27
416: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
417: xmpyu %fr8R, %fr7L, %fr28
418: ldd -0x30(%r30), p096b2
419: xmpyu %fr8L, %fr7R, %fr29
420: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
421: xmpyu %fr8R, %fr4R, %fr30
422: ldd -0x58(%r30), p160c1
423: xmpyu %fr8L, %fr4L, %fr31
424: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
425: xmpyu %fr8R, %fr5R, %fr22
426: ldd -0x50(%r30), p160c2
427: xmpyu %fr8L, %fr5L, %fr23
428: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
429: xmpyu %fr8R, %fr6R, %fr24
430: ldd -0x18(%r30), p224d1
431: xmpyu %fr8L, %fr6L, %fr25
432: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
433: xmpyu %fr8R, %fr7R, %fr26
434: ldd -0x10(%r30), p224d2
435: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
436: addib,= -1, n, L(end2)
437: xmpyu %fr8L, %fr7L, %fr27
438: L(oop)
439: add p032a1, p032a2, m032
440: ldd -0x80(%r30), p000a
441: add,dc p096b1, p096b2, m096
442: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
443:
444: add,dc p160c1, p160c2, m160
445: ldd -0x68(%r30), p064a
446: add,dc p224d1, p224d2, m224
447: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
448:
449: add,dc %r0, %r0, m288
450: ldd -0x40(%r30), p064b
451: ldo 32(up), up
452: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
453:
454: depd,z m032, 31, 32, ma000
455: ldd -0x28(%r30), p128b
456: extrd,u m032, 31, 32, ma064
457: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
458:
459: depd m096, 31, 32, ma064
460: ldd -0x60(%r30), p128c
461: extrd,u m096, 31, 32, ma128
462: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
463:
464: depd m160, 31, 32, ma128
465: ldd -0x48(%r30), p192c
466: extrd,u m160, 31, 32, ma192
467: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
468:
469: depd m224, 31, 32, ma192
470: ldd -0x20(%r30), p192d
471: extrd,u m224, 31, 32, ma256
472: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
473:
474: depd m288, 31, 32, ma256
475: ldd -0x88(%r30), p256d
476: add climb, p000a, s000
477: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
478:
479: add,dc p064a, p064b, s064
480: ldd 0(rp), r000
481: add,dc p128b, p128c, s128
482: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
483:
484: add,dc p192c, p192d, s192
485: ldd 8(rp), r064
486: add,dc p256d, %r0, climb
487: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
488:
489: ldd 16(rp), r128
490: add ma000, s000, s000 C accum mid 0
491: ldd 24(rp), r192
492: add,dc ma064, s064, s064 C accum mid 1
493:
494: add,dc ma128, s128, s128 C accum mid 2
495: fldd 0(up), %fr4
496: add,dc ma192, s192, s192 C accum mid 3
497: fldd 8(up), %fr5
498:
499: add,dc ma256, climb, climb
500: fldd 16(up), %fr6
501: sub r000, s000, s000 C accum rlimb 0
502: fldd 24(up), %fr7
503:
504: sub,db r064, s064, s064 C accum rlimb 1
505: sub,db r128, s128, s128 C accum rlimb 2
506: std s000, 0(rp)
507:
508: sub,db r192, s192, s192 C accum rlimb 3
509: sub,db %r0, climb, climb
510: sub %r0, climb, climb
511: std s064, 8(rp)
512:
513: xmpyu %fr8R, %fr4L, %fr22
514: ldd -0x78(%r30), p032a1
515: xmpyu %fr8L, %fr4R, %fr23
516: std s128, 16(rp)
517:
518: xmpyu %fr8R, %fr5L, %fr24
519: ldd -0x70(%r30), p032a2
520: xmpyu %fr8L, %fr5R, %fr25
521: std s192, 24(rp)
522:
523: xmpyu %fr8R, %fr6L, %fr26
524: ldd -0x38(%r30), p096b1
525: xmpyu %fr8L, %fr6R, %fr27
526: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
527:
528: xmpyu %fr8R, %fr7L, %fr28
529: ldd -0x30(%r30), p096b2
530: xmpyu %fr8L, %fr7R, %fr29
531: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
532:
533: xmpyu %fr8R, %fr4R, %fr30
534: ldd -0x58(%r30), p160c1
535: xmpyu %fr8L, %fr4L, %fr31
536: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
537:
538: xmpyu %fr8R, %fr5R, %fr22
539: ldd -0x50(%r30), p160c2
540: xmpyu %fr8L, %fr5L, %fr23
541: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
542:
543: xmpyu %fr8R, %fr6R, %fr24
544: ldd -0x18(%r30), p224d1
545: xmpyu %fr8L, %fr6L, %fr25
546: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
547:
548: xmpyu %fr8R, %fr7R, %fr26
549: ldd -0x10(%r30), p224d2
550: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
551: xmpyu %fr8L, %fr7L, %fr27
552:
553: addib,<> -1, n, L(oop)
554: ldo 32(rp), rp
555:
556: L(end2)
557: add p032a1, p032a2, m032
558: ldd -0x80(%r30), p000a
559: add,dc p096b1, p096b2, m096
560: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
561: add,dc p160c1, p160c2, m160
562: ldd -0x68(%r30), p064a
563: add,dc p224d1, p224d2, m224
564: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
565: add,dc %r0, %r0, m288
566: ldd -0x40(%r30), p064b
567: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
568: depd,z m032, 31, 32, ma000
569: ldd -0x28(%r30), p128b
570: extrd,u m032, 31, 32, ma064
571: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
572: depd m096, 31, 32, ma064
573: ldd -0x60(%r30), p128c
574: extrd,u m096, 31, 32, ma128
575: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
576: depd m160, 31, 32, ma128
577: ldd -0x48(%r30), p192c
578: extrd,u m160, 31, 32, ma192
579: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
580: depd m224, 31, 32, ma192
581: ldd -0x20(%r30), p192d
582: extrd,u m224, 31, 32, ma256
583: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
584: depd m288, 31, 32, ma256
585: ldd -0x88(%r30), p256d
586: add climb, p000a, s000
587: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
588: add,dc p064a, p064b, s064
589: ldd 0(rp), r000
590: add,dc p128b, p128c, s128
591: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
592: add,dc p192c, p192d, s192
593: ldd 8(rp), r064
594: add,dc p256d, %r0, climb
595: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
596: ldd 16(rp), r128
597: add ma000, s000, s000 C accum mid 0
598: ldd 24(rp), r192
599: add,dc ma064, s064, s064 C accum mid 1
600: add,dc ma128, s128, s128 C accum mid 2
601: add,dc ma192, s192, s192 C accum mid 3
602: add,dc ma256, climb, climb
603: sub r000, s000, s000 C accum rlimb 0
604: sub,db r064, s064, s064 C accum rlimb 1
605: sub,db r128, s128, s128 C accum rlimb 2
606: std s000, 0(rp)
607: sub,db r192, s192, s192 C accum rlimb 3
608: sub,db %r0, climb, climb
609: sub %r0, climb, climb
610: std s064, 8(rp)
611: ldd -0x78(%r30), p032a1
612: std s128, 16(rp)
613: ldd -0x70(%r30), p032a2
614: std s192, 24(rp)
615: ldd -0x38(%r30), p096b1
616: ldd -0x30(%r30), p096b2
617: ldd -0x58(%r30), p160c1
618: ldd -0x50(%r30), p160c2
619: ldd -0x18(%r30), p224d1
620: ldd -0x10(%r30), p224d2
621: ldo 32(rp), rp
622:
623: L(end1)
624: add p032a1, p032a2, m032
625: ldd -0x80(%r30), p000a
626: add,dc p096b1, p096b2, m096
627: add,dc p160c1, p160c2, m160
628: ldd -0x68(%r30), p064a
629: add,dc p224d1, p224d2, m224
630: add,dc %r0, %r0, m288
631: ldd -0x40(%r30), p064b
632: depd,z m032, 31, 32, ma000
633: ldd -0x28(%r30), p128b
634: extrd,u m032, 31, 32, ma064
635: depd m096, 31, 32, ma064
636: ldd -0x60(%r30), p128c
637: extrd,u m096, 31, 32, ma128
638: depd m160, 31, 32, ma128
639: ldd -0x48(%r30), p192c
640: extrd,u m160, 31, 32, ma192
641: depd m224, 31, 32, ma192
642: ldd -0x20(%r30), p192d
643: extrd,u m224, 31, 32, ma256
644: depd m288, 31, 32, ma256
645: ldd -0x88(%r30), p256d
646: add climb, p000a, s000
647: add,dc p064a, p064b, s064
648: ldd 0(rp), r000
649: add,dc p128b, p128c, s128
650: add,dc p192c, p192d, s192
651: ldd 8(rp), r064
652: add,dc p256d, %r0, climb
653: ldd 16(rp), r128
654: add ma000, s000, s000 C accum mid 0
655: ldd 24(rp), r192
656: add,dc ma064, s064, s064 C accum mid 1
657: add,dc ma128, s128, s128 C accum mid 2
658: add,dc ma192, s192, s192 C accum mid 3
659: add,dc ma256, climb, climb
660: sub r000, s000, s000 C accum rlimb 0
661: sub,db r064, s064, s064 C accum rlimb 1
662: sub,db r128, s128, s128 C accum rlimb 2
663: std s000, 0(rp)
664: sub,db r192, s192, s192 C accum rlimb 3
665: sub,db %r0, climb, climb
666: sub %r0, climb, climb
667: std s064, 8(rp)
668: std s128, 16(rp)
669: std s192, 24(rp)
670:
671: ldd -0xb0(%r30), %r13
672: ldd -0xb8(%r30), %r12
673: ldd -0xc0(%r30), %r11
674: ldd -0xc8(%r30), %r10
675: ldd -0xd0(%r30), %r9
676: ldd -0xd8(%r30), %r8
677: ldd -0xe0(%r30), %r7
678: ldd -0xe8(%r30), %r6
679: L(done)
680: ifdef(`HAVE_ABI_2_0w',
681: ` copy climb, %r28
682: ',` extrd,u climb, 63, 32, %r29
683: extrd,u climb, 31, 32, %r28
684: ')
685: ldd -0xf0(%r30), %r5
686: ldd -0xf8(%r30), %r4
687: bve (%r2)
688: ldd,mb -0x100(%r30), %r3
689: EPILOGUE(mpn_submul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>