Annotation of OpenXM_contrib/gmp/mpn/pa64/addmul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
! 2: dnl add the result to a second limb vector.
! 3:
! 4: dnl Copyright 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
! 5:
! 6: dnl This file is part of the GNU MP Library.
! 7:
! 8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 9: dnl it under the terms of the GNU Lesser General Public License as published
! 10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 11: dnl your option) any later version.
! 12:
! 13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: dnl License for more details.
! 17:
! 18: dnl You should have received a copy of the GNU Lesser General Public License
! 19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: dnl MA 02111-1307, USA.
! 22:
! 23:
! 24: dnl This approaches 7.0 cycles/limb on PA8000 and 6.375 cycles/limb on PA8500
! 25: dnl for huge operands. It should be possible to do 6.0 cycles/limb with the
! 26: dnl current instructions and unrolling level. It is unknown why the code runs
! 27: dnl somewhat slower.
! 28:
! 29: dnl The feed-in and wind-down code has not yet been scheduled. Many cycles
! 30: dnl could be saved there per call.
! 31:
! 32: dnl DESCRIPTION:
! 33: dnl The main loop "BIG" is 4-way unrolled, mainly to allow
! 34: dnl effective use of ADD,DC. Delays in moving data via the cache from the FP
! 35: dnl registers to the IU registers, have demaned a deep software pipeline, and
! 36: dnl a lot of stack slots for partial products in flight.
! 37: dnl
! 38: dnl CODE STRUCTURE:
! 39: dnl save-some-registers
! 40: dnl do 0, 1, 2, or 3 limbs
! 41: dnl if done, restore-some-regs and return
! 42: dnl save-many-regs
! 43: dnl do 4, 8, ... limb
! 44: dnl restore-all-regs
! 45:
! 46: dnl STACK LAYOUT:
! 47: dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
! 48: dnl slots marked FREE, as well as some slots in the caller's "frame marker".
! 49: dnl
! 50: dnl -00 <- r30
! 51: dnl -08 FREE
! 52: dnl -10 tmp
! 53: dnl -18 tmp
! 54: dnl -20 tmp
! 55: dnl -28 tmp
! 56: dnl -30 tmp
! 57: dnl -38 tmp
! 58: dnl -40 tmp
! 59: dnl -48 tmp
! 60: dnl -50 tmp
! 61: dnl -58 tmp
! 62: dnl -60 tmp
! 63: dnl -68 tmp
! 64: dnl -70 tmp
! 65: dnl -78 tmp
! 66: dnl -80 tmp
! 67: dnl -88 tmp
! 68: dnl -90 FREE
! 69: dnl -98 FREE
! 70: dnl -a0 FREE
! 71: dnl -a8 FREE
! 72: dnl -b0 r13
! 73: dnl -b8 r12
! 74: dnl -c0 r11
! 75: dnl -c8 r10
! 76: dnl -d0 r8
! 77: dnl -d8 r8
! 78: dnl -e0 r7
! 79: dnl -e8 r6
! 80: dnl -f0 r5
! 81: dnl -f8 r4
! 82: dnl -100 r3
! 83: dnl Previous frame:
! 84: dnl [unused area]
! 85: dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
! 86:
! 87:
! 88: include(`../config.m4')
! 89:
! 90: dnl INPUT PARAMETERS:
! 91: define(`rp',`%r26') dnl
! 92: define(`up',`%r25') dnl
! 93: define(`n',`%r24') dnl
! 94: define(`vlimb',`%r23') dnl
! 95:
! 96: define(`climb',`%r23') dnl
! 97:
! 98: ifdef(`HAVE_ABI_2_0w',
! 99: ` .level 2.0W
! 100: ',` .level 2.0N
! 101: ')
! 102: PROLOGUE(mpn_addmul_1)
! 103:
! 104: ifdef(`HAVE_ABI_2_0w',
! 105: ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
! 106: ')
! 107: std,ma %r3, 0x100(%r30)
! 108: std %r4, -0xf8(%r30)
! 109: std %r5, -0xf0(%r30)
! 110: ldo 0(%r0), climb C clear climb
! 111: fldd -0x138(%r30), %fr8 C put vlimb in fp register
! 112:
! 113: define(`p032a1',`%r1') dnl
! 114: define(`p032a2',`%r19') dnl
! 115:
! 116: define(`m032',`%r20') dnl
! 117: define(`m096',`%r21') dnl
! 118:
! 119: define(`p000a',`%r22') dnl
! 120: define(`p064a',`%r29') dnl
! 121:
! 122: define(`s000',`%r31') dnl
! 123:
! 124: define(`ma000',`%r4') dnl
! 125: define(`ma064',`%r20') dnl
! 126:
! 127: define(`r000',`%r3') dnl
! 128:
! 129: extrd,u n, 63, 2, %r5
! 130: cmpb,= %r5, %r0, L(BIG)
! 131: nop
! 132:
! 133: fldd 0(up), %fr4
! 134: ldo 8(up), up
! 135: xmpyu %fr8R, %fr4L, %fr22
! 136: xmpyu %fr8L, %fr4R, %fr23
! 137: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 138: xmpyu %fr8R, %fr4R, %fr24
! 139: xmpyu %fr8L, %fr4L, %fr25
! 140: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 141: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 142: addib,<> -1, %r5, L(two_or_more)
! 143: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 144: L(one)
! 145: ldd -0x78(%r30), p032a1
! 146: ldd -0x70(%r30), p032a2
! 147: ldd -0x80(%r30), p000a
! 148: b L(0_one_out)
! 149: ldd -0x68(%r30), p064a
! 150:
! 151: L(two_or_more)
! 152: fldd 0(up), %fr4
! 153: ldo 8(up), up
! 154: xmpyu %fr8R, %fr4L, %fr22
! 155: xmpyu %fr8L, %fr4R, %fr23
! 156: ldd -0x78(%r30), p032a1
! 157: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 158: xmpyu %fr8R, %fr4R, %fr24
! 159: xmpyu %fr8L, %fr4L, %fr25
! 160: ldd -0x70(%r30), p032a2
! 161: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 162: ldd -0x80(%r30), p000a
! 163: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 164: ldd -0x68(%r30), p064a
! 165: addib,<> -1, %r5, L(three_or_more)
! 166: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 167: L(two)
! 168: add p032a1, p032a2, m032
! 169: add,dc %r0, %r0, m096
! 170: depd,z m032, 31, 32, ma000
! 171: extrd,u m032, 31, 32, ma064
! 172: ldd 0(rp), r000
! 173: b L(0_two_out)
! 174: depd m096, 31, 32, ma064
! 175:
! 176: L(three_or_more)
! 177: fldd 0(up), %fr4
! 178: add p032a1, p032a2, m032
! 179: add,dc %r0, %r0, m096
! 180: depd,z m032, 31, 32, ma000
! 181: extrd,u m032, 31, 32, ma064
! 182: ldd 0(rp), r000
! 183: dnl addib,= -1, %r5, L(0_out)
! 184: depd m096, 31, 32, ma064
! 185: L(oop0)
! 186: dnl xmpyu %fr8R, %fr4L, %fr22
! 187: dnl xmpyu %fr8L, %fr4R, %fr23
! 188: dnl ldd -0x78(%r30), p032a1
! 189: dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 190: dnl
! 191: dnl xmpyu %fr8R, %fr4R, %fr24
! 192: dnl xmpyu %fr8L, %fr4L, %fr25
! 193: dnl ldd -0x70(%r30), p032a2
! 194: dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 195: dnl
! 196: dnl ldo 8(rp), rp
! 197: dnl add climb, p000a, s000
! 198: dnl ldd -0x80(%r30), p000a
! 199: dnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 200: dnl
! 201: dnl add,dc p064a, %r0, climb
! 202: dnl ldo 8(up), up
! 203: dnl ldd -0x68(%r30), p064a
! 204: dnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 205: dnl
! 206: dnl add ma000, s000, s000
! 207: dnl add,dc ma064, climb, climb
! 208: dnl fldd 0(up), %fr4
! 209: dnl
! 210: dnl add r000, s000, s000
! 211: dnl add,dc %r0, climb, climb
! 212: dnl std s000, -8(rp)
! 213: dnl
! 214: dnl add p032a1, p032a2, m032
! 215: dnl add,dc %r0, %r0, m096
! 216: dnl
! 217: dnl depd,z m032, 31, 32, ma000
! 218: dnl extrd,u m032, 31, 32, ma064
! 219: dnl ldd 0(rp), r000
! 220: dnl addib,<> -1, %r5, L(oop0)
! 221: dnl depd m096, 31, 32, ma064
! 222: L(0_out)
! 223: ldo 8(up), up
! 224: xmpyu %fr8R, %fr4L, %fr22
! 225: xmpyu %fr8L, %fr4R, %fr23
! 226: ldd -0x78(%r30), p032a1
! 227: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 228: xmpyu %fr8R, %fr4R, %fr24
! 229: xmpyu %fr8L, %fr4L, %fr25
! 230: ldd -0x70(%r30), p032a2
! 231: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 232: ldo 8(rp), rp
! 233: add climb, p000a, s000
! 234: ldd -0x80(%r30), p000a
! 235: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 236: add,dc p064a, %r0, climb
! 237: ldd -0x68(%r30), p064a
! 238: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 239: add ma000, s000, s000
! 240: add,dc ma064, climb, climb
! 241: add r000, s000, s000
! 242: add,dc %r0, climb, climb
! 243: std s000, -8(rp)
! 244: add p032a1, p032a2, m032
! 245: add,dc %r0, %r0, m096
! 246: depd,z m032, 31, 32, ma000
! 247: extrd,u m032, 31, 32, ma064
! 248: ldd 0(rp), r000
! 249: depd m096, 31, 32, ma064
! 250: L(0_two_out)
! 251: ldd -0x78(%r30), p032a1
! 252: ldd -0x70(%r30), p032a2
! 253: ldo 8(rp), rp
! 254: add climb, p000a, s000
! 255: ldd -0x80(%r30), p000a
! 256: add,dc p064a, %r0, climb
! 257: ldd -0x68(%r30), p064a
! 258: add ma000, s000, s000
! 259: add,dc ma064, climb, climb
! 260: add r000, s000, s000
! 261: add,dc %r0, climb, climb
! 262: std s000, -8(rp)
! 263: L(0_one_out)
! 264: add p032a1, p032a2, m032
! 265: add,dc %r0, %r0, m096
! 266: depd,z m032, 31, 32, ma000
! 267: extrd,u m032, 31, 32, ma064
! 268: ldd 0(rp), r000
! 269: depd m096, 31, 32, ma064
! 270:
! 271: add climb, p000a, s000
! 272: add,dc p064a, %r0, climb
! 273: add ma000, s000, s000
! 274: add,dc ma064, climb, climb
! 275: add r000, s000, s000
! 276: add,dc %r0, climb, climb
! 277: std s000, 0(rp)
! 278:
! 279: cmpib,>= 4, n, L(done)
! 280: ldo 8(rp), rp
! 281:
! 282: dnl 4-way unrolled code.
! 283:
! 284: L(BIG)
! 285:
! 286: define(`p032a1',`%r1') dnl
! 287: define(`p032a2',`%r19') dnl
! 288: define(`p096b1',`%r20') dnl
! 289: define(`p096b2',`%r21') dnl
! 290: define(`p160c1',`%r22') dnl
! 291: define(`p160c2',`%r29') dnl
! 292: define(`p224d1',`%r31') dnl
! 293: define(`p224d2',`%r3') dnl
! 294: dnl
! 295: define(`m032',`%r4') dnl
! 296: define(`m096',`%r5') dnl
! 297: define(`m160',`%r6') dnl
! 298: define(`m224',`%r7') dnl
! 299: define(`m288',`%r8') dnl
! 300: dnl
! 301: define(`p000a',`%r1') dnl
! 302: define(`p064a',`%r19') dnl
! 303: define(`p064b',`%r20') dnl
! 304: define(`p128b',`%r21') dnl
! 305: define(`p128c',`%r22') dnl
! 306: define(`p192c',`%r29') dnl
! 307: define(`p192d',`%r31') dnl
! 308: define(`p256d',`%r3') dnl
! 309: dnl
! 310: define(`s000',`%r10') dnl
! 311: define(`s064',`%r11') dnl
! 312: define(`s128',`%r12') dnl
! 313: define(`s192',`%r13') dnl
! 314: dnl
! 315: define(`ma000',`%r9') dnl
! 316: define(`ma064',`%r4') dnl
! 317: define(`ma128',`%r5') dnl
! 318: define(`ma192',`%r6') dnl
! 319: define(`ma256',`%r7') dnl
! 320: dnl
! 321: define(`r000',`%r1') dnl
! 322: define(`r064',`%r19') dnl
! 323: define(`r128',`%r20') dnl
! 324: define(`r192',`%r21') dnl
! 325:
! 326: std %r6, -0xe8(%r30)
! 327: std %r7, -0xe0(%r30)
! 328: std %r8, -0xd8(%r30)
! 329: std %r9, -0xd0(%r30)
! 330: std %r10, -0xc8(%r30)
! 331: std %r11, -0xc0(%r30)
! 332: std %r12, -0xb8(%r30)
! 333: std %r13, -0xb0(%r30)
! 334:
! 335: ifdef(`HAVE_ABI_2_0w',
! 336: ` extrd,u n, 61, 62, n C right shift 2
! 337: ',` extrd,u n, 61, 30, n C right shift 2, zero extend
! 338: ')
! 339:
! 340: L(4_or_more)
! 341: fldd 0(up), %fr4
! 342: fldd 8(up), %fr5
! 343: fldd 16(up), %fr6
! 344: fldd 24(up), %fr7
! 345: xmpyu %fr8R, %fr4L, %fr22
! 346: xmpyu %fr8L, %fr4R, %fr23
! 347: xmpyu %fr8R, %fr5L, %fr24
! 348: xmpyu %fr8L, %fr5R, %fr25
! 349: xmpyu %fr8R, %fr6L, %fr26
! 350: xmpyu %fr8L, %fr6R, %fr27
! 351: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 352: xmpyu %fr8R, %fr7L, %fr28
! 353: xmpyu %fr8L, %fr7R, %fr29
! 354: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 355: xmpyu %fr8R, %fr4R, %fr30
! 356: xmpyu %fr8L, %fr4L, %fr31
! 357: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 358: xmpyu %fr8R, %fr5R, %fr22
! 359: xmpyu %fr8L, %fr5L, %fr23
! 360: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 361: xmpyu %fr8R, %fr6R, %fr24
! 362: xmpyu %fr8L, %fr6L, %fr25
! 363: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 364: xmpyu %fr8R, %fr7R, %fr26
! 365: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 366: addib,<> -1, n, L(8_or_more)
! 367: xmpyu %fr8L, %fr7L, %fr27
! 368: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 369: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 370: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 371: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 372: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 373: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 374: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 375: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 376: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 377: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 378: ldd -0x78(%r30), p032a1
! 379: ldd -0x70(%r30), p032a2
! 380: ldd -0x38(%r30), p096b1
! 381: ldd -0x30(%r30), p096b2
! 382: ldd -0x58(%r30), p160c1
! 383: ldd -0x50(%r30), p160c2
! 384: ldd -0x18(%r30), p224d1
! 385: ldd -0x10(%r30), p224d2
! 386: b L(end1)
! 387: nop
! 388:
! 389: L(8_or_more)
! 390: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 391: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 392: ldo 32(up), up
! 393: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 394: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 395: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 396: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 397: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 398: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 399: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 400: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 401: fldd 0(up), %fr4
! 402: fldd 8(up), %fr5
! 403: fldd 16(up), %fr6
! 404: fldd 24(up), %fr7
! 405: xmpyu %fr8R, %fr4L, %fr22
! 406: ldd -0x78(%r30), p032a1
! 407: xmpyu %fr8L, %fr4R, %fr23
! 408: xmpyu %fr8R, %fr5L, %fr24
! 409: ldd -0x70(%r30), p032a2
! 410: xmpyu %fr8L, %fr5R, %fr25
! 411: xmpyu %fr8R, %fr6L, %fr26
! 412: ldd -0x38(%r30), p096b1
! 413: xmpyu %fr8L, %fr6R, %fr27
! 414: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 415: xmpyu %fr8R, %fr7L, %fr28
! 416: ldd -0x30(%r30), p096b2
! 417: xmpyu %fr8L, %fr7R, %fr29
! 418: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 419: xmpyu %fr8R, %fr4R, %fr30
! 420: ldd -0x58(%r30), p160c1
! 421: xmpyu %fr8L, %fr4L, %fr31
! 422: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 423: xmpyu %fr8R, %fr5R, %fr22
! 424: ldd -0x50(%r30), p160c2
! 425: xmpyu %fr8L, %fr5L, %fr23
! 426: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 427: xmpyu %fr8R, %fr6R, %fr24
! 428: ldd -0x18(%r30), p224d1
! 429: xmpyu %fr8L, %fr6L, %fr25
! 430: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 431: xmpyu %fr8R, %fr7R, %fr26
! 432: ldd -0x10(%r30), p224d2
! 433: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 434: addib,= -1, n, L(end2)
! 435: xmpyu %fr8L, %fr7L, %fr27
! 436: L(oop)
! 437: add p032a1, p032a2, m032
! 438: ldd -0x80(%r30), p000a
! 439: add,dc p096b1, p096b2, m096
! 440: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 441:
! 442: add,dc p160c1, p160c2, m160
! 443: ldd -0x68(%r30), p064a
! 444: add,dc p224d1, p224d2, m224
! 445: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 446:
! 447: add,dc %r0, %r0, m288
! 448: ldd -0x40(%r30), p064b
! 449: ldo 32(up), up
! 450: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 451:
! 452: depd,z m032, 31, 32, ma000
! 453: ldd -0x28(%r30), p128b
! 454: extrd,u m032, 31, 32, ma064
! 455: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 456:
! 457: depd m096, 31, 32, ma064
! 458: ldd -0x60(%r30), p128c
! 459: extrd,u m096, 31, 32, ma128
! 460: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 461:
! 462: depd m160, 31, 32, ma128
! 463: ldd -0x48(%r30), p192c
! 464: extrd,u m160, 31, 32, ma192
! 465: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 466:
! 467: depd m224, 31, 32, ma192
! 468: ldd -0x20(%r30), p192d
! 469: extrd,u m224, 31, 32, ma256
! 470: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 471:
! 472: depd m288, 31, 32, ma256
! 473: ldd -0x88(%r30), p256d
! 474: add climb, p000a, s000
! 475: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 476:
! 477: add,dc p064a, p064b, s064
! 478: ldd 0(rp), r000
! 479: add,dc p128b, p128c, s128
! 480: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 481:
! 482: add,dc p192c, p192d, s192
! 483: ldd 8(rp), r064
! 484: add,dc p256d, %r0, climb
! 485: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 486:
! 487: ldd 16(rp), r128
! 488: add ma000, s000, s000 C accum mid 0
! 489: ldd 24(rp), r192
! 490: add,dc ma064, s064, s064 C accum mid 1
! 491:
! 492: add,dc ma128, s128, s128 C accum mid 2
! 493: fldd 0(up), %fr4
! 494: add,dc ma192, s192, s192 C accum mid 3
! 495: fldd 8(up), %fr5
! 496:
! 497: add,dc ma256, climb, climb
! 498: fldd 16(up), %fr6
! 499: add r000, s000, s000 C accum rlimb 0
! 500: fldd 24(up), %fr7
! 501:
! 502: add,dc r064, s064, s064 C accum rlimb 1
! 503: add,dc r128, s128, s128 C accum rlimb 2
! 504: std s000, 0(rp)
! 505:
! 506: add,dc r192, s192, s192 C accum rlimb 3
! 507: add,dc %r0, climb, climb
! 508: std s064, 8(rp)
! 509:
! 510: xmpyu %fr8R, %fr4L, %fr22
! 511: ldd -0x78(%r30), p032a1
! 512: xmpyu %fr8L, %fr4R, %fr23
! 513: std s128, 16(rp)
! 514:
! 515: xmpyu %fr8R, %fr5L, %fr24
! 516: ldd -0x70(%r30), p032a2
! 517: xmpyu %fr8L, %fr5R, %fr25
! 518: std s192, 24(rp)
! 519:
! 520: xmpyu %fr8R, %fr6L, %fr26
! 521: ldd -0x38(%r30), p096b1
! 522: xmpyu %fr8L, %fr6R, %fr27
! 523: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 524:
! 525: xmpyu %fr8R, %fr7L, %fr28
! 526: ldd -0x30(%r30), p096b2
! 527: xmpyu %fr8L, %fr7R, %fr29
! 528: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 529:
! 530: xmpyu %fr8R, %fr4R, %fr30
! 531: ldd -0x58(%r30), p160c1
! 532: xmpyu %fr8L, %fr4L, %fr31
! 533: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 534:
! 535: xmpyu %fr8R, %fr5R, %fr22
! 536: ldd -0x50(%r30), p160c2
! 537: xmpyu %fr8L, %fr5L, %fr23
! 538: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 539:
! 540: xmpyu %fr8R, %fr6R, %fr24
! 541: ldd -0x18(%r30), p224d1
! 542: xmpyu %fr8L, %fr6L, %fr25
! 543: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 544:
! 545: xmpyu %fr8R, %fr7R, %fr26
! 546: ldd -0x10(%r30), p224d2
! 547: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 548: xmpyu %fr8L, %fr7L, %fr27
! 549:
! 550: addib,<> -1, n, L(oop)
! 551: ldo 32(rp), rp
! 552:
! 553: L(end2)
! 554: add p032a1, p032a2, m032
! 555: ldd -0x80(%r30), p000a
! 556: add,dc p096b1, p096b2, m096
! 557: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 558: add,dc p160c1, p160c2, m160
! 559: ldd -0x68(%r30), p064a
! 560: add,dc p224d1, p224d2, m224
! 561: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 562: add,dc %r0, %r0, m288
! 563: ldd -0x40(%r30), p064b
! 564: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 565: depd,z m032, 31, 32, ma000
! 566: ldd -0x28(%r30), p128b
! 567: extrd,u m032, 31, 32, ma064
! 568: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 569: depd m096, 31, 32, ma064
! 570: ldd -0x60(%r30), p128c
! 571: extrd,u m096, 31, 32, ma128
! 572: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 573: depd m160, 31, 32, ma128
! 574: ldd -0x48(%r30), p192c
! 575: extrd,u m160, 31, 32, ma192
! 576: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 577: depd m224, 31, 32, ma192
! 578: ldd -0x20(%r30), p192d
! 579: extrd,u m224, 31, 32, ma256
! 580: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 581: depd m288, 31, 32, ma256
! 582: ldd -0x88(%r30), p256d
! 583: add climb, p000a, s000
! 584: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 585: add,dc p064a, p064b, s064
! 586: ldd 0(rp), r000
! 587: add,dc p128b, p128c, s128
! 588: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 589: add,dc p192c, p192d, s192
! 590: ldd 8(rp), r064
! 591: add,dc p256d, %r0, climb
! 592: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 593: ldd 16(rp), r128
! 594: add ma000, s000, s000 C accum mid 0
! 595: ldd 24(rp), r192
! 596: add,dc ma064, s064, s064 C accum mid 1
! 597: add,dc ma128, s128, s128 C accum mid 2
! 598: add,dc ma192, s192, s192 C accum mid 3
! 599: add,dc ma256, climb, climb
! 600: add r000, s000, s000 C accum rlimb 0
! 601: add,dc r064, s064, s064 C accum rlimb 1
! 602: add,dc r128, s128, s128 C accum rlimb 2
! 603: std s000, 0(rp)
! 604: add,dc r192, s192, s192 C accum rlimb 3
! 605: add,dc %r0, climb, climb
! 606: std s064, 8(rp)
! 607: ldd -0x78(%r30), p032a1
! 608: std s128, 16(rp)
! 609: ldd -0x70(%r30), p032a2
! 610: std s192, 24(rp)
! 611: ldd -0x38(%r30), p096b1
! 612: ldd -0x30(%r30), p096b2
! 613: ldd -0x58(%r30), p160c1
! 614: ldd -0x50(%r30), p160c2
! 615: ldd -0x18(%r30), p224d1
! 616: ldd -0x10(%r30), p224d2
! 617: ldo 32(rp), rp
! 618:
! 619: L(end1)
! 620: add p032a1, p032a2, m032
! 621: ldd -0x80(%r30), p000a
! 622: add,dc p096b1, p096b2, m096
! 623: add,dc p160c1, p160c2, m160
! 624: ldd -0x68(%r30), p064a
! 625: add,dc p224d1, p224d2, m224
! 626: add,dc %r0, %r0, m288
! 627: ldd -0x40(%r30), p064b
! 628: depd,z m032, 31, 32, ma000
! 629: ldd -0x28(%r30), p128b
! 630: extrd,u m032, 31, 32, ma064
! 631: depd m096, 31, 32, ma064
! 632: ldd -0x60(%r30), p128c
! 633: extrd,u m096, 31, 32, ma128
! 634: depd m160, 31, 32, ma128
! 635: ldd -0x48(%r30), p192c
! 636: extrd,u m160, 31, 32, ma192
! 637: depd m224, 31, 32, ma192
! 638: ldd -0x20(%r30), p192d
! 639: extrd,u m224, 31, 32, ma256
! 640: depd m288, 31, 32, ma256
! 641: ldd -0x88(%r30), p256d
! 642: add climb, p000a, s000
! 643: add,dc p064a, p064b, s064
! 644: ldd 0(rp), r000
! 645: add,dc p128b, p128c, s128
! 646: add,dc p192c, p192d, s192
! 647: ldd 8(rp), r064
! 648: add,dc p256d, %r0, climb
! 649: ldd 16(rp), r128
! 650: add ma000, s000, s000 C accum mid 0
! 651: ldd 24(rp), r192
! 652: add,dc ma064, s064, s064 C accum mid 1
! 653: add,dc ma128, s128, s128 C accum mid 2
! 654: add,dc ma192, s192, s192 C accum mid 3
! 655: add,dc ma256, climb, climb
! 656: add r000, s000, s000 C accum rlimb 0
! 657: add,dc r064, s064, s064 C accum rlimb 1
! 658: add,dc r128, s128, s128 C accum rlimb 2
! 659: std s000, 0(rp)
! 660: add,dc r192, s192, s192 C accum rlimb 3
! 661: add,dc %r0, climb, climb
! 662: std s064, 8(rp)
! 663: std s128, 16(rp)
! 664: std s192, 24(rp)
! 665:
! 666: ldd -0xb0(%r30), %r13
! 667: ldd -0xb8(%r30), %r12
! 668: ldd -0xc0(%r30), %r11
! 669: ldd -0xc8(%r30), %r10
! 670: ldd -0xd0(%r30), %r9
! 671: ldd -0xd8(%r30), %r8
! 672: ldd -0xe0(%r30), %r7
! 673: ldd -0xe8(%r30), %r6
! 674: L(done)
! 675: ifdef(`HAVE_ABI_2_0w',
! 676: ` copy climb, %r28
! 677: ',` extrd,u climb, 63, 32, %r29
! 678: extrd,u climb, 31, 32, %r28
! 679: ')
! 680: ldd -0xf0(%r30), %r5
! 681: ldd -0xf8(%r30), %r4
! 682: bve (%r2)
! 683: ldd,mb -0x100(%r30), %r3
! 684: EPILOGUE(mpn_addmul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>