Annotation of OpenXM_contrib/gmp/mpn/pa64/mul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
! 2: dnl the result in a second limb vector.
! 3:
! 4: dnl Copyright 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
! 5:
! 6: dnl This file is part of the GNU MP Library.
! 7:
! 8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 9: dnl it under the terms of the GNU Lesser General Public License as published
! 10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 11: dnl your option) any later version.
! 12:
! 13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: dnl License for more details.
! 17:
! 18: dnl You should have received a copy of the GNU Lesser General Public License
! 19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: dnl MA 02111-1307, USA.
! 22:
! 23:
! 24: dnl This approaches ?? cycles/limb on PA8000 and 5.625 cycles/limb on PA8500
! 25: dnl for huge operands. These numbers are close to optimal.
! 26:
! 27: dnl The feed-in and wind-down code has not yet been scheduled. Many cycles
! 28: dnl could be saved there per call.
! 29:
! 30: dnl DESCRIPTION:
! 31: dnl The main loop "BIG" is 4-way unrolled, mainly to allow
! 32: dnl effective use of ADD,DC. Delays in moving data via the cache from the FP
! 33: dnl registers to the IU registers, have demaned a deep software pipeline, and
! 34: dnl a lot of stack slots for partial products in flight.
! 35: dnl
! 36: dnl CODE STRUCTURE:
! 37: dnl save-some-registers
! 38: dnl do 0, 1, 2, or 3 limbs
! 39: dnl if done, restore-some-regs and return
! 40: dnl save-many-regs
! 41: dnl do 4, 8, ... limb
! 42: dnl restore-all-regs
! 43:
! 44: dnl STACK LAYOUT:
! 45: dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
! 46: dnl slots marked FREE, as well as some slots in the caller's "frame marker".
! 47: dnl
! 48: dnl -00 <- r30
! 49: dnl -08 FREE
! 50: dnl -10 tmp
! 51: dnl -18 tmp
! 52: dnl -20 tmp
! 53: dnl -28 tmp
! 54: dnl -30 tmp
! 55: dnl -38 tmp
! 56: dnl -40 tmp
! 57: dnl -48 tmp
! 58: dnl -50 tmp
! 59: dnl -58 tmp
! 60: dnl -60 tmp
! 61: dnl -68 tmp
! 62: dnl -70 tmp
! 63: dnl -78 tmp
! 64: dnl -80 tmp
! 65: dnl -88 tmp
! 66: dnl -90 FREE
! 67: dnl -98 FREE
! 68: dnl -a0 FREE
! 69: dnl -a8 FREE
! 70: dnl -b0 r13
! 71: dnl -b8 r12
! 72: dnl -c0 r11
! 73: dnl -c8 r10
! 74: dnl -d0 r8
! 75: dnl -d8 r8
! 76: dnl -e0 r7
! 77: dnl -e8 r6
! 78: dnl -f0 r5
! 79: dnl -f8 r4
! 80: dnl -100 r3
! 81: dnl Previous frame:
! 82: dnl [unused area]
! 83: dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
! 84:
! 85:
! 86: include(`../config.m4')
! 87:
! 88: dnl INPUT PARAMETERS:
! 89: define(`rp',`%r26') dnl
! 90: define(`up',`%r25') dnl
! 91: define(`n',`%r24') dnl
! 92: define(`vlimb',`%r23') dnl
! 93:
! 94: define(`climb',`%r23') dnl
! 95:
! 96: ifdef(`HAVE_ABI_2_0w',
! 97: ` .level 2.0W
! 98: ',` .level 2.0N
! 99: ')
! 100: PROLOGUE(mpn_mul_1)
! 101:
! 102: ifdef(`HAVE_ABI_2_0w',
! 103: ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
! 104: ')
! 105: std,ma %r3, 0x100(%r30)
! 106: std %r4, -0xf8(%r30)
! 107: std %r5, -0xf0(%r30)
! 108: ldo 0(%r0), climb C clear climb
! 109: fldd -0x138(%r30), %fr8 C put vlimb in fp register
! 110:
! 111: define(`p032a1',`%r1') dnl
! 112: define(`p032a2',`%r19') dnl
! 113:
! 114: define(`m032',`%r20') dnl
! 115: define(`m096',`%r21') dnl
! 116:
! 117: define(`p000a',`%r22') dnl
! 118: define(`p064a',`%r29') dnl
! 119:
! 120: define(`s000',`%r31') dnl
! 121:
! 122: define(`ma000',`%r4') dnl
! 123: define(`ma064',`%r20') dnl
! 124:
! 125: C define(`r000',`%r3') dnl FIXME don't save r3 for n < 4.
! 126:
! 127: extrd,u n, 63, 2, %r5
! 128: cmpb,= %r5, %r0, L(BIG)
! 129: nop
! 130:
! 131: fldd 0(up), %fr4
! 132: ldo 8(up), up
! 133: xmpyu %fr8R, %fr4L, %fr22
! 134: xmpyu %fr8L, %fr4R, %fr23
! 135: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 136: xmpyu %fr8R, %fr4R, %fr24
! 137: xmpyu %fr8L, %fr4L, %fr25
! 138: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 139: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 140: addib,<> -1, %r5, L(two_or_more)
! 141: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 142: L(one)
! 143: ldd -0x78(%r30), p032a1
! 144: ldd -0x70(%r30), p032a2
! 145: ldd -0x80(%r30), p000a
! 146: b L(0_one_out)
! 147: ldd -0x68(%r30), p064a
! 148:
! 149: L(two_or_more)
! 150: fldd 0(up), %fr4
! 151: ldo 8(up), up
! 152: xmpyu %fr8R, %fr4L, %fr22
! 153: xmpyu %fr8L, %fr4R, %fr23
! 154: ldd -0x78(%r30), p032a1
! 155: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 156: xmpyu %fr8R, %fr4R, %fr24
! 157: xmpyu %fr8L, %fr4L, %fr25
! 158: ldd -0x70(%r30), p032a2
! 159: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 160: ldd -0x80(%r30), p000a
! 161: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 162: ldd -0x68(%r30), p064a
! 163: addib,<> -1, %r5, L(three_or_more)
! 164: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 165: L(two)
! 166: add p032a1, p032a2, m032
! 167: add,dc %r0, %r0, m096
! 168: depd,z m032, 31, 32, ma000
! 169: extrd,u m032, 31, 32, ma064
! 170: b L(0_two_out)
! 171: depd m096, 31, 32, ma064
! 172:
! 173: L(three_or_more)
! 174: fldd 0(up), %fr4
! 175: add p032a1, p032a2, m032
! 176: add,dc %r0, %r0, m096
! 177: depd,z m032, 31, 32, ma000
! 178: extrd,u m032, 31, 32, ma064
! 179: dnl addib,= -1, %r5, L(0_out)
! 180: depd m096, 31, 32, ma064
! 181: L(oop0)
! 182: dnl xmpyu %fr8R, %fr4L, %fr22
! 183: dnl xmpyu %fr8L, %fr4R, %fr23
! 184: dnl ldd -0x78(%r30), p032a1
! 185: dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 186: dnl
! 187: dnl xmpyu %fr8R, %fr4R, %fr24
! 188: dnl xmpyu %fr8L, %fr4L, %fr25
! 189: dnl ldd -0x70(%r30), p032a2
! 190: dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 191: dnl
! 192: dnl ldo 8(rp), rp
! 193: dnl add climb, p000a, s000
! 194: dnl ldd -0x80(%r30), p000a
! 195: dnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 196: dnl
! 197: dnl add,dc p064a, %r0, climb
! 198: dnl ldo 8(up), up
! 199: dnl ldd -0x68(%r30), p064a
! 200: dnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 201: dnl
! 202: dnl add ma000, s000, s000
! 203: dnl add,dc ma064, climb, climb
! 204: dnl fldd 0(up), %fr4
! 205: dnl
! 206: dnl std s000, -8(rp)
! 207: dnl
! 208: dnl add p032a1, p032a2, m032
! 209: dnl add,dc %r0, %r0, m096
! 210: dnl
! 211: dnl depd,z m032, 31, 32, ma000
! 212: dnl extrd,u m032, 31, 32, ma064
! 213: dnl addib,<> -1, %r5, L(oop0)
! 214: dnl depd m096, 31, 32, ma064
! 215: L(0_out)
! 216: ldo 8(up), up
! 217: xmpyu %fr8R, %fr4L, %fr22
! 218: xmpyu %fr8L, %fr4R, %fr23
! 219: ldd -0x78(%r30), p032a1
! 220: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 221: xmpyu %fr8R, %fr4R, %fr24
! 222: xmpyu %fr8L, %fr4L, %fr25
! 223: ldd -0x70(%r30), p032a2
! 224: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 225: ldo 8(rp), rp
! 226: add climb, p000a, s000
! 227: ldd -0x80(%r30), p000a
! 228: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 229: add,dc p064a, %r0, climb
! 230: ldd -0x68(%r30), p064a
! 231: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 232: add ma000, s000, s000
! 233: add,dc ma064, climb, climb
! 234: std s000, -8(rp)
! 235: add p032a1, p032a2, m032
! 236: add,dc %r0, %r0, m096
! 237: depd,z m032, 31, 32, ma000
! 238: extrd,u m032, 31, 32, ma064
! 239: depd m096, 31, 32, ma064
! 240: L(0_two_out)
! 241: ldd -0x78(%r30), p032a1
! 242: ldd -0x70(%r30), p032a2
! 243: ldo 8(rp), rp
! 244: add climb, p000a, s000
! 245: ldd -0x80(%r30), p000a
! 246: add,dc p064a, %r0, climb
! 247: ldd -0x68(%r30), p064a
! 248: add ma000, s000, s000
! 249: add,dc ma064, climb, climb
! 250: std s000, -8(rp)
! 251: L(0_one_out)
! 252: add p032a1, p032a2, m032
! 253: add,dc %r0, %r0, m096
! 254: depd,z m032, 31, 32, ma000
! 255: extrd,u m032, 31, 32, ma064
! 256: depd m096, 31, 32, ma064
! 257:
! 258: add climb, p000a, s000
! 259: add,dc p064a, %r0, climb
! 260: add ma000, s000, s000
! 261: add,dc ma064, climb, climb
! 262: std s000, 0(rp)
! 263:
! 264: cmpib,>= 4, n, L(done)
! 265: ldo 8(rp), rp
! 266:
! 267: dnl 4-way unrolled code.
! 268:
! 269: L(BIG)
! 270:
! 271: define(`p032a1',`%r1') dnl
! 272: define(`p032a2',`%r19') dnl
! 273: define(`p096b1',`%r20') dnl
! 274: define(`p096b2',`%r21') dnl
! 275: define(`p160c1',`%r22') dnl
! 276: define(`p160c2',`%r29') dnl
! 277: define(`p224d1',`%r31') dnl
! 278: define(`p224d2',`%r3') dnl
! 279: dnl
! 280: define(`m032',`%r4') dnl
! 281: define(`m096',`%r5') dnl
! 282: define(`m160',`%r6') dnl
! 283: define(`m224',`%r7') dnl
! 284: define(`m288',`%r8') dnl
! 285: dnl
! 286: define(`p000a',`%r1') dnl
! 287: define(`p064a',`%r19') dnl
! 288: define(`p064b',`%r20') dnl
! 289: define(`p128b',`%r21') dnl
! 290: define(`p128c',`%r22') dnl
! 291: define(`p192c',`%r29') dnl
! 292: define(`p192d',`%r31') dnl
! 293: define(`p256d',`%r3') dnl
! 294: dnl
! 295: define(`s000',`%r10') dnl
! 296: define(`s064',`%r11') dnl
! 297: define(`s128',`%r12') dnl
! 298: define(`s192',`%r13') dnl
! 299: dnl
! 300: define(`ma000',`%r9') dnl
! 301: define(`ma064',`%r4') dnl
! 302: define(`ma128',`%r5') dnl
! 303: define(`ma192',`%r6') dnl
! 304: define(`ma256',`%r7') dnl
! 305:
! 306: std %r6, -0xe8(%r30)
! 307: std %r7, -0xe0(%r30)
! 308: std %r8, -0xd8(%r30)
! 309: std %r9, -0xd0(%r30)
! 310: std %r10, -0xc8(%r30)
! 311: std %r11, -0xc0(%r30)
! 312: std %r12, -0xb8(%r30)
! 313: std %r13, -0xb0(%r30)
! 314:
! 315: ifdef(`HAVE_ABI_2_0w',
! 316: ` extrd,u n, 61, 62, n C right shift 2
! 317: ',` extrd,u n, 61, 30, n C right shift 2, zero extend
! 318: ')
! 319:
! 320: L(4_or_more)
! 321: fldd 0(up), %fr4
! 322: fldd 8(up), %fr5
! 323: fldd 16(up), %fr6
! 324: fldd 24(up), %fr7
! 325: xmpyu %fr8R, %fr4L, %fr22
! 326: xmpyu %fr8L, %fr4R, %fr23
! 327: xmpyu %fr8R, %fr5L, %fr24
! 328: xmpyu %fr8L, %fr5R, %fr25
! 329: xmpyu %fr8R, %fr6L, %fr26
! 330: xmpyu %fr8L, %fr6R, %fr27
! 331: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 332: xmpyu %fr8R, %fr7L, %fr28
! 333: xmpyu %fr8L, %fr7R, %fr29
! 334: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 335: xmpyu %fr8R, %fr4R, %fr30
! 336: xmpyu %fr8L, %fr4L, %fr31
! 337: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 338: xmpyu %fr8R, %fr5R, %fr22
! 339: xmpyu %fr8L, %fr5L, %fr23
! 340: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 341: xmpyu %fr8R, %fr6R, %fr24
! 342: xmpyu %fr8L, %fr6L, %fr25
! 343: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 344: xmpyu %fr8R, %fr7R, %fr26
! 345: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 346: addib,<> -1, n, L(8_or_more)
! 347: xmpyu %fr8L, %fr7L, %fr27
! 348: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 349: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 350: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 351: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 352: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 353: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 354: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 355: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 356: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 357: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 358: ldd -0x78(%r30), p032a1
! 359: ldd -0x70(%r30), p032a2
! 360: ldd -0x38(%r30), p096b1
! 361: ldd -0x30(%r30), p096b2
! 362: ldd -0x58(%r30), p160c1
! 363: ldd -0x50(%r30), p160c2
! 364: ldd -0x18(%r30), p224d1
! 365: ldd -0x10(%r30), p224d2
! 366: b L(end1)
! 367: nop
! 368:
! 369: L(8_or_more)
! 370: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 371: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 372: ldo 32(up), up
! 373: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 374: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 375: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 376: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 377: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 378: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 379: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 380: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 381: fldd 0(up), %fr4
! 382: fldd 8(up), %fr5
! 383: fldd 16(up), %fr6
! 384: fldd 24(up), %fr7
! 385: xmpyu %fr8R, %fr4L, %fr22
! 386: ldd -0x78(%r30), p032a1
! 387: xmpyu %fr8L, %fr4R, %fr23
! 388: xmpyu %fr8R, %fr5L, %fr24
! 389: ldd -0x70(%r30), p032a2
! 390: xmpyu %fr8L, %fr5R, %fr25
! 391: xmpyu %fr8R, %fr6L, %fr26
! 392: ldd -0x38(%r30), p096b1
! 393: xmpyu %fr8L, %fr6R, %fr27
! 394: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 395: xmpyu %fr8R, %fr7L, %fr28
! 396: ldd -0x30(%r30), p096b2
! 397: xmpyu %fr8L, %fr7R, %fr29
! 398: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 399: xmpyu %fr8R, %fr4R, %fr30
! 400: ldd -0x58(%r30), p160c1
! 401: xmpyu %fr8L, %fr4L, %fr31
! 402: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 403: xmpyu %fr8R, %fr5R, %fr22
! 404: ldd -0x50(%r30), p160c2
! 405: xmpyu %fr8L, %fr5L, %fr23
! 406: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 407: xmpyu %fr8R, %fr6R, %fr24
! 408: ldd -0x18(%r30), p224d1
! 409: xmpyu %fr8L, %fr6L, %fr25
! 410: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 411: xmpyu %fr8R, %fr7R, %fr26
! 412: ldd -0x10(%r30), p224d2
! 413: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 414: addib,= -1, n, L(end2)
! 415: xmpyu %fr8L, %fr7L, %fr27
! 416: L(oop)
! 417: add p032a1, p032a2, m032
! 418: ldd -0x80(%r30), p000a
! 419: add,dc p096b1, p096b2, m096
! 420: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 421:
! 422: add,dc p160c1, p160c2, m160
! 423: ldd -0x68(%r30), p064a
! 424: add,dc p224d1, p224d2, m224
! 425: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 426:
! 427: add,dc %r0, %r0, m288
! 428: ldd -0x40(%r30), p064b
! 429: ldo 32(up), up
! 430: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 431:
! 432: depd,z m032, 31, 32, ma000
! 433: ldd -0x28(%r30), p128b
! 434: extrd,u m032, 31, 32, ma064
! 435: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 436:
! 437: depd m096, 31, 32, ma064
! 438: ldd -0x60(%r30), p128c
! 439: extrd,u m096, 31, 32, ma128
! 440: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 441:
! 442: depd m160, 31, 32, ma128
! 443: ldd -0x48(%r30), p192c
! 444: extrd,u m160, 31, 32, ma192
! 445: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 446:
! 447: depd m224, 31, 32, ma192
! 448: ldd -0x20(%r30), p192d
! 449: extrd,u m224, 31, 32, ma256
! 450: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 451:
! 452: depd m288, 31, 32, ma256
! 453: ldd -0x88(%r30), p256d
! 454: add climb, p000a, s000
! 455: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 456:
! 457: add,dc p064a, p064b, s064
! 458: add,dc p128b, p128c, s128
! 459: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 460:
! 461: add,dc p192c, p192d, s192
! 462: add,dc p256d, %r0, climb
! 463: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 464:
! 465: add ma000, s000, s000 C accum mid 0
! 466: fldd 0(up), %fr4
! 467: add,dc ma064, s064, s064 C accum mid 1
! 468: std s000, 0(rp)
! 469:
! 470: add,dc ma128, s128, s128 C accum mid 2
! 471: fldd 8(up), %fr5
! 472: add,dc ma192, s192, s192 C accum mid 3
! 473: std s064, 8(rp)
! 474:
! 475: add,dc ma256, climb, climb
! 476: fldd 16(up), %fr6
! 477: std s128, 16(rp)
! 478:
! 479: xmpyu %fr8R, %fr4L, %fr22
! 480: ldd -0x78(%r30), p032a1
! 481: xmpyu %fr8L, %fr4R, %fr23
! 482: fldd 24(up), %fr7
! 483:
! 484: xmpyu %fr8R, %fr5L, %fr24
! 485: ldd -0x70(%r30), p032a2
! 486: xmpyu %fr8L, %fr5R, %fr25
! 487: std s192, 24(rp)
! 488:
! 489: xmpyu %fr8R, %fr6L, %fr26
! 490: ldd -0x38(%r30), p096b1
! 491: xmpyu %fr8L, %fr6R, %fr27
! 492: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 493:
! 494: xmpyu %fr8R, %fr7L, %fr28
! 495: ldd -0x30(%r30), p096b2
! 496: xmpyu %fr8L, %fr7R, %fr29
! 497: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 498:
! 499: xmpyu %fr8R, %fr4R, %fr30
! 500: ldd -0x58(%r30), p160c1
! 501: xmpyu %fr8L, %fr4L, %fr31
! 502: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 503:
! 504: xmpyu %fr8R, %fr5R, %fr22
! 505: ldd -0x50(%r30), p160c2
! 506: xmpyu %fr8L, %fr5L, %fr23
! 507: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 508:
! 509: xmpyu %fr8R, %fr6R, %fr24
! 510: ldd -0x18(%r30), p224d1
! 511: xmpyu %fr8L, %fr6L, %fr25
! 512: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 513:
! 514: xmpyu %fr8R, %fr7R, %fr26
! 515: ldd -0x10(%r30), p224d2
! 516: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 517: xmpyu %fr8L, %fr7L, %fr27
! 518:
! 519: addib,<> -1, n, L(oop)
! 520: ldo 32(rp), rp
! 521:
! 522: L(end2)
! 523: add p032a1, p032a2, m032
! 524: ldd -0x80(%r30), p000a
! 525: add,dc p096b1, p096b2, m096
! 526: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 527: add,dc p160c1, p160c2, m160
! 528: ldd -0x68(%r30), p064a
! 529: add,dc p224d1, p224d2, m224
! 530: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 531: add,dc %r0, %r0, m288
! 532: ldd -0x40(%r30), p064b
! 533: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 534: depd,z m032, 31, 32, ma000
! 535: ldd -0x28(%r30), p128b
! 536: extrd,u m032, 31, 32, ma064
! 537: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 538: depd m096, 31, 32, ma064
! 539: ldd -0x60(%r30), p128c
! 540: extrd,u m096, 31, 32, ma128
! 541: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 542: depd m160, 31, 32, ma128
! 543: ldd -0x48(%r30), p192c
! 544: extrd,u m160, 31, 32, ma192
! 545: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 546: depd m224, 31, 32, ma192
! 547: ldd -0x20(%r30), p192d
! 548: extrd,u m224, 31, 32, ma256
! 549: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 550: depd m288, 31, 32, ma256
! 551: ldd -0x88(%r30), p256d
! 552: add climb, p000a, s000
! 553: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 554: add,dc p064a, p064b, s064
! 555: add,dc p128b, p128c, s128
! 556: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 557: add,dc p192c, p192d, s192
! 558: add,dc p256d, %r0, climb
! 559: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 560: add ma000, s000, s000 C accum mid 0
! 561: add,dc ma064, s064, s064 C accum mid 1
! 562: add,dc ma128, s128, s128 C accum mid 2
! 563: add,dc ma192, s192, s192 C accum mid 3
! 564: add,dc ma256, climb, climb
! 565: std s000, 0(rp)
! 566: std s064, 8(rp)
! 567: ldd -0x78(%r30), p032a1
! 568: std s128, 16(rp)
! 569: ldd -0x70(%r30), p032a2
! 570: std s192, 24(rp)
! 571: ldd -0x38(%r30), p096b1
! 572: ldd -0x30(%r30), p096b2
! 573: ldd -0x58(%r30), p160c1
! 574: ldd -0x50(%r30), p160c2
! 575: ldd -0x18(%r30), p224d1
! 576: ldd -0x10(%r30), p224d2
! 577: ldo 32(rp), rp
! 578:
! 579: L(end1)
! 580: add p032a1, p032a2, m032
! 581: ldd -0x80(%r30), p000a
! 582: add,dc p096b1, p096b2, m096
! 583: add,dc p160c1, p160c2, m160
! 584: ldd -0x68(%r30), p064a
! 585: add,dc p224d1, p224d2, m224
! 586: add,dc %r0, %r0, m288
! 587: ldd -0x40(%r30), p064b
! 588: depd,z m032, 31, 32, ma000
! 589: ldd -0x28(%r30), p128b
! 590: extrd,u m032, 31, 32, ma064
! 591: depd m096, 31, 32, ma064
! 592: ldd -0x60(%r30), p128c
! 593: extrd,u m096, 31, 32, ma128
! 594: depd m160, 31, 32, ma128
! 595: ldd -0x48(%r30), p192c
! 596: extrd,u m160, 31, 32, ma192
! 597: depd m224, 31, 32, ma192
! 598: ldd -0x20(%r30), p192d
! 599: extrd,u m224, 31, 32, ma256
! 600: depd m288, 31, 32, ma256
! 601: ldd -0x88(%r30), p256d
! 602: add climb, p000a, s000
! 603: add,dc p064a, p064b, s064
! 604: add,dc p128b, p128c, s128
! 605: add,dc p192c, p192d, s192
! 606: add,dc p256d, %r0, climb
! 607: add ma000, s000, s000 C accum mid 0
! 608: add,dc ma064, s064, s064 C accum mid 1
! 609: add,dc ma128, s128, s128 C accum mid 2
! 610: add,dc ma192, s192, s192 C accum mid 3
! 611: add,dc ma256, climb, climb
! 612: std s000, 0(rp)
! 613: std s064, 8(rp)
! 614: std s128, 16(rp)
! 615: std s192, 24(rp)
! 616:
! 617: ldd -0xb0(%r30), %r13
! 618: ldd -0xb8(%r30), %r12
! 619: ldd -0xc0(%r30), %r11
! 620: ldd -0xc8(%r30), %r10
! 621: ldd -0xd0(%r30), %r9
! 622: ldd -0xd8(%r30), %r8
! 623: ldd -0xe0(%r30), %r7
! 624: ldd -0xe8(%r30), %r6
! 625: L(done)
! 626: ifdef(`HAVE_ABI_2_0w',
! 627: ` copy climb, %r28
! 628: ',` extrd,u climb, 63, 32, %r29
! 629: extrd,u climb, 31, 32, %r28
! 630: ')
! 631: ldd -0xf0(%r30), %r5
! 632: ldd -0xf8(%r30), %r4
! 633: bve (%r2)
! 634: ldd,mb -0x100(%r30), %r3
! 635: EPILOGUE(mpn_mul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>