Annotation of OpenXM_contrib/gmp/mpn/pa64/submul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
! 2: dnl subtract the result from a second limb vector.
! 3:
! 4: dnl Copyright 1998, 1999, 2000, 2002 Free Software Foundation, Inc.
! 5:
! 6: dnl This file is part of the GNU MP Library.
! 7:
! 8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 9: dnl it under the terms of the GNU Lesser General Public License as published
! 10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 11: dnl your option) any later version.
! 12:
! 13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 16: dnl License for more details.
! 17:
! 18: dnl You should have received a copy of the GNU Lesser General Public License
! 19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 21: dnl MA 02111-1307, USA.
! 22:
! 23:
! 24: dnl This approaches ?? cycles/limb on PA8000 and 6.75 cycles/limb on PA8500
! 25: dnl for huge operands.
! 26:
! 27: dnl The feed-in and wind-down code has not yet been scheduled. Many cycles
! 28: dnl could be saved there per call.
! 29:
! 30: dnl DESCRIPTION:
! 31: dnl The main loop "BIG" is 4-way unrolled, mainly to allow
! 32: dnl effective use of ADD,DC. Delays in moving data via the cache from the FP
! 33: dnl registers to the IU registers, have demaned a deep software pipeline, and
! 34: dnl a lot of stack slots for partial products in flight.
! 35: dnl
! 36: dnl CODE STRUCTURE:
! 37: dnl save-some-registers
! 38: dnl do 0, 1, 2, or 3 limbs
! 39: dnl if done, restore-some-regs and return
! 40: dnl save-many-regs
! 41: dnl do 4, 8, ... limb
! 42: dnl restore-all-regs
! 43:
! 44: dnl STACK LAYOUT:
! 45: dnl HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
! 46: dnl slots marked FREE, as well as some slots in the caller's "frame marker".
! 47: dnl
! 48: dnl -00 <- r30
! 49: dnl -08 FREE
! 50: dnl -10 tmp
! 51: dnl -18 tmp
! 52: dnl -20 tmp
! 53: dnl -28 tmp
! 54: dnl -30 tmp
! 55: dnl -38 tmp
! 56: dnl -40 tmp
! 57: dnl -48 tmp
! 58: dnl -50 tmp
! 59: dnl -58 tmp
! 60: dnl -60 tmp
! 61: dnl -68 tmp
! 62: dnl -70 tmp
! 63: dnl -78 tmp
! 64: dnl -80 tmp
! 65: dnl -88 tmp
! 66: dnl -90 FREE
! 67: dnl -98 FREE
! 68: dnl -a0 FREE
! 69: dnl -a8 FREE
! 70: dnl -b0 r13
! 71: dnl -b8 r12
! 72: dnl -c0 r11
! 73: dnl -c8 r10
! 74: dnl -d0 r8
! 75: dnl -d8 r8
! 76: dnl -e0 r7
! 77: dnl -e8 r6
! 78: dnl -f0 r5
! 79: dnl -f8 r4
! 80: dnl -100 r3
! 81: dnl Previous frame:
! 82: dnl [unused area]
! 83: dnl -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
! 84:
! 85:
! 86: include(`../config.m4')
! 87:
! 88: dnl INPUT PARAMETERS:
! 89: define(`rp',`%r26') dnl
! 90: define(`up',`%r25') dnl
! 91: define(`n',`%r24') dnl
! 92: define(`vlimb',`%r23') dnl
! 93:
! 94: define(`climb',`%r23') dnl
! 95:
! 96: ifdef(`HAVE_ABI_2_0w',
! 97: ` .level 2.0W
! 98: ',` .level 2.0N
! 99: ')
! 100: PROLOGUE(mpn_submul_1)
! 101:
! 102: ifdef(`HAVE_ABI_2_0w',
! 103: ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
! 104: ')
! 105: std,ma %r3, 0x100(%r30)
! 106: std %r4, -0xf8(%r30)
! 107: std %r5, -0xf0(%r30)
! 108: ldo 0(%r0), climb C clear climb
! 109: fldd -0x138(%r30), %fr8 C put vlimb in fp register
! 110:
! 111: define(`p032a1',`%r1') dnl
! 112: define(`p032a2',`%r19') dnl
! 113:
! 114: define(`m032',`%r20') dnl
! 115: define(`m096',`%r21') dnl
! 116:
! 117: define(`p000a',`%r22') dnl
! 118: define(`p064a',`%r29') dnl
! 119:
! 120: define(`s000',`%r31') dnl
! 121:
! 122: define(`ma000',`%r4') dnl
! 123: define(`ma064',`%r20') dnl
! 124:
! 125: define(`r000',`%r3') dnl
! 126:
! 127: extrd,u n, 63, 2, %r5
! 128: cmpb,= %r5, %r0, L(BIG)
! 129: nop
! 130:
! 131: fldd 0(up), %fr4
! 132: ldo 8(up), up
! 133: xmpyu %fr8R, %fr4L, %fr22
! 134: xmpyu %fr8L, %fr4R, %fr23
! 135: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 136: xmpyu %fr8R, %fr4R, %fr24
! 137: xmpyu %fr8L, %fr4L, %fr25
! 138: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 139: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 140: addib,<> -1, %r5, L(two_or_more)
! 141: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 142: L(one)
! 143: ldd -0x78(%r30), p032a1
! 144: ldd -0x70(%r30), p032a2
! 145: ldd -0x80(%r30), p000a
! 146: b L(0_one_out)
! 147: ldd -0x68(%r30), p064a
! 148:
! 149: L(two_or_more)
! 150: fldd 0(up), %fr4
! 151: ldo 8(up), up
! 152: xmpyu %fr8R, %fr4L, %fr22
! 153: xmpyu %fr8L, %fr4R, %fr23
! 154: ldd -0x78(%r30), p032a1
! 155: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 156: xmpyu %fr8R, %fr4R, %fr24
! 157: xmpyu %fr8L, %fr4L, %fr25
! 158: ldd -0x70(%r30), p032a2
! 159: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 160: ldd -0x80(%r30), p000a
! 161: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 162: ldd -0x68(%r30), p064a
! 163: addib,<> -1, %r5, L(three_or_more)
! 164: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 165: L(two)
! 166: add p032a1, p032a2, m032
! 167: add,dc %r0, %r0, m096
! 168: depd,z m032, 31, 32, ma000
! 169: extrd,u m032, 31, 32, ma064
! 170: ldd 0(rp), r000
! 171: b L(0_two_out)
! 172: depd m096, 31, 32, ma064
! 173:
! 174: L(three_or_more)
! 175: fldd 0(up), %fr4
! 176: add p032a1, p032a2, m032
! 177: add,dc %r0, %r0, m096
! 178: depd,z m032, 31, 32, ma000
! 179: extrd,u m032, 31, 32, ma064
! 180: ldd 0(rp), r000
! 181: dnl addib,= -1, %r5, L(0_out)
! 182: depd m096, 31, 32, ma064
! 183: L(oop0)
! 184: dnl xmpyu %fr8R, %fr4L, %fr22
! 185: dnl xmpyu %fr8L, %fr4R, %fr23
! 186: dnl ldd -0x78(%r30), p032a1
! 187: dnl fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 188: dnl
! 189: dnl xmpyu %fr8R, %fr4R, %fr24
! 190: dnl xmpyu %fr8L, %fr4L, %fr25
! 191: dnl ldd -0x70(%r30), p032a2
! 192: dnl fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 193: dnl
! 194: dnl ldo 8(rp), rp
! 195: dnl add climb, p000a, s000
! 196: dnl ldd -0x80(%r30), p000a
! 197: dnl fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 198: dnl
! 199: dnl add,dc p064a, %r0, climb
! 200: dnl ldo 8(up), up
! 201: dnl ldd -0x68(%r30), p064a
! 202: dnl fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 203: dnl
! 204: dnl add ma000, s000, s000
! 205: dnl add,dc ma064, climb, climb
! 206: dnl fldd 0(up), %fr4
! 207: dnl
! 208: dnl sub r000, s000, s000
! 209: dnl sub,db %r0, climb, climb
! 210: dnl sub %r0, climb, climb
! 211: dnl std s000, -8(rp)
! 212: dnl
! 213: dnl add p032a1, p032a2, m032
! 214: dnl add,dc %r0, %r0, m096
! 215: dnl
! 216: dnl depd,z m032, 31, 32, ma000
! 217: dnl extrd,u m032, 31, 32, ma064
! 218: dnl ldd 0(rp), r000
! 219: dnl addib,<> -1, %r5, L(oop0)
! 220: dnl depd m096, 31, 32, ma064
! 221: L(0_out)
! 222: ldo 8(up), up
! 223: xmpyu %fr8R, %fr4L, %fr22
! 224: xmpyu %fr8L, %fr4R, %fr23
! 225: ldd -0x78(%r30), p032a1
! 226: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 227: xmpyu %fr8R, %fr4R, %fr24
! 228: xmpyu %fr8L, %fr4L, %fr25
! 229: ldd -0x70(%r30), p032a2
! 230: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 231: ldo 8(rp), rp
! 232: add climb, p000a, s000
! 233: ldd -0x80(%r30), p000a
! 234: fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
! 235: add,dc p064a, %r0, climb
! 236: ldd -0x68(%r30), p064a
! 237: fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
! 238: add ma000, s000, s000
! 239: add,dc ma064, climb, climb
! 240: sub r000, s000, s000
! 241: sub,db %r0, climb, climb
! 242: sub %r0, climb, climb
! 243: std s000, -8(rp)
! 244: add p032a1, p032a2, m032
! 245: add,dc %r0, %r0, m096
! 246: depd,z m032, 31, 32, ma000
! 247: extrd,u m032, 31, 32, ma064
! 248: ldd 0(rp), r000
! 249: depd m096, 31, 32, ma064
! 250: L(0_two_out)
! 251: ldd -0x78(%r30), p032a1
! 252: ldd -0x70(%r30), p032a2
! 253: ldo 8(rp), rp
! 254: add climb, p000a, s000
! 255: ldd -0x80(%r30), p000a
! 256: add,dc p064a, %r0, climb
! 257: ldd -0x68(%r30), p064a
! 258: add ma000, s000, s000
! 259: add,dc ma064, climb, climb
! 260: sub r000, s000, s000
! 261: sub,db %r0, climb, climb
! 262: sub %r0, climb, climb
! 263: std s000, -8(rp)
! 264: L(0_one_out)
! 265: add p032a1, p032a2, m032
! 266: add,dc %r0, %r0, m096
! 267: depd,z m032, 31, 32, ma000
! 268: extrd,u m032, 31, 32, ma064
! 269: ldd 0(rp), r000
! 270: depd m096, 31, 32, ma064
! 271:
! 272: add climb, p000a, s000
! 273: add,dc p064a, %r0, climb
! 274: add ma000, s000, s000
! 275: add,dc ma064, climb, climb
! 276: sub r000, s000, s000
! 277: sub,db %r0, climb, climb
! 278: sub %r0, climb, climb
! 279: std s000, 0(rp)
! 280:
! 281: cmpib,>= 4, n, L(done)
! 282: ldo 8(rp), rp
! 283:
! 284: dnl 4-way unrolled code.
! 285:
! 286: L(BIG)
! 287:
! 288: define(`p032a1',`%r1') dnl
! 289: define(`p032a2',`%r19') dnl
! 290: define(`p096b1',`%r20') dnl
! 291: define(`p096b2',`%r21') dnl
! 292: define(`p160c1',`%r22') dnl
! 293: define(`p160c2',`%r29') dnl
! 294: define(`p224d1',`%r31') dnl
! 295: define(`p224d2',`%r3') dnl
! 296: dnl
! 297: define(`m032',`%r4') dnl
! 298: define(`m096',`%r5') dnl
! 299: define(`m160',`%r6') dnl
! 300: define(`m224',`%r7') dnl
! 301: define(`m288',`%r8') dnl
! 302: dnl
! 303: define(`p000a',`%r1') dnl
! 304: define(`p064a',`%r19') dnl
! 305: define(`p064b',`%r20') dnl
! 306: define(`p128b',`%r21') dnl
! 307: define(`p128c',`%r22') dnl
! 308: define(`p192c',`%r29') dnl
! 309: define(`p192d',`%r31') dnl
! 310: define(`p256d',`%r3') dnl
! 311: dnl
! 312: define(`s000',`%r10') dnl
! 313: define(`s064',`%r11') dnl
! 314: define(`s128',`%r12') dnl
! 315: define(`s192',`%r13') dnl
! 316: dnl
! 317: define(`ma000',`%r9') dnl
! 318: define(`ma064',`%r4') dnl
! 319: define(`ma128',`%r5') dnl
! 320: define(`ma192',`%r6') dnl
! 321: define(`ma256',`%r7') dnl
! 322: dnl
! 323: define(`r000',`%r1') dnl
! 324: define(`r064',`%r19') dnl
! 325: define(`r128',`%r20') dnl
! 326: define(`r192',`%r21') dnl
! 327:
! 328: std %r6, -0xe8(%r30)
! 329: std %r7, -0xe0(%r30)
! 330: std %r8, -0xd8(%r30)
! 331: std %r9, -0xd0(%r30)
! 332: std %r10, -0xc8(%r30)
! 333: std %r11, -0xc0(%r30)
! 334: std %r12, -0xb8(%r30)
! 335: std %r13, -0xb0(%r30)
! 336:
! 337: ifdef(`HAVE_ABI_2_0w',
! 338: ` extrd,u n, 61, 62, n C right shift 2
! 339: ',` extrd,u n, 61, 30, n C right shift 2, zero extend
! 340: ')
! 341:
! 342: L(4_or_more)
! 343: fldd 0(up), %fr4
! 344: fldd 8(up), %fr5
! 345: fldd 16(up), %fr6
! 346: fldd 24(up), %fr7
! 347: xmpyu %fr8R, %fr4L, %fr22
! 348: xmpyu %fr8L, %fr4R, %fr23
! 349: xmpyu %fr8R, %fr5L, %fr24
! 350: xmpyu %fr8L, %fr5R, %fr25
! 351: xmpyu %fr8R, %fr6L, %fr26
! 352: xmpyu %fr8L, %fr6R, %fr27
! 353: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 354: xmpyu %fr8R, %fr7L, %fr28
! 355: xmpyu %fr8L, %fr7R, %fr29
! 356: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 357: xmpyu %fr8R, %fr4R, %fr30
! 358: xmpyu %fr8L, %fr4L, %fr31
! 359: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 360: xmpyu %fr8R, %fr5R, %fr22
! 361: xmpyu %fr8L, %fr5L, %fr23
! 362: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 363: xmpyu %fr8R, %fr6R, %fr24
! 364: xmpyu %fr8L, %fr6L, %fr25
! 365: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 366: xmpyu %fr8R, %fr7R, %fr26
! 367: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 368: addib,<> -1, n, L(8_or_more)
! 369: xmpyu %fr8L, %fr7L, %fr27
! 370: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 371: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 372: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 373: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 374: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 375: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 376: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 377: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 378: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 379: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 380: ldd -0x78(%r30), p032a1
! 381: ldd -0x70(%r30), p032a2
! 382: ldd -0x38(%r30), p096b1
! 383: ldd -0x30(%r30), p096b2
! 384: ldd -0x58(%r30), p160c1
! 385: ldd -0x50(%r30), p160c2
! 386: ldd -0x18(%r30), p224d1
! 387: ldd -0x10(%r30), p224d2
! 388: b L(end1)
! 389: nop
! 390:
! 391: L(8_or_more)
! 392: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 393: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 394: ldo 32(up), up
! 395: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 396: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 397: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 398: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 399: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 400: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 401: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 402: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 403: fldd 0(up), %fr4
! 404: fldd 8(up), %fr5
! 405: fldd 16(up), %fr6
! 406: fldd 24(up), %fr7
! 407: xmpyu %fr8R, %fr4L, %fr22
! 408: ldd -0x78(%r30), p032a1
! 409: xmpyu %fr8L, %fr4R, %fr23
! 410: xmpyu %fr8R, %fr5L, %fr24
! 411: ldd -0x70(%r30), p032a2
! 412: xmpyu %fr8L, %fr5R, %fr25
! 413: xmpyu %fr8R, %fr6L, %fr26
! 414: ldd -0x38(%r30), p096b1
! 415: xmpyu %fr8L, %fr6R, %fr27
! 416: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 417: xmpyu %fr8R, %fr7L, %fr28
! 418: ldd -0x30(%r30), p096b2
! 419: xmpyu %fr8L, %fr7R, %fr29
! 420: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 421: xmpyu %fr8R, %fr4R, %fr30
! 422: ldd -0x58(%r30), p160c1
! 423: xmpyu %fr8L, %fr4L, %fr31
! 424: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 425: xmpyu %fr8R, %fr5R, %fr22
! 426: ldd -0x50(%r30), p160c2
! 427: xmpyu %fr8L, %fr5L, %fr23
! 428: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 429: xmpyu %fr8R, %fr6R, %fr24
! 430: ldd -0x18(%r30), p224d1
! 431: xmpyu %fr8L, %fr6L, %fr25
! 432: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 433: xmpyu %fr8R, %fr7R, %fr26
! 434: ldd -0x10(%r30), p224d2
! 435: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 436: addib,= -1, n, L(end2)
! 437: xmpyu %fr8L, %fr7L, %fr27
! 438: L(oop)
! 439: add p032a1, p032a2, m032
! 440: ldd -0x80(%r30), p000a
! 441: add,dc p096b1, p096b2, m096
! 442: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 443:
! 444: add,dc p160c1, p160c2, m160
! 445: ldd -0x68(%r30), p064a
! 446: add,dc p224d1, p224d2, m224
! 447: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 448:
! 449: add,dc %r0, %r0, m288
! 450: ldd -0x40(%r30), p064b
! 451: ldo 32(up), up
! 452: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 453:
! 454: depd,z m032, 31, 32, ma000
! 455: ldd -0x28(%r30), p128b
! 456: extrd,u m032, 31, 32, ma064
! 457: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 458:
! 459: depd m096, 31, 32, ma064
! 460: ldd -0x60(%r30), p128c
! 461: extrd,u m096, 31, 32, ma128
! 462: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 463:
! 464: depd m160, 31, 32, ma128
! 465: ldd -0x48(%r30), p192c
! 466: extrd,u m160, 31, 32, ma192
! 467: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 468:
! 469: depd m224, 31, 32, ma192
! 470: ldd -0x20(%r30), p192d
! 471: extrd,u m224, 31, 32, ma256
! 472: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 473:
! 474: depd m288, 31, 32, ma256
! 475: ldd -0x88(%r30), p256d
! 476: add climb, p000a, s000
! 477: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 478:
! 479: add,dc p064a, p064b, s064
! 480: ldd 0(rp), r000
! 481: add,dc p128b, p128c, s128
! 482: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 483:
! 484: add,dc p192c, p192d, s192
! 485: ldd 8(rp), r064
! 486: add,dc p256d, %r0, climb
! 487: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 488:
! 489: ldd 16(rp), r128
! 490: add ma000, s000, s000 C accum mid 0
! 491: ldd 24(rp), r192
! 492: add,dc ma064, s064, s064 C accum mid 1
! 493:
! 494: add,dc ma128, s128, s128 C accum mid 2
! 495: fldd 0(up), %fr4
! 496: add,dc ma192, s192, s192 C accum mid 3
! 497: fldd 8(up), %fr5
! 498:
! 499: add,dc ma256, climb, climb
! 500: fldd 16(up), %fr6
! 501: sub r000, s000, s000 C accum rlimb 0
! 502: fldd 24(up), %fr7
! 503:
! 504: sub,db r064, s064, s064 C accum rlimb 1
! 505: sub,db r128, s128, s128 C accum rlimb 2
! 506: std s000, 0(rp)
! 507:
! 508: sub,db r192, s192, s192 C accum rlimb 3
! 509: sub,db %r0, climb, climb
! 510: sub %r0, climb, climb
! 511: std s064, 8(rp)
! 512:
! 513: xmpyu %fr8R, %fr4L, %fr22
! 514: ldd -0x78(%r30), p032a1
! 515: xmpyu %fr8L, %fr4R, %fr23
! 516: std s128, 16(rp)
! 517:
! 518: xmpyu %fr8R, %fr5L, %fr24
! 519: ldd -0x70(%r30), p032a2
! 520: xmpyu %fr8L, %fr5R, %fr25
! 521: std s192, 24(rp)
! 522:
! 523: xmpyu %fr8R, %fr6L, %fr26
! 524: ldd -0x38(%r30), p096b1
! 525: xmpyu %fr8L, %fr6R, %fr27
! 526: fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
! 527:
! 528: xmpyu %fr8R, %fr7L, %fr28
! 529: ldd -0x30(%r30), p096b2
! 530: xmpyu %fr8L, %fr7R, %fr29
! 531: fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
! 532:
! 533: xmpyu %fr8R, %fr4R, %fr30
! 534: ldd -0x58(%r30), p160c1
! 535: xmpyu %fr8L, %fr4L, %fr31
! 536: fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
! 537:
! 538: xmpyu %fr8R, %fr5R, %fr22
! 539: ldd -0x50(%r30), p160c2
! 540: xmpyu %fr8L, %fr5L, %fr23
! 541: fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
! 542:
! 543: xmpyu %fr8R, %fr6R, %fr24
! 544: ldd -0x18(%r30), p224d1
! 545: xmpyu %fr8L, %fr6L, %fr25
! 546: fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
! 547:
! 548: xmpyu %fr8R, %fr7R, %fr26
! 549: ldd -0x10(%r30), p224d2
! 550: fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
! 551: xmpyu %fr8L, %fr7L, %fr27
! 552:
! 553: addib,<> -1, n, L(oop)
! 554: ldo 32(rp), rp
! 555:
! 556: L(end2)
! 557: add p032a1, p032a2, m032
! 558: ldd -0x80(%r30), p000a
! 559: add,dc p096b1, p096b2, m096
! 560: fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
! 561: add,dc p160c1, p160c2, m160
! 562: ldd -0x68(%r30), p064a
! 563: add,dc p224d1, p224d2, m224
! 564: fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
! 565: add,dc %r0, %r0, m288
! 566: ldd -0x40(%r30), p064b
! 567: fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
! 568: depd,z m032, 31, 32, ma000
! 569: ldd -0x28(%r30), p128b
! 570: extrd,u m032, 31, 32, ma064
! 571: fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
! 572: depd m096, 31, 32, ma064
! 573: ldd -0x60(%r30), p128c
! 574: extrd,u m096, 31, 32, ma128
! 575: fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
! 576: depd m160, 31, 32, ma128
! 577: ldd -0x48(%r30), p192c
! 578: extrd,u m160, 31, 32, ma192
! 579: fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
! 580: depd m224, 31, 32, ma192
! 581: ldd -0x20(%r30), p192d
! 582: extrd,u m224, 31, 32, ma256
! 583: fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
! 584: depd m288, 31, 32, ma256
! 585: ldd -0x88(%r30), p256d
! 586: add climb, p000a, s000
! 587: fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
! 588: add,dc p064a, p064b, s064
! 589: ldd 0(rp), r000
! 590: add,dc p128b, p128c, s128
! 591: fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
! 592: add,dc p192c, p192d, s192
! 593: ldd 8(rp), r064
! 594: add,dc p256d, %r0, climb
! 595: fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
! 596: ldd 16(rp), r128
! 597: add ma000, s000, s000 C accum mid 0
! 598: ldd 24(rp), r192
! 599: add,dc ma064, s064, s064 C accum mid 1
! 600: add,dc ma128, s128, s128 C accum mid 2
! 601: add,dc ma192, s192, s192 C accum mid 3
! 602: add,dc ma256, climb, climb
! 603: sub r000, s000, s000 C accum rlimb 0
! 604: sub,db r064, s064, s064 C accum rlimb 1
! 605: sub,db r128, s128, s128 C accum rlimb 2
! 606: std s000, 0(rp)
! 607: sub,db r192, s192, s192 C accum rlimb 3
! 608: sub,db %r0, climb, climb
! 609: sub %r0, climb, climb
! 610: std s064, 8(rp)
! 611: ldd -0x78(%r30), p032a1
! 612: std s128, 16(rp)
! 613: ldd -0x70(%r30), p032a2
! 614: std s192, 24(rp)
! 615: ldd -0x38(%r30), p096b1
! 616: ldd -0x30(%r30), p096b2
! 617: ldd -0x58(%r30), p160c1
! 618: ldd -0x50(%r30), p160c2
! 619: ldd -0x18(%r30), p224d1
! 620: ldd -0x10(%r30), p224d2
! 621: ldo 32(rp), rp
! 622:
! 623: L(end1)
! 624: add p032a1, p032a2, m032
! 625: ldd -0x80(%r30), p000a
! 626: add,dc p096b1, p096b2, m096
! 627: add,dc p160c1, p160c2, m160
! 628: ldd -0x68(%r30), p064a
! 629: add,dc p224d1, p224d2, m224
! 630: add,dc %r0, %r0, m288
! 631: ldd -0x40(%r30), p064b
! 632: depd,z m032, 31, 32, ma000
! 633: ldd -0x28(%r30), p128b
! 634: extrd,u m032, 31, 32, ma064
! 635: depd m096, 31, 32, ma064
! 636: ldd -0x60(%r30), p128c
! 637: extrd,u m096, 31, 32, ma128
! 638: depd m160, 31, 32, ma128
! 639: ldd -0x48(%r30), p192c
! 640: extrd,u m160, 31, 32, ma192
! 641: depd m224, 31, 32, ma192
! 642: ldd -0x20(%r30), p192d
! 643: extrd,u m224, 31, 32, ma256
! 644: depd m288, 31, 32, ma256
! 645: ldd -0x88(%r30), p256d
! 646: add climb, p000a, s000
! 647: add,dc p064a, p064b, s064
! 648: ldd 0(rp), r000
! 649: add,dc p128b, p128c, s128
! 650: add,dc p192c, p192d, s192
! 651: ldd 8(rp), r064
! 652: add,dc p256d, %r0, climb
! 653: ldd 16(rp), r128
! 654: add ma000, s000, s000 C accum mid 0
! 655: ldd 24(rp), r192
! 656: add,dc ma064, s064, s064 C accum mid 1
! 657: add,dc ma128, s128, s128 C accum mid 2
! 658: add,dc ma192, s192, s192 C accum mid 3
! 659: add,dc ma256, climb, climb
! 660: sub r000, s000, s000 C accum rlimb 0
! 661: sub,db r064, s064, s064 C accum rlimb 1
! 662: sub,db r128, s128, s128 C accum rlimb 2
! 663: std s000, 0(rp)
! 664: sub,db r192, s192, s192 C accum rlimb 3
! 665: sub,db %r0, climb, climb
! 666: sub %r0, climb, climb
! 667: std s064, 8(rp)
! 668: std s128, 16(rp)
! 669: std s192, 24(rp)
! 670:
! 671: ldd -0xb0(%r30), %r13
! 672: ldd -0xb8(%r30), %r12
! 673: ldd -0xc0(%r30), %r11
! 674: ldd -0xc8(%r30), %r10
! 675: ldd -0xd0(%r30), %r9
! 676: ldd -0xd8(%r30), %r8
! 677: ldd -0xe0(%r30), %r7
! 678: ldd -0xe8(%r30), %r6
! 679: L(done)
! 680: ifdef(`HAVE_ABI_2_0w',
! 681: ` copy climb, %r28
! 682: ',` extrd,u climb, 63, 32, %r29
! 683: extrd,u climb, 31, 32, %r28
! 684: ')
! 685: ldd -0xf0(%r30), %r5
! 686: ldd -0xf8(%r30), %r4
! 687: bve (%r2)
! 688: ldd,mb -0x100(%r30), %r3
! 689: EPILOGUE(mpn_submul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>