Annotation of OpenXM_contrib/gmp/mpn/sparc64/addmul_1.asm, Revision 1.1.1.2
1.1.1.2 ! ohara 1: dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
! 2: dnl the result to a second limb vector.
1.1 maekawa 3:
1.1.1.2 ! ohara 4: dnl Copyright 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
1.1.1.2 ! ohara 25: C Algorithm: We use eight floating-point multiplies per limb product, with the
! 26: C invariant v operand split into four 16-bit pieces, and the s1 operand split
! 27: C into 32-bit pieces. We sum pairs of 48-bit partial products using
! 28: C floating-point add, then convert the four 49-bit product-sums and transfer
! 29: C them to the integer unit.
! 30:
! 31: C Possible optimizations:
! 32: C 1. Align the stack area where we transfer the four 49-bit product-sums
! 33: C to a 32-byte boundary. That would minimize the cache collition.
! 34: C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
! 35: C be to align the area to map to the area immediately before s1?)
! 36: C 2. Figure out a better way for summing the 49-bit quantities.
! 37: C 3. Unrolling. Questionable if it is worth the code expansion, given that
! 38: C it could only save 1 cycle/limb.
! 39: C 4. Specialize for particular v values. If its upper 32 bits are zero, we
! 40: C could save many operations, in the FPU (fmuld), but more so in the IEU
! 41: C since we'll be summing 48-bit quantities, which is much simpler.
! 42: C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
! 43: C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
! 44: C not be greater than needed for L2 cache latency, and also not so great
! 45: C that i16 needs to be copied.
! 46: C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
! 47: C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
! 48: C ops.)
! 49:
! 50: C Instruction classification (as per UltraSPARC-1/2 functional units):
! 51: C 8 FM
! 52: C 10 FA
! 53: C 12 MEM
! 54: C 10 ISHIFT + 14 IADDLOG
! 55: C 1 BRANCH
! 56: C 55 insns totally (plus one mov insn that should be optimized out)
! 57:
! 58: C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
! 59: C sustain the peak execution rate of 4 instructions/cycle. While it may be
! 60: C possible to save one or two instructions, it seems unlikely we can save
! 61: C enough to shave off any more cycles.
! 62:
1.1 maekawa 63: C INPUT PARAMETERS
1.1.1.2 ! ohara 64: C rp i0
! 65: C up i1
! 66: C n i2
! 67: C v i3
1.1 maekawa 68:
69: ASM_START()
1.1.1.2 ! ohara 70: REGISTER(%g2,#scratch)
! 71: REGISTER(%g3,#scratch)
! 72:
! 73: define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
! 74: define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
! 75: define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
! 76: define(`u00',`%f32') define(`u32', `%f34')
! 77: define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
! 78: define(`cy',`%g1')
! 79: define(`rlimb',`%g3')
! 80: define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
! 81: define(`xffffffff',`%l7')
! 82: define(`xffff',`%o0')
1.1 maekawa 83:
84: PROLOGUE(mpn_addmul_1)
85:
1.1.1.2 ! ohara 86: C Initialization. (1) Split v operand into four 16-bit chunks and store them
! 87: C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
! 88: C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
! 89:
! 90: save %sp, -256, %sp
! 91: mov -1, %g4
! 92: srlx %g4, 48, xffff C store mask in register `xffff'
! 93: and %i3, xffff, %g2
! 94: stx %g2, [%sp+2223+0]
! 95: srlx %i3, 16, %g3
! 96: and %g3, xffff, %g3
! 97: stx %g3, [%sp+2223+8]
! 98: srlx %i3, 32, %g2
! 99: and %g2, xffff, %g2
! 100: stx %g2, [%sp+2223+16]
! 101: srlx %i3, 48, %g3
! 102: stx %g3, [%sp+2223+24]
! 103: srlx %g4, 32, xffffffff C store mask in register `xffffffff'
! 104:
! 105: sllx %i2, 3, %i2
! 106: mov 0, cy C clear cy
! 107: add %i0, %i2, %i0
! 108: add %i1, %i2, %i1
! 109: neg %i2
! 110: add %i1, 4, %i5
! 111: add %i0, -32, %i4
! 112: add %i0, -16, %i0
! 113:
! 114: ldd [%sp+2223+0], v00
! 115: ldd [%sp+2223+8], v16
! 116: ldd [%sp+2223+16], v32
! 117: ldd [%sp+2223+24], v48
! 118: ld [%sp+2223+0],%f2 C zero f2
! 119: ld [%sp+2223+0],%f4 C zero f4
! 120: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 121: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 122: fxtod v00, v00
! 123: fxtod v16, v16
! 124: fxtod v32, v32
! 125: fxtod v48, v48
! 126:
! 127: C Start real work. (We sneakingly read f3 and f5 above...)
! 128: C The software pipeline is very deep, requiring 4 feed-in stages.
! 129:
! 130: fxtod %f2, u00
! 131: fxtod %f4, u32
! 132: fmuld u00, v00, a00
! 133: fmuld u00, v16, a16
! 134: fmuld u00, v32, p32
! 135: fmuld u32, v00, r32
! 136: fmuld u00, v48, p48
! 137: addcc %i2, 8, %i2
! 138: bnz,pt %icc, .L_two_or_more
! 139: fmuld u32, v16, r48
! 140:
! 141: .L_one:
! 142: fmuld u32, v32, r64 C FIXME not urgent
! 143: faddd p32, r32, a32
! 144: fdtox a00, a00
! 145: faddd p48, r48, a48
! 146: fmuld u32, v48, r80 C FIXME not urgent
! 147: fdtox a16, a16
! 148: fdtox a32, a32
! 149: fdtox a48, a48
! 150: std a00, [%sp+2223+0]
! 151: std a16, [%sp+2223+8]
! 152: std a32, [%sp+2223+16]
! 153: std a48, [%sp+2223+24]
! 154: addcc %i2, 8, %i2
! 155:
! 156: fdtox r64, a00
! 157: ldx [%i0+%i2], rlimb C read rp[i]
! 158: fdtox r80, a16
! 159: ldx [%sp+2223+0], i00
! 160: ldx [%sp+2223+8], i16
! 161: ldx [%sp+2223+16], i32
! 162: ldx [%sp+2223+24], i48
! 163: std a00, [%sp+2223+0]
! 164: std a16, [%sp+2223+8]
! 165: addcc %i2, 8, %i2
! 166:
! 167: srlx rlimb, 32, %g4 C HI(rlimb)
! 168: and rlimb, xffffffff, %g5 C LO(rlimb)
! 169: add i00, %g5, %g5 C i00+ now in g5
! 170: ldx [%sp+2223+0], i00
! 171: srlx i16, 48, %l4 C (i16 >> 48)
! 172: mov i16, %g2
! 173: ldx [%sp+2223+8], i16
! 174: srlx i48, 16, %l5 C (i48 >> 16)
! 175: add i32, %g4, %g4 C i32+ now in g4
! 176: sllx i48, 32, %l6 C (i48 << 32)
! 177: srlx %g4, 32, %o3 C (i32 >> 32)
! 178: add %l5, %l4, %o1 C hi64- in %o1
! 179: std a00, [%sp+2223+0]
! 180: sllx %g4, 16, %o2 C (i32 << 16)
! 181: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 182: std a16, [%sp+2223+8]
! 183: sllx %o1, 48, %o3 C (hi64 << 48)
! 184: add %g2, %o2, %o2 C mi64- in %o2
! 185: add %l6, %o2, %o2 C mi64- in %o2
! 186: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 187: add cy, %g5, %o4 C x = prev(i00) + cy
! 188: addcc %i2, 8, %i2
! 189: b,a .L_out_1
! 190:
! 191: .L_two_or_more:
! 192: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 193: fmuld u32, v32, r64 C FIXME not urgent
! 194: faddd p32, r32, a32
! 195: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 196: fdtox a00, a00
! 197: faddd p48, r48, a48
! 198: fmuld u32, v48, r80 C FIXME not urgent
! 199: fdtox a16, a16
! 200: fdtox a32, a32
! 201: fxtod %f2, u00
! 202: fxtod %f4, u32
! 203: fdtox a48, a48
! 204: std a00, [%sp+2223+0]
! 205: fmuld u00, v00, p00
! 206: std a16, [%sp+2223+8]
! 207: fmuld u00, v16, p16
! 208: std a32, [%sp+2223+16]
! 209: fmuld u00, v32, p32
! 210: std a48, [%sp+2223+24]
! 211: faddd p00, r64, a00
! 212: fmuld u32, v00, r32
! 213: faddd p16, r80, a16
! 214: fmuld u00, v48, p48
! 215: addcc %i2, 8, %i2
! 216: bnz,pt %icc, .L_three_or_more
! 217: fmuld u32, v16, r48
! 218:
! 219: .L_two:
! 220: fmuld u32, v32, r64 C FIXME not urgent
! 221: faddd p32, r32, a32
! 222: fdtox a00, a00
! 223: ldx [%i0+%i2], rlimb C read rp[i]
! 224: faddd p48, r48, a48
! 225: fmuld u32, v48, r80 C FIXME not urgent
! 226: fdtox a16, a16
! 227: ldx [%sp+2223+0], i00
! 228: fdtox a32, a32
! 229: ldx [%sp+2223+8], i16
! 230: ldx [%sp+2223+16], i32
! 231: ldx [%sp+2223+24], i48
! 232: fdtox a48, a48
! 233: std a00, [%sp+2223+0]
! 234: std a16, [%sp+2223+8]
! 235: std a32, [%sp+2223+16]
! 236: std a48, [%sp+2223+24]
! 237: addcc %i2, 8, %i2
! 238:
! 239: fdtox r64, a00
! 240: srlx rlimb, 32, %g4 C HI(rlimb)
! 241: and rlimb, xffffffff, %g5 C LO(rlimb)
! 242: ldx [%i0+%i2], rlimb C read rp[i]
! 243: add i00, %g5, %g5 C i00+ now in g5
! 244: fdtox r80, a16
! 245: ldx [%sp+2223+0], i00
! 246: srlx i16, 48, %l4 C (i16 >> 48)
! 247: mov i16, %g2
! 248: ldx [%sp+2223+8], i16
! 249: srlx i48, 16, %l5 C (i48 >> 16)
! 250: add i32, %g4, %g4 C i32+ now in g4
! 251: ldx [%sp+2223+16], i32
! 252: sllx i48, 32, %l6 C (i48 << 32)
! 253: ldx [%sp+2223+24], i48
! 254: srlx %g4, 32, %o3 C (i32 >> 32)
! 255: add %l5, %l4, %o1 C hi64- in %o1
! 256: std a00, [%sp+2223+0]
! 257: sllx %g4, 16, %o2 C (i32 << 16)
! 258: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 259: std a16, [%sp+2223+8]
! 260: sllx %o1, 48, %o3 C (hi64 << 48)
! 261: add %g2, %o2, %o2 C mi64- in %o2
! 262: add %l6, %o2, %o2 C mi64- in %o2
! 263: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 264: add cy, %g5, %o4 C x = prev(i00) + cy
! 265: addcc %i2, 8, %i2
! 266: b,a .L_out_2
! 267:
! 268: .L_three_or_more:
! 269: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 270: fmuld u32, v32, r64 C FIXME not urgent
! 271: faddd p32, r32, a32
! 272: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 273: fdtox a00, a00
! 274: ldx [%i0+%i2], rlimb C read rp[i]
! 275: faddd p48, r48, a48
! 276: fmuld u32, v48, r80 C FIXME not urgent
! 277: fdtox a16, a16
! 278: ldx [%sp+2223+0], i00
! 279: fdtox a32, a32
! 280: ldx [%sp+2223+8], i16
! 281: fxtod %f2, u00
! 282: ldx [%sp+2223+16], i32
! 283: fxtod %f4, u32
! 284: ldx [%sp+2223+24], i48
! 285: fdtox a48, a48
! 286: std a00, [%sp+2223+0]
! 287: fmuld u00, v00, p00
! 288: std a16, [%sp+2223+8]
! 289: fmuld u00, v16, p16
! 290: std a32, [%sp+2223+16]
! 291: fmuld u00, v32, p32
! 292: std a48, [%sp+2223+24]
! 293: faddd p00, r64, a00
! 294: fmuld u32, v00, r32
! 295: faddd p16, r80, a16
! 296: fmuld u00, v48, p48
! 297: addcc %i2, 8, %i2
! 298: bnz,pt %icc, .L_four_or_more
! 299: fmuld u32, v16, r48
! 300:
! 301: .L_three:
! 302: fmuld u32, v32, r64 C FIXME not urgent
! 303: faddd p32, r32, a32
! 304: fdtox a00, a00
! 305: srlx rlimb, 32, %g4 C HI(rlimb)
! 306: and rlimb, xffffffff, %g5 C LO(rlimb)
! 307: ldx [%i0+%i2], rlimb C read rp[i]
! 308: faddd p48, r48, a48
! 309: add i00, %g5, %g5 C i00+ now in g5
! 310: fmuld u32, v48, r80 C FIXME not urgent
! 311: fdtox a16, a16
! 312: ldx [%sp+2223+0], i00
! 313: fdtox a32, a32
! 314: srlx i16, 48, %l4 C (i16 >> 48)
! 315: mov i16, %g2
! 316: ldx [%sp+2223+8], i16
! 317: srlx i48, 16, %l5 C (i48 >> 16)
! 318: add i32, %g4, %g4 C i32+ now in g4
! 319: ldx [%sp+2223+16], i32
! 320: sllx i48, 32, %l6 C (i48 << 32)
! 321: ldx [%sp+2223+24], i48
! 322: fdtox a48, a48
! 323: srlx %g4, 32, %o3 C (i32 >> 32)
! 324: add %l5, %l4, %o1 C hi64- in %o1
! 325: std a00, [%sp+2223+0]
! 326: sllx %g4, 16, %o2 C (i32 << 16)
! 327: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 328: std a16, [%sp+2223+8]
! 329: sllx %o1, 48, %o3 C (hi64 << 48)
! 330: add %g2, %o2, %o2 C mi64- in %o2
! 331: std a32, [%sp+2223+16]
! 332: add %l6, %o2, %o2 C mi64- in %o2
! 333: std a48, [%sp+2223+24]
! 334: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 335: add cy, %g5, %o4 C x = prev(i00) + cy
! 336: addcc %i2, 8, %i2
! 337: b,a .L_out_3
! 338:
! 339: .L_four_or_more:
! 340: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 341: fmuld u32, v32, r64 C FIXME not urgent
! 342: faddd p32, r32, a32
! 343: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 344: fdtox a00, a00
! 345: srlx rlimb, 32, %g4 C HI(rlimb)
! 346: and rlimb, xffffffff, %g5 C LO(rlimb)
! 347: ldx [%i0+%i2], rlimb C read rp[i]
! 348: faddd p48, r48, a48
! 349: add i00, %g5, %g5 C i00+ now in g5
! 350: fmuld u32, v48, r80 C FIXME not urgent
! 351: fdtox a16, a16
! 352: ldx [%sp+2223+0], i00
! 353: fdtox a32, a32
! 354: srlx i16, 48, %l4 C (i16 >> 48)
! 355: mov i16, %g2
! 356: ldx [%sp+2223+8], i16
! 357: fxtod %f2, u00
! 358: srlx i48, 16, %l5 C (i48 >> 16)
! 359: add i32, %g4, %g4 C i32+ now in g4
! 360: ldx [%sp+2223+16], i32
! 361: fxtod %f4, u32
! 362: sllx i48, 32, %l6 C (i48 << 32)
! 363: ldx [%sp+2223+24], i48
! 364: fdtox a48, a48
! 365: srlx %g4, 32, %o3 C (i32 >> 32)
! 366: add %l5, %l4, %o1 C hi64- in %o1
! 367: std a00, [%sp+2223+0]
! 368: fmuld u00, v00, p00
! 369: sllx %g4, 16, %o2 C (i32 << 16)
! 370: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 371: std a16, [%sp+2223+8]
! 372: fmuld u00, v16, p16
! 373: sllx %o1, 48, %o3 C (hi64 << 48)
! 374: add %g2, %o2, %o2 C mi64- in %o2
! 375: std a32, [%sp+2223+16]
! 376: fmuld u00, v32, p32
! 377: add %l6, %o2, %o2 C mi64- in %o2
! 378: std a48, [%sp+2223+24]
! 379: faddd p00, r64, a00
! 380: fmuld u32, v00, r32
! 381: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 382: faddd p16, r80, a16
! 383: fmuld u00, v48, p48
! 384: add cy, %g5, %o4 C x = prev(i00) + cy
! 385: addcc %i2, 8, %i2
! 386: bnz,pt %icc, .Loop
! 387: fmuld u32, v16, r48
! 388:
! 389: .L_four:
! 390: b,a .L_out_4
! 391:
! 392: C BEGIN MAIN LOOP
! 393: .align 16
! 394: .Loop:
! 395: C 00
! 396: srlx %o4, 16, %o5 C (x >> 16)
! 397: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 398: fmuld u32, v32, r64 C FIXME not urgent
! 399: faddd p32, r32, a32
! 400: C 01
! 401: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 402: and %o4, xffff, %o5 C (x & 0xffff)
! 403: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 404: fdtox a00, a00
! 405: C 02
! 406: srlx rlimb, 32, %g4 C HI(rlimb)
! 407: and rlimb, xffffffff, %g5 C LO(rlimb)
! 408: ldx [%i0+%i2], rlimb C read rp[i]
! 409: faddd p48, r48, a48
! 410: C 03
! 411: srlx %o2, 48, %o7 C (mi64 >> 48)
! 412: add i00, %g5, %g5 C i00+ now in g5
! 413: fmuld u32, v48, r80 C FIXME not urgent
! 414: fdtox a16, a16
! 415: C 04
! 416: sllx %o2, 16, %i3 C (mi64 << 16)
! 417: add %o7, %o1, cy C new cy
! 418: ldx [%sp+2223+0], i00
! 419: fdtox a32, a32
! 420: C 05
! 421: srlx i16, 48, %l4 C (i16 >> 48)
! 422: mov i16, %g2
! 423: ldx [%sp+2223+8], i16
! 424: fxtod %f2, u00
! 425: C 06
! 426: srlx i48, 16, %l5 C (i48 >> 16)
! 427: add i32, %g4, %g4 C i32+ now in g4
! 428: ldx [%sp+2223+16], i32
! 429: fxtod %f4, u32
! 430: C 07
! 431: sllx i48, 32, %l6 C (i48 << 32)
! 432: or %i3, %o5, %o5
! 433: ldx [%sp+2223+24], i48
! 434: fdtox a48, a48
! 435: C 08
! 436: srlx %g4, 32, %o3 C (i32 >> 32)
! 437: add %l5, %l4, %o1 C hi64- in %o1
! 438: std a00, [%sp+2223+0]
! 439: fmuld u00, v00, p00
! 440: C 09
! 441: sllx %g4, 16, %o2 C (i32 << 16)
! 442: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 443: std a16, [%sp+2223+8]
! 444: fmuld u00, v16, p16
! 445: C 10
! 446: sllx %o1, 48, %o3 C (hi64 << 48)
! 447: add %g2, %o2, %o2 C mi64- in %o2
! 448: std a32, [%sp+2223+16]
! 449: fmuld u00, v32, p32
! 450: C 11
! 451: add %l6, %o2, %o2 C mi64- in %o2
! 452: std a48, [%sp+2223+24]
! 453: faddd p00, r64, a00
! 454: fmuld u32, v00, r32
! 455: C 12
! 456: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 457: stx %o5, [%i4+%i2]
! 458: faddd p16, r80, a16
! 459: fmuld u00, v48, p48
! 460: C 13
! 461: add cy, %g5, %o4 C x = prev(i00) + cy
! 462: addcc %i2, 8, %i2
! 463: bnz,pt %icc, .Loop
! 464: fmuld u32, v16, r48
! 465: C END MAIN LOOP
! 466:
! 467: .L_out_4:
! 468: srlx %o4, 16, %o5 C (x >> 16)
! 469: fmuld u32, v32, r64 C FIXME not urgent
! 470: faddd p32, r32, a32
! 471: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 472: and %o4, xffff, %o5 C (x & 0xffff)
! 473: fdtox a00, a00
! 474: srlx rlimb, 32, %g4 C HI(rlimb)
! 475: and rlimb, xffffffff, %g5 C LO(rlimb)
! 476: ldx [%i0+%i2], rlimb C read rp[i]
! 477: faddd p48, r48, a48
! 478: srlx %o2, 48, %o7 C (mi64 >> 48)
! 479: add i00, %g5, %g5 C i00+ now in g5
! 480: fmuld u32, v48, r80 C FIXME not urgent
! 481: fdtox a16, a16
! 482: sllx %o2, 16, %i3 C (mi64 << 16)
! 483: add %o7, %o1, cy C new cy
! 484: ldx [%sp+2223+0], i00
! 485: fdtox a32, a32
! 486: srlx i16, 48, %l4 C (i16 >> 48)
! 487: mov i16, %g2
! 488: ldx [%sp+2223+8], i16
! 489: srlx i48, 16, %l5 C (i48 >> 16)
! 490: add i32, %g4, %g4 C i32+ now in g4
! 491: ldx [%sp+2223+16], i32
! 492: sllx i48, 32, %l6 C (i48 << 32)
! 493: or %i3, %o5, %o5
! 494: ldx [%sp+2223+24], i48
! 495: fdtox a48, a48
! 496: srlx %g4, 32, %o3 C (i32 >> 32)
! 497: add %l5, %l4, %o1 C hi64- in %o1
! 498: std a00, [%sp+2223+0]
! 499: sllx %g4, 16, %o2 C (i32 << 16)
! 500: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 501: std a16, [%sp+2223+8]
! 502: sllx %o1, 48, %o3 C (hi64 << 48)
! 503: add %g2, %o2, %o2 C mi64- in %o2
! 504: std a32, [%sp+2223+16]
! 505: add %l6, %o2, %o2 C mi64- in %o2
! 506: std a48, [%sp+2223+24]
! 507: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 508: stx %o5, [%i4+%i2]
! 509: add cy, %g5, %o4 C x = prev(i00) + cy
! 510: add %i2, 8, %i2
! 511: .L_out_3:
! 512: srlx %o4, 16, %o5 C (x >> 16)
! 513: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 514: and %o4, xffff, %o5 C (x & 0xffff)
! 515: fdtox r64, a00
! 516: srlx rlimb, 32, %g4 C HI(rlimb)
! 517: and rlimb, xffffffff, %g5 C LO(rlimb)
! 518: ldx [%i0+%i2], rlimb C read rp[i]
! 519: srlx %o2, 48, %o7 C (mi64 >> 48)
! 520: add i00, %g5, %g5 C i00+ now in g5
! 521: fdtox r80, a16
! 522: sllx %o2, 16, %i3 C (mi64 << 16)
! 523: add %o7, %o1, cy C new cy
! 524: ldx [%sp+2223+0], i00
! 525: srlx i16, 48, %l4 C (i16 >> 48)
! 526: mov i16, %g2
! 527: ldx [%sp+2223+8], i16
! 528: srlx i48, 16, %l5 C (i48 >> 16)
! 529: add i32, %g4, %g4 C i32+ now in g4
! 530: ldx [%sp+2223+16], i32
! 531: sllx i48, 32, %l6 C (i48 << 32)
! 532: or %i3, %o5, %o5
! 533: ldx [%sp+2223+24], i48
! 534: srlx %g4, 32, %o3 C (i32 >> 32)
! 535: add %l5, %l4, %o1 C hi64- in %o1
! 536: std a00, [%sp+2223+0]
! 537: sllx %g4, 16, %o2 C (i32 << 16)
! 538: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 539: std a16, [%sp+2223+8]
! 540: sllx %o1, 48, %o3 C (hi64 << 48)
! 541: add %g2, %o2, %o2 C mi64- in %o2
! 542: add %l6, %o2, %o2 C mi64- in %o2
! 543: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 544: stx %o5, [%i4+%i2]
! 545: add cy, %g5, %o4 C x = prev(i00) + cy
! 546: add %i2, 8, %i2
! 547: .L_out_2:
! 548: srlx %o4, 16, %o5 C (x >> 16)
! 549: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 550: and %o4, xffff, %o5 C (x & 0xffff)
! 551: srlx rlimb, 32, %g4 C HI(rlimb)
! 552: and rlimb, xffffffff, %g5 C LO(rlimb)
! 553: srlx %o2, 48, %o7 C (mi64 >> 48)
! 554: add i00, %g5, %g5 C i00+ now in g5
! 555: sllx %o2, 16, %i3 C (mi64 << 16)
! 556: add %o7, %o1, cy C new cy
! 557: ldx [%sp+2223+0], i00
! 558: srlx i16, 48, %l4 C (i16 >> 48)
! 559: mov i16, %g2
! 560: ldx [%sp+2223+8], i16
! 561: srlx i48, 16, %l5 C (i48 >> 16)
! 562: add i32, %g4, %g4 C i32+ now in g4
! 563: sllx i48, 32, %l6 C (i48 << 32)
! 564: or %i3, %o5, %o5
! 565: srlx %g4, 32, %o3 C (i32 >> 32)
! 566: add %l5, %l4, %o1 C hi64- in %o1
! 567: sllx %g4, 16, %o2 C (i32 << 16)
! 568: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 569: sllx %o1, 48, %o3 C (hi64 << 48)
! 570: add %g2, %o2, %o2 C mi64- in %o2
! 571: add %l6, %o2, %o2 C mi64- in %o2
! 572: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 573: stx %o5, [%i4+%i2]
! 574: add cy, %g5, %o4 C x = prev(i00) + cy
! 575: add %i2, 8, %i2
! 576: .L_out_1:
! 577: srlx %o4, 16, %o5 C (x >> 16)
! 578: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 579: and %o4, xffff, %o5 C (x & 0xffff)
! 580: srlx %o2, 48, %o7 C (mi64 >> 48)
! 581: sllx %o2, 16, %i3 C (mi64 << 16)
! 582: add %o7, %o1, cy C new cy
! 583: or %i3, %o5, %o5
! 584: stx %o5, [%i4+%i2]
! 585:
! 586: sllx i00, 0, %g2
! 587: add %g2, cy, cy
! 588: sllx i16, 16, %g3
! 589: add %g3, cy, cy
1.1 maekawa 590:
1.1.1.2 ! ohara 591: return %i7+8
! 592: mov cy, %o0
! 593: EPILOGUE(mpn_addmul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>