Annotation of OpenXM_contrib/gmp/mpn/sparc64/mul_1.asm, Revision 1.1.1.2
1.1.1.2 ! ohara 1: dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
! 2: dnl the result in a second limb vector.
1.1 maekawa 3:
1.1.1.2 ! ohara 4: dnl Copyright 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published
10: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
11: dnl your option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23: include(`../config.m4')
24:
1.1.1.2 ! ohara 25: C Algorithm: We use eight floating-point multiplies per limb product, with the
! 26: C invariant v operand split into four 16-bit pieces, and the s1 operand split
! 27: C into 32-bit pieces. We sum pairs of 48-bit partial products using
! 28: C floating-point add, then convert the four 49-bit product-sums and transfer
! 29: C them to the integer unit.
! 30:
! 31: C Possible optimizations:
! 32: C 1. Align the stack area where we transfer the four 49-bit product-sums
! 33: C to a 32-byte boundary. That would minimize the cache collition.
! 34: C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
! 35: C be to align the area to map to the area immediately before s1?)
! 36: C 2. Figure out a better way for summing the 49-bit quantities.
! 37: C 3. Unrolling. Questionable if it is worth the code expansion, given that
! 38: C it could only save 1 cycle/limb.
! 39: C 4. Specialize for particular v values. If its upper 32 bits are zero, we
! 40: C could save many operations, in the FPU (fmuld), but more so in the IEU
! 41: C since we'll be summing 48-bit quantities, which is much simpler.
! 42: C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
! 43: C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
! 44: C not be greater than needed for L2 cache latency, and also not so great
! 45: C that i16 needs to be copied.
! 46: C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
! 47: C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
! 48: C ops.)
! 49:
! 50: C Instruction classification (as per UltraSPARC-1/2 functional units):
! 51: C 8 FM
! 52: C 10 FA
! 53: C 11 MEM
! 54: C 9 ISHIFT + 10? IADDLOG
! 55: C 1 BRANCH
! 56: C 49 insns totally (plus three mov insns that should be optimized out)
! 57:
! 58: C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
! 59: C sustain 3.79 instructions/cycle. It would not be terribly hard to save a
! 60: C cycle/loop.
! 61:
1.1 maekawa 62: C INPUT PARAMETERS
1.1.1.2 ! ohara 63: C rp i0
! 64: C up i1
! 65: C n i2
! 66: C v i3
1.1 maekawa 67:
68: ASM_START()
1.1.1.2 ! ohara 69: REGISTER(%g2,#scratch)
! 70: REGISTER(%g3,#scratch)
! 71:
! 72: define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
! 73: define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
! 74: define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
! 75: define(`u00',`%f32') define(`u32', `%f34')
! 76: define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
! 77: define(`cy',`%g1')
! 78: define(`rlimb',`%g3')
! 79: define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
! 80: define(`xffffffff',`%l7')
! 81: define(`xffff',`%o0')
1.1 maekawa 82:
83: PROLOGUE(mpn_mul_1)
84:
1.1.1.2 ! ohara 85: C Initialization. (1) Split v operand into four 16-bit chunks and store them
! 86: C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
! 87: C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
! 88:
! 89: save %sp, -256, %sp
! 90: mov -1, %g4
! 91: srlx %g4, 48, xffff C store mask in register `xffff'
! 92: and %i3, xffff, %g2
! 93: stx %g2, [%sp+2223+0]
! 94: srlx %i3, 16, %g3
! 95: and %g3, xffff, %g3
! 96: stx %g3, [%sp+2223+8]
! 97: srlx %i3, 32, %g2
! 98: and %g2, xffff, %g2
! 99: stx %g2, [%sp+2223+16]
! 100: srlx %i3, 48, %g3
! 101: stx %g3, [%sp+2223+24]
! 102: srlx %g4, 32, xffffffff C store mask in register `xffffffff'
! 103:
! 104: sllx %i2, 3, %i2
! 105: mov 0, cy C clear cy
! 106: add %i0, %i2, %i0
! 107: add %i1, %i2, %i1
! 108: neg %i2
! 109: add %i1, 4, %i5
! 110: add %i0, -32, %i4
! 111: add %i0, -16, %i0
! 112:
! 113: ldd [%sp+2223+0], v00
! 114: ldd [%sp+2223+8], v16
! 115: ldd [%sp+2223+16], v32
! 116: ldd [%sp+2223+24], v48
! 117: ld [%sp+2223+0],%f2 C zero f2
! 118: ld [%sp+2223+0],%f4 C zero f4
! 119: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 120: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 121: fxtod v00, v00
! 122: fxtod v16, v16
! 123: fxtod v32, v32
! 124: fxtod v48, v48
! 125:
! 126: C Start real work. (We sneakingly read f3 and f5 above...)
! 127: C The software pipeline is very deep, requiring 4 feed-in stages.
! 128:
! 129: fxtod %f2, u00
! 130: fxtod %f4, u32
! 131: fmuld u00, v00, a00
! 132: fmuld u00, v16, a16
! 133: fmuld u00, v32, p32
! 134: fmuld u32, v00, r32
! 135: fmuld u00, v48, p48
! 136: addcc %i2, 8, %i2
! 137: bnz,pt %icc, .L_two_or_more
! 138: fmuld u32, v16, r48
! 139:
! 140: .L_one:
! 141: fmuld u32, v32, r64 C FIXME not urgent
! 142: faddd p32, r32, a32
! 143: fdtox a00, a00
! 144: faddd p48, r48, a48
! 145: fmuld u32, v48, r80 C FIXME not urgent
! 146: fdtox a16, a16
! 147: fdtox a32, a32
! 148: fdtox a48, a48
! 149: std a00, [%sp+2223+0]
! 150: std a16, [%sp+2223+8]
! 151: std a32, [%sp+2223+16]
! 152: std a48, [%sp+2223+24]
! 153: addcc %i2, 8, %i2
! 154:
! 155: fdtox r64, a00
! 156: fdtox r80, a16
! 157: ldx [%sp+2223+0], i00
! 158: ldx [%sp+2223+8], i16
! 159: ldx [%sp+2223+16], i32
! 160: ldx [%sp+2223+24], i48
! 161: std a00, [%sp+2223+0]
! 162: std a16, [%sp+2223+8]
! 163: addcc %i2, 8, %i2
! 164:
! 165: mov i00, %g5 C i00+ now in g5
! 166: ldx [%sp+2223+0], i00
! 167: srlx i16, 48, %l4 C (i16 >> 48)
! 168: mov i16, %g2
! 169: ldx [%sp+2223+8], i16
! 170: srlx i48, 16, %l5 C (i48 >> 16)
! 171: mov i32, %g4 C i32+ now in g4
! 172: sllx i48, 32, %l6 C (i48 << 32)
! 173: srlx %g4, 32, %o3 C (i32 >> 32)
! 174: add %l5, %l4, %o1 C hi64- in %o1
! 175: std a00, [%sp+2223+0]
! 176: sllx %g4, 16, %o2 C (i32 << 16)
! 177: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 178: std a16, [%sp+2223+8]
! 179: sllx %o1, 48, %o3 C (hi64 << 48)
! 180: add %g2, %o2, %o2 C mi64- in %o2
! 181: add %l6, %o2, %o2 C mi64- in %o2
! 182: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 183: add cy, %g5, %o4 C x = prev(i00) + cy
! 184: addcc %i2, 8, %i2
! 185: b,a .L_out_1
! 186:
! 187: .L_two_or_more:
! 188: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 189: fmuld u32, v32, r64 C FIXME not urgent
! 190: faddd p32, r32, a32
! 191: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 192: fdtox a00, a00
! 193: faddd p48, r48, a48
! 194: fmuld u32, v48, r80 C FIXME not urgent
! 195: fdtox a16, a16
! 196: fdtox a32, a32
! 197: fxtod %f2, u00
! 198: fxtod %f4, u32
! 199: fdtox a48, a48
! 200: std a00, [%sp+2223+0]
! 201: fmuld u00, v00, p00
! 202: std a16, [%sp+2223+8]
! 203: fmuld u00, v16, p16
! 204: std a32, [%sp+2223+16]
! 205: fmuld u00, v32, p32
! 206: std a48, [%sp+2223+24]
! 207: faddd p00, r64, a00
! 208: fmuld u32, v00, r32
! 209: faddd p16, r80, a16
! 210: fmuld u00, v48, p48
! 211: addcc %i2, 8, %i2
! 212: bnz,pt %icc, .L_three_or_more
! 213: fmuld u32, v16, r48
! 214:
! 215: .L_two:
! 216: fmuld u32, v32, r64 C FIXME not urgent
! 217: faddd p32, r32, a32
! 218: fdtox a00, a00
! 219: faddd p48, r48, a48
! 220: fmuld u32, v48, r80 C FIXME not urgent
! 221: fdtox a16, a16
! 222: ldx [%sp+2223+0], i00
! 223: fdtox a32, a32
! 224: ldx [%sp+2223+8], i16
! 225: ldx [%sp+2223+16], i32
! 226: ldx [%sp+2223+24], i48
! 227: fdtox a48, a48
! 228: std a00, [%sp+2223+0]
! 229: std a16, [%sp+2223+8]
! 230: std a32, [%sp+2223+16]
! 231: std a48, [%sp+2223+24]
! 232: addcc %i2, 8, %i2
! 233:
! 234: fdtox r64, a00
! 235: mov i00, %g5 C i00+ now in g5
! 236: fdtox r80, a16
! 237: ldx [%sp+2223+0], i00
! 238: srlx i16, 48, %l4 C (i16 >> 48)
! 239: mov i16, %g2
! 240: ldx [%sp+2223+8], i16
! 241: srlx i48, 16, %l5 C (i48 >> 16)
! 242: mov i32, %g4 C i32+ now in g4
! 243: ldx [%sp+2223+16], i32
! 244: sllx i48, 32, %l6 C (i48 << 32)
! 245: ldx [%sp+2223+24], i48
! 246: srlx %g4, 32, %o3 C (i32 >> 32)
! 247: add %l5, %l4, %o1 C hi64- in %o1
! 248: std a00, [%sp+2223+0]
! 249: sllx %g4, 16, %o2 C (i32 << 16)
! 250: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 251: std a16, [%sp+2223+8]
! 252: sllx %o1, 48, %o3 C (hi64 << 48)
! 253: add %g2, %o2, %o2 C mi64- in %o2
! 254: add %l6, %o2, %o2 C mi64- in %o2
! 255: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 256: add cy, %g5, %o4 C x = prev(i00) + cy
! 257: addcc %i2, 8, %i2
! 258: b,a .L_out_2
! 259:
! 260: .L_three_or_more:
! 261: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 262: fmuld u32, v32, r64 C FIXME not urgent
! 263: faddd p32, r32, a32
! 264: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 265: fdtox a00, a00
! 266: faddd p48, r48, a48
! 267: fmuld u32, v48, r80 C FIXME not urgent
! 268: fdtox a16, a16
! 269: ldx [%sp+2223+0], i00
! 270: fdtox a32, a32
! 271: ldx [%sp+2223+8], i16
! 272: fxtod %f2, u00
! 273: ldx [%sp+2223+16], i32
! 274: fxtod %f4, u32
! 275: ldx [%sp+2223+24], i48
! 276: fdtox a48, a48
! 277: std a00, [%sp+2223+0]
! 278: fmuld u00, v00, p00
! 279: std a16, [%sp+2223+8]
! 280: fmuld u00, v16, p16
! 281: std a32, [%sp+2223+16]
! 282: fmuld u00, v32, p32
! 283: std a48, [%sp+2223+24]
! 284: faddd p00, r64, a00
! 285: fmuld u32, v00, r32
! 286: faddd p16, r80, a16
! 287: fmuld u00, v48, p48
! 288: addcc %i2, 8, %i2
! 289: bnz,pt %icc, .L_four_or_more
! 290: fmuld u32, v16, r48
! 291:
! 292: .L_three:
! 293: fmuld u32, v32, r64 C FIXME not urgent
! 294: faddd p32, r32, a32
! 295: fdtox a00, a00
! 296: faddd p48, r48, a48
! 297: mov i00, %g5 C i00+ now in g5
! 298: fmuld u32, v48, r80 C FIXME not urgent
! 299: fdtox a16, a16
! 300: ldx [%sp+2223+0], i00
! 301: fdtox a32, a32
! 302: srlx i16, 48, %l4 C (i16 >> 48)
! 303: mov i16, %g2
! 304: ldx [%sp+2223+8], i16
! 305: srlx i48, 16, %l5 C (i48 >> 16)
! 306: mov i32, %g4 C i32+ now in g4
! 307: ldx [%sp+2223+16], i32
! 308: sllx i48, 32, %l6 C (i48 << 32)
! 309: ldx [%sp+2223+24], i48
! 310: fdtox a48, a48
! 311: srlx %g4, 32, %o3 C (i32 >> 32)
! 312: add %l5, %l4, %o1 C hi64- in %o1
! 313: std a00, [%sp+2223+0]
! 314: sllx %g4, 16, %o2 C (i32 << 16)
! 315: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 316: std a16, [%sp+2223+8]
! 317: sllx %o1, 48, %o3 C (hi64 << 48)
! 318: add %g2, %o2, %o2 C mi64- in %o2
! 319: std a32, [%sp+2223+16]
! 320: add %l6, %o2, %o2 C mi64- in %o2
! 321: std a48, [%sp+2223+24]
! 322: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 323: add cy, %g5, %o4 C x = prev(i00) + cy
! 324: addcc %i2, 8, %i2
! 325: b,a .L_out_3
! 326:
! 327: .L_four_or_more:
! 328: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 329: fmuld u32, v32, r64 C FIXME not urgent
! 330: faddd p32, r32, a32
! 331: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 332: fdtox a00, a00
! 333: faddd p48, r48, a48
! 334: mov i00, %g5 C i00+ now in g5
! 335: fmuld u32, v48, r80 C FIXME not urgent
! 336: fdtox a16, a16
! 337: ldx [%sp+2223+0], i00
! 338: fdtox a32, a32
! 339: srlx i16, 48, %l4 C (i16 >> 48)
! 340: mov i16, %g2
! 341: ldx [%sp+2223+8], i16
! 342: fxtod %f2, u00
! 343: srlx i48, 16, %l5 C (i48 >> 16)
! 344: mov i32, %g4 C i32+ now in g4
! 345: ldx [%sp+2223+16], i32
! 346: fxtod %f4, u32
! 347: sllx i48, 32, %l6 C (i48 << 32)
! 348: ldx [%sp+2223+24], i48
! 349: fdtox a48, a48
! 350: srlx %g4, 32, %o3 C (i32 >> 32)
! 351: add %l5, %l4, %o1 C hi64- in %o1
! 352: std a00, [%sp+2223+0]
! 353: fmuld u00, v00, p00
! 354: sllx %g4, 16, %o2 C (i32 << 16)
! 355: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 356: std a16, [%sp+2223+8]
! 357: fmuld u00, v16, p16
! 358: sllx %o1, 48, %o3 C (hi64 << 48)
! 359: add %g2, %o2, %o2 C mi64- in %o2
! 360: std a32, [%sp+2223+16]
! 361: fmuld u00, v32, p32
! 362: add %l6, %o2, %o2 C mi64- in %o2
! 363: std a48, [%sp+2223+24]
! 364: faddd p00, r64, a00
! 365: fmuld u32, v00, r32
! 366: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 367: faddd p16, r80, a16
! 368: fmuld u00, v48, p48
! 369: add cy, %g5, %o4 C x = prev(i00) + cy
! 370: addcc %i2, 8, %i2
! 371: bnz,pt %icc, .Loop
! 372: fmuld u32, v16, r48
! 373:
! 374: .L_four:
! 375: b,a .L_out_4
! 376:
! 377: C BEGIN MAIN LOOP
! 378: .align 16
! 379: .Loop:
! 380: C 00
! 381: srlx %o4, 16, %o5 C (x >> 16)
! 382: ld [%i5+%i2], %f3 C read low 32 bits of up[i]
! 383: fmuld u32, v32, r64 C FIXME not urgent
! 384: faddd p32, r32, a32
! 385: C 01
! 386: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 387: and %o4, xffff, %o5 C (x & 0xffff)
! 388: ld [%i1+%i2], %f5 C read high 32 bits of up[i]
! 389: fdtox a00, a00
! 390: C 02
! 391: faddd p48, r48, a48
! 392: C 03
! 393: srlx %o2, 48, %o7 C (mi64 >> 48)
! 394: mov i00, %g5 C i00+ now in g5
! 395: fmuld u32, v48, r80 C FIXME not urgent
! 396: fdtox a16, a16
! 397: C 04
! 398: sllx %o2, 16, %i3 C (mi64 << 16)
! 399: add %o7, %o1, cy C new cy
! 400: ldx [%sp+2223+0], i00
! 401: fdtox a32, a32
! 402: C 05
! 403: srlx i16, 48, %l4 C (i16 >> 48)
! 404: mov i16, %g2
! 405: ldx [%sp+2223+8], i16
! 406: fxtod %f2, u00
! 407: C 06
! 408: srlx i48, 16, %l5 C (i48 >> 16)
! 409: mov i32, %g4 C i32+ now in g4
! 410: ldx [%sp+2223+16], i32
! 411: fxtod %f4, u32
! 412: C 07
! 413: sllx i48, 32, %l6 C (i48 << 32)
! 414: or %i3, %o5, %o5
! 415: ldx [%sp+2223+24], i48
! 416: fdtox a48, a48
! 417: C 08
! 418: srlx %g4, 32, %o3 C (i32 >> 32)
! 419: add %l5, %l4, %o1 C hi64- in %o1
! 420: std a00, [%sp+2223+0]
! 421: fmuld u00, v00, p00
! 422: C 09
! 423: sllx %g4, 16, %o2 C (i32 << 16)
! 424: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 425: std a16, [%sp+2223+8]
! 426: fmuld u00, v16, p16
! 427: C 10
! 428: sllx %o1, 48, %o3 C (hi64 << 48)
! 429: add %g2, %o2, %o2 C mi64- in %o2
! 430: std a32, [%sp+2223+16]
! 431: fmuld u00, v32, p32
! 432: C 11
! 433: add %l6, %o2, %o2 C mi64- in %o2
! 434: std a48, [%sp+2223+24]
! 435: faddd p00, r64, a00
! 436: fmuld u32, v00, r32
! 437: C 12
! 438: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 439: stx %o5, [%i4+%i2]
! 440: faddd p16, r80, a16
! 441: fmuld u00, v48, p48
! 442: C 13
! 443: add cy, %g5, %o4 C x = prev(i00) + cy
! 444: addcc %i2, 8, %i2
! 445: bnz,pt %icc, .Loop
! 446: fmuld u32, v16, r48
! 447: C END MAIN LOOP
! 448:
! 449: .L_out_4:
! 450: srlx %o4, 16, %o5 C (x >> 16)
! 451: fmuld u32, v32, r64 C FIXME not urgent
! 452: faddd p32, r32, a32
! 453: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 454: and %o4, xffff, %o5 C (x & 0xffff)
! 455: fdtox a00, a00
! 456: faddd p48, r48, a48
! 457: srlx %o2, 48, %o7 C (mi64 >> 48)
! 458: mov i00, %g5 C i00+ now in g5
! 459: fmuld u32, v48, r80 C FIXME not urgent
! 460: fdtox a16, a16
! 461: sllx %o2, 16, %i3 C (mi64 << 16)
! 462: add %o7, %o1, cy C new cy
! 463: ldx [%sp+2223+0], i00
! 464: fdtox a32, a32
! 465: srlx i16, 48, %l4 C (i16 >> 48)
! 466: mov i16, %g2
! 467: ldx [%sp+2223+8], i16
! 468: srlx i48, 16, %l5 C (i48 >> 16)
! 469: mov i32, %g4 C i32+ now in g4
! 470: ldx [%sp+2223+16], i32
! 471: sllx i48, 32, %l6 C (i48 << 32)
! 472: or %i3, %o5, %o5
! 473: ldx [%sp+2223+24], i48
! 474: fdtox a48, a48
! 475: srlx %g4, 32, %o3 C (i32 >> 32)
! 476: add %l5, %l4, %o1 C hi64- in %o1
! 477: std a00, [%sp+2223+0]
! 478: sllx %g4, 16, %o2 C (i32 << 16)
! 479: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 480: std a16, [%sp+2223+8]
! 481: sllx %o1, 48, %o3 C (hi64 << 48)
! 482: add %g2, %o2, %o2 C mi64- in %o2
! 483: std a32, [%sp+2223+16]
! 484: add %l6, %o2, %o2 C mi64- in %o2
! 485: std a48, [%sp+2223+24]
! 486: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 487: stx %o5, [%i4+%i2]
! 488: add cy, %g5, %o4 C x = prev(i00) + cy
! 489: add %i2, 8, %i2
! 490: .L_out_3:
! 491: srlx %o4, 16, %o5 C (x >> 16)
! 492: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 493: and %o4, xffff, %o5 C (x & 0xffff)
! 494: fdtox r64, a00
! 495: srlx %o2, 48, %o7 C (mi64 >> 48)
! 496: mov i00, %g5 C i00+ now in g5
! 497: fdtox r80, a16
! 498: sllx %o2, 16, %i3 C (mi64 << 16)
! 499: add %o7, %o1, cy C new cy
! 500: ldx [%sp+2223+0], i00
! 501: srlx i16, 48, %l4 C (i16 >> 48)
! 502: mov i16, %g2
! 503: ldx [%sp+2223+8], i16
! 504: srlx i48, 16, %l5 C (i48 >> 16)
! 505: mov i32, %g4 C i32+ now in g4
! 506: ldx [%sp+2223+16], i32
! 507: sllx i48, 32, %l6 C (i48 << 32)
! 508: or %i3, %o5, %o5
! 509: ldx [%sp+2223+24], i48
! 510: srlx %g4, 32, %o3 C (i32 >> 32)
! 511: add %l5, %l4, %o1 C hi64- in %o1
! 512: std a00, [%sp+2223+0]
! 513: sllx %g4, 16, %o2 C (i32 << 16)
! 514: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 515: std a16, [%sp+2223+8]
! 516: sllx %o1, 48, %o3 C (hi64 << 48)
! 517: add %g2, %o2, %o2 C mi64- in %o2
! 518: add %l6, %o2, %o2 C mi64- in %o2
! 519: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 520: stx %o5, [%i4+%i2]
! 521: add cy, %g5, %o4 C x = prev(i00) + cy
! 522: add %i2, 8, %i2
! 523: .L_out_2:
! 524: srlx %o4, 16, %o5 C (x >> 16)
! 525: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 526: and %o4, xffff, %o5 C (x & 0xffff)
! 527: srlx %o2, 48, %o7 C (mi64 >> 48)
! 528: mov i00, %g5 C i00+ now in g5
! 529: sllx %o2, 16, %i3 C (mi64 << 16)
! 530: add %o7, %o1, cy C new cy
! 531: ldx [%sp+2223+0], i00
! 532: srlx i16, 48, %l4 C (i16 >> 48)
! 533: mov i16, %g2
! 534: ldx [%sp+2223+8], i16
! 535: srlx i48, 16, %l5 C (i48 >> 16)
! 536: mov i32, %g4 C i32+ now in g4
! 537: sllx i48, 32, %l6 C (i48 << 32)
! 538: or %i3, %o5, %o5
! 539: srlx %g4, 32, %o3 C (i32 >> 32)
! 540: add %l5, %l4, %o1 C hi64- in %o1
! 541: sllx %g4, 16, %o2 C (i32 << 16)
! 542: add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
! 543: sllx %o1, 48, %o3 C (hi64 << 48)
! 544: add %g2, %o2, %o2 C mi64- in %o2
! 545: add %l6, %o2, %o2 C mi64- in %o2
! 546: sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
! 547: stx %o5, [%i4+%i2]
! 548: add cy, %g5, %o4 C x = prev(i00) + cy
! 549: add %i2, 8, %i2
! 550: .L_out_1:
! 551: srlx %o4, 16, %o5 C (x >> 16)
! 552: add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
! 553: and %o4, xffff, %o5 C (x & 0xffff)
! 554: srlx %o2, 48, %o7 C (mi64 >> 48)
! 555: sllx %o2, 16, %i3 C (mi64 << 16)
! 556: add %o7, %o1, cy C new cy
! 557: or %i3, %o5, %o5
! 558: stx %o5, [%i4+%i2]
! 559:
! 560: sllx i00, 0, %g2
! 561: add %g2, cy, cy
! 562: sllx i16, 16, %g3
! 563: add %g3, cy, cy
1.1 maekawa 564:
1.1.1.2 ! ohara 565: return %i7+8
! 566: mov cy, %o0
! 567: EPILOGUE(mpn_mul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>