=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/sparc64/Attic/addmul_1.asm,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1.1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/sparc64/Attic/addmul_1.asm 2000/09/09 14:12:41 1.1.1.1 +++ OpenXM_contrib/gmp/mpn/sparc64/Attic/addmul_1.asm 2003/08/25 16:06:26 1.1.1.2 @@ -1,7 +1,7 @@ -dnl SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and -dnl add the result to a second limb vector. +dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. -dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. +dnl Copyright 1998, 2000, 2001, 2002 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -22,93 +22,572 @@ dnl MA 02111-1307, USA. include(`../config.m4') +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the s1 operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collition. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before s1?) +C 2. Figure out a better way for summing the 49-bit quantities. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which is much simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 12 MEM +C 10 ISHIFT + 14 IADDLOG +C 1 BRANCH +C 55 insns totally (plus one mov insn that should be optimized out) + +C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain the peak execution rate of 4 instructions/cycle. While it may be +C possible to save one or two instructions, it seems unlikely we can save +C enough to shave off any more cycles. + C INPUT PARAMETERS -C res_ptr i0 -C s1_ptr i1 -C size i2 -C s2_limb i3 +C rp i0 +C up i1 +C n i2 +C v i3 ASM_START() - .register %g2,#scratch - .register %g3,#scratch + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + PROLOGUE(mpn_addmul_1) - save %sp,-256,%sp -C We store 0.0 in f10 and keep it invariant accross thw two -C function calls below. Note that this is not ABI conformant, -C but since the functions are local, that's acceptable. -ifdef(`PIC', -`L(pc): rd %pc,%o7 - ld [%o7+L(noll)-L(pc)],%f10', -` sethi %hh(L(noll)),%g2 - sethi %lm(L(noll)),%g1 - or %g2,%hm(L(noll)),%g2 - or %g1,%lo(L(noll)),%g1 - sllx %g2,32,%g2 - ld [%g1+%g2],%f10') +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. - sub %i1,%i0,%g1 - srlx %g1,3,%g1 - cmp %g1,%i2 - bcc,pt %xcc,L(nooverlap) - nop + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' - sllx %i2,3,%g2 C compute stack allocation byte count - add %g2,15,%o0 - and %o0,-16,%o0 - sub %sp,%o0,%sp - add %sp,2223,%o0 + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 - mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp - call mpn_copyi - mov %i2,%o2 C copy n to mpn_copyi's count parameter + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 - add %sp,2223,%i1 +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. -L(nooverlap): -C First multiply-add with low 32 bits of s2_limb - mov %i0,%o0 - mov %i1,%o1 - add %i2,%i2,%o2 - call addmull - srl %i3,0,%o3 + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %icc, .L_two_or_more + fmuld u32, v16, r48 - mov %o0,%l0 C keep carry-out from accmull +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + addcc %i2, 8, %i2 -C Now multiply-add with high 32 bits of s2_limb, unless it is zero. - srlx %i3,32,%o3 - brz,a,pn %o3,L(small) - mov %o0,%i0 - mov %i1,%o1 - add %i2,%i2,%o2 - call addmulu - add %i0,4,%o0 + fdtox r64, a00 + ldx [%i0+%i2], rlimb C read rp[i] + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + addcc %i2, 8, %i2 - add %l0,%o0,%i0 -L(small): - ret - restore %g0,%g0,%g0 -EPILOGUE(mpn_addmul_1) + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + add i00, %g5, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + b,a .L_out_1 -C Put a zero in the text segment to allow us to t the address -C quickly when compiling for PIC - TEXT - ALIGN(4) -L(noll): - .word 0 +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %icc, .L_three_or_more + fmuld u32, v16, r48 -define(`LO',`(+4)') -define(`HI',`(-4)') +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + addcc %i2, 8, %i2 -define(`DLO',`(+4)') -define(`DHI',`(-4)') -define(`LOWPART') -define(`E',`L(l.$1)') -include_mpn(`sparc64/addmul1h.asm') + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + b,a .L_out_2 -define(`DLO',`(-4)') -define(`DHI',`(+4)') -undefine(`LOWPART') -define(`E',`L(u.$1)') -include_mpn(`sparc64/addmul1h.asm') +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %icc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + b,a .L_out_3 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %icc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %icc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_addmul_1)