version 1.1.1.1, 2000/09/09 14:12:41 |
version 1.1.1.2, 2003/08/25 16:06:26 |
|
|
dnl SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and |
dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store |
dnl store the result to a second limb vector. |
dnl the result in a second limb vector. |
|
|
dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. |
dnl Copyright 1998, 2000, 2001, 2002 Free Software Foundation, Inc. |
|
|
dnl This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
|
|
Line 22 dnl MA 02111-1307, USA. |
|
Line 22 dnl MA 02111-1307, USA. |
|
|
|
include(`../config.m4') |
include(`../config.m4') |
|
|
|
C Algorithm: We use eight floating-point multiplies per limb product, with the |
|
C invariant v operand split into four 16-bit pieces, and the s1 operand split |
|
C into 32-bit pieces. We sum pairs of 48-bit partial products using |
|
C floating-point add, then convert the four 49-bit product-sums and transfer |
|
C them to the integer unit. |
|
|
|
C Possible optimizations: |
|
C 1. Align the stack area where we transfer the four 49-bit product-sums |
|
C to a 32-byte boundary. That would minimize the cache collition. |
|
C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would |
|
C be to align the area to map to the area immediately before s1?) |
|
C 2. Figure out a better way for summing the 49-bit quantities. |
|
C 3. Unrolling. Questionable if it is worth the code expansion, given that |
|
C it could only save 1 cycle/limb. |
|
C 4. Specialize for particular v values. If its upper 32 bits are zero, we |
|
C could save many operations, in the FPU (fmuld), but more so in the IEU |
|
C since we'll be summing 48-bit quantities, which is much simpler. |
|
C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and |
|
C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should |
|
C not be greater than needed for L2 cache latency, and also not so great |
|
C that i16 needs to be copied. |
|
C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want |
|
C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU |
|
C ops.) |
|
|
|
C Instruction classification (as per UltraSPARC-1/2 functional units): |
|
C 8 FM |
|
C 10 FA |
|
C 11 MEM |
|
C 9 ISHIFT + 10? IADDLOG |
|
C 1 BRANCH |
|
C 49 insns totally (plus three mov insns that should be optimized out) |
|
|
|
C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we |
|
C sustain 3.79 instructions/cycle. It would not be terribly hard to save a |
|
C cycle/loop. |
|
|
C INPUT PARAMETERS |
C INPUT PARAMETERS |
C res_ptr i0 |
C rp i0 |
C s1_ptr i1 |
C up i1 |
C size i2 |
C n i2 |
C s2_limb i3 |
C v i3 |
|
|
ASM_START() |
ASM_START() |
.register %g2,#scratch |
REGISTER(%g2,#scratch) |
.register %g3,#scratch |
REGISTER(%g3,#scratch) |
|
|
|
define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') |
|
define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') |
|
define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') |
|
define(`u00',`%f32') define(`u32', `%f34') |
|
define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') |
|
define(`cy',`%g1') |
|
define(`rlimb',`%g3') |
|
define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') |
|
define(`xffffffff',`%l7') |
|
define(`xffff',`%o0') |
|
|
PROLOGUE(mpn_mul_1) |
PROLOGUE(mpn_mul_1) |
save %sp,-256,%sp |
|
|
|
C We store 0.0 in f10 and keep it invariant accross thw two |
C Initialization. (1) Split v operand into four 16-bit chunks and store them |
C function calls below. Note that this is not ABI conformant, |
C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs |
C but since the functions are local, that's acceptable. |
C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. |
ifdef(`PIC', |
|
`L(pc): rd %pc,%o7 |
|
ld [%o7+L(noll)-L(pc)],%f10', |
|
` sethi %hh(L(noll)),%g2 |
|
sethi %lm(L(noll)),%g1 |
|
or %g2,%hm(L(noll)),%g2 |
|
or %g1,%lo(L(noll)),%g1 |
|
sllx %g2,32,%g2 |
|
ld [%g1+%g2],%f10') |
|
|
|
sub %i1,%i0,%g1 |
save %sp, -256, %sp |
srlx %g1,3,%g1 |
mov -1, %g4 |
cmp %g1,%i2 |
srlx %g4, 48, xffff C store mask in register `xffff' |
bcc,pt %xcc,L(nooverlap) |
and %i3, xffff, %g2 |
nop |
stx %g2, [%sp+2223+0] |
|
srlx %i3, 16, %g3 |
|
and %g3, xffff, %g3 |
|
stx %g3, [%sp+2223+8] |
|
srlx %i3, 32, %g2 |
|
and %g2, xffff, %g2 |
|
stx %g2, [%sp+2223+16] |
|
srlx %i3, 48, %g3 |
|
stx %g3, [%sp+2223+24] |
|
srlx %g4, 32, xffffffff C store mask in register `xffffffff' |
|
|
sllx %i2,3,%g2 C compute stack allocation byte count |
sllx %i2, 3, %i2 |
add %g2,15,%o0 |
mov 0, cy C clear cy |
and %o0,-16,%o0 |
add %i0, %i2, %i0 |
sub %sp,%o0,%sp |
add %i1, %i2, %i1 |
add %sp,2223,%o0 |
neg %i2 |
|
add %i1, 4, %i5 |
|
add %i0, -32, %i4 |
|
add %i0, -16, %i0 |
|
|
mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp |
ldd [%sp+2223+0], v00 |
call mpn_copyi |
ldd [%sp+2223+8], v16 |
mov %i2,%o2 C copy n to mpn_copyi's count parameter |
ldd [%sp+2223+16], v32 |
|
ldd [%sp+2223+24], v48 |
|
ld [%sp+2223+0],%f2 C zero f2 |
|
ld [%sp+2223+0],%f4 C zero f4 |
|
ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
|
ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
|
fxtod v00, v00 |
|
fxtod v16, v16 |
|
fxtod v32, v32 |
|
fxtod v48, v48 |
|
|
add %sp,2223,%i1 |
C Start real work. (We sneakingly read f3 and f5 above...) |
|
C The software pipeline is very deep, requiring 4 feed-in stages. |
|
|
L(nooverlap): |
fxtod %f2, u00 |
C First multiply-add with low 32 bits of s2_limb |
fxtod %f4, u32 |
mov %i0,%o0 |
fmuld u00, v00, a00 |
mov %i1,%o1 |
fmuld u00, v16, a16 |
add %i2,%i2,%o2 |
fmuld u00, v32, p32 |
call mull |
fmuld u32, v00, r32 |
srl %i3,0,%o3 |
fmuld u00, v48, p48 |
|
addcc %i2, 8, %i2 |
|
bnz,pt %icc, .L_two_or_more |
|
fmuld u32, v16, r48 |
|
|
mov %o0,%l0 C keep carry-out from accmull |
.L_one: |
|
fmuld u32, v32, r64 C FIXME not urgent |
|
faddd p32, r32, a32 |
|
fdtox a00, a00 |
|
faddd p48, r48, a48 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
fdtox a32, a32 |
|
fdtox a48, a48 |
|
std a00, [%sp+2223+0] |
|
std a16, [%sp+2223+8] |
|
std a32, [%sp+2223+16] |
|
std a48, [%sp+2223+24] |
|
addcc %i2, 8, %i2 |
|
|
C Now multiply-add with high 32 bits of s2_limb, unless it is zero. |
fdtox r64, a00 |
srlx %i3,32,%o3 |
fdtox r80, a16 |
brz,a,pn %o3,L(small) |
ldx [%sp+2223+0], i00 |
mov %o0,%i0 |
ldx [%sp+2223+8], i16 |
mov %i1,%o1 |
ldx [%sp+2223+16], i32 |
add %i2,%i2,%o2 |
ldx [%sp+2223+24], i48 |
call addmulu |
std a00, [%sp+2223+0] |
add %i0,4,%o0 |
std a16, [%sp+2223+8] |
|
addcc %i2, 8, %i2 |
|
|
add %l0,%o0,%i0 |
mov i00, %g5 C i00+ now in g5 |
L(small): |
ldx [%sp+2223+0], i00 |
ret |
srlx i16, 48, %l4 C (i16 >> 48) |
restore %g0,%g0,%g0 |
mov i16, %g2 |
EPILOGUE(mpn_mul_1) |
ldx [%sp+2223+8], i16 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
std a00, [%sp+2223+0] |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
std a16, [%sp+2223+8] |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
addcc %i2, 8, %i2 |
|
b,a .L_out_1 |
|
|
C Put a zero in the text segment to allow us to t the address |
.L_two_or_more: |
C quickly when compiling for PIC |
ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
TEXT |
fmuld u32, v32, r64 C FIXME not urgent |
ALIGN(4) |
faddd p32, r32, a32 |
L(noll): |
ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
.word 0 |
fdtox a00, a00 |
|
faddd p48, r48, a48 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
fdtox a32, a32 |
|
fxtod %f2, u00 |
|
fxtod %f4, u32 |
|
fdtox a48, a48 |
|
std a00, [%sp+2223+0] |
|
fmuld u00, v00, p00 |
|
std a16, [%sp+2223+8] |
|
fmuld u00, v16, p16 |
|
std a32, [%sp+2223+16] |
|
fmuld u00, v32, p32 |
|
std a48, [%sp+2223+24] |
|
faddd p00, r64, a00 |
|
fmuld u32, v00, r32 |
|
faddd p16, r80, a16 |
|
fmuld u00, v48, p48 |
|
addcc %i2, 8, %i2 |
|
bnz,pt %icc, .L_three_or_more |
|
fmuld u32, v16, r48 |
|
|
define(`LO',`(+4)') |
.L_two: |
define(`HI',`(-4)') |
fmuld u32, v32, r64 C FIXME not urgent |
|
faddd p32, r32, a32 |
|
fdtox a00, a00 |
|
faddd p48, r48, a48 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
ldx [%sp+2223+0], i00 |
|
fdtox a32, a32 |
|
ldx [%sp+2223+8], i16 |
|
ldx [%sp+2223+16], i32 |
|
ldx [%sp+2223+24], i48 |
|
fdtox a48, a48 |
|
std a00, [%sp+2223+0] |
|
std a16, [%sp+2223+8] |
|
std a32, [%sp+2223+16] |
|
std a48, [%sp+2223+24] |
|
addcc %i2, 8, %i2 |
|
|
define(`DLO',`(+4)') |
fdtox r64, a00 |
define(`DHI',`(-4)') |
mov i00, %g5 C i00+ now in g5 |
define(`E',`L($1)') |
fdtox r80, a16 |
include_mpn(`sparc64/mul_1h.asm') |
ldx [%sp+2223+0], i00 |
|
srlx i16, 48, %l4 C (i16 >> 48) |
|
mov i16, %g2 |
|
ldx [%sp+2223+8], i16 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
ldx [%sp+2223+16], i32 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
ldx [%sp+2223+24], i48 |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
std a00, [%sp+2223+0] |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
std a16, [%sp+2223+8] |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
addcc %i2, 8, %i2 |
|
b,a .L_out_2 |
|
|
define(`DLO',`(-4)') |
.L_three_or_more: |
define(`DHI',`(+4)') |
ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
undefine(`LOWPART') |
fmuld u32, v32, r64 C FIXME not urgent |
define(`E',`L(u.$1)') |
faddd p32, r32, a32 |
include_mpn(`sparc64/addmul1h.asm') |
ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
|
fdtox a00, a00 |
|
faddd p48, r48, a48 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
ldx [%sp+2223+0], i00 |
|
fdtox a32, a32 |
|
ldx [%sp+2223+8], i16 |
|
fxtod %f2, u00 |
|
ldx [%sp+2223+16], i32 |
|
fxtod %f4, u32 |
|
ldx [%sp+2223+24], i48 |
|
fdtox a48, a48 |
|
std a00, [%sp+2223+0] |
|
fmuld u00, v00, p00 |
|
std a16, [%sp+2223+8] |
|
fmuld u00, v16, p16 |
|
std a32, [%sp+2223+16] |
|
fmuld u00, v32, p32 |
|
std a48, [%sp+2223+24] |
|
faddd p00, r64, a00 |
|
fmuld u32, v00, r32 |
|
faddd p16, r80, a16 |
|
fmuld u00, v48, p48 |
|
addcc %i2, 8, %i2 |
|
bnz,pt %icc, .L_four_or_more |
|
fmuld u32, v16, r48 |
|
|
|
.L_three: |
|
fmuld u32, v32, r64 C FIXME not urgent |
|
faddd p32, r32, a32 |
|
fdtox a00, a00 |
|
faddd p48, r48, a48 |
|
mov i00, %g5 C i00+ now in g5 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
ldx [%sp+2223+0], i00 |
|
fdtox a32, a32 |
|
srlx i16, 48, %l4 C (i16 >> 48) |
|
mov i16, %g2 |
|
ldx [%sp+2223+8], i16 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
ldx [%sp+2223+16], i32 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
ldx [%sp+2223+24], i48 |
|
fdtox a48, a48 |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
std a00, [%sp+2223+0] |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
std a16, [%sp+2223+8] |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
std a32, [%sp+2223+16] |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
std a48, [%sp+2223+24] |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
addcc %i2, 8, %i2 |
|
b,a .L_out_3 |
|
|
|
.L_four_or_more: |
|
ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
|
fmuld u32, v32, r64 C FIXME not urgent |
|
faddd p32, r32, a32 |
|
ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
|
fdtox a00, a00 |
|
faddd p48, r48, a48 |
|
mov i00, %g5 C i00+ now in g5 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
ldx [%sp+2223+0], i00 |
|
fdtox a32, a32 |
|
srlx i16, 48, %l4 C (i16 >> 48) |
|
mov i16, %g2 |
|
ldx [%sp+2223+8], i16 |
|
fxtod %f2, u00 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
ldx [%sp+2223+16], i32 |
|
fxtod %f4, u32 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
ldx [%sp+2223+24], i48 |
|
fdtox a48, a48 |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
std a00, [%sp+2223+0] |
|
fmuld u00, v00, p00 |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
std a16, [%sp+2223+8] |
|
fmuld u00, v16, p16 |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
std a32, [%sp+2223+16] |
|
fmuld u00, v32, p32 |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
std a48, [%sp+2223+24] |
|
faddd p00, r64, a00 |
|
fmuld u32, v00, r32 |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
faddd p16, r80, a16 |
|
fmuld u00, v48, p48 |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
addcc %i2, 8, %i2 |
|
bnz,pt %icc, .Loop |
|
fmuld u32, v16, r48 |
|
|
|
.L_four: |
|
b,a .L_out_4 |
|
|
|
C BEGIN MAIN LOOP |
|
.align 16 |
|
.Loop: |
|
C 00 |
|
srlx %o4, 16, %o5 C (x >> 16) |
|
ld [%i5+%i2], %f3 C read low 32 bits of up[i] |
|
fmuld u32, v32, r64 C FIXME not urgent |
|
faddd p32, r32, a32 |
|
C 01 |
|
add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
|
and %o4, xffff, %o5 C (x & 0xffff) |
|
ld [%i1+%i2], %f5 C read high 32 bits of up[i] |
|
fdtox a00, a00 |
|
C 02 |
|
faddd p48, r48, a48 |
|
C 03 |
|
srlx %o2, 48, %o7 C (mi64 >> 48) |
|
mov i00, %g5 C i00+ now in g5 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
C 04 |
|
sllx %o2, 16, %i3 C (mi64 << 16) |
|
add %o7, %o1, cy C new cy |
|
ldx [%sp+2223+0], i00 |
|
fdtox a32, a32 |
|
C 05 |
|
srlx i16, 48, %l4 C (i16 >> 48) |
|
mov i16, %g2 |
|
ldx [%sp+2223+8], i16 |
|
fxtod %f2, u00 |
|
C 06 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
ldx [%sp+2223+16], i32 |
|
fxtod %f4, u32 |
|
C 07 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
or %i3, %o5, %o5 |
|
ldx [%sp+2223+24], i48 |
|
fdtox a48, a48 |
|
C 08 |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
std a00, [%sp+2223+0] |
|
fmuld u00, v00, p00 |
|
C 09 |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
std a16, [%sp+2223+8] |
|
fmuld u00, v16, p16 |
|
C 10 |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
std a32, [%sp+2223+16] |
|
fmuld u00, v32, p32 |
|
C 11 |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
std a48, [%sp+2223+24] |
|
faddd p00, r64, a00 |
|
fmuld u32, v00, r32 |
|
C 12 |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
stx %o5, [%i4+%i2] |
|
faddd p16, r80, a16 |
|
fmuld u00, v48, p48 |
|
C 13 |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
addcc %i2, 8, %i2 |
|
bnz,pt %icc, .Loop |
|
fmuld u32, v16, r48 |
|
C END MAIN LOOP |
|
|
|
.L_out_4: |
|
srlx %o4, 16, %o5 C (x >> 16) |
|
fmuld u32, v32, r64 C FIXME not urgent |
|
faddd p32, r32, a32 |
|
add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
|
and %o4, xffff, %o5 C (x & 0xffff) |
|
fdtox a00, a00 |
|
faddd p48, r48, a48 |
|
srlx %o2, 48, %o7 C (mi64 >> 48) |
|
mov i00, %g5 C i00+ now in g5 |
|
fmuld u32, v48, r80 C FIXME not urgent |
|
fdtox a16, a16 |
|
sllx %o2, 16, %i3 C (mi64 << 16) |
|
add %o7, %o1, cy C new cy |
|
ldx [%sp+2223+0], i00 |
|
fdtox a32, a32 |
|
srlx i16, 48, %l4 C (i16 >> 48) |
|
mov i16, %g2 |
|
ldx [%sp+2223+8], i16 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
ldx [%sp+2223+16], i32 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
or %i3, %o5, %o5 |
|
ldx [%sp+2223+24], i48 |
|
fdtox a48, a48 |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
std a00, [%sp+2223+0] |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
std a16, [%sp+2223+8] |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
std a32, [%sp+2223+16] |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
std a48, [%sp+2223+24] |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
stx %o5, [%i4+%i2] |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
add %i2, 8, %i2 |
|
.L_out_3: |
|
srlx %o4, 16, %o5 C (x >> 16) |
|
add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
|
and %o4, xffff, %o5 C (x & 0xffff) |
|
fdtox r64, a00 |
|
srlx %o2, 48, %o7 C (mi64 >> 48) |
|
mov i00, %g5 C i00+ now in g5 |
|
fdtox r80, a16 |
|
sllx %o2, 16, %i3 C (mi64 << 16) |
|
add %o7, %o1, cy C new cy |
|
ldx [%sp+2223+0], i00 |
|
srlx i16, 48, %l4 C (i16 >> 48) |
|
mov i16, %g2 |
|
ldx [%sp+2223+8], i16 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
ldx [%sp+2223+16], i32 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
or %i3, %o5, %o5 |
|
ldx [%sp+2223+24], i48 |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
std a00, [%sp+2223+0] |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
std a16, [%sp+2223+8] |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
stx %o5, [%i4+%i2] |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
add %i2, 8, %i2 |
|
.L_out_2: |
|
srlx %o4, 16, %o5 C (x >> 16) |
|
add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
|
and %o4, xffff, %o5 C (x & 0xffff) |
|
srlx %o2, 48, %o7 C (mi64 >> 48) |
|
mov i00, %g5 C i00+ now in g5 |
|
sllx %o2, 16, %i3 C (mi64 << 16) |
|
add %o7, %o1, cy C new cy |
|
ldx [%sp+2223+0], i00 |
|
srlx i16, 48, %l4 C (i16 >> 48) |
|
mov i16, %g2 |
|
ldx [%sp+2223+8], i16 |
|
srlx i48, 16, %l5 C (i48 >> 16) |
|
mov i32, %g4 C i32+ now in g4 |
|
sllx i48, 32, %l6 C (i48 << 32) |
|
or %i3, %o5, %o5 |
|
srlx %g4, 32, %o3 C (i32 >> 32) |
|
add %l5, %l4, %o1 C hi64- in %o1 |
|
sllx %g4, 16, %o2 C (i32 << 16) |
|
add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT |
|
sllx %o1, 48, %o3 C (hi64 << 48) |
|
add %g2, %o2, %o2 C mi64- in %o2 |
|
add %l6, %o2, %o2 C mi64- in %o2 |
|
sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT |
|
stx %o5, [%i4+%i2] |
|
add cy, %g5, %o4 C x = prev(i00) + cy |
|
add %i2, 8, %i2 |
|
.L_out_1: |
|
srlx %o4, 16, %o5 C (x >> 16) |
|
add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT |
|
and %o4, xffff, %o5 C (x & 0xffff) |
|
srlx %o2, 48, %o7 C (mi64 >> 48) |
|
sllx %o2, 16, %i3 C (mi64 << 16) |
|
add %o7, %o1, cy C new cy |
|
or %i3, %o5, %o5 |
|
stx %o5, [%i4+%i2] |
|
|
|
sllx i00, 0, %g2 |
|
add %g2, cy, cy |
|
sllx i16, 16, %g3 |
|
add %g3, cy, cy |
|
|
|
return %i7+8 |
|
mov cy, %o0 |
|
EPILOGUE(mpn_mul_1) |