version 1.1.1.1, 2000/09/09 14:12:41 |
version 1.1.1.2, 2003/08/25 16:06:26 |
|
|
dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and |
dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and |
dnl subtract the result from a second limb vector. |
dnl subtract the result from a second limb vector. |
|
|
dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. |
dnl Copyright 1998, 2000, 2001 Free Software Foundation, Inc. |
|
|
dnl This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
|
|
Line 20 dnl along with the GNU MP Library; see the file COPYI |
|
Line 20 dnl along with the GNU MP Library; see the file COPYI |
|
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
dnl MA 02111-1307, USA. |
dnl MA 02111-1307, USA. |
|
|
|
|
include(`../config.m4') |
include(`../config.m4') |
|
|
C INPUT PARAMETERS |
C Algorithm: We use two floating-point multiplies per limb product, with the |
C res_ptr i0 |
C invariant v operand split into two 16-bit pieces, and the u operand split |
C s1_ptr i1 |
C into 32-bit pieces. We convert the two 48-bit products and transfer them to |
C size i2 |
C the integer unit. |
C s2_limb i3 |
|
|
|
ASM_START() |
C Speed: 7 cycles/limb on UltraSPARC-1/2. |
|
|
TEXT |
C Possible optimizations: |
ALIGN(4) |
C 1. Combine 32-bit memory operations into 64-bit operations. Since we're |
L(noll): |
C memory bandwidth limited, this could save 1.5 cycles/limb. |
.word 0 |
C 2. Unroll the inner loop. Since we already use alternate temporary areas, |
|
C it is very straightforward to unroll, using an exit branch midways. |
|
C Unrolling would allow deeper scheduling which could improve speed for L2 |
|
C cache case. |
|
C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es |
|
C aren't sufficiently apart-scheduled with just two temp areas. |
|
C 4. Do some cross-jumping to save about 1/2 the code size. |
|
C 5. Specialize for particular v values. If its upper 16 bits are zero, we |
|
C could save many operations. |
|
|
PROLOGUE(mpn_submul_1) |
C INPUT PARAMETERS |
save %sp,-256,%sp |
C rp i0 |
|
C up i1 |
|
C n i2 |
|
C v i3 |
|
|
ifdef(`PIC', |
define(`FSIZE',224) |
`L(pc): rd %pc,%o7 |
|
ld [%o7+L(noll)-L(pc)],%f10', |
|
` sethi %hi(L(noll)),%g1 |
|
ld [%g1+%lo(L(noll))],%f10') |
|
|
|
sethi %hi(0xffff0000),%o0 |
ASM_START() |
andn %i3,%o0,%o0 |
PROLOGUE(mpn_submul_1) |
st %o0,[%fp-16] |
add %sp, -FSIZE, %sp |
ld [%fp-16],%f11 |
sethi %hi(0xffff), %g1 |
fxtod %f10,%f6 |
srl %o3, 16, %g2 |
|
or %g1, %lo(0xffff), %g1 |
|
and %o3, %g1, %g1 |
|
stx %g1, [%sp+104] |
|
stx %g2, [%sp+112] |
|
ldd [%sp+104], %f6 |
|
ldd [%sp+112], %f8 |
|
fxtod %f6, %f6 |
|
fxtod %f8, %f8 |
|
ld [%sp+104], %f10 C zero f10 |
|
|
srl %i3,16,%o0 |
mov 0, %g3 C cy = 0 |
st %o0,[%fp-16] |
|
ld [%fp-16],%f11 |
|
fxtod %f10,%f8 |
|
|
|
mov 0,%g3 C cy = 0 |
define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe |
|
|
ld [%i1],%f11 |
add %sp, 160, %o5 C point in scratch area |
subcc %i2,1,%i2 |
and %o5, -32, %o5 C align at 0 (mod 32) in scratch area |
be,pn %icc,L(end1) |
|
add %i1,4,%i1 C s1_ptr++ |
|
|
|
fxtod %f10,%f2 |
subcc %o2, 1, %o2 |
ld [%i1],%f11 |
ld [%o1], %f11 C read up[i] |
add %i1,4,%i1 C s1_ptr++ |
add %o1, 4, %o1 C up++ |
fmuld %f2,%f8,%f16 |
bne,pt %icc, .L_two_or_more |
fmuld %f2,%f6,%f4 |
fxtod %f10, %f2 |
fdtox %f16,%f14 |
.L_1: fmuld %f2, %f8, %f16 |
std %f14,[%fp-24] |
fmuld %f2, %f6, %f4 |
fdtox %f4,%f12 |
fdtox %f16, %f14 |
subcc %i2,1,%i2 |
fdtox %f4, %f12 |
be,pn %icc,L(end2) |
std %f14, [%o5+16] |
std %f12,[%fp-16] |
std %f12, [%o5+24] |
|
ldx [%o5+16], %g2 C p16 |
|
ldx [%o5+24], %g1 C p0 |
|
lduw [%o0], %g5 C read rp[i] |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0] |
|
srlx %g4, 32, %g3 C new cy |
|
sub %g0, %g3, %o0 |
|
retl |
|
sub %sp, -FSIZE, %sp |
|
|
fxtod %f10,%f2 |
.align 16 |
ld [%i1],%f11 |
.L_two_or_more: |
add %i1,4,%i1 C s1_ptr++ |
subcc %o2, 1, %o2 |
fmuld %f2,%f8,%f16 |
ld [%o1], %f11 C read up[i] |
fmuld %f2,%f6,%f4 |
fmuld %f2, %f8, %f16 |
fdtox %f16,%f14 |
fmuld %f2, %f6, %f4 |
std %f14,[%fp-40] |
add %o1, 4, %o1 C up++ |
fdtox %f4,%f12 |
bne,pt %icc, .L_three_or_more |
subcc %i2,1,%i2 |
fxtod %f10, %f2 |
be,pn %icc,L(end3) |
.L_2: fdtox %f16, %f14 |
std %f12,[%fp-32] |
fdtox %f4, %f12 |
|
std %f14, [%o5+16] |
|
fmuld %f2, %f8, %f16 |
|
std %f12, [%o5+24] |
|
fmuld %f2, %f6, %f4 |
|
fdtox %f16, %f14 |
|
fdtox %f4, %f12 |
|
std %f14, [%o5+0] |
|
std %f12, [%o5+8] |
|
lduw [%o0], %g5 C read rp[i] |
|
ldx [%o5+16], %g2 C p16 |
|
ldx [%o5+24], %g1 C p0 |
|
sllx %g2, 16, %g4 C (p16 << 16) * crossjmp pt |
|
ldx [%o5+0], %g2 C p16 |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
ldx [%o5+8], %g1 C p0 |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+0] |
|
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+4], %g5 C read rp[i] |
|
sub %g0, %g3, %g3 |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
srl %g3, 0, %g3 C zero most significant 32 bits |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
add %g3, %g4, %g4 C p += cy |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+4] |
|
srlx %g4, 32, %g3 C new cy |
|
sub %g0, %g3, %o0 |
|
retl |
|
sub %sp, -FSIZE, %sp |
|
|
fxtod %f10,%f2 |
.align 16 |
ld [%i1],%f11 |
.L_three_or_more: |
add %i1,4,%i1 C s1_ptr++ |
subcc %o2, 1, %o2 |
ld [%i0],%g5 |
ld [%o1], %f11 C read up[i] |
ldx [%fp-24],%g2 C p16 |
fdtox %f16, %f14 |
fmuld %f2,%f8,%f16 |
fdtox %f4, %f12 |
ldx [%fp-16],%g1 C p0 |
std %f14, [%o5+16] |
fmuld %f2,%f6,%f4 |
fmuld %f2, %f8, %f16 |
sllx %g2,16,%g2 C align p16 |
std %f12, [%o5+24] |
fdtox %f16,%f14 |
fmuld %f2, %f6, %f4 |
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
add %o1, 4, %o1 C up++ |
std %f14,[%fp-24] |
bne,pt %icc, .L_four_or_more |
fdtox %f4,%f12 |
fxtod %f10, %f2 |
add %i0,4,%i0 C res_ptr++ |
.L_3: fdtox %f16, %f14 |
subcc %i2,1,%i2 |
fdtox %f4, %f12 |
be,pn %icc,L(end4) |
std %f14, [%o5+0] |
std %f12,[%fp-16] |
fmuld %f2, %f8, %f16 |
|
std %f12, [%o5+8] |
|
fmuld %f2, %f6, %f4 |
|
fdtox %f16, %f14 |
|
ldx [%o5+16], %g2 C p16 |
|
fdtox %f4, %f12 |
|
ldx [%o5+24], %g1 C p0 |
|
std %f14, [%o5+16] |
|
std %f12, [%o5+24] |
|
lduw [%o0], %g5 C read rp[i] |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
ldx [%o5+0], %g2 C p16 |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
ldx [%o5+8], %g1 C p0 |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+0] |
|
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+4], %g5 C read rp[i] |
|
sub %g0, %g3, %g3 |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
ldx [%o5+16], %g2 C p16 |
|
srl %g3, 0, %g3 C zero most significant 32 bits |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
ldx [%o5+24], %g1 C p0 |
|
add %g3, %g4, %g4 C p += cy |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+4] |
|
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+8], %g5 C read rp[i] |
|
sub %g0, %g3, %g3 |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
srl %g3, 0, %g3 C zero most significant 32 bits |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
add %g3, %g4, %g4 C p += cy |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+8] |
|
srlx %g4, 32, %g3 C new cy |
|
sub %g0, %g3, %o0 |
|
retl |
|
sub %sp, -FSIZE, %sp |
|
|
b,a L(loopm) |
.align 16 |
|
.L_four_or_more: |
|
subcc %o2, 1, %o2 |
|
ld [%o1], %f11 C read up[i] |
|
fdtox %f16, %f14 |
|
fdtox %f4, %f12 |
|
std %f14, [%o5+0] |
|
fmuld %f2, %f8, %f16 |
|
std %f12, [%o5+8] |
|
fmuld %f2, %f6, %f4 |
|
add %o1, 4, %o1 C up++ |
|
bne,pt %icc, .L_five_or_more |
|
fxtod %f10, %f2 |
|
.L_4: fdtox %f16, %f14 |
|
ldx [%o5+16], %g2 C p16 |
|
fdtox %f4, %f12 |
|
ldx [%o5+24], %g1 C p0 |
|
std %f14, [%o5+16] |
|
fmuld %f2, %f8, %f16 |
|
std %f12, [%o5+24] |
|
fmuld %f2, %f6, %f4 |
|
add %o1, 4, %o1 C up++ |
|
lduw [%o0], %g5 C read rp[i] |
|
fdtox %f16, %f14 |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
ldx [%o5+0], %g2 C p16 |
|
fdtox %f4, %f12 |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
ldx [%o5+8], %g1 C p0 |
|
std %f14, [%o5+0] |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
std %f12, [%o5+8] |
|
stw %g4, [%o0+0] |
|
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+4], %g5 C read rp[i] |
|
sub %g0, %g3, %g3 |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
ldx [%o5+16], %g2 C p16 |
|
srl %g3, 0, %g3 C zero most significant 32 bits |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
ldx [%o5+24], %g1 C p0 |
|
add %g3, %g4, %g4 C p += cy |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+4] |
|
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+8], %g5 C read rp[i] |
|
sub %g0, %g3, %g3 |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
ldx [%o5+0], %g2 C p16 |
|
srl %g3, 0, %g3 C zero most significant 32 bits |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
ldx [%o5+8], %g1 C p0 |
|
add %g3, %g4, %g4 C p += cy |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+8] |
|
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+12], %g5 C read rp[i] |
|
sub %g0, %g3, %g3 |
|
sllx %g2, 16, %g4 C (p16 << 16) |
|
srl %g3, 0, %g3 C zero most significant 32 bits |
|
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
|
add %g3, %g4, %g4 C p += cy |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
stw %g4, [%o0+12] |
|
srlx %g4, 32, %g3 C new cy |
|
sub %g0, %g3, %o0 |
|
retl |
|
sub %sp, -FSIZE, %sp |
|
|
|
.align 16 |
|
.L_five_or_more: |
|
subcc %o2, 1, %o2 |
|
ld [%o1], %f11 C read up[i] |
|
fdtox %f16, %f14 |
|
ldx [%o5+16], %g2 C p16 |
|
fdtox %f4, %f12 |
|
ldx [%o5+24], %g1 C p0 |
|
std %f14, [%o5+16] |
|
fmuld %f2, %f8, %f16 |
|
std %f12, [%o5+24] |
|
fmuld %f2, %f6, %f4 |
|
add %o1, 4, %o1 C up++ |
|
lduw [%o0], %g5 C read rp[i] |
|
bne,pt %icc, .Loop |
|
fxtod %f10, %f2 |
|
b,a .L_out_5 |
|
|
|
C BEGIN MAIN LOOP |
.align 16 |
.align 16 |
C BEGIN LOOP |
C -- 0 |
L(loop): |
.Loop: sub %g0, %g3, %g3 |
fxtod %f10,%f2 |
subcc %o2, 1, %o2 |
ld [%i1],%f11 |
ld [%o1], %f11 C read up[i] |
add %i1,4,%i1 C s1_ptr++ |
fdtox %f16, %f14 |
add %g3,%g1,%g4 C p += cy |
C -- 1 |
subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
sllx %g2, 16, %g4 C (p16 << 16) |
ld [%i0],%g5 |
add %o0, 4, %o0 C rp++ |
srlx %g4,32,%g3 |
ldx [%o5+0], %g2 C p16 |
ldx [%fp-24],%g2 C p16 |
fdtox %f4, %f12 |
fmuld %f2,%f8,%f16 |
C -- 2 |
ldx [%fp-16],%g1 C p0 |
srl %g3, 0, %g3 C zero most significant 32 bits |
fmuld %f2,%f6,%f4 |
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
sllx %g2,16,%g2 C align p16 |
ldx [%o5+8], %g1 C p0 |
st %l2,[%i0-4] |
fanop |
addx %g3,0,%g3 |
C -- 3 |
fdtox %f16,%f14 |
nop |
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
add %g3, %g4, %g4 C p += cy |
std %f14,[%fp-24] |
std %f14, [%o5+0] |
fdtox %f4,%f12 |
fmuld %f2, %f8, %f16 |
std %f12,[%fp-16] |
C -- 4 |
subcc %i2,1,%i2 |
nop |
be,pn %icc,L(loope) |
sub %g5, %g4, %g4 C p += rp[i] |
add %i0,4,%i0 C res_ptr++ |
std %f12, [%o5+8] |
L(loopm): |
fmuld %f2, %f6, %f4 |
fxtod %f10,%f2 |
C -- 5 |
ld [%i1],%f11 |
xor %o5, 16, %o5 C alternate scratch variables |
add %i1,4,%i1 C s1_ptr++ |
add %o1, 4, %o1 C up++ |
add %g3,%g1,%g4 C p += cy |
stw %g4, [%o0-4] |
subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
fanop |
ld [%i0],%g5 |
C -- 6 |
srlx %g4,32,%g3 |
srlx %g4, 32, %g3 C new cy |
ldx [%fp-40],%g2 C p16 |
lduw [%o0], %g5 C read rp[i] |
fmuld %f2,%f8,%f16 |
bne,pt %icc, .Loop |
ldx [%fp-32],%g1 C p0 |
fxtod %f10, %f2 |
fmuld %f2,%f6,%f4 |
C END MAIN LOOP |
sllx %g2,16,%g2 C align p16 |
|
st %l2,[%i0-4] |
|
addx %g3,0,%g3 |
|
fdtox %f16,%f14 |
|
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
|
std %f14,[%fp-40] |
|
fdtox %f4,%f12 |
|
std %f12,[%fp-32] |
|
subcc %i2,1,%i2 |
|
bne,pt %icc,L(loop) |
|
add %i0,4,%i0 C res_ptr++ |
|
C END LOOP |
|
|
|
fxtod %f10,%f2 |
.L_out_5: |
add %g3,%g1,%g4 C p += cy |
sub %g0, %g3, %g3 |
subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
fdtox %f16, %f14 |
ld [%i0],%g5 |
sllx %g2, 16, %g4 C (p16 << 16) |
srlx %g4,32,%g3 |
ldx [%o5+0], %g2 C p16 |
ldx [%fp-24],%g2 C p16 |
fdtox %f4, %f12 |
fmuld %f2,%f8,%f16 |
srl %g3, 0, %g3 C zero most significant 32 bits |
ldx [%fp-16],%g1 C p0 |
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
fmuld %f2,%f6,%f4 |
ldx [%o5+8], %g1 C p0 |
sllx %g2,16,%g2 C align p16 |
add %g4, %g3, %g4 C p += cy |
st %l2,[%i0-4] |
std %f14, [%o5+0] |
b,a L(xxx) |
fmuld %f2, %f8, %f16 |
L(loope): |
sub %g5, %g4, %g4 C p += rp[i] |
L(end4): |
std %f12, [%o5+8] |
fxtod %f10,%f2 |
fmuld %f2, %f6, %f4 |
add %g3,%g1,%g4 C p += cy |
xor %o5, 16, %o5 |
subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
stw %g4, [%o0+0] |
ld [%i0],%g5 |
srlx %g4, 32, %g3 C new cy |
srlx %g4,32,%g3 |
lduw [%o0+4], %g5 C read rp[i] |
ldx [%fp-40],%g2 C p16 |
|
fmuld %f2,%f8,%f16 |
|
ldx [%fp-32],%g1 C p0 |
|
fmuld %f2,%f6,%f4 |
|
sllx %g2,16,%g2 C align p16 |
|
st %l2,[%i0-4] |
|
fdtox %f16,%f14 |
|
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
|
std %f14,[%fp-40] |
|
fdtox %f4,%f12 |
|
std %f12,[%fp-32] |
|
add %i0,4,%i0 C res_ptr++ |
|
|
|
add %g3,%g1,%g4 C p += cy |
sub %g0, %g3, %g3 |
subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
fdtox %f16, %f14 |
ld [%i0],%g5 |
sllx %g2, 16, %g4 C (p16 << 16) |
srlx %g4,32,%g3 |
ldx [%o5+0], %g2 C p16 |
ldx [%fp-24],%g2 C p16 |
fdtox %f4, %f12 |
ldx [%fp-16],%g1 C p0 |
srl %g3, 0, %g3 C zero most significant 32 bits |
sllx %g2,16,%g2 C align p16 |
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
st %l2,[%i0-4] |
ldx [%o5+8], %g1 C p0 |
b,a L(yyy) |
add %g3, %g4, %g4 C p += cy |
|
std %f14, [%o5+0] |
|
sub %g5, %g4, %g4 C p += rp[i] |
|
std %f12, [%o5+8] |
|
xor %o5, 16, %o5 |
|
stw %g4, [%o0+4] |
|
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+8], %g5 C read rp[i] |
|
|
L(end3): |
sub %g0, %g3, %g3 |
fxtod %f10,%f2 |
sllx %g2, 16, %g4 C (p16 << 16) |
ld [%i0],%g5 |
ldx [%o5+0], %g2 C p16 |
ldx [%fp-24],%g2 C p16 |
srl %g3, 0, %g3 C zero most significant 32 bits |
fmuld %f2,%f8,%f16 |
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
ldx [%fp-16],%g1 C p0 |
ldx [%o5+8], %g1 C p0 |
fmuld %f2,%f6,%f4 |
add %g3, %g4, %g4 C p += cy |
sllx %g2,16,%g2 C align p16 |
sub %g5, %g4, %g4 C p += rp[i] |
L(xxx): fdtox %f16,%f14 |
xor %o5, 16, %o5 |
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
stw %g4, [%o0+8] |
std %f14,[%fp-24] |
srlx %g4, 32, %g3 C new cy |
fdtox %f4,%f12 |
lduw [%o0+12], %g5 C read rp[i] |
std %f12,[%fp-16] |
|
add %i0,4,%i0 C res_ptr++ |
|
|
|
add %g3,%g1,%g4 C p += cy |
sub %g0, %g3, %g3 |
subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
sllx %g2, 16, %g4 C (p16 << 16) |
ld [%i0],%g5 |
ldx [%o5+0], %g2 C p16 |
srlx %g4,32,%g3 |
srl %g3, 0, %g3 C zero most significant 32 bits |
ldx [%fp-40],%g2 C p16 |
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
ldx [%fp-32],%g1 C p0 |
ldx [%o5+8], %g1 C p0 |
sllx %g2,16,%g2 C align p16 |
add %g3, %g4, %g4 C p += cy |
st %l2,[%i0-4] |
sub %g5, %g4, %g4 C p += rp[i] |
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
stw %g4, [%o0+12] |
add %i0,4,%i0 C res_ptr++ |
srlx %g4, 32, %g3 C new cy |
|
lduw [%o0+16], %g5 C read rp[i] |
|
|
add %g3,%g1,%g4 C p += cy |
sub %g0, %g3, %g3 |
subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
sllx %g2, 16, %g4 C (p16 << 16) |
ld [%i0],%g5 |
srl %g3, 0, %g3 C zero most significant 32 bits |
srlx %g4,32,%g3 |
add %g1, %g4, %g4 C p = p0 + (p16 << 16) |
ldx [%fp-24],%g2 C p16 |
add %g3, %g4, %g4 C p += cy |
ldx [%fp-16],%g1 C p0 |
sub %g5, %g4, %g4 C p += rp[i] |
sllx %g2,16,%g2 C align p16 |
stw %g4, [%o0+16] |
st %l2,[%i0-4] |
srlx %g4, 32, %g3 C new cy |
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
|
add %i0,4,%i0 C res_ptr++ |
|
b,a L(ret) |
|
|
|
L(end2): |
sub %g0, %g3, %o0 |
fxtod %f10,%f2 |
retl |
fmuld %f2,%f8,%f16 |
sub %sp, -FSIZE, %sp |
fmuld %f2,%f6,%f4 |
|
fdtox %f16,%f14 |
|
std %f14,[%fp-40] |
|
fdtox %f4,%f12 |
|
std %f12,[%fp-32] |
|
ld [%i0],%g5 |
|
ldx [%fp-24],%g2 C p16 |
|
ldx [%fp-16],%g1 C p0 |
|
sllx %g2,16,%g2 C align p16 |
|
L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
|
add %i0,4,%i0 C res_ptr++ |
|
|
|
add %g3,%g1,%g4 C p += cy |
|
subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
|
ld [%i0],%g5 |
|
srlx %g4,32,%g3 |
|
ldx [%fp-40],%g2 C p16 |
|
ldx [%fp-32],%g1 C p0 |
|
sllx %g2,16,%g2 C align p16 |
|
st %l2,[%i0-4] |
|
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
|
add %i0,4,%i0 C res_ptr++ |
|
b,a L(ret) |
|
|
|
L(end1): |
|
fxtod %f10,%f2 |
|
fmuld %f2,%f8,%f16 |
|
fmuld %f2,%f6,%f4 |
|
fdtox %f16,%f14 |
|
std %f14,[%fp-24] |
|
fdtox %f4,%f12 |
|
std %f12,[%fp-16] |
|
|
|
ld [%i0],%g5 |
|
ldx [%fp-24],%g2 C p16 |
|
ldx [%fp-16],%g1 C p0 |
|
sllx %g2,16,%g2 C align p16 |
|
add %g2,%g1,%g1 C add p16 to p0 (ADD1) |
|
add %i0,4,%i0 C res_ptr++ |
|
|
|
L(ret): add %g3,%g1,%g4 C p += cy |
|
subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) |
|
srlx %g4,32,%g3 |
|
st %l2,[%i0-4] |
|
|
|
addx %g3,%g0,%g3 |
|
ret |
|
restore %g0,%g3,%o0 C sideeffect: put cy in retreg |
|
EPILOGUE(mpn_submul_1) |
EPILOGUE(mpn_submul_1) |