version 1.1.1.1, 2000/09/09 14:12:41 |
version 1.1.1.2, 2003/08/25 16:06:26 |
|
|
! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and |
dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and |
! store difference in a third limb vector. |
dnl store difference in a third limb vector. |
|
|
! Copyright (C) 1999, 2000 Free Software Foundation, Inc. |
dnl Copyright 2001, 2002 Free Software Foundation, Inc. |
|
|
! This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
|
|
! The GNU MP Library is free software; you can redistribute it and/or modify |
dnl The GNU MP Library is free software; you can redistribute it and/or modify |
! it under the terms of the GNU Lesser General Public License as published by |
dnl it under the terms of the GNU Lesser General Public License as published |
! the Free Software Foundation; either version 2.1 of the License, or (at your |
dnl by the Free Software Foundation; either version 2.1 of the License, or (at |
! option) any later version. |
dnl your option) any later version. |
|
|
! The GNU MP Library is distributed in the hope that it will be useful, but |
dnl The GNU MP Library is distributed in the hope that it will be useful, but |
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
! License for more details. |
dnl License for more details. |
|
|
! You should have received a copy of the GNU Lesser General Public License |
dnl You should have received a copy of the GNU Lesser General Public License |
! along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to |
! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
! MA 02111-1307, USA. |
dnl MA 02111-1307, USA. |
|
|
|
include(`../config.m4') |
|
|
! INPUT PARAMETERS |
C Compute carry-out from the most significant bits of u,v, and r, where |
! res_ptr %o0 |
C r=u-v-carry_in, using logic operations. |
! s1_ptr %o1 |
|
! s2_ptr %o2 |
|
! size %o3 |
|
|
|
include(`../config.m4') |
C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn |
|
C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. |
|
C Therefore, it seems futile to try to optimize this any further... |
|
|
ASM_START() |
C INPUT PARAMETERS |
.register %g2,#scratch |
define(`rp',`%i0') |
.register %g3,#scratch |
define(`up',`%i1') |
PROLOGUE(mpn_sub_n) |
define(`vp',`%i2') |
|
define(`n',`%i3') |
|
|
! 12 mem ops >= 12 cycles |
define(`u0',`%l0') |
! 8 shift insn >= 8 cycles |
define(`u1',`%l2') |
! 8 addccc, executing alone, +8 cycles |
define(`u2',`%l4') |
! Unrolling not mandatory...perhaps 2-way is best? |
define(`u3',`%l6') |
! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl |
define(`v0',`%l1') |
! All in all, it runs at 5 cycles/limb |
define(`v1',`%l3') |
|
define(`v2',`%l5') |
|
define(`v3',`%l7') |
|
|
save %sp,-160,%sp |
define(`cy',`%i4') |
|
|
addcc %g0,%g0,%g0 |
define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe |
|
define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe |
|
|
add %i3,-4,%i3 |
ASM_START() |
brlz,pn %i3,L(there) |
REGISTER(%g2,#scratch) |
nop |
REGISTER(%g3,#scratch) |
|
PROLOGUE(mpn_sub_n) |
|
save %sp,-160,%sp |
|
|
ldx [%i1+0],%l0 |
fitod %f0,%f0 C make sure f0 contains small, quiet number |
ldx [%i2+0],%l4 |
subcc n,4,%g0 |
ldx [%i1+8],%l1 |
bl,pn %icc,.Loop0 |
ldx [%i2+8],%l5 |
mov 0,cy |
ldx [%i1+16],%l2 |
|
ldx [%i2+16],%l6 |
|
ldx [%i1+24],%l3 |
|
ldx [%i2+24],%l7 |
|
add %i1,32,%i1 |
|
add %i2,32,%i2 |
|
|
|
add %i3,-4,%i3 |
ldx [up+0],u0 |
brlz,pn %i3,L(skip) |
ldx [vp+0],v0 |
nop |
add up,32,up |
b L(loop1) ! jump instead of executing many NOPs |
ldx [up-24],u1 |
nop |
ldx [vp+8],v1 |
ALIGN(32) |
add vp,32,vp |
!--------- Start main loop --------- |
ldx [up-16],u2 |
L(loop1): |
ldx [vp-16],v2 |
subccc %l0,%l4,%g1 |
ldx [up-8],u3 |
!- |
ldx [vp-8],v3 |
srlx %l0,32,%o0 |
subcc n,8,n |
ldx [%i1+0],%l0 |
sub u0,v0,%g1 C main sub |
!- |
sub %g1,cy,%g4 C carry sub |
srlx %l4,32,%o4 |
orn u0,v0,%g2 |
ldx [%i2+0],%l4 |
bl,pn %icc,.Lend4567 |
!- |
fanop |
subccc %o0,%o4,%g0 |
b,a .Loop |
!- |
|
subccc %l1,%l5,%g2 |
|
!- |
|
srlx %l1,32,%o1 |
|
ldx [%i1+8],%l1 |
|
!- |
|
srlx %l5,32,%o5 |
|
ldx [%i2+8],%l5 |
|
!- |
|
subccc %o1,%o5,%g0 |
|
!- |
|
subccc %l2,%l6,%g3 |
|
!- |
|
srlx %l2,32,%o2 |
|
ldx [%i1+16],%l2 |
|
!- |
|
srlx %l6,32,%g5 ! asymmetry |
|
ldx [%i2+16],%l6 |
|
!- |
|
subccc %o2,%g5,%g0 |
|
!- |
|
subccc %l3,%l7,%g4 |
|
!- |
|
srlx %l3,32,%o3 |
|
ldx [%i1+24],%l3 |
|
add %i1,32,%i1 |
|
!- |
|
srlx %l7,32,%o7 |
|
ldx [%i2+24],%l7 |
|
add %i2,32,%i2 |
|
!- |
|
subccc %o3,%o7,%g0 |
|
!- |
|
stx %g1,[%i0+0] |
|
!- |
|
stx %g2,[%i0+8] |
|
!- |
|
stx %g3,[%i0+16] |
|
add %i3,-4,%i3 |
|
!- |
|
stx %g4,[%i0+24] |
|
add %i0,32,%i0 |
|
|
|
brgez,pt %i3,L(loop1) |
.align 16 |
nop |
C START MAIN LOOP |
!--------- End main loop --------- |
.Loop: orn %g4,%g2,%g2 |
L(skip): |
andn u0,v0,%g3 |
subccc %l0,%l4,%g1 |
ldx [up+0],u0 |
srlx %l0,32,%o0 |
fanop |
srlx %l4,32,%o4 |
C -- |
subccc %o0,%o4,%g0 |
andn %g2,%g3,%g2 |
subccc %l1,%l5,%g2 |
ldx [vp+0],v0 |
srlx %l1,32,%o1 |
add up,32,up |
srlx %l5,32,%o5 |
fanop |
subccc %o1,%o5,%g0 |
C -- |
subccc %l2,%l6,%g3 |
srlx %g2,63,cy |
srlx %l2,32,%o2 |
sub u1,v1,%g1 |
srlx %l6,32,%g5 ! asymmetry |
stx %g4,[rp+0] |
subccc %o2,%g5,%g0 |
fanop |
subccc %l3,%l7,%g4 |
C -- |
srlx %l3,32,%o3 |
sub %g1,cy,%g4 |
srlx %l7,32,%o7 |
orn u1,v1,%g2 |
subccc %o3,%o7,%g0 |
fmnop |
stx %g1,[%i0+0] |
fanop |
stx %g2,[%i0+8] |
C -- |
stx %g3,[%i0+16] |
orn %g4,%g2,%g2 |
stx %g4,[%i0+24] |
andn u1,v1,%g3 |
add %i0,32,%i0 |
ldx [up-24],u1 |
|
fanop |
|
C -- |
|
andn %g2,%g3,%g2 |
|
ldx [vp+8],v1 |
|
add vp,32,vp |
|
fanop |
|
C -- |
|
srlx %g2,63,cy |
|
sub u2,v2,%g1 |
|
stx %g4,[rp+8] |
|
fanop |
|
C -- |
|
sub %g1,cy,%g4 |
|
orn u2,v2,%g2 |
|
fmnop |
|
fanop |
|
C -- |
|
orn %g4,%g2,%g2 |
|
andn u2,v2,%g3 |
|
ldx [up-16],u2 |
|
fanop |
|
C -- |
|
andn %g2,%g3,%g2 |
|
ldx [vp-16],v2 |
|
add rp,32,rp |
|
fanop |
|
C -- |
|
srlx %g2,63,cy |
|
sub u3,v3,%g1 |
|
stx %g4,[rp-16] |
|
fanop |
|
C -- |
|
sub %g1,cy,%g4 |
|
orn u3,v3,%g2 |
|
fmnop |
|
fanop |
|
C -- |
|
orn %g4,%g2,%g2 |
|
andn u3,v3,%g3 |
|
ldx [up-8],u3 |
|
fanop |
|
C -- |
|
andn %g2,%g3,%g2 |
|
subcc n,4,n |
|
ldx [vp-8],v3 |
|
fanop |
|
C -- |
|
srlx %g2,63,cy |
|
sub u0,v0,%g1 |
|
stx %g4,[rp-8] |
|
fanop |
|
C -- |
|
sub %g1,cy,%g4 |
|
orn u0,v0,%g2 |
|
bge,pt %icc,.Loop |
|
fanop |
|
C END MAIN LOOP |
|
.Lend4567: |
|
orn %g4,%g2,%g2 |
|
andn u0,v0,%g3 |
|
andn %g2,%g3,%g2 |
|
srlx %g2,63,cy |
|
sub u1,v1,%g1 |
|
stx %g4,[rp+0] |
|
sub %g1,cy,%g4 |
|
orn u1,v1,%g2 |
|
orn %g4,%g2,%g2 |
|
andn u1,v1,%g3 |
|
andn %g2,%g3,%g2 |
|
srlx %g2,63,cy |
|
sub u2,v2,%g1 |
|
stx %g4,[rp+8] |
|
sub %g1,cy,%g4 |
|
orn u2,v2,%g2 |
|
orn %g4,%g2,%g2 |
|
andn u2,v2,%g3 |
|
andn %g2,%g3,%g2 |
|
add rp,32,rp |
|
srlx %g2,63,cy |
|
sub u3,v3,%g1 |
|
stx %g4,[rp-16] |
|
sub %g1,cy,%g4 |
|
orn u3,v3,%g2 |
|
orn %g4,%g2,%g2 |
|
andn u3,v3,%g3 |
|
andn %g2,%g3,%g2 |
|
srlx %g2,63,cy |
|
stx %g4,[rp-8] |
|
|
L(there): |
addcc n,4,n |
add %i3,4,%i3 |
bz,pn %icc,.Lret |
brz,pt %i3,L(end) |
fanop |
nop |
|
|
|
L(loop2): |
.Loop0: ldx [up],u0 |
ldx [%i1+0],%l0 |
add up,8,up |
add %i1,8,%i1 |
ldx [vp],v0 |
ldx [%i2+0],%l4 |
add vp,8,vp |
add %i2,8,%i2 |
add rp,8,rp |
srlx %l0,32,%g2 |
subcc n,1,n |
srlx %l4,32,%g3 |
sub u0,v0,%g1 |
subccc %l0,%l4,%g1 |
orn u0,v0,%g2 |
subccc %g2,%g3,%g0 |
sub %g1,cy,%g4 |
stx %g1,[%i0+0] |
andn u0,v0,%g3 |
add %i0,8,%i0 |
orn %g4,%g2,%g2 |
add %i3,-1,%i3 |
stx %g4,[rp-8] |
brgz,pt %i3,L(loop2) |
andn %g2,%g3,%g2 |
nop |
bnz,pt %icc,.Loop0 |
|
srlx %g2,63,cy |
|
|
L(end): addc %g0,%g0,%i0 |
.Lret: mov cy,%i0 |
ret |
ret |
restore |
restore |
EPILOGUE(mpn_sub_n) |
EPILOGUE(mpn_sub_n) |