=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/sparc64/Attic/sub_n.asm,v retrieving revision 1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/sparc64/Attic/sub_n.asm 2000/09/09 14:12:41 1.1 +++ OpenXM_contrib/gmp/mpn/sparc64/Attic/sub_n.asm 2003/08/25 16:06:26 1.1.1.2 @@ -1,172 +1,218 @@ -! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and -! store difference in a third limb vector. +dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. -! Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl Copyright 2001, 2002 Free Software Foundation, Inc. -! This file is part of the GNU MP Library. +dnl This file is part of the GNU MP Library. -! The GNU MP Library is free software; you can redistribute it and/or modify -! it under the terms of the GNU Lesser General Public License as published by -! the Free Software Foundation; either version 2.1 of the License, or (at your -! option) any later version. +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. -! The GNU MP Library is distributed in the hope that it will be useful, but -! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -! License for more details. +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. -! You should have received a copy of the GNU Lesser General Public License -! along with the GNU MP Library; see the file COPYING.LIB. If not, write to -! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, -! MA 02111-1307, USA. +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. +include(`../config.m4') -! INPUT PARAMETERS -! res_ptr %o0 -! s1_ptr %o1 -! s2_ptr %o2 -! size %o3 +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u-v-carry_in, using logic operations. -include(`../config.m4') +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... -ASM_START() - .register %g2,#scratch - .register %g3,#scratch -PROLOGUE(mpn_sub_n) +C INPUT PARAMETERS +define(`rp',`%i0') +define(`up',`%i1') +define(`vp',`%i2') +define(`n',`%i3') -! 12 mem ops >= 12 cycles -! 8 shift insn >= 8 cycles -! 8 addccc, executing alone, +8 cycles -! Unrolling not mandatory...perhaps 2-way is best? -! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl -! All in all, it runs at 5 cycles/limb +define(`u0',`%l0') +define(`u1',`%l2') +define(`u2',`%l4') +define(`u3',`%l6') +define(`v0',`%l1') +define(`v1',`%l3') +define(`v2',`%l5') +define(`v3',`%l7') - save %sp,-160,%sp +define(`cy',`%i4') - addcc %g0,%g0,%g0 +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe - add %i3,-4,%i3 - brlz,pn %i3,L(there) - nop +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_n) + save %sp,-160,%sp - ldx [%i1+0],%l0 - ldx [%i2+0],%l4 - ldx [%i1+8],%l1 - ldx [%i2+8],%l5 - ldx [%i1+16],%l2 - ldx [%i2+16],%l6 - ldx [%i1+24],%l3 - ldx [%i2+24],%l7 - add %i1,32,%i1 - add %i2,32,%i2 + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %icc,.Loop0 + mov 0,cy - add %i3,-4,%i3 - brlz,pn %i3,L(skip) - nop - b L(loop1) ! jump instead of executing many NOPs - nop - ALIGN(32) -!--------- Start main loop --------- -L(loop1): - subccc %l0,%l4,%g1 -!- - srlx %l0,32,%o0 - ldx [%i1+0],%l0 -!- - srlx %l4,32,%o4 - ldx [%i2+0],%l4 -!- - subccc %o0,%o4,%g0 -!- - subccc %l1,%l5,%g2 -!- - srlx %l1,32,%o1 - ldx [%i1+8],%l1 -!- - srlx %l5,32,%o5 - ldx [%i2+8],%l5 -!- - subccc %o1,%o5,%g0 -!- - subccc %l2,%l6,%g3 -!- - srlx %l2,32,%o2 - ldx [%i1+16],%l2 -!- - srlx %l6,32,%g5 ! asymmetry - ldx [%i2+16],%l6 -!- - subccc %o2,%g5,%g0 -!- - subccc %l3,%l7,%g4 -!- - srlx %l3,32,%o3 - ldx [%i1+24],%l3 - add %i1,32,%i1 -!- - srlx %l7,32,%o7 - ldx [%i2+24],%l7 - add %i2,32,%i2 -!- - subccc %o3,%o7,%g0 -!- - stx %g1,[%i0+0] -!- - stx %g2,[%i0+8] -!- - stx %g3,[%i0+16] - add %i3,-4,%i3 -!- - stx %g4,[%i0+24] - add %i0,32,%i0 + ldx [up+0],u0 + ldx [vp+0],v0 + add up,32,up + ldx [up-24],u1 + ldx [vp+8],v1 + add vp,32,vp + ldx [up-16],u2 + ldx [vp-16],v2 + ldx [up-8],u3 + ldx [vp-8],v3 + subcc n,8,n + sub u0,v0,%g1 C main sub + sub %g1,cy,%g4 C carry sub + orn u0,v0,%g2 + bl,pn %icc,.Lend4567 + fanop + b,a .Loop - brgez,pt %i3,L(loop1) - nop -!--------- End main loop --------- -L(skip): - subccc %l0,%l4,%g1 - srlx %l0,32,%o0 - srlx %l4,32,%o4 - subccc %o0,%o4,%g0 - subccc %l1,%l5,%g2 - srlx %l1,32,%o1 - srlx %l5,32,%o5 - subccc %o1,%o5,%g0 - subccc %l2,%l6,%g3 - srlx %l2,32,%o2 - srlx %l6,32,%g5 ! asymmetry - subccc %o2,%g5,%g0 - subccc %l3,%l7,%g4 - srlx %l3,32,%o3 - srlx %l7,32,%o7 - subccc %o3,%o7,%g0 - stx %g1,[%i0+0] - stx %g2,[%i0+8] - stx %g3,[%i0+16] - stx %g4,[%i0+24] - add %i0,32,%i0 + .align 16 +C START MAIN LOOP +.Loop: orn %g4,%g2,%g2 + andn u0,v0,%g3 + ldx [up+0],u0 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+0],v0 + add up,32,up + fanop +C -- + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g4,[rp+0] + fanop +C -- + sub %g1,cy,%g4 + orn u1,v1,%g2 + fmnop + fanop +C -- + orn %g4,%g2,%g2 + andn u1,v1,%g3 + ldx [up-24],u1 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+8],v1 + add vp,32,vp + fanop +C -- + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g4,[rp+8] + fanop +C -- + sub %g1,cy,%g4 + orn u2,v2,%g2 + fmnop + fanop +C -- + orn %g4,%g2,%g2 + andn u2,v2,%g3 + ldx [up-16],u2 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp-16],v2 + add rp,32,rp + fanop +C -- + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g4,[rp-16] + fanop +C -- + sub %g1,cy,%g4 + orn u3,v3,%g2 + fmnop + fanop +C -- + orn %g4,%g2,%g2 + andn u3,v3,%g3 + ldx [up-8],u3 + fanop +C -- + andn %g2,%g3,%g2 + subcc n,4,n + ldx [vp-8],v3 + fanop +C -- + srlx %g2,63,cy + sub u0,v0,%g1 + stx %g4,[rp-8] + fanop +C -- + sub %g1,cy,%g4 + orn u0,v0,%g2 + bge,pt %icc,.Loop + fanop +C END MAIN LOOP +.Lend4567: + orn %g4,%g2,%g2 + andn u0,v0,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g4,[rp+0] + sub %g1,cy,%g4 + orn u1,v1,%g2 + orn %g4,%g2,%g2 + andn u1,v1,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g4,[rp+8] + sub %g1,cy,%g4 + orn u2,v2,%g2 + orn %g4,%g2,%g2 + andn u2,v2,%g3 + andn %g2,%g3,%g2 + add rp,32,rp + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g4,[rp-16] + sub %g1,cy,%g4 + orn u3,v3,%g2 + orn %g4,%g2,%g2 + andn u3,v3,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + stx %g4,[rp-8] -L(there): - add %i3,4,%i3 - brz,pt %i3,L(end) - nop + addcc n,4,n + bz,pn %icc,.Lret + fanop -L(loop2): - ldx [%i1+0],%l0 - add %i1,8,%i1 - ldx [%i2+0],%l4 - add %i2,8,%i2 - srlx %l0,32,%g2 - srlx %l4,32,%g3 - subccc %l0,%l4,%g1 - subccc %g2,%g3,%g0 - stx %g1,[%i0+0] - add %i0,8,%i0 - add %i3,-1,%i3 - brgz,pt %i3,L(loop2) - nop +.Loop0: ldx [up],u0 + add up,8,up + ldx [vp],v0 + add vp,8,vp + add rp,8,rp + subcc n,1,n + sub u0,v0,%g1 + orn u0,v0,%g2 + sub %g1,cy,%g4 + andn u0,v0,%g3 + orn %g4,%g2,%g2 + stx %g4,[rp-8] + andn %g2,%g3,%g2 + bnz,pt %icc,.Loop0 + srlx %g2,63,cy -L(end): addc %g0,%g0,%i0 +.Lret: mov cy,%i0 ret restore EPILOGUE(mpn_sub_n)