OpenXM_contrib/gmp/mpn/sparc64/sub_n.asm - diff

Return to sub_n.asm CVS log

Up to [local] / OpenXM_contrib / gmp / mpn / sparc64

Diff for /OpenXM_contrib/gmp/mpn/sparc64/Attic/sub_n.asm between version 1.1.1.1 and 1.1.1.2

-version 1.1.1.1, 2000/09/09 14:12:41
+version 1.1.1.2, 2003/08/25 16:06:26
 Line 1
 Line 1
 Line 1
- ! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
- ! store difference in a third limb vector.
+ dnl  store difference in a third limb vector.
- ! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+ dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
- ! This file is part of the GNU MP Library.
+ dnl  This file is part of the GNU MP Library.
- ! The GNU MP Library is free software; you can redistribute it and/or modify
+ dnl  The GNU MP Library is free software; you can redistribute it and/or modify
- ! it under the terms of the GNU Lesser General Public License as published by
+ dnl  it under the terms of the GNU Lesser General Public License as published
- ! the Free Software Foundation; either version 2.1 of the License, or (at your
+ dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
- ! option) any later version.
+ dnl  your option) any later version.
- ! The GNU MP Library is distributed in the hope that it will be useful, but
+ dnl  The GNU MP Library is distributed in the hope that it will be useful, but
- ! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- ! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
- ! License for more details.
+ dnl  License for more details.
- ! You should have received a copy of the GNU Lesser General Public License
+ dnl  You should have received a copy of the GNU Lesser General Public License
- ! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
- ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- ! MA 02111-1307, USA.
+ dnl  MA 02111-1307, USA.
+ include(`../config.m4')
- ! INPUT PARAMETERS
+ C Compute carry-out from the most significant bits of u,v, and r, where
- ! res_ptr       %o0
+ C r=u-v-carry_in, using logic operations.
- ! s1_ptr        %o1
- ! s2_ptr        %o2
- ! size          %o3
- include(`../config.m4')
+ C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
+ C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+ C Therefore, it seems futile to try to optimize this any further...
- ASM_START()
+ C INPUT PARAMETERS
-         .register       %g2,#scratch
+ define(`rp',`%i0')
-         .register       %g3,#scratch
+ define(`up',`%i1')
- PROLOGUE(mpn_sub_n)
+ define(`vp',`%i2')
+ define(`n',`%i3')
- ! 12 mem ops >= 12 cycles
+ define(`u0',`%l0')
- ! 8 shift insn >= 8 cycles
+ define(`u1',`%l2')
- ! 8 addccc, executing alone, +8 cycles
+ define(`u2',`%l4')
- ! Unrolling not mandatory...perhaps 2-way is best?
+ define(`u3',`%l6')
- ! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+ define(`v0',`%l1')
- ! All in all, it runs at 5 cycles/limb
+ define(`v1',`%l3')
+ define(`v2',`%l5')
+ define(`v3',`%l7')
-         save    %sp,-160,%sp
+ define(`cy',`%i4')
-         addcc   %g0,%g0,%g0
+ define(`fanop',`fitod %f0,%f2')         dnl  A quasi nop running in the FA pipe
+ define(`fmnop',`fmuld %f0,%f0,%f4')     dnl  A quasi nop running in the FM pipe
-         add     %i3,-4,%i3
+ ASM_START()
-         brlz,pn %i3,L(there)
+         REGISTER(%g2,#scratch)
-         nop
+         REGISTER(%g3,#scratch)
+ PROLOGUE(mpn_sub_n)
+         save    %sp,-160,%sp
-         ldx     [%i1+0],%l0
+         fitod   %f0,%f0         C make sure f0 contains small, quiet number
-         ldx     [%i2+0],%l4
+         subcc   n,4,%g0
-         ldx     [%i1+8],%l1
+         bl,pn   %icc,.Loop0
-         ldx     [%i2+8],%l5
+         mov     0,cy
-         ldx     [%i1+16],%l2
-         ldx     [%i2+16],%l6
-         ldx     [%i1+24],%l3
-         ldx     [%i2+24],%l7
-         add     %i1,32,%i1
-         add     %i2,32,%i2
-         add     %i3,-4,%i3
+         ldx     [up+0],u0
-         brlz,pn %i3,L(skip)
+         ldx     [vp+0],v0
-         nop
+         add     up,32,up
-         b       L(loop1)        ! jump instead of executing many NOPs
+         ldx     [up-24],u1
-         nop
+         ldx     [vp+8],v1
-         ALIGN(32)
+         add     vp,32,vp
- !---------  Start main loop ---------
+         ldx     [up-16],u2
- L(loop1):
+         ldx     [vp-16],v2
-         subccc  %l0,%l4,%g1
+         ldx     [up-8],u3
- !-
+         ldx     [vp-8],v3
-         srlx    %l0,32,%o0
+         subcc   n,8,n
-         ldx     [%i1+0],%l0
+         sub     u0,v0,%g1       C main sub
- !-
+         sub     %g1,cy,%g4      C carry sub
-         srlx    %l4,32,%o4
+         orn     u0,v0,%g2
-         ldx     [%i2+0],%l4
+         bl,pn   %icc,.Lend4567
- !-
+         fanop
-         subccc  %o0,%o4,%g0
+         b,a     .Loop
- !-
-         subccc  %l1,%l5,%g2
- !-
-         srlx    %l1,32,%o1
-         ldx     [%i1+8],%l1
- !-
-         srlx    %l5,32,%o5
-         ldx     [%i2+8],%l5
- !-
-         subccc  %o1,%o5,%g0
- !-
-         subccc  %l2,%l6,%g3
- !-
-         srlx    %l2,32,%o2
-         ldx     [%i1+16],%l2
- !-
-         srlx    %l6,32,%g5      ! asymmetry
-         ldx     [%i2+16],%l6
- !-
-         subccc  %o2,%g5,%g0
- !-
-         subccc  %l3,%l7,%g4
- !-
-         srlx    %l3,32,%o3
-         ldx     [%i1+24],%l3
-         add     %i1,32,%i1
- !-
-         srlx    %l7,32,%o7
-         ldx     [%i2+24],%l7
-         add     %i2,32,%i2
- !-
-         subccc  %o3,%o7,%g0
- !-
-         stx     %g1,[%i0+0]
- !-
-         stx     %g2,[%i0+8]
- !-
-         stx     %g3,[%i0+16]
-         add     %i3,-4,%i3
- !-
-         stx     %g4,[%i0+24]
-         add     %i0,32,%i0
-         brgez,pt        %i3,L(loop1)
+         .align  16
-         nop
+ C START MAIN LOOP
- !---------  End main loop ---------
+ .Loop:  orn     %g4,%g2,%g2
- L(skip):
+         andn    u0,v0,%g3
-         subccc  %l0,%l4,%g1
+         ldx     [up+0],u0
-         srlx    %l0,32,%o0
+         fanop
-         srlx    %l4,32,%o4
+ C --
-         subccc  %o0,%o4,%g0
+         andn    %g2,%g3,%g2
-         subccc  %l1,%l5,%g2
+         ldx     [vp+0],v0
-         srlx    %l1,32,%o1
+         add     up,32,up
-         srlx    %l5,32,%o5
+         fanop
-         subccc  %o1,%o5,%g0
+ C --
-         subccc  %l2,%l6,%g3
+         srlx    %g2,63,cy
-         srlx    %l2,32,%o2
+         sub     u1,v1,%g1
-         srlx    %l6,32,%g5      ! asymmetry
+         stx     %g4,[rp+0]
-         subccc  %o2,%g5,%g0
+         fanop
-         subccc  %l3,%l7,%g4
+ C --
-         srlx    %l3,32,%o3
+         sub     %g1,cy,%g4
-         srlx    %l7,32,%o7
+         orn     u1,v1,%g2
-         subccc  %o3,%o7,%g0
+         fmnop
-         stx     %g1,[%i0+0]
+         fanop
-         stx     %g2,[%i0+8]
+ C --
-         stx     %g3,[%i0+16]
+         orn     %g4,%g2,%g2
-         stx     %g4,[%i0+24]
+         andn    u1,v1,%g3
-         add     %i0,32,%i0
+         ldx     [up-24],u1
+         fanop
+ C --
+         andn    %g2,%g3,%g2
+         ldx     [vp+8],v1
+         add     vp,32,vp
+         fanop
+ C --
+         srlx    %g2,63,cy
+         sub     u2,v2,%g1
+         stx     %g4,[rp+8]
+         fanop
+ C --
+         sub     %g1,cy,%g4
+         orn     u2,v2,%g2
+         fmnop
+         fanop
+ C --
+         orn     %g4,%g2,%g2
+         andn    u2,v2,%g3
+         ldx     [up-16],u2
+         fanop
+ C --
+         andn    %g2,%g3,%g2
+         ldx     [vp-16],v2
+         add     rp,32,rp
+         fanop
+ C --
+         srlx    %g2,63,cy
+         sub     u3,v3,%g1
+         stx     %g4,[rp-16]
+         fanop
+ C --
+         sub     %g1,cy,%g4
+         orn     u3,v3,%g2
+         fmnop
+         fanop
+ C --
+         orn     %g4,%g2,%g2
+         andn    u3,v3,%g3
+         ldx     [up-8],u3
+         fanop
+ C --
+         andn    %g2,%g3,%g2
+         subcc   n,4,n
+         ldx     [vp-8],v3
+         fanop
+ C --
+         srlx    %g2,63,cy
+         sub     u0,v0,%g1
+         stx     %g4,[rp-8]
+         fanop
+ C --
+         sub     %g1,cy,%g4
+         orn     u0,v0,%g2
+         bge,pt  %icc,.Loop
+         fanop
+ C END MAIN LOOP
+ .Lend4567:
+         orn     %g4,%g2,%g2
+         andn    u0,v0,%g3
+         andn    %g2,%g3,%g2
+         srlx    %g2,63,cy
+         sub     u1,v1,%g1
+         stx     %g4,[rp+0]
+         sub     %g1,cy,%g4
+         orn     u1,v1,%g2
+         orn     %g4,%g2,%g2
+         andn    u1,v1,%g3
+         andn    %g2,%g3,%g2
+         srlx    %g2,63,cy
+         sub     u2,v2,%g1
+         stx     %g4,[rp+8]
+         sub     %g1,cy,%g4
+         orn     u2,v2,%g2
+         orn     %g4,%g2,%g2
+         andn    u2,v2,%g3
+         andn    %g2,%g3,%g2
+         add     rp,32,rp
+         srlx    %g2,63,cy
+         sub     u3,v3,%g1
+         stx     %g4,[rp-16]
+         sub     %g1,cy,%g4
+         orn     u3,v3,%g2
+         orn     %g4,%g2,%g2
+         andn    u3,v3,%g3
+         andn    %g2,%g3,%g2
+         srlx    %g2,63,cy
+         stx     %g4,[rp-8]
- L(there):
+         addcc   n,4,n
-         add     %i3,4,%i3
+         bz,pn   %icc,.Lret
-         brz,pt  %i3,L(end)
+         fanop
-         nop
- L(loop2):
+ .Loop0: ldx     [up],u0
-         ldx     [%i1+0],%l0
+         add     up,8,up
-         add     %i1,8,%i1
+         ldx     [vp],v0
-         ldx     [%i2+0],%l4
+         add     vp,8,vp
-         add     %i2,8,%i2
+         add     rp,8,rp
-         srlx    %l0,32,%g2
+         subcc   n,1,n
-         srlx    %l4,32,%g3
+         sub     u0,v0,%g1
-         subccc  %l0,%l4,%g1
+         orn     u0,v0,%g2
-         subccc  %g2,%g3,%g0
+         sub     %g1,cy,%g4
-         stx     %g1,[%i0+0]
+         andn    u0,v0,%g3
-         add     %i0,8,%i0
+         orn     %g4,%g2,%g2
-         add     %i3,-1,%i3
+         stx     %g4,[rp-8]
-         brgz,pt %i3,L(loop2)
+         andn    %g2,%g3,%g2
-         nop
+         bnz,pt  %icc,.Loop0
+         srlx    %g2,63,cy
- L(end): addc    %g0,%g0,%i0
+ .Lret:  mov     cy,%i0
          ret
          restore
  EPILOGUE(mpn_sub_n)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>