OpenXM_contrib/gmp/mpn/pa64/submul_1.S - annotate

Return to submul_1.S CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / pa64
Annotation of OpenXM_contrib/gmp/mpn/pa64/submul_1.S, Revision 1.1.1.1

1.1       maekawa     1: ; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
                      2: ; subtract the result from a second limb vector.
                      3:
                      4: ; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
                      5:
                      6: ; This file is part of the GNU MP Library.
                      7:
                      8: ; The GNU MP Library is free software; you can redistribute it and/or modify
                      9: ; it under the terms of the GNU Lesser General Public License as published by
                     10: ; the Free Software Foundation; either version 2.1 of the License, or (at your
                     11: ; option) any later version.
                     12:
                     13: ; The GNU MP Library is distributed in the hope that it will be useful, but
                     14: ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     15: ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     16: ; License for more details.
                     17:
                     18: ; You should have received a copy of the GNU Lesser General Public License
                     19: ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     20: ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     21: ; MA 02111-1307, USA.
                     22:
                     23: ; INPUT PARAMETERS
                     24: #define rptr           %r26
                     25: #define sptr           %r25
                     26: #define size           %r24
                     27: #define s2limb         -56(%r30)
                     28:
                     29: ; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
                     30: ; it faster, but the PA8000 pipeline is not publically documented and it
                     31: ; is very complex to reverse engineer
                     32:
                     33: #define t1 %r19
                     34: #define rlimb %r20
                     35: #define hi %r21
                     36: #define lo %r22
                     37: #define m0 %r28
                     38: #define m1 %r3
                     39: #define cylimb %r29
                     40: #define t3 %r4
                     41: #define t2 %r6
                     42: #define t5 %r23
                     43: #define t4 %r31
                     44:        .level  2.0n
                     45:        .code
                     46:        .export __gmpn_submul_1,entry
                     47: __gmpn_submul_1
                     48:        .proc
                     49:        .callinfo frame=128,no_calls
                     50:        .entry
                     51:         fldd           -56(%r30),%fr5          ; s2limb passed on stack
                     52:        ldo             128(%r30),%r30
                     53:        add             %r0,%r0,cylimb          ; clear cy and cylimb
                     54:
                     55:        std             %r3,-96(%r30)
                     56:        std             %r4,-88(%r30)
                     57:        std             %r5,-80(%r30)
                     58:        std             %r6,-72(%r30)
                     59:        depdi,z         1,31,1,%r5
                     60:
                     61:        fldd            0(sptr),%fr4
                     62:        ldo             8(sptr),sptr
                     63:
                     64:        xmpyu           %fr5R,%fr4R,%fr6
                     65:        fstd            %fr6,-128(%r30)
                     66:        xmpyu           %fr5R,%fr4L,%fr7
                     67:        fstd            %fr7,-120(%r30)
                     68:        xmpyu           %fr5L,%fr4R,%fr8
                     69:        fstd            %fr8,-112(%r30)
                     70:        xmpyu           %fr5L,%fr4L,%fr9
                     71:        fstd            %fr9,-104(%r30)
                     72:        ldd             -128(%r30),lo           ; lo = low 64 bit of product
                     73:        ldd             -120(%r30),m0           ; m0 = mid0 64 bit of product
                     74:        ldd             -112(%r30),m1           ; m1 = mid1 64 bit of product
                     75:        ldd             -104(%r30),hi           ; hi = high 64 bit of product
                     76:        addib,=         -1,%r24,L$end1
                     77:        nop
                     78:        fldd            0(sptr),%fr4
                     79:        ldo             8(sptr),sptr
                     80:        addib,=         -1,%r24,L$end2
                     81:        nop
                     82: L$loop
                     83:        xmpyu           %fr5R,%fr4R,%fr6
                     84:        fstd            %fr6,-128(%r30)
                     85:        xmpyu           %fr5R,%fr4L,%fr7
                     86:        fstd            %fr7,-120(%r30)
                     87:        xmpyu           %fr5L,%fr4R,%fr8
                     88:        fstd            %fr8,-112(%r30)
                     89:        xmpyu           %fr5L,%fr4L,%fr9
                     90:        fstd            %fr9,-104(%r30)
                     91:        ldd             0(rptr),rlimb
                     92:        extrd,u         lo,31,32,t1             ; t1 = hi32(lo)
                     93:        extrd,u         lo,63,32,t4             ; t4 = lo32(lo)
                     94:        add,l           m0,t1,t1                ; t1 += m0
                     95:        add,l,*nuv      m1,t1,t1                ; t1 += m1
                     96:         add,l          %r5,hi,hi               ; propagate carry
                     97:        extrd,u         t1,31,32,t2             ; t2 = hi32(t1)
                     98:        depd,z          t1,31,32,t5             ; t5 = lo32(t1)
                     99:        add,l           t5,t4,t4                ; t4 += lo32(t1)
                    100:        ldd             -128(%r30),lo           ; lo = low 64 bit of product
                    101:        add             cylimb,t4,t4
                    102:        ldd             -120(%r30),m0           ; m0 = mid0 64 bit of product
                    103:        add,dc          t2,hi,cylimb
                    104:        ldd             -112(%r30),m1           ; m1 = mid1 64 bit of product
                    105:        sub             rlimb,t4,t3
                    106:        add             t4,t3,%r0
                    107:        ldd             -104(%r30),hi           ; hi = high 64 bit of product
                    108:        add,dc          %r0,cylimb,cylimb
                    109:        fldd            0(sptr),%fr4
                    110:        ldo             8(sptr),sptr
                    111:        std             t3,0(rptr)
                    112:        addib,<>        -1,%r24,L$loop
                    113:        ldo             8(rptr),rptr
                    114: L$end2
                    115:        xmpyu           %fr5R,%fr4R,%fr6
                    116:        fstd            %fr6,-128(%r30)
                    117:        xmpyu           %fr5R,%fr4L,%fr7
                    118:        fstd            %fr7,-120(%r30)
                    119:        xmpyu           %fr5L,%fr4R,%fr8
                    120:        fstd            %fr8,-112(%r30)
                    121:        xmpyu           %fr5L,%fr4L,%fr9
                    122:        fstd            %fr9,-104(%r30)
                    123:        ldd             0(rptr),rlimb
                    124:        extrd,u         lo,31,32,t1             ; t1 = hi32(lo)
                    125:        extrd,u         lo,63,32,t4             ; t4 = lo32(lo)
                    126:        add,l           m0,t1,t1                ; t1 += m0
                    127:        add,l,*nuv      m1,t1,t1                ; t1 += m0
                    128:         add,l          %r5,hi,hi               ; propagate carry
                    129:        extrd,u         t1,31,32,t2             ; t2 = hi32(t1)
                    130:        depd,z          t1,31,32,t5             ; t5 = lo32(t1)
                    131:        add,l           t5,t4,t4                ; t4 += lo32(t1)
                    132:        ldd             -128(%r30),lo           ; lo = low 64 bit of product
                    133:        add             cylimb,t4,t4
                    134:        ldd             -120(%r30),m0           ; m0 = mid0 64 bit of product
                    135:        add,dc          t2,hi,cylimb
                    136:        ldd             -112(%r30),m1           ; m1 = mid1 64 bit of product
                    137:        sub             rlimb,t4,t3
                    138:        add             t4,t3,%r0
                    139:        ldd             -104(%r30),hi           ; hi = high 64 bit of product
                    140:        add,dc          %r0,cylimb,cylimb
                    141:        std             t3,0(rptr)
                    142:        ldo             8(rptr),rptr
                    143: L$end1
                    144:        ldd             0(rptr),rlimb
                    145:        extrd,u         lo,31,32,t1             ; t1 = hi32(lo)
                    146:        extrd,u         lo,63,32,t4             ; t4 = lo32(lo)
                    147:        add,l           m0,t1,t1                ; t1 += m0
                    148:        add,l,*nuv      m1,t1,t1                ; t1 += m0
                    149:         add,l          %r5,hi,hi               ; propagate carry
                    150:        extrd,u         t1,31,32,t2             ; t2 = hi32(t1)
                    151:        depd,z          t1,31,32,t5             ; t5 = lo32(t1)
                    152:        add,l           t5,t4,t4                ; t4 += lo32(t1)
                    153:        add             cylimb,t4,t4
                    154:        add,dc          t2,hi,cylimb
                    155:        sub             rlimb,t4,t3
                    156:        add             t4,t3,%r0
                    157:        add,dc          %r0,cylimb,cylimb
                    158:        std             t3,0(rptr)
                    159:        ldo             8(rptr),rptr
                    160:
                    161:        ldd             -96(%r30),%r3
                    162:        ldd             -88(%r30),%r4
                    163:        ldd             -80(%r30),%r5
                    164:        ldd             -72(%r30),%r6
                    165:
                    166:        extrd,u         cylimb,31,32,%r28
                    167:        bve             (%r2)
                    168:        .exit
                    169:        ldo             -128(%r30),%r30
                    170:        .procend
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>