OpenXM_contrib/gmp/mpn/pa64w/submul_1.S - annotate

Return to submul_1.S CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / pa64w
Annotation of OpenXM_contrib/gmp/mpn/pa64w/submul_1.S, Revision 1.1.1.1

1.1       maekawa     1: ; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
                      2: ; subtract the result from a second limb vector.
                      3:
                      4: ; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
                      5:
                      6: ; This file is part of the GNU MP Library.
                      7:
                      8: ; The GNU MP Library is free software; you can redistribute it and/or modify
                      9: ; it under the terms of the GNU Lesser General Public License as published by
                     10: ; the Free Software Foundation; either version 2.1 of the License, or (at your
                     11: ; option) any later version.
                     12:
                     13: ; The GNU MP Library is distributed in the hope that it will be useful, but
                     14: ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     15: ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     16: ; License for more details.
                     17:
                     18: ; You should have received a copy of the GNU Lesser General Public License
                     19: ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     20: ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     21: ; MA 02111-1307, USA.
                     22:
                     23: ; INPUT PARAMETERS
                     24: #define rptr           %r26
                     25: #define sptr           %r25
                     26: #define size           %r24
                     27: #define s2limb         %r23
                     28:
                     29: ; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
                     30: ; it faster, but the PA8000 pipeline is not publically documented and it
                     31: ; is very complex to reverse engineer
                     32:
                     33: #define t1 %r19
                     34: #define rlimb %r20
                     35: #define hi %r21
                     36: #define lo %r22
                     37: #define m0 %r28
                     38: #define m1 %r3
                     39: #define cylimb %r29
                     40: #define t3 %r4
                     41: #define t2 %r6
                     42: #define t5 %r23
                     43: #define t4 %r31
                     44:        .level  2.0w
                     45:        .code
                     46:        .export __gmpn_submul_1,entry
                     47: __gmpn_submul_1
                     48:        .proc
                     49:        .callinfo frame=128,no_calls
                     50:        .entry
                     51:        std             s2limb,-56(%r30)
                     52:         fldd           -56(%r30),%fr5
                     53:        ldo             128(%r30),%r30
                     54:        add             %r0,%r0,cylimb          ; clear cy and cylimb
                     55:
                     56:        std             %r3,-96(%r30)
                     57:        std             %r4,-88(%r30)
                     58:        std             %r5,-80(%r30)
                     59:        std             %r6,-72(%r30)
                     60:        depdi,z         1,31,1,%r5
                     61:
                     62:        fldd            0(sptr),%fr4
                     63:        ldo             8(sptr),sptr
                     64:
                     65:        xmpyu           %fr5R,%fr4R,%fr6
                     66:        fstd            %fr6,-128(%r30)
                     67:        xmpyu           %fr5R,%fr4L,%fr7
                     68:        fstd            %fr7,-120(%r30)
                     69:        xmpyu           %fr5L,%fr4R,%fr8
                     70:        fstd            %fr8,-112(%r30)
                     71:        xmpyu           %fr5L,%fr4L,%fr9
                     72:        fstd            %fr9,-104(%r30)
                     73:        ldd             -128(%r30),lo           ; lo = low 64 bit of product
                     74:        ldd             -120(%r30),m0           ; m0 = mid0 64 bit of product
                     75:        ldd             -112(%r30),m1           ; m1 = mid1 64 bit of product
                     76:        ldd             -104(%r30),hi           ; hi = high 64 bit of product
                     77:        addib,=         -1,%r24,L$end1
                     78:        nop
                     79:        fldd            0(sptr),%fr4
                     80:        ldo             8(sptr),sptr
                     81:        addib,=         -1,%r24,L$end2
                     82:        nop
                     83: L$loop
                     84:        xmpyu           %fr5R,%fr4R,%fr6
                     85:        fstd            %fr6,-128(%r30)
                     86:        xmpyu           %fr5R,%fr4L,%fr7
                     87:        fstd            %fr7,-120(%r30)
                     88:        xmpyu           %fr5L,%fr4R,%fr8
                     89:        fstd            %fr8,-112(%r30)
                     90:        xmpyu           %fr5L,%fr4L,%fr9
                     91:        fstd            %fr9,-104(%r30)
                     92:        ldd             0(rptr),rlimb
                     93:        extrd,u         lo,31,32,t1             ; t1 = hi32(lo)
                     94:        extrd,u         lo,63,32,t4             ; t4 = lo32(lo)
                     95:        add,l           m0,t1,t1                ; t1 += m0
                     96:        add,l,*nuv      m1,t1,t1                ; t1 += m1
                     97:         add,l          %r5,hi,hi               ; propagate carry
                     98:        extrd,u         t1,31,32,t2             ; t2 = hi32(t1)
                     99:        depd,z          t1,31,32,t5             ; t5 = lo32(t1)
                    100:        add,l           t5,t4,t4                ; t4 += lo32(t1)
                    101:        ldd             -128(%r30),lo           ; lo = low 64 bit of product
                    102:        add             cylimb,t4,t4
                    103:        ldd             -120(%r30),m0           ; m0 = mid0 64 bit of product
                    104:        add,dc          t2,hi,cylimb
                    105:        ldd             -112(%r30),m1           ; m1 = mid1 64 bit of product
                    106:        sub             rlimb,t4,t3
                    107:        add             t4,t3,%r0
                    108:        ldd             -104(%r30),hi           ; hi = high 64 bit of product
                    109:        add,dc          %r0,cylimb,cylimb
                    110:        fldd            0(sptr),%fr4
                    111:        ldo             8(sptr),sptr
                    112:        std             t3,0(rptr)
                    113:        addib,<>        -1,%r24,L$loop
                    114:        ldo             8(rptr),rptr
                    115: L$end2
                    116:        xmpyu           %fr5R,%fr4R,%fr6
                    117:        fstd            %fr6,-128(%r30)
                    118:        xmpyu           %fr5R,%fr4L,%fr7
                    119:        fstd            %fr7,-120(%r30)
                    120:        xmpyu           %fr5L,%fr4R,%fr8
                    121:        fstd            %fr8,-112(%r30)
                    122:        xmpyu           %fr5L,%fr4L,%fr9
                    123:        fstd            %fr9,-104(%r30)
                    124:        ldd             0(rptr),rlimb
                    125:        extrd,u         lo,31,32,t1             ; t1 = hi32(lo)
                    126:        extrd,u         lo,63,32,t4             ; t4 = lo32(lo)
                    127:        add,l           m0,t1,t1                ; t1 += m0
                    128:        add,l,*nuv      m1,t1,t1                ; t1 += m0
                    129:         add,l          %r5,hi,hi               ; propagate carry
                    130:        extrd,u         t1,31,32,t2             ; t2 = hi32(t1)
                    131:        depd,z          t1,31,32,t5             ; t5 = lo32(t1)
                    132:        add,l           t5,t4,t4                ; t4 += lo32(t1)
                    133:        ldd             -128(%r30),lo           ; lo = low 64 bit of product
                    134:        add             cylimb,t4,t4
                    135:        ldd             -120(%r30),m0           ; m0 = mid0 64 bit of product
                    136:        add,dc          t2,hi,cylimb
                    137:        ldd             -112(%r30),m1           ; m1 = mid1 64 bit of product
                    138:        sub             rlimb,t4,t3
                    139:        add             t4,t3,%r0
                    140:        ldd             -104(%r30),hi           ; hi = high 64 bit of product
                    141:        add,dc          %r0,cylimb,cylimb
                    142:        std             t3,0(rptr)
                    143:        ldo             8(rptr),rptr
                    144: L$end1
                    145:        ldd             0(rptr),rlimb
                    146:        extrd,u         lo,31,32,t1             ; t1 = hi32(lo)
                    147:        extrd,u         lo,63,32,t4             ; t4 = lo32(lo)
                    148:        add,l           m0,t1,t1                ; t1 += m0
                    149:        add,l,*nuv      m1,t1,t1                ; t1 += m0
                    150:         add,l          %r5,hi,hi               ; propagate carry
                    151:        extrd,u         t1,31,32,t2             ; t2 = hi32(t1)
                    152:        depd,z          t1,31,32,t5             ; t5 = lo32(t1)
                    153:        add,l           t5,t4,t4                ; t4 += lo32(t1)
                    154:        add             cylimb,t4,t4
                    155:        add,dc          t2,hi,cylimb
                    156:        sub             rlimb,t4,t3
                    157:        add             t4,t3,%r0
                    158:        add,dc          %r0,cylimb,cylimb
                    159:        std             t3,0(rptr)
                    160:        ldo             8(rptr),rptr
                    161:
                    162:        ldd             -96(%r30),%r3
                    163:        ldd             -88(%r30),%r4
                    164:        ldd             -80(%r30),%r5
                    165:        ldd             -72(%r30),%r6
                    166:
                    167:        copy            cylimb,%r28
                    168:        bve             (%r2)
                    169:        .exit
                    170:        ldo             -128(%r30),%r30
                    171:        .procend
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>