[BACK]Return to sub_n.s CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / m88k

Annotation of OpenXM_contrib/gmp/mpn/m88k/sub_n.s, Revision 1.1.1.3

1.1.1.3 ! ohara       1: ; mc88100 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
1.1       maekawa     2: ; store difference in a third limb vector.
                      3:
1.1.1.3 ! ohara       4: ; Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
1.1       maekawa     5:
                      6: ; This file is part of the GNU MP Library.
                      7:
                      8: ; The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.2   maekawa     9: ; it under the terms of the GNU Lesser General Public License as published by
                     10: ; the Free Software Foundation; either version 2.1 of the License, or (at your
1.1       maekawa    11: ; option) any later version.
                     12:
                     13: ; The GNU MP Library is distributed in the hope that it will be useful, but
                     14: ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.2   maekawa    15: ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
1.1       maekawa    16: ; License for more details.
                     17:
1.1.1.2   maekawa    18: ; You should have received a copy of the GNU Lesser General Public License
1.1       maekawa    19: ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     20: ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     21: ; MA 02111-1307, USA.
                     22:
                     23:
                     24: ; INPUT PARAMETERS
                     25: ; res_ptr      r2
                     26: ; s1_ptr       r3
                     27: ; s2_ptr       r4
                     28: ; size         r5
                     29:
                     30: ; This code has been optimized to run one instruction per clock, avoiding
                     31: ; load stalls and writeback contention.  As a result, the instruction
                     32: ; order is not always natural.
                     33:
                     34: ; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
                     35: ; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
                     36:
                     37:        text
                     38:        align    16
1.1.1.2   maekawa    39:        global   ___gmpn_sub_n
                     40: ___gmpn_sub_n:
1.1       maekawa    41:        ld      r6,r3,0                 ; read first limb from s1_ptr
                     42:        extu    r10,r5,3
                     43:        ld      r7,r4,0                 ; read first limb from s2_ptr
                     44:
                     45:        subu    r5,r0,r5
                     46:        mak     r5,r5,3<4>
                     47:        bcnd.n  eq0,r5,Lzero
                     48:        subu.co r0,r0,r0                ; initialize carry
                     49:
                     50:        or      r12,r0,lo16(Lbase)
                     51:        or.u    r12,r12,hi16(Lbase)
                     52:        addu    r12,r12,r5              ; r12 is address for entering in loop
                     53:
                     54:        extu    r5,r5,2                 ; divide by 4
                     55:        subu    r2,r2,r5                ; adjust res_ptr
                     56:        subu    r3,r3,r5                ; adjust s1_ptr
                     57:        subu    r4,r4,r5                ; adjust s2_ptr
                     58:
                     59:        or      r8,r6,r0
                     60:
                     61:        jmp.n   r12
                     62:         or     r9,r7,r0
                     63:
                     64: Loop:  addu    r3,r3,32
                     65:        st      r8,r2,28
                     66:        addu    r4,r4,32
                     67:        ld      r6,r3,0
                     68:        addu    r2,r2,32
                     69:        ld      r7,r4,0
                     70: Lzero: subu    r10,r10,1               ; subtract 0 + 8r limbs (adj loop cnt)
                     71: Lbase: ld      r8,r3,4
                     72:        subu.cio r6,r6,r7
                     73:        ld      r9,r4,4
                     74:        st      r6,r2,0
                     75:        ld      r6,r3,8                 ; subtract 7 + 8r limbs
                     76:        subu.cio r8,r8,r9
                     77:        ld      r7,r4,8
                     78:        st      r8,r2,4
                     79:        ld      r8,r3,12                ; subtract 6 + 8r limbs
                     80:        subu.cio r6,r6,r7
                     81:        ld      r9,r4,12
                     82:        st      r6,r2,8
                     83:        ld      r6,r3,16                ; subtract 5 + 8r limbs
                     84:        subu.cio r8,r8,r9
                     85:        ld      r7,r4,16
                     86:        st      r8,r2,12
                     87:        ld      r8,r3,20                ; subtract 4 + 8r limbs
                     88:        subu.cio r6,r6,r7
                     89:        ld      r9,r4,20
                     90:        st      r6,r2,16
                     91:        ld      r6,r3,24                ; subtract 3 + 8r limbs
                     92:        subu.cio r8,r8,r9
                     93:        ld      r7,r4,24
                     94:        st      r8,r2,20
                     95:        ld      r8,r3,28                ; subtract 2 + 8r limbs
                     96:        subu.cio r6,r6,r7
                     97:        ld      r9,r4,28
                     98:        st      r6,r2,24
                     99:        bcnd.n  ne0,r10,Loop            ; subtract 1 + 8r limbs
                    100:         subu.cio r8,r8,r9
                    101:
                    102:        st      r8,r2,28                ; store most significant limb
                    103:
                    104:        addu.ci r2,r0,r0                ; return carry-out from most sign. limb
                    105:        jmp.n    r1
                    106:         xor    r2,r2,1

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>