OpenXM_contrib/gmp/mpn/m88k/add_n.s - annotate

Return to add_n.s CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / m88k
Annotation of OpenXM_contrib/gmp/mpn/m88k/add_n.s, Revision 1.1.1.1

1.1       maekawa     1: ; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
                      2: ; sum in a third limb vector.
                      3:
                      4: ; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
                      5:
                      6: ; This file is part of the GNU MP Library.
                      7:
                      8: ; The GNU MP Library is free software; you can redistribute it and/or modify
                      9: ; it under the terms of the GNU Library General Public License as published by
                     10: ; the Free Software Foundation; either version 2 of the License, or (at your
                     11: ; option) any later version.
                     12:
                     13: ; The GNU MP Library is distributed in the hope that it will be useful, but
                     14: ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     15: ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
                     16: ; License for more details.
                     17:
                     18: ; You should have received a copy of the GNU Library General Public License
                     19: ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     20: ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     21: ; MA 02111-1307, USA.
                     22:
                     23:
                     24: ; INPUT PARAMETERS
                     25: ; res_ptr      r2
                     26: ; s1_ptr       r3
                     27: ; s2_ptr       r4
                     28: ; size         r5
                     29:
                     30: ; This code has been optimized to run one instruction per clock, avoiding
                     31: ; load stalls and writeback contention.  As a result, the instruction
                     32: ; order is not always natural.
                     33:
                     34: ; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
                     35: ; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
                     36:
                     37:        text
                     38:        align    16
                     39:        global   ___mpn_add_n
                     40: ___mpn_add_n:
                     41:        ld      r6,r3,0                 ; read first limb from s1_ptr
                     42:        extu    r10,r5,3
                     43:        ld      r7,r4,0                 ; read first limb from s2_ptr
                     44:
                     45:        subu.co r5,r0,r5                ; (clear carry as side effect)
                     46:        mak     r5,r5,3<4>
                     47:        bcnd    eq0,r5,Lzero
                     48:
                     49:        or      r12,r0,lo16(Lbase)
                     50:        or.u    r12,r12,hi16(Lbase)
                     51:        addu    r12,r12,r5              ; r12 is address for entering in loop
                     52:
                     53:        extu    r5,r5,2                 ; divide by 4
                     54:        subu    r2,r2,r5                ; adjust res_ptr
                     55:        subu    r3,r3,r5                ; adjust s1_ptr
                     56:        subu    r4,r4,r5                ; adjust s2_ptr
                     57:
                     58:        or      r8,r6,r0
                     59:
                     60:        jmp.n   r12
                     61:         or     r9,r7,r0
                     62:
                     63: Loop:  addu    r3,r3,32
                     64:        st      r8,r2,28
                     65:        addu    r4,r4,32
                     66:        ld      r6,r3,0
                     67:        addu    r2,r2,32
                     68:        ld      r7,r4,0
                     69: Lzero: subu    r10,r10,1               ; add 0 + 8r limbs (adj loop cnt)
                     70: Lbase: ld      r8,r3,4
                     71:        addu.cio r6,r6,r7
                     72:        ld      r9,r4,4
                     73:        st      r6,r2,0
                     74:        ld      r6,r3,8                 ; add 7 + 8r limbs
                     75:        addu.cio r8,r8,r9
                     76:        ld      r7,r4,8
                     77:        st      r8,r2,4
                     78:        ld      r8,r3,12                ; add 6 + 8r limbs
                     79:        addu.cio r6,r6,r7
                     80:        ld      r9,r4,12
                     81:        st      r6,r2,8
                     82:        ld      r6,r3,16                ; add 5 + 8r limbs
                     83:        addu.cio r8,r8,r9
                     84:        ld      r7,r4,16
                     85:        st      r8,r2,12
                     86:        ld      r8,r3,20                ; add 4 + 8r limbs
                     87:        addu.cio r6,r6,r7
                     88:        ld      r9,r4,20
                     89:        st      r6,r2,16
                     90:        ld      r6,r3,24                ; add 3 + 8r limbs
                     91:        addu.cio r8,r8,r9
                     92:        ld      r7,r4,24
                     93:        st      r8,r2,20
                     94:        ld      r8,r3,28                ; add 2 + 8r limbs
                     95:        addu.cio r6,r6,r7
                     96:        ld      r9,r4,28
                     97:        st      r6,r2,24
                     98:        bcnd.n  ne0,r10,Loop            ; add 1 + 8r limbs
                     99:         addu.cio r8,r8,r9
                    100:
                    101:        st      r8,r2,28                ; store most significant limb
                    102:
                    103:        jmp.n    r1
                    104:         addu.ci r2,r0,r0               ; return carry-out from most sign. limb
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>