OpenXM_contrib/gmp/mpn/m88k/mul_1.s - annotate

Return to mul_1.s CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / m88k
Annotation of OpenXM_contrib/gmp/mpn/m88k/mul_1.s, Revision 1.1

1.1     ! maekawa     1: ; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and
        !             2: ; store the product in a second limb vector.
        !             3:
        !             4: ; Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc.
        !             5:
        !             6: ; This file is part of the GNU MP Library.
        !             7:
        !             8: ; The GNU MP Library is free software; you can redistribute it and/or modify
        !             9: ; it under the terms of the GNU Library General Public License as published by
        !            10: ; the Free Software Foundation; either version 2 of the License, or (at your
        !            11: ; option) any later version.
        !            12:
        !            13: ; The GNU MP Library is distributed in the hope that it will be useful, but
        !            14: ; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
        !            15: ; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
        !            16: ; License for more details.
        !            17:
        !            18: ; You should have received a copy of the GNU Library General Public License
        !            19: ; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
        !            20: ; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
        !            21: ; MA 02111-1307, USA.
        !            22:
        !            23:
        !            24: ; INPUT PARAMETERS
        !            25: ; res_ptr      r2
        !            26: ; s1_ptr       r3
        !            27: ; size         r4
        !            28: ; s2_limb      r5
        !            29:
        !            30: ; Common overhead is about 11 cycles/invocation.
        !            31:
        !            32: ; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
        !            33: ; pipeline stalls 2 cycles due to WB contention.)
        !            34:
        !            35: ; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
        !            36: ; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
        !            37:
        !            38: ; To enhance speed:
        !            39: ; 1. Unroll main loop 4-8 times.
        !            40: ; 2. Schedule code to avoid WB contention.  It might be tempting to move the
        !            41: ;    ld instruction in the loops down to save 2 cycles (less WB contention),
        !            42: ;    but that looses because the ultimate value will be read from outside
        !            43: ;    the allocated space.  But if we handle the ultimate multiplication in
        !            44: ;    the tail, we can do this.
        !            45: ; 3. Make the multiplication with less instructions.  I think the code for
        !            46: ;    (S2_LIMB >= 0x10000) is not minimal.
        !            47: ; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
        !            48: ; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
        !            49: ; cycles/limb.  (Assuming infinite unrolling.)
        !            50:
        !            51:        text
        !            52:        align    16
        !            53:        global   ___mpn_mul_1
        !            54: ___mpn_mul_1:
        !            55:
        !            56:        ; Make S1_PTR and RES_PTR point at the end of their blocks
        !            57:        ; and negate SIZE.
        !            58:        lda      r3,r3[r4]
        !            59:        lda      r6,r2[r4]      ; RES_PTR in r6 since r2 is retval
        !            60:        subu     r4,r0,r4
        !            61:
        !            62:        addu.co  r2,r0,r0       ; r2 = cy = 0
        !            63:        ld       r9,r3[r4]
        !            64:        mask     r7,r5,0xffff   ; r7 = lo(S2_LIMB)
        !            65:        extu     r8,r5,16       ; r8 = hi(S2_LIMB)
        !            66:        bcnd.n   eq0,r8,Lsmall  ; jump if (hi(S2_LIMB) == 0)
        !            67:         subu    r6,r6,4
        !            68:
        !            69: ; General code for any value of S2_LIMB.
        !            70:
        !            71:        ; Make a stack frame and save r25 and r26
        !            72:        subu     r31,r31,16
        !            73:        st.d     r25,r31,8
        !            74:
        !            75:        ; Enter the loop in the middle
        !            76:        br.n    L1
        !            77:        addu     r4,r4,1
        !            78:
        !            79: Loop:  ld       r9,r3[r4]
        !            80:        st       r26,r6[r4]
        !            81: ; bcnd ne0,r0,0                ; bubble
        !            82:        addu     r4,r4,1
        !            83: L1:    mul      r26,r9,r5      ; low word of product   mul_1   WB ld
        !            84:        mask     r12,r9,0xffff  ; r12 = lo(s1_limb)     mask_1
        !            85:        mul      r11,r12,r7     ; r11 =  prod_0         mul_2   WB mask_1
        !            86:        mul      r10,r12,r8     ; r10 = prod_1a         mul_3
        !            87:        extu     r13,r9,16      ; r13 = hi(s1_limb)     extu_1  WB mul_1
        !            88:        mul      r12,r13,r7     ; r12 = prod_1b         mul_4   WB extu_1
        !            89:        mul      r25,r13,r8     ; r25  = prod_2         mul_5   WB mul_2
        !            90:        extu     r11,r11,16     ; r11 = hi(prod_0)      extu_2  WB mul_3
        !            91:        addu     r10,r10,r11    ;                       addu_1  WB extu_2
        !            92: ; bcnd ne0,r0,0                ; bubble                        WB addu_1
        !            93:        addu.co  r10,r10,r12    ;                               WB mul_4
        !            94:        mask.u   r10,r10,0xffff ; move the 16 most significant bits...
        !            95:        addu.ci  r10,r10,r0     ; ...to the low half of the word...
        !            96:        rot      r10,r10,16     ; ...and put carry in pos 16.
        !            97:        addu.co  r26,r26,r2     ; add old carry limb
        !            98:        bcnd.n   ne0,r4,Loop
        !            99:         addu.ci r2,r25,r10     ; compute new carry limb
        !           100:
        !           101:        st       r26,r6[r4]
        !           102:        ld.d     r25,r31,8
        !           103:        jmp.n    r1
        !           104:         addu    r31,r31,16
        !           105:
        !           106: ; Fast code for S2_LIMB < 0x10000
        !           107: Lsmall:
        !           108:        ; Enter the loop in the middle
        !           109:        br.n    SL1
        !           110:        addu     r4,r4,1
        !           111:
        !           112: SLoop: ld       r9,r3[r4]      ;
        !           113:        st       r8,r6[r4]      ;
        !           114:        addu     r4,r4,1        ;
        !           115: SL1:   mul      r8,r9,r5       ; low word of product
        !           116:        mask     r12,r9,0xffff  ; r12 = lo(s1_limb)
        !           117:        extu     r13,r9,16      ; r13 = hi(s1_limb)
        !           118:        mul      r11,r12,r7     ; r11 =  prod_0
        !           119:        mul      r12,r13,r7     ; r12 = prod_1b
        !           120:        addu.cio r8,r8,r2       ; add old carry limb
        !           121:        extu     r10,r11,16     ; r11 = hi(prod_0)
        !           122:        addu     r10,r10,r12    ;
        !           123:        bcnd.n   ne0,r4,SLoop
        !           124:        extu     r2,r10,16      ; r2 = new carry limb
        !           125:
        !           126:        jmp.n    r1
        !           127:        st       r8,r6[r4]
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>