[BACK]Return to addmul_3.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / alpha / ev6 / nails

Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_3.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  Alpha ev6 nails mpn_addmul_3.
                      2:
                      3: dnl  Copyright 2002 Free Software Foundation, Inc.
                      4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24: dnl  INPUT PARAMETERS
                     25: define(`rp',`r16')
                     26: define(`up',`r17')
                     27: define(`n',`r18')
                     28: define(`vp',`r19')
                     29:
                     30: dnl  Useful register aliases
                     31: define(`numb_mask',`r24')
                     32: define(`ulimb',`r25')
                     33: define(`rlimb',`r27')
                     34:
                     35: define(`m0a',`r0')
                     36: define(`m0b',`r1')
                     37: define(`m1a',`r2')
                     38: define(`m1b',`r3')
                     39: define(`m2a',`r20')
                     40: define(`m2b',`r21')
                     41:
                     42: define(`acc0',`r4')
                     43: define(`acc1',`r5')
                     44: define(`acc2',`r22')
                     45:
                     46: define(`v0',`r6')
                     47: define(`v1',`r7')
                     48: define(`v2',`r23')
                     49:
                     50: dnl Used for temps: r8 r19 r28
                     51:
                     52: define(`NAIL_BITS',`GMP_NAIL_BITS')
                     53: define(`NUMB_BITS',`GMP_NUMB_BITS')
                     54:
                     55: dnl  This declaration is munged by configure
                     56: NAILS_SUPPORT(3-63)
                     57:
                     58: dnl  Runs at 3.0 cycles/limb.  With unrolling, the ulimb load and the 3
                     59: dnl  bookkeeping increments and the `bis' that copies from r22 to r6 could be
                     60: dnl  removed and the instruction count reduced from 26 to to 21.  We could
                     61: dnl  thereby probably reach 2 cycles/limb, the IMUL bandwidth.
                     62:
                     63: dnl If this is going to be a Karatsuba basecase building block, we need some
                     64: dnl of the combinations below.  That way, we won't ever hit the
                     65: dnl slower mpn_addmul_1 for any huge multiplication.
                     66: dnl
                     67: dnl    Alt 3           Alt 4           Alt 5           Alt 6
                     68: dnl    addmul_2        addmul_2        addmul_3        addmul_3
                     69: dnl    addmul_3        addmul_3        addmul_4        addmul_4
                     70: dnl                    addmul_4        addmul_5        addmul_5
                     71: dnl                                                    addmul_6
                     72:
                     73: dnl Register usage:
                     74: dnl callee-saves:      r9 r10 r11 r12 r13 r14 r15
                     75: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
                     76: dnl         r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
                     77: dnl return address: 26
                     78: dnl global pointer: 29
                     79: dnl stack pointer: 30
                     80:
                     81: ASM_START()
                     82: PROLOGUE(mpn_addmul_3)
                     83:        lda     numb_mask,-1(r31)
                     84:        srl     numb_mask,NAIL_BITS,numb_mask
                     85:
                     86:        ldq     v0,     0(vp)
                     87:        ldq     v1,     8(vp)
                     88:        ldq     v2,     16(vp)
                     89:
                     90:        bis     r31,    r31,    acc0            C       zero acc0
                     91:        sll     v0,NAIL_BITS,   v0
                     92:        bis     r31,    r31,    acc1            C       zero acc1
                     93:        sll     v1,NAIL_BITS,   v1
                     94:        bis     r31,    r31,    acc2            C       zero acc2
                     95:        sll     v2,NAIL_BITS,   v2
                     96:        bis     r31,    r31,    r19
                     97:
                     98: C MAIN LOOP
                     99:        ldq     ulimb,  0(up)
                    100:        lda     up,     8(up)
                    101:        mulq    v0,     ulimb,  m0a             C U1
                    102:        umulh   v0,     ulimb,  m0b             C U1
                    103:        mulq    v1,     ulimb,  m1a             C U1
                    104:        umulh   v1,     ulimb,  m1b             C U1
                    105:        lda     n,      -1(n)
                    106:        mulq    v2,     ulimb,  m2a             C U1
                    107:        umulh   v2,     ulimb,  m2b             C U1
                    108:        beq     n,      Lend                    C U0
                    109:        ALIGN(16)
                    110: Loop:
                    111:        bis     r31,    r31,    r31             C       nop
                    112:        ldq     rlimb,  0(rp)
                    113:        ldq     ulimb,  0(up)
                    114:        addq    r19,    acc0,   acc0            C       propagate nail
                    115:
                    116:        lda     rp,     8(rp)
                    117:        srl     m0a,NAIL_BITS,  r8              C U0
                    118:        lda     up,     8(up)
                    119:        mulq    v0,     ulimb,  m0a             C U1
                    120:
                    121:        addq    r8,     acc0,   r19
                    122:        addq    m0b,    acc1,   acc0
                    123:        umulh   v0,     ulimb,  m0b             C U1
                    124:        bis     r31,    r31,    r31             C       nop
                    125:
                    126:        addq    rlimb,  r19,    r19
                    127:        srl     m1a,NAIL_BITS,  r8              C U0
                    128:        bis     r31,    r31,    r31             C       nop
                    129:        mulq    v1,     ulimb,  m1a             C U1
                    130:
                    131:        addq    r8,     acc0,   acc0
                    132:        addq    m1b,    acc2,   acc1
                    133:        umulh   v1,     ulimb,  m1b             C U1
                    134:        and     r19,numb_mask,  r28             C       extract numb part
                    135:
                    136:        bis     r31,    r31,    r31             C       nop
                    137:        srl     m2a,NAIL_BITS,  r8              C U0
                    138:        lda     n,      -1(n)
                    139:        mulq    v2,     ulimb,  m2a             C U1
                    140:
                    141:        addq    r8,     acc1,   acc1
                    142:        bis     r31,    m2b,    acc2
                    143:        umulh   v2,     ulimb,  m2b             C U1
                    144:        srl     r19,NUMB_BITS,  r19             C       extract nail part
                    145:
                    146:        bis     r31,    r31,    r31             C       nop
                    147:        stq     r28,    -8(rp)
                    148:
                    149:        bne     n,      Loop                    C U0
                    150: C END LOOP
                    151: Lend:
                    152:        ldq     rlimb,  0(rp)
                    153:        addq    r19,    acc0,   acc0            C       propagate nail
                    154:        lda     rp,     8(rp)
                    155:        srl     m0a,NAIL_BITS,  r8              C U0
                    156:        addq    r8,     acc0,   r19
                    157:        addq    m0b,    acc1,   acc0
                    158:        addq    rlimb,  r19,    r19
                    159:        srl     m1a,NAIL_BITS,  r8              C U0
                    160:        addq    r8,     acc0,   acc0
                    161:        addq    m1b,    acc2,   acc1
                    162:        and     r19,numb_mask,  r28             C extract limb
                    163:        srl     m2a,NAIL_BITS,  r8              C U0
                    164:        addq    r8,     acc1,   acc1
                    165:        bis     r31,    m2b,    acc2
                    166:        srl     r19,NUMB_BITS,  r19             C extract nail
                    167:        stq     r28,    -8(rp)
                    168:
                    169:        addq    r19,    acc0,   acc0            C propagate nail
                    170:        and     acc0,numb_mask, r28
                    171:        stq     r28,    0(rp)
                    172:        srl     acc0,NUMB_BITS, r19
                    173:        addq    r19,    acc1,   acc1
                    174:
                    175:        and     acc1,numb_mask, r28
                    176:        stq     r28,    8(rp)
                    177:        srl     acc1,NUMB_BITS, r19
                    178:        addq    r19,    acc2,   m0a
                    179:
                    180:        ret     r31,    (r26),  1
                    181: EPILOGUE(mpn_addmul_3)
                    182: ASM_END()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>