[BACK]Return to addmul_2.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / alpha / ev6 / nails

Annotation of OpenXM_contrib/gmp/mpn/alpha/ev6/nails/addmul_2.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  Alpha ev6 nails mpn_addmul_2.
                      2:
                      3: dnl  Copyright 2002 Free Software Foundation, Inc.
                      4: dnl
                      5: dnl  This file is part of the GNU MP Library.
                      6: dnl
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      8: dnl  modify it under the terms of the GNU Lesser General Public License as
                      9: dnl  published by the Free Software Foundation; either version 2.1 of the
                     10: dnl  License, or (at your option) any later version.
                     11: dnl
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     13: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     14: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     15: dnl  Lesser General Public License for more details.
                     16: dnl
                     17: dnl  You should have received a copy of the GNU Lesser General Public
                     18: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     19: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     20: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24: dnl  INPUT PARAMETERS
                     25: define(`rp',`r16')
                     26: define(`up',`r17')
                     27: define(`n',`r18')
                     28: define(`vp',`r19')
                     29:
                     30: dnl  Useful register aliases
                     31: define(`numb_mask',`r24')
                     32: define(`ulimb',`r25')
                     33: define(`rlimb',`r27')
                     34:
                     35: define(`m0a',`r0')
                     36: define(`m0b',`r1')
                     37: define(`m1a',`r2')
                     38: define(`m1b',`r3')
                     39:
                     40: define(`acc0',`r4')
                     41: define(`acc1',`r5')
                     42:
                     43: define(`v0',`r6')
                     44: define(`v1',`r7')
                     45:
                     46: dnl Used for temps: r8 r19 r28
                     47:
                     48: define(`NAIL_BITS',`GMP_NAIL_BITS')
                     49: define(`NUMB_BITS',`GMP_NUMB_BITS')
                     50:
                     51: dnl  This declaration is munged by configure
                     52: NAILS_SUPPORT(3-63)
                     53:
                     54: dnl  Runs at 4.0 cycles/limb.  With unrolling, the ulimb load and the 3
                     55: dnl  bookkeeping increments and the `bis' that copies from r21 to r5 could be
                     56: dnl  removed and the instruction count reduced from 21 to to 16.  We could
                     57: dnl  thereby reach about 2.3 cycles/limb.
                     58:
                     59: dnl If this is going to be a Karatsuba basecase building block, we need some
                     60: dnl of the combinations below.  That way, we won't ever hit the
                     61: dnl slower mpn_addmul_1 for any huge multiplication.
                     62: dnl
                     63: dnl    Alt 3           Alt 4           Alt 5           Alt 6
                     64: dnl    addmul_2        addmul_2        addmul_3        addmul_3
                     65: dnl    addmul_3        addmul_3        addmul_4        addmul_4
                     66: dnl                    addmul_4        addmul_5        addmul_5
                     67: dnl                                                    addmul_6
                     68:
                     69: dnl Register usage:
                     70: dnl callee-saves:      r9 r10 r11 r12 r13 r14 r15
                     71: dnl scratch: r0 r1 r2 r3 r4 r5 r6 r7 r8
                     72: dnl         r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r27 r28
                     73: dnl return address: 26
                     74: dnl global pointer: 29
                     75: dnl stack pointer: 30
                     76:
                     77: ASM_START()
                     78: PROLOGUE(mpn_addmul_2)
                     79:        lda     numb_mask,-1(r31)
                     80:        srl     numb_mask,NAIL_BITS,numb_mask
                     81:
                     82:        ldq     v0,     0(vp)
                     83:        ldq     v1,     8(vp)
                     84:
                     85:        bis     r31,    r31,    acc0            C       zero acc0
                     86:        sll     v0,NAIL_BITS,   v0
                     87:        bis     r31,    r31,    acc1            C       zero acc1
                     88:        sll     v1,NAIL_BITS,   v1
                     89:        bis     r31,    r31,    r19
                     90:
                     91: C MAIN LOOP
                     92:        ldq     ulimb,  0(up)
                     93:        lda     up,     8(up)
                     94:        mulq    v0,     ulimb,  m0a             C U1
                     95:        umulh   v0,     ulimb,  m0b             C U1
                     96:        mulq    v1,     ulimb,  m1a             C U1
                     97:        umulh   v1,     ulimb,  m1b             C U1
                     98:        lda     n,      -1(n)
                     99:        beq     n,      Lend                    C U0
                    100:        ALIGN(16)
                    101: Loop:
                    102:        bis     r31,    r31,    r31             C       nop
                    103:        ldq     rlimb,  0(rp)
                    104:        ldq     ulimb,  0(up)
                    105:        addq    r19,    acc0,   acc0            C       propagate nail
                    106:
                    107:        lda     rp,     8(rp)
                    108:        srl     m0a,NAIL_BITS,r8                C U0
                    109:        lda     up,     8(up)
                    110:        mulq    v0,     ulimb,  m0a             C U1
                    111:
                    112:        addq    r8,     acc0,   r19
                    113:        addq    m0b,    acc1,   acc0
                    114:        umulh   v0,     ulimb,  m0b             C U1
                    115:        bis     r31,    r31,    r31             C       nop
                    116:
                    117:        addq    rlimb,  r19,    r19
                    118:        srl     m1a,NAIL_BITS,r8                C U0
                    119:        bis     r31,    r31,    r31             C       nop
                    120:        mulq    v1,     ulimb,  m1a             C U1
                    121:
                    122:        addq    r8,     acc0,   acc0
                    123:        bis     r31,    m1b,    acc1
                    124:        umulh   v1,     ulimb,  m1b             C U1
                    125:        and     r19,numb_mask,  r28             C       extract numb part
                    126:
                    127:        lda     n,      -1(n)
                    128:        srl     r19,NUMB_BITS,  r19             C       extract nail part
                    129:        stq     r28,    -8(rp)
                    130:        bne     n,      Loop                    C U0
                    131: C END LOOP
                    132: Lend:
                    133:        ldq     rlimb,  0(rp)
                    134:        addq    r19,    acc0,   acc0            C       propagate nail
                    135:        lda     rp,     8(rp)
                    136:        srl     m0a,NAIL_BITS,r8                C U0
                    137:        addq    r8,     acc0,   r19
                    138:        addq    m0b,    acc1,   acc0
                    139:        addq    rlimb,  r19,    r19
                    140:        srl     m1a,NAIL_BITS,r8                C U0
                    141:        addq    r8,     acc0,   acc0
                    142:        bis     r31,    m1b,    acc1
                    143:        and     r19,numb_mask,  r28             C extract limb
                    144:
                    145:        srl     r19,NUMB_BITS,  r19             C extract nail
                    146:        stq     r28,    -8(rp)
                    147:
                    148:        addq    r19,    acc0,   acc0            C propagate nail
                    149:        and     acc0,numb_mask, r28
                    150:        stq     r28,    0(rp)
                    151:        srl     acc0,NUMB_BITS, r19
                    152:        addq    r19,    acc1,   r0
                    153:
                    154:        ret     r31,    (r26),  1
                    155: EPILOGUE(mpn_addmul_2)
                    156: ASM_END()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>