OpenXM_contrib/gmp/mpn/ia64/lorrshift.asm - annotate

Return to lorrshift.asm CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / ia64
Annotation of OpenXM_contrib/gmp/mpn/ia64/lorrshift.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  IA-64 mpn_Xshift.
                      2:
                      3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
                      4:
                      5: dnl  This file is part of the GNU MP Library.
                      6:
                      7: dnl  The GNU MP Library is free software; you can redistribute it and/or modify
                      8: dnl  it under the terms of the GNU Lesser General Public License as published
                      9: dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
                     10: dnl  your option) any later version.
                     11:
                     12: dnl  The GNU MP Library is distributed in the hope that it will be useful, but
                     13: dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     14: dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     15: dnl  License for more details.
                     16:
                     17: dnl  You should have received a copy of the GNU Lesser General Public License
                     18: dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     19: dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     20: dnl  MA 02111-1307, USA.
                     21:
                     22: include(`../config.m4')
                     23:
                     24: C This code runs at 2 cycles/limb for large operands on the Itanium.  It needs
                     25: C a very deep software pipeline, since shl/shr.u have a 4 cycle latency.  The
                     26: C main loop here is not great; it is oversheduled with respect to the shr.u
                     27: C instructions, and this actually turns out to give considerably more complex
                     28: C wind down code.  The code runs slowly for operands with <= 8 limbs, since we
                     29: C have a non-scheduled loop for that case.  We also have a primitive loop for
                     30: C the unrolling edge, and as a consequence of the main loop stupidity it is
                     31: C executed 1-4 steps instead of 0-3 steps.
                     32:
                     33: C By having 63 separate loops using the shrp instruction, we could easily reach
                     34: C 1 cycle/limb.  Such loops would require a less deep software pipeline, since
                     35: C shrp unlike shl/shr.u have a plain one cycle latency.
                     36:
                     37: C INPUT PARAMETERS
                     38: C rp = r32
                     39: C sp = r33
                     40: C n = r34
                     41: C cnt = r35
                     42:
                     43: ifdef(`OPERATION_lshift',`
                     44:        define(`FSH',`shl')
                     45:        define(`BSH',`shr.u')
                     46:        define(`UPD',`-8')
                     47:        define(`func',`mpn_lshift')
                     48: ')
                     49: ifdef(`OPERATION_rshift',`
                     50:        define(`FSH',`shr.u')
                     51:        define(`BSH',`shl')
                     52:        define(`UPD',`8')
                     53:        define(`func',`mpn_rshift')
                     54: ')
                     55:
                     56: ASM_START()
                     57: PROLOGUE(func)
                     58:        .prologue
                     59: ifdef(`HAVE_ABI_32',
                     60: `      addp4   r32 = 0, r32
                     61:        addp4   r33 = 0, r33
                     62:        sxt4    r34 = r34
                     63:        zxt4    r35 = r35
                     64:        ;;
                     65: ')
                     66:        add     r34 = -1, r34
                     67:        sub     r31 = 64, r35
                     68:        .save   ar.lc, r2
                     69:        mov     r2 = ar.lc;;
                     70:        .body
                     71:        cmp.leu p6, p7 = 8,r34
                     72: ifdef(`OPERATION_lshift',`
                     73:        shladd  r33 = r34, 3, r33
                     74:        shladd  r32 = r34, 3, r32;;
                     75: ')
                     76:        ld8     r19 = [r33], UPD        ;;
                     77:        BSH     r8 = r19, r31           C function return value
                     78:    (p6) br.dptk        .Lbig
                     79:
                     80: C
                     81: C Code for small operands.  Not an optimization for the Itanium, it is here
                     82: C just to simplify the general case.
                     83: C
                     84:        mov     ar.lc = r34;;
                     85:        br.cloop.dptk .Loops
                     86:        FSH     r26 = r19, r35  ;;
                     87:        st8     [r32] = r26
                     88:        mov     ar.lc = r2
                     89:        br.ret.sptk.many b0
                     90: .Loops:
                     91:        ld8     r16 = [r33], UPD
                     92:        FSH     r26 = r19, r35  ;;
                     93:        BSH     r27 = r16, r31  ;;
                     94:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
                     95:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
                     96:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
                     97:        or      r27 = r27, r26
                     98:        mov     r19 = r16       ;;
                     99:        st8     [r32] = r27, UPD
                    100:        br.cloop.dptk .Loops
                    101:        FSH     r26 = r19, r35  ;;
                    102:        st8     [r32] = r26
                    103:        mov     ar.lc = r2
                    104:        br.ret.sptk.many b0
                    105:
                    106: C
                    107: C Code for operands with >8 limbs.  An edge loop and a very deep software
                    108: C pipeline.
                    109: C
                    110: .Lbig: and     r15 = 3, r34
                    111:        shr.u   r14 = r34, 2    ;;
                    112:        mov     ar.lc = r15
                    113: .Loop0:
                    114:        ld8     r16 = [r33], UPD
                    115:        FSH     r26 = r19, r35  ;;
                    116:        BSH     r27 = r16, r31  ;;
                    117:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
                    118:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
                    119:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
                    120:        or      r27 = r27, r26
                    121:        mov     r19 = r16       ;;
                    122:        st8     [r32] = r27, UPD
                    123:        br.cloop.dptk .Loop0
                    124:
                    125: .Lunroll:
                    126:        add     r14 = -2, r14   ;;
                    127:        mov     ar.lc = r14
                    128:
                    129: .Lphase1:
                    130:   { .mmi
                    131:        ld8     r16 = [r33], UPD        ;;
                    132: } { .mmi
                    133:        ld8     r17 = [r33], UPD        ;;
                    134: } { .mmi
                    135:        ld8     r18 = [r33], UPD
                    136:        FSH     r26 = r19, r35  ;;
                    137: } { .mmi
                    138:        ld8     r19 = [r33], UPD
                    139:        BSH     r27 = r16, r31  ;;
                    140: } { .mib
                    141:        FSH     r20 = r16, r35
                    142: }
                    143:
                    144: .Lphase2:
                    145:   { .mmi
                    146:        ld8     r16 = [r33], UPD
                    147:        BSH     r21 = r17, r31
                    148: } { .mib
                    149:        FSH     r22 = r17, r35  ;;
                    150: } { .mmi
                    151:        ld8     r17 = [r33], UPD
                    152:        BSH     r23 = r18, r31
                    153: } { .mib
                    154:        or      r27 = r27, r26
                    155:        FSH     r24 = r18, r35
                    156:        br.cloop.dptk .Loop
                    157: }
                    158:        br.sptk .Lend2
                    159: .Loop:
                    160:   { .mmi
                    161:        st8     [r32] = r27, UPD
                    162:        ld8     r18 = [r33], UPD
                    163:        BSH     r25 = r19, r31
                    164: } { .mib
                    165:        or      r21 = r21, r20
                    166:        FSH     r26 = r19, r35  ;;
                    167: } { .mmi
                    168:        st8     [r32] = r21, UPD
                    169:        ld8     r19 = [r33], UPD
                    170:        BSH     r27 = r16, r31
                    171: } { .mib
                    172:        or      r23 = r23, r22
                    173:        FSH     r20 = r16, r35  ;;
                    174: } { .mmi
                    175:        st8     [r32] = r23, UPD
                    176:        ld8     r16 = [r33], UPD
                    177:        BSH     r21 = r17, r31
                    178: } { .mib
                    179:        or      r25 = r25, r24
                    180:        FSH     r22 = r17, r35  ;;
                    181: } { .mmi
                    182:        st8     [r32] = r25, UPD
                    183:        ld8     r17 = [r33], UPD
                    184:        BSH     r23 = r18, r31
                    185: } { .mib
                    186:        or      r27 = r27, r26
                    187:        FSH     r24 = r18, r35
                    188:        br.cloop.sptk .Loop;;
                    189: }
                    190: .Lend2:
                    191:   { .mmi
                    192:        st8     [r32] = r27, UPD
                    193:        ld8     r18 = [r33], UPD
                    194:        BSH     r25 = r19, r31
                    195: } { .mib
                    196:        or      r21 = r21, r20
                    197:        FSH     r26 = r19, r35  ;;
                    198: } { .mmi
                    199:        st8     [r32] = r21, UPD
                    200:        BSH     r27 = r16, r31
                    201: } { .mib
                    202:        or      r23 = r23, r22
                    203:        FSH     r20 = r16, r35  ;;
                    204: } { .mmi
                    205:        st8     [r32] = r23, UPD
                    206:        BSH     r21 = r17, r31
                    207: } { .mib
                    208:        or      r25 = r25, r24
                    209:        FSH     r22 = r17, r35  ;;
                    210: } { .mmi
                    211:        st8     [r32] = r25, UPD
                    212:        BSH     r23 = r18, r31
                    213: } { .mib
                    214:        or      r27 = r27, r26
                    215:        FSH     r24 = r18, r35  ;;
                    216: }
                    217:
                    218:   { .mmi
                    219:        st8     [r32] = r27, UPD
                    220: } { .mib
                    221:        or      r21 = r21, r20  ;;
                    222: } { .mmi
                    223:        st8     [r32] = r21, UPD
                    224: } { .mib
                    225:        or      r23 = r23, r22  ;;
                    226: } { .mmi
                    227:        st8     [r32] = r23, UPD;;
                    228: } { .mmi
                    229:        st8     [r32] = r24
                    230: }
                    231:        mov     ar.lc = r2
                    232:        br.ret.sptk.many b0
                    233: EPILOGUE(func)
                    234: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>