[BACK]Return to lorrshift.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / ia64

Annotation of OpenXM_contrib/gmp/mpn/ia64/lorrshift.asm, Revision 1.1

1.1     ! ohara       1: dnl  IA-64 mpn_Xshift.
        !             2:
        !             3: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
        !             4:
        !             5: dnl  This file is part of the GNU MP Library.
        !             6:
        !             7: dnl  The GNU MP Library is free software; you can redistribute it and/or modify
        !             8: dnl  it under the terms of the GNU Lesser General Public License as published
        !             9: dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
        !            10: dnl  your option) any later version.
        !            11:
        !            12: dnl  The GNU MP Library is distributed in the hope that it will be useful, but
        !            13: dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
        !            14: dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
        !            15: dnl  License for more details.
        !            16:
        !            17: dnl  You should have received a copy of the GNU Lesser General Public License
        !            18: dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
        !            19: dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
        !            20: dnl  MA 02111-1307, USA.
        !            21:
        !            22: include(`../config.m4')
        !            23:
        !            24: C This code runs at 2 cycles/limb for large operands on the Itanium.  It needs
        !            25: C a very deep software pipeline, since shl/shr.u have a 4 cycle latency.  The
        !            26: C main loop here is not great; it is oversheduled with respect to the shr.u
        !            27: C instructions, and this actually turns out to give considerably more complex
        !            28: C wind down code.  The code runs slowly for operands with <= 8 limbs, since we
        !            29: C have a non-scheduled loop for that case.  We also have a primitive loop for
        !            30: C the unrolling edge, and as a consequence of the main loop stupidity it is
        !            31: C executed 1-4 steps instead of 0-3 steps.
        !            32:
        !            33: C By having 63 separate loops using the shrp instruction, we could easily reach
        !            34: C 1 cycle/limb.  Such loops would require a less deep software pipeline, since
        !            35: C shrp unlike shl/shr.u have a plain one cycle latency.
        !            36:
        !            37: C INPUT PARAMETERS
        !            38: C rp = r32
        !            39: C sp = r33
        !            40: C n = r34
        !            41: C cnt = r35
        !            42:
        !            43: ifdef(`OPERATION_lshift',`
        !            44:        define(`FSH',`shl')
        !            45:        define(`BSH',`shr.u')
        !            46:        define(`UPD',`-8')
        !            47:        define(`func',`mpn_lshift')
        !            48: ')
        !            49: ifdef(`OPERATION_rshift',`
        !            50:        define(`FSH',`shr.u')
        !            51:        define(`BSH',`shl')
        !            52:        define(`UPD',`8')
        !            53:        define(`func',`mpn_rshift')
        !            54: ')
        !            55:
        !            56: ASM_START()
        !            57: PROLOGUE(func)
        !            58:        .prologue
        !            59: ifdef(`HAVE_ABI_32',
        !            60: `      addp4   r32 = 0, r32
        !            61:        addp4   r33 = 0, r33
        !            62:        sxt4    r34 = r34
        !            63:        zxt4    r35 = r35
        !            64:        ;;
        !            65: ')
        !            66:        add     r34 = -1, r34
        !            67:        sub     r31 = 64, r35
        !            68:        .save   ar.lc, r2
        !            69:        mov     r2 = ar.lc;;
        !            70:        .body
        !            71:        cmp.leu p6, p7 = 8,r34
        !            72: ifdef(`OPERATION_lshift',`
        !            73:        shladd  r33 = r34, 3, r33
        !            74:        shladd  r32 = r34, 3, r32;;
        !            75: ')
        !            76:        ld8     r19 = [r33], UPD        ;;
        !            77:        BSH     r8 = r19, r31           C function return value
        !            78:    (p6) br.dptk        .Lbig
        !            79:
        !            80: C
        !            81: C Code for small operands.  Not an optimization for the Itanium, it is here
        !            82: C just to simplify the general case.
        !            83: C
        !            84:        mov     ar.lc = r34;;
        !            85:        br.cloop.dptk .Loops
        !            86:        FSH     r26 = r19, r35  ;;
        !            87:        st8     [r32] = r26
        !            88:        mov     ar.lc = r2
        !            89:        br.ret.sptk.many b0
        !            90: .Loops:
        !            91:        ld8     r16 = [r33], UPD
        !            92:        FSH     r26 = r19, r35  ;;
        !            93:        BSH     r27 = r16, r31  ;;
        !            94:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
        !            95:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
        !            96:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
        !            97:        or      r27 = r27, r26
        !            98:        mov     r19 = r16       ;;
        !            99:        st8     [r32] = r27, UPD
        !           100:        br.cloop.dptk .Loops
        !           101:        FSH     r26 = r19, r35  ;;
        !           102:        st8     [r32] = r26
        !           103:        mov     ar.lc = r2
        !           104:        br.ret.sptk.many b0
        !           105:
        !           106: C
        !           107: C Code for operands with >8 limbs.  An edge loop and a very deep software
        !           108: C pipeline.
        !           109: C
        !           110: .Lbig: and     r15 = 3, r34
        !           111:        shr.u   r14 = r34, 2    ;;
        !           112:        mov     ar.lc = r15
        !           113: .Loop0:
        !           114:        ld8     r16 = [r33], UPD
        !           115:        FSH     r26 = r19, r35  ;;
        !           116:        BSH     r27 = r16, r31  ;;
        !           117:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
        !           118:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
        !           119:        { .mib; nop.b 0;; }                     C delay to save 6 cycles...
        !           120:        or      r27 = r27, r26
        !           121:        mov     r19 = r16       ;;
        !           122:        st8     [r32] = r27, UPD
        !           123:        br.cloop.dptk .Loop0
        !           124:
        !           125: .Lunroll:
        !           126:        add     r14 = -2, r14   ;;
        !           127:        mov     ar.lc = r14
        !           128:
        !           129: .Lphase1:
        !           130:   { .mmi
        !           131:        ld8     r16 = [r33], UPD        ;;
        !           132: } { .mmi
        !           133:        ld8     r17 = [r33], UPD        ;;
        !           134: } { .mmi
        !           135:        ld8     r18 = [r33], UPD
        !           136:        FSH     r26 = r19, r35  ;;
        !           137: } { .mmi
        !           138:        ld8     r19 = [r33], UPD
        !           139:        BSH     r27 = r16, r31  ;;
        !           140: } { .mib
        !           141:        FSH     r20 = r16, r35
        !           142: }
        !           143:
        !           144: .Lphase2:
        !           145:   { .mmi
        !           146:        ld8     r16 = [r33], UPD
        !           147:        BSH     r21 = r17, r31
        !           148: } { .mib
        !           149:        FSH     r22 = r17, r35  ;;
        !           150: } { .mmi
        !           151:        ld8     r17 = [r33], UPD
        !           152:        BSH     r23 = r18, r31
        !           153: } { .mib
        !           154:        or      r27 = r27, r26
        !           155:        FSH     r24 = r18, r35
        !           156:        br.cloop.dptk .Loop
        !           157: }
        !           158:        br.sptk .Lend2
        !           159: .Loop:
        !           160:   { .mmi
        !           161:        st8     [r32] = r27, UPD
        !           162:        ld8     r18 = [r33], UPD
        !           163:        BSH     r25 = r19, r31
        !           164: } { .mib
        !           165:        or      r21 = r21, r20
        !           166:        FSH     r26 = r19, r35  ;;
        !           167: } { .mmi
        !           168:        st8     [r32] = r21, UPD
        !           169:        ld8     r19 = [r33], UPD
        !           170:        BSH     r27 = r16, r31
        !           171: } { .mib
        !           172:        or      r23 = r23, r22
        !           173:        FSH     r20 = r16, r35  ;;
        !           174: } { .mmi
        !           175:        st8     [r32] = r23, UPD
        !           176:        ld8     r16 = [r33], UPD
        !           177:        BSH     r21 = r17, r31
        !           178: } { .mib
        !           179:        or      r25 = r25, r24
        !           180:        FSH     r22 = r17, r35  ;;
        !           181: } { .mmi
        !           182:        st8     [r32] = r25, UPD
        !           183:        ld8     r17 = [r33], UPD
        !           184:        BSH     r23 = r18, r31
        !           185: } { .mib
        !           186:        or      r27 = r27, r26
        !           187:        FSH     r24 = r18, r35
        !           188:        br.cloop.sptk .Loop;;
        !           189: }
        !           190: .Lend2:
        !           191:   { .mmi
        !           192:        st8     [r32] = r27, UPD
        !           193:        ld8     r18 = [r33], UPD
        !           194:        BSH     r25 = r19, r31
        !           195: } { .mib
        !           196:        or      r21 = r21, r20
        !           197:        FSH     r26 = r19, r35  ;;
        !           198: } { .mmi
        !           199:        st8     [r32] = r21, UPD
        !           200:        BSH     r27 = r16, r31
        !           201: } { .mib
        !           202:        or      r23 = r23, r22
        !           203:        FSH     r20 = r16, r35  ;;
        !           204: } { .mmi
        !           205:        st8     [r32] = r23, UPD
        !           206:        BSH     r21 = r17, r31
        !           207: } { .mib
        !           208:        or      r25 = r25, r24
        !           209:        FSH     r22 = r17, r35  ;;
        !           210: } { .mmi
        !           211:        st8     [r32] = r25, UPD
        !           212:        BSH     r23 = r18, r31
        !           213: } { .mib
        !           214:        or      r27 = r27, r26
        !           215:        FSH     r24 = r18, r35  ;;
        !           216: }
        !           217:
        !           218:   { .mmi
        !           219:        st8     [r32] = r27, UPD
        !           220: } { .mib
        !           221:        or      r21 = r21, r20  ;;
        !           222: } { .mmi
        !           223:        st8     [r32] = r21, UPD
        !           224: } { .mib
        !           225:        or      r23 = r23, r22  ;;
        !           226: } { .mmi
        !           227:        st8     [r32] = r23, UPD;;
        !           228: } { .mmi
        !           229:        st8     [r32] = r24
        !           230: }
        !           231:        mov     ar.lc = r2
        !           232:        br.ret.sptk.many b0
        !           233: EPILOGUE(func)
        !           234: ASM_END()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>