Annotation of OpenXM_contrib/gmp/mpn/ia64/lorrshift.asm, Revision 1.1
1.1 ! ohara 1: dnl IA-64 mpn_Xshift.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4:
! 5: dnl This file is part of the GNU MP Library.
! 6:
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or modify
! 8: dnl it under the terms of the GNU Lesser General Public License as published
! 9: dnl by the Free Software Foundation; either version 2.1 of the License, or (at
! 10: dnl your option) any later version.
! 11:
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful, but
! 13: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 14: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 15: dnl License for more details.
! 16:
! 17: dnl You should have received a copy of the GNU Lesser General Public License
! 18: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 19: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
! 20: dnl MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24: C This code runs at 2 cycles/limb for large operands on the Itanium. It needs
! 25: C a very deep software pipeline, since shl/shr.u have a 4 cycle latency. The
! 26: C main loop here is not great; it is oversheduled with respect to the shr.u
! 27: C instructions, and this actually turns out to give considerably more complex
! 28: C wind down code. The code runs slowly for operands with <= 8 limbs, since we
! 29: C have a non-scheduled loop for that case. We also have a primitive loop for
! 30: C the unrolling edge, and as a consequence of the main loop stupidity it is
! 31: C executed 1-4 steps instead of 0-3 steps.
! 32:
! 33: C By having 63 separate loops using the shrp instruction, we could easily reach
! 34: C 1 cycle/limb. Such loops would require a less deep software pipeline, since
! 35: C shrp unlike shl/shr.u have a plain one cycle latency.
! 36:
! 37: C INPUT PARAMETERS
! 38: C rp = r32
! 39: C sp = r33
! 40: C n = r34
! 41: C cnt = r35
! 42:
! 43: ifdef(`OPERATION_lshift',`
! 44: define(`FSH',`shl')
! 45: define(`BSH',`shr.u')
! 46: define(`UPD',`-8')
! 47: define(`func',`mpn_lshift')
! 48: ')
! 49: ifdef(`OPERATION_rshift',`
! 50: define(`FSH',`shr.u')
! 51: define(`BSH',`shl')
! 52: define(`UPD',`8')
! 53: define(`func',`mpn_rshift')
! 54: ')
! 55:
! 56: ASM_START()
! 57: PROLOGUE(func)
! 58: .prologue
! 59: ifdef(`HAVE_ABI_32',
! 60: ` addp4 r32 = 0, r32
! 61: addp4 r33 = 0, r33
! 62: sxt4 r34 = r34
! 63: zxt4 r35 = r35
! 64: ;;
! 65: ')
! 66: add r34 = -1, r34
! 67: sub r31 = 64, r35
! 68: .save ar.lc, r2
! 69: mov r2 = ar.lc;;
! 70: .body
! 71: cmp.leu p6, p7 = 8,r34
! 72: ifdef(`OPERATION_lshift',`
! 73: shladd r33 = r34, 3, r33
! 74: shladd r32 = r34, 3, r32;;
! 75: ')
! 76: ld8 r19 = [r33], UPD ;;
! 77: BSH r8 = r19, r31 C function return value
! 78: (p6) br.dptk .Lbig
! 79:
! 80: C
! 81: C Code for small operands. Not an optimization for the Itanium, it is here
! 82: C just to simplify the general case.
! 83: C
! 84: mov ar.lc = r34;;
! 85: br.cloop.dptk .Loops
! 86: FSH r26 = r19, r35 ;;
! 87: st8 [r32] = r26
! 88: mov ar.lc = r2
! 89: br.ret.sptk.many b0
! 90: .Loops:
! 91: ld8 r16 = [r33], UPD
! 92: FSH r26 = r19, r35 ;;
! 93: BSH r27 = r16, r31 ;;
! 94: { .mib; nop.b 0;; } C delay to save 6 cycles...
! 95: { .mib; nop.b 0;; } C delay to save 6 cycles...
! 96: { .mib; nop.b 0;; } C delay to save 6 cycles...
! 97: or r27 = r27, r26
! 98: mov r19 = r16 ;;
! 99: st8 [r32] = r27, UPD
! 100: br.cloop.dptk .Loops
! 101: FSH r26 = r19, r35 ;;
! 102: st8 [r32] = r26
! 103: mov ar.lc = r2
! 104: br.ret.sptk.many b0
! 105:
! 106: C
! 107: C Code for operands with >8 limbs. An edge loop and a very deep software
! 108: C pipeline.
! 109: C
! 110: .Lbig: and r15 = 3, r34
! 111: shr.u r14 = r34, 2 ;;
! 112: mov ar.lc = r15
! 113: .Loop0:
! 114: ld8 r16 = [r33], UPD
! 115: FSH r26 = r19, r35 ;;
! 116: BSH r27 = r16, r31 ;;
! 117: { .mib; nop.b 0;; } C delay to save 6 cycles...
! 118: { .mib; nop.b 0;; } C delay to save 6 cycles...
! 119: { .mib; nop.b 0;; } C delay to save 6 cycles...
! 120: or r27 = r27, r26
! 121: mov r19 = r16 ;;
! 122: st8 [r32] = r27, UPD
! 123: br.cloop.dptk .Loop0
! 124:
! 125: .Lunroll:
! 126: add r14 = -2, r14 ;;
! 127: mov ar.lc = r14
! 128:
! 129: .Lphase1:
! 130: { .mmi
! 131: ld8 r16 = [r33], UPD ;;
! 132: } { .mmi
! 133: ld8 r17 = [r33], UPD ;;
! 134: } { .mmi
! 135: ld8 r18 = [r33], UPD
! 136: FSH r26 = r19, r35 ;;
! 137: } { .mmi
! 138: ld8 r19 = [r33], UPD
! 139: BSH r27 = r16, r31 ;;
! 140: } { .mib
! 141: FSH r20 = r16, r35
! 142: }
! 143:
! 144: .Lphase2:
! 145: { .mmi
! 146: ld8 r16 = [r33], UPD
! 147: BSH r21 = r17, r31
! 148: } { .mib
! 149: FSH r22 = r17, r35 ;;
! 150: } { .mmi
! 151: ld8 r17 = [r33], UPD
! 152: BSH r23 = r18, r31
! 153: } { .mib
! 154: or r27 = r27, r26
! 155: FSH r24 = r18, r35
! 156: br.cloop.dptk .Loop
! 157: }
! 158: br.sptk .Lend2
! 159: .Loop:
! 160: { .mmi
! 161: st8 [r32] = r27, UPD
! 162: ld8 r18 = [r33], UPD
! 163: BSH r25 = r19, r31
! 164: } { .mib
! 165: or r21 = r21, r20
! 166: FSH r26 = r19, r35 ;;
! 167: } { .mmi
! 168: st8 [r32] = r21, UPD
! 169: ld8 r19 = [r33], UPD
! 170: BSH r27 = r16, r31
! 171: } { .mib
! 172: or r23 = r23, r22
! 173: FSH r20 = r16, r35 ;;
! 174: } { .mmi
! 175: st8 [r32] = r23, UPD
! 176: ld8 r16 = [r33], UPD
! 177: BSH r21 = r17, r31
! 178: } { .mib
! 179: or r25 = r25, r24
! 180: FSH r22 = r17, r35 ;;
! 181: } { .mmi
! 182: st8 [r32] = r25, UPD
! 183: ld8 r17 = [r33], UPD
! 184: BSH r23 = r18, r31
! 185: } { .mib
! 186: or r27 = r27, r26
! 187: FSH r24 = r18, r35
! 188: br.cloop.sptk .Loop;;
! 189: }
! 190: .Lend2:
! 191: { .mmi
! 192: st8 [r32] = r27, UPD
! 193: ld8 r18 = [r33], UPD
! 194: BSH r25 = r19, r31
! 195: } { .mib
! 196: or r21 = r21, r20
! 197: FSH r26 = r19, r35 ;;
! 198: } { .mmi
! 199: st8 [r32] = r21, UPD
! 200: BSH r27 = r16, r31
! 201: } { .mib
! 202: or r23 = r23, r22
! 203: FSH r20 = r16, r35 ;;
! 204: } { .mmi
! 205: st8 [r32] = r23, UPD
! 206: BSH r21 = r17, r31
! 207: } { .mib
! 208: or r25 = r25, r24
! 209: FSH r22 = r17, r35 ;;
! 210: } { .mmi
! 211: st8 [r32] = r25, UPD
! 212: BSH r23 = r18, r31
! 213: } { .mib
! 214: or r27 = r27, r26
! 215: FSH r24 = r18, r35 ;;
! 216: }
! 217:
! 218: { .mmi
! 219: st8 [r32] = r27, UPD
! 220: } { .mib
! 221: or r21 = r21, r20 ;;
! 222: } { .mmi
! 223: st8 [r32] = r21, UPD
! 224: } { .mib
! 225: or r23 = r23, r22 ;;
! 226: } { .mmi
! 227: st8 [r32] = r23, UPD;;
! 228: } { .mmi
! 229: st8 [r32] = r24
! 230: }
! 231: mov ar.lc = r2
! 232: br.ret.sptk.many b0
! 233: EPILOGUE(func)
! 234: ASM_END()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>