Annotation of OpenXM_contrib/gmp/mpn/ia64/README, Revision 1.1
1.1 ! ohara 1: Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 2:
! 3: This file is part of the GNU MP Library.
! 4:
! 5: The GNU MP Library is free software; you can redistribute it and/or modify
! 6: it under the terms of the GNU Lesser General Public License as published by
! 7: the Free Software Foundation; either version 2.1 of the License, or (at your
! 8: option) any later version.
! 9:
! 10: The GNU MP Library is distributed in the hope that it will be useful, but
! 11: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 12: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! 13: License for more details.
! 14:
! 15: You should have received a copy of the GNU Lesser General Public License
! 16: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
! 17: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
! 18: 02111-1307, USA.
! 19:
! 20:
! 21:
! 22:
! 23:
! 24:
! 25: The IA-64 ISA keeps instructions three and three in 128 bit bundles.
! 26: Programmers/compilers need to put explicit breaks `;;' when there are
! 27: WAW or RAW dependencies. Such breaks can typically just be at the end
! 28: of a bundle, with some exceptions.
! 29:
! 30: The Itanium and Itanium 2 implementations can under ideal conditions
! 31: execute two bundles per cycle. The Itanium 2 allows 4 of these
! 32: instructions to do integer operations, while the Itanium 2 allows all
! 33: 6 to be integer operations.
! 34:
! 35: Taken cloop branches seem to insert a bubble into the pipeline most of
! 36: the time.
! 37:
! 38: Loads to the fp registers bypass the L1 cache and thus get extremely
! 39: long latencies, 9 cycles on the Itanium and 7 cycles on the Itanium 2.
! 40:
! 41: The software pipeline stuff using br.ctop instruction causes delays,
! 42: since many issue slots are taken up by instructions with zero
! 43: predicates, and since about many extra instructions are needed to set
! 44: things up. These features are designed for code density, not maximum
! 45: speed.
! 46:
! 47: Misc pipeline limitations (Itanium):
! 48: * The getf.sig instruction can only execute in M0.
! 49: * At most four integer instructions/cycle.
! 50: * Nops take up resources like any plain instructions.
! 51:
! 52: ================================================================
! 53: mpn_add_n, mpn_sub_n:
! 54:
! 55: The current code runs at 3 cycles/limb. Unrolling could clearly bring
! 56: down the time to 2 cycles/limb.
! 57:
! 58: ================================================================
! 59: mpn_addmul_1:
! 60:
! 61: The current code runs at 3.7 cycles/limb, but that somewhat odd timing
! 62: is reached only for huge operands. It uses the mod-scheduled software
! 63: pipelining feature. The reason for the poor speed for small operands
! 64: is that mod-scheduled loops have a very long start-up overhead.
! 65:
! 66: For optimal speed, we need to load two 64-bit limbs with the ldfp8
! 67: instruction, and stay away from mod-scheduled loops. Since rp and up
! 68: might be mutually aligned in two ways, we will need two loop variants,
! 69: with the same basic structure:
! 70:
! 71: { .mfi getf.sig
! 72: xma.l
! 73: (p6) cmp.leu p6, p7 =
! 74: } { .mfi stf8
! 75: xma.hu
! 76: (p7) cmp.ltu p6, p7 =
! 77: ;;
! 78: } { .mib getf.sig
! 79: (p6) add 1
! 80: nop.b
! 81: } { .mib ldfp8 = [up], 16
! 82: (p7) add
! 83: nop.b
! 84: ;;
! 85: { .mfi getf.sig
! 86: xma.l
! 87: (p6) cmp.leu p6, p7 =
! 88: } { .mfi stf8
! 89: xma.hu
! 90: (p7) cmp.ltu p6, p7 =
! 91: ;;
! 92: } { .mib getf.sig
! 93: (p6) add 1
! 94: nop.b
! 95: } { .mib ldfp8 = [rp], 16
! 96: (p7) add
! 97: br.cloop
! 98: ;;
! 99: }
! 100:
! 101: 2 limbs/20 instructions
! 102: 20 insn/max 6 insn/cycle: 3.3 cycles/2limb
! 103: 8 memops/max 2 memops/cycle: 4.0 cycles/2limb
! 104: 8 intops/max 2 intops/cycle: 4.0 cycles/2limb
! 105: 4 fpops/max 2 fpops/cycle: 2.0 cycles/2limb
! 106:
! 107: ================================================================
! 108: mpn_submul_1:
! 109:
! 110: The current code just calls mpn_mul_1 and mpn_sub_n and thus needs
! 111: about 7 cycles/limb.
! 112:
! 113: We could implement this much like mpn_addmul_1, if we first complement
! 114: v. When v is complemented, the low product limb becomes complement of
! 115: true product. This should allow us to use the accumulation of xma.
! 116: Here is how it works:
! 117:
! 118:
! 119: #define umul_ppmma(ph, pl, m0, m1, a) \
! 120: do { \
! 121: UDItype __m0 = (m0), __m1 = (m1), __a = (a); \
! 122: __asm__ ("xma.hu %0 = %1, %2, %3" \
! 123: : "=f" (ph) \
! 124: : "f" (m0), "f" (m1), "f" (__a)); \
! 125: (pl) = __m0 * __m1 + __a; \
! 126: } while (0)
! 127:
! 128: mp_limb_t
! 129: mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
! 130: {
! 131: mp_limb_t cl, cy;
! 132: mp_size_t i;
! 133: mp_limb_t phi, plo;
! 134: mp_limb_t x;
! 135: mp_limb_t ul, vln;
! 136:
! 137: vln = -vl;
! 138:
! 139: cl = 0;
! 140: for (i = n; i != 0; i--)
! 141: {
! 142: ul = *up++; /* will need this in both fregs and gregs */
! 143: x = *rp;
! 144: umul_ppmma (phi, plo, ul, vln, x);
! 145:
! 146: cy = plo < cl;
! 147: plo -= cl;
! 148:
! 149: cl = ul - phi;
! 150: cl += cy;
! 151:
! 152: *rp++ = plo;
! 153: }
! 154:
! 155: return cl;
! 156: }
! 157:
! 158: ================================================================
! 159: mpn_mul_1:
! 160:
! 161: The current code runs at 3.7 cycles/limb. The code is very similar to
! 162: the mpn_addmul_1 code. See comments above.
! 163:
! 164: Faster code wouldn't be too hard to write. This is one possible
! 165: pattern:
! 166:
! 167: { .mfi getf.sig
! 168: xma.l
! 169: (p6) cmp.leu p6, p7 =
! 170: } { .mfi stf8
! 171: xma.hu
! 172: (p7) cmp.ltu p6, p7 =
! 173: ;;
! 174: } { .mib getf.sig
! 175: (p6) add 1
! 176: nop.b
! 177: } { .mib ldf8 = [up], 8
! 178: (p7) add
! 179: br.cloop
! 180: ;;
! 181: }
! 182:
! 183: 1 limb/12 instructions
! 184: 12 insn/max 6 insn/cycle: 2.0 cycles/limb
! 185: 4 memops/max 2 memops/cycle: 2.0 cycles/limb
! 186: 4 intops/max 2 intops/cycle: 2.0 cycles/limb
! 187: 2 fpops/max 2 fpops/cycle: 1.0 cycles/limb
! 188:
! 189: ================================================================
! 190: mpn_mul_8
! 191:
! 192: The add+cmp+add we use on the other codes is optimal for shortening
! 193: recurrencies (2 cycles) but the sequence takes up 4 execution slots. When
! 194: recurrency depth is not critical, a more standard 3-cycle add+cmp+add is
! 195: better.
! 196:
! 197: /* First load the 8 values from v */
! 198: ldfp8 v0, v1 = [r35], 16;;
! 199: ldfp8 v2, v3 = [r35], 16;;
! 200: ldfp8 v4, v5 = [r35], 16;;
! 201: ldfp8 v6, v7 = [r35], 16;;
! 202:
! 203: /* In the inner loop, get a new U limb and store a result limb. */
! 204: mov lc = un
! 205: Loop: ldf8 u0 = [r33], 8
! 206: xma.l lp0 = v0, u0, hp0
! 207: xma.hu hp0 = v0, u0, hp0
! 208: xma.l lp1 = v1, u0, hp1
! 209: xma.hu hp1 = v1, u0, hp1
! 210: xma.l lp2 = v2, u0, hp2
! 211: xma.hu hp2 = v2, u0, hp2
! 212: xma.l lp3 = v3, u0, hp3
! 213: xma.hu hp3 = v3, u0, hp3
! 214: xma.l lp4 = v4, u0, hp4
! 215: xma.hu hp4 = v4, u0, hp4
! 216: xma.l lp5 = v5, u0, hp5
! 217: xma.hu hp5 = v5, u0, hp5
! 218: xma.l lp6 = v6, u0, hp6
! 219: xma.hu hp6 = v6, u0, hp6
! 220: xma.l lp7 = v7, u0, hp7
! 221: xma.hu hp7 = v7, u0, hp7
! 222: getf.sig l0 = lp0
! 223: getf.sig l1 = lp1
! 224: getf.sig l2 = lp2
! 225: getf.sig l3 = lp3
! 226: getf.sig l4 = lp4
! 227: getf.sig l5 = lp5
! 228: getf.sig l6 = lp6
! 229: getf.sig l7 = lp7
! 230: add+cmp+add l0, l0, h0
! 231: add+cmp+add l1, l1, h1
! 232: add+cmp+add l2, l2, h2
! 233: add+cmp+add l3, l3, h3
! 234: add+cmp+add l4, l4, h4
! 235: add+cmp+add l5, l5, h5
! 236: add+cmp+add l6, l6, h6
! 237: add+cmp+add l7, l7, h7
! 238: st8 [r32] = xx, 8
! 239: br.cloop Loop
! 240:
! 241: 50 insn at max 6 insn/cycle: 8.33 cycles/limb8
! 242: 10 memops at max 2 memops/cycle: 5 cycles/limb8
! 243: 16 fpops at max 2 fpops/cycle: 8 cycles/limb8
! 244: 24 intops at max 4 intops/cycle: 6 cycles/limb8
! 245: 10+24 memops+intops at max 4/cycle 8.5 cycles/limb8
! 246: 1.0625 cycles/limb
! 247:
! 248: ================================================================
! 249: mpn_lshift, mpn_rshift
! 250:
! 251: The current code runs at 2 cycles/limb, but has a too deep software
! 252: pipeline. The code suffers badly from the 4 cycle latency of the
! 253: variable shift instructions.
! 254:
! 255: Using 63 separate loops, we could use the double-word SHRP
! 256: instruction. That instruction has a plain single-cycle latency. We
! 257: need 63 loops since this instruction only accept immediate count.
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>