Annotation of OpenXM_contrib/gmp/mpn/cray/ieee/submul_1.c, Revision 1.1.1.1
1.1 ohara 1: /* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and
2: subtract the result from a second limb vector.
3:
4: Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5:
6: This file is part of the GNU MP Library.
7:
8: The GNU MP Library is free software; you can redistribute it and/or modify
9: it under the terms of the GNU Lesser General Public License as published by
10: the Free Software Foundation; either version 2.1 of the License, or (at your
11: option) any later version.
12:
13: The GNU MP Library is distributed in the hope that it will be useful, but
14: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: License for more details.
17:
18: You should have received a copy of the GNU Lesser General Public License
19: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: MA 02111-1307, USA. */
22:
23: /* This code runs at just under 9 cycles/limb on a T90. That is not perfect,
24: mainly due to vector register shortage in the main loop. Assembly code
25: should bring it down to perhaps 7 cycles/limb. */
26:
27: #include <intrinsics.h>
28: #include "gmp.h"
29: #include "gmp-impl.h"
30:
31: mp_limb_t
32: mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
33: {
34: mp_limb_t cy[n];
35: mp_limb_t a, b, r, s0, s1, c0, c1;
36: mp_size_t i;
37: int more_carries;
38:
39: if (up == rp)
40: {
41: /* The algorithm used below cannot handle overlap. Handle it here by
42: making a temporary copy of the source vector, then call ourselves. */
43: mp_limb_t xp[n];
44: MPN_COPY (xp, up, n);
45: return mpn_submul_1 (rp, xp, n, vl);
46: }
47:
48: a = up[0] * vl;
49: r = rp[0];
50: s0 = r - a;
51: rp[0] = s0;
52: c1 = ((s0 & a) | ((s0 | a) & ~r)) >> 63;
53: cy[0] = c1;
54:
55: /* Main multiply loop. Generate a raw accumulated output product in rp[]
56: and a carry vector in cy[]. */
57: #pragma _CRI ivdep
58: for (i = 1; i < n; i++)
59: {
60: a = up[i] * vl;
61: b = _int_mult_upper (up[i - 1], vl);
62: s0 = a + b;
63: c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
64: r = rp[i];
65: s1 = r - s0;
66: rp[i] = s1;
67: c1 = ((s1 & s0) | ((s1 | s0) & ~r)) >> 63;
68: cy[i] = c0 + c1;
69: }
70: /* Carry subtract loop. Subtract the carry vector cy[] from the raw result
71: rp[] and store the new result back to rp[]. */
72: more_carries = 0;
73: #pragma _CRI ivdep
74: for (i = 1; i < n; i++)
75: {
76: r = rp[i];
77: c0 = cy[i - 1];
78: s0 = r - c0;
79: rp[i] = s0;
80: c0 = (s0 & ~r) >> 63;
81: more_carries += c0;
82: }
83: /* If that second loop generated carry, handle that in scalar loop. */
84: if (more_carries)
85: {
86: mp_limb_t cyrec = 0;
87: /* Look for places where rp[k] == ~0 and cy[k-1] == 1 or
88: rp[k] == ~1 and cy[k-1] == 2.
89: These are where we got a recurrency carry. */
90: for (i = 1; i < n; i++)
91: {
92: r = rp[i];
93: c0 = ~r < cy[i - 1];
94: s0 = r - cyrec;
95: rp[i] = s0;
96: c1 = (s0 & ~r) >> 63;
97: cyrec = c0 | c1;
98: }
99: return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1];
100: }
101:
102: return _int_mult_upper (up[n - 1], vl) + cy[n - 1];
103: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>