Annotation of OpenXM_contrib/gmp/mpn/generic/addsub_n.c, Revision 1.1.1.2
1.1 maekawa 1: /* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
2:
1.1.1.2 ! ohara 3: Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
1.1 maekawa 4:
5: This file is part of the GNU MP Library.
6:
7: The GNU MP Library is free software; you can redistribute it and/or modify
8: it under the terms of the GNU Lesser General Public License as published by
9: the Free Software Foundation; either version 2.1 of the License, or (at your
10: option) any later version.
11:
12: The GNU MP Library is distributed in the hope that it will be useful, but
13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15: License for more details.
16:
17: You should have received a copy of the GNU Lesser General Public License
18: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20: MA 02111-1307, USA. */
21:
22: #include "gmp.h"
23: #include "gmp-impl.h"
24:
25: #ifndef L1_CACHE_SIZE
26: #define L1_CACHE_SIZE 8192 /* only 68040 has less than this */
27: #endif
28:
29: #define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
30:
31:
32: /* mpn_addsub_n.
33: r1[] = s1[] + s2[]
34: r2[] = s1[] - s2[]
35: All operands have n limbs.
36: In-place operations allowed. */
37: mp_limb_t
38: mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
39: {
40: mp_limb_t acyn, acyo; /* carry for add */
41: mp_limb_t scyn, scyo; /* carry for subtract */
42: mp_size_t off; /* offset in operands */
43: mp_size_t this_n; /* size of current chunk */
44:
45: /* We alternatingly add and subtract in chunks that fit into the (L1)
46: cache. Since the chunks are several hundred limbs, the function call
47: overhead is insignificant, but we get much better locality. */
48:
49: /* We have three variant of the inner loop, the proper loop is chosen
50: depending on whether r1 or r2 are the same operand as s1 or s2. */
51:
52: if (r1p != s1p && r1p != s2p)
53: {
54: /* r1 is not identical to either input operand. We can therefore write
55: to r1 directly, without using temporary storage. */
56: acyo = 0;
57: scyo = 0;
58: for (off = 0; off < n; off += PART_SIZE)
59: {
60: this_n = MIN (n - off, PART_SIZE);
61: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
62: acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
63: #else
64: acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
65: acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
66: #endif
67: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
68: scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
69: #else
70: scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
71: scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
72: #endif
73: }
74: }
75: else if (r2p != s1p && r2p != s2p)
76: {
77: /* r2 is not identical to either input operand. We can therefore write
78: to r2 directly, without using temporary storage. */
79: acyo = 0;
80: scyo = 0;
81: for (off = 0; off < n; off += PART_SIZE)
82: {
83: this_n = MIN (n - off, PART_SIZE);
84: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
85: scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
86: #else
87: scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
88: scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
89: #endif
90: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
91: acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
92: #else
93: acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
94: acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
95: #endif
96: }
97: }
98: else
99: {
100: /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
101: Need temporary storage. */
102: mp_limb_t tp[PART_SIZE];
103: acyo = 0;
104: scyo = 0;
105: for (off = 0; off < n; off += PART_SIZE)
106: {
107: this_n = MIN (n - off, PART_SIZE);
108: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
109: acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
110: #else
111: acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
112: acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
113: #endif
114: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
115: scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
116: #else
117: scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
118: scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
119: #endif
120: MPN_COPY (r1p + off, tp, this_n);
121: }
122: }
123:
124: return 2 * acyo + scyo;
125: }
126:
127: #ifdef MAIN
128: #include <stdlib.h>
129: #include <stdio.h>
130: #include "timing.h"
131:
132: long cputime ();
133:
134: int
135: main (int argc, char **argv)
136: {
137: mp_ptr r1p, r2p, s1p, s2p;
138: double t;
139: mp_size_t n;
140:
141: n = strtol (argv[1], 0, 0);
142:
143: r1p = malloc (n * BYTES_PER_MP_LIMB);
144: r2p = malloc (n * BYTES_PER_MP_LIMB);
145: s1p = malloc (n * BYTES_PER_MP_LIMB);
146: s2p = malloc (n * BYTES_PER_MP_LIMB);
147: TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
148: printf (" separate add and sub: %.3f\n", t);
149: TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
150: printf ("combined addsub separate variables: %.3f\n", t);
151: TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
152: printf (" combined addsub r1 overlap: %.3f\n", t);
153: TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
154: printf (" combined addsub r2 overlap: %.3f\n", t);
155: TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
156: printf (" combined addsub in-place: %.3f\n", t);
157:
158: return 0;
159: }
160: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>