[BACK]Return to addsub_n.c CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / generic

Annotation of OpenXM_contrib/gmp/mpn/generic/addsub_n.c, Revision 1.1.1.1

1.1       maekawa     1: /* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
                      2:
                      3: Copyright (C) 1999, 2000 Free Software Foundation, Inc.
                      4:
                      5: This file is part of the GNU MP Library.
                      6:
                      7: The GNU MP Library is free software; you can redistribute it and/or modify
                      8: it under the terms of the GNU Lesser General Public License as published by
                      9: the Free Software Foundation; either version 2.1 of the License, or (at your
                     10: option) any later version.
                     11:
                     12: The GNU MP Library is distributed in the hope that it will be useful, but
                     13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     14: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     15: License for more details.
                     16:
                     17: You should have received a copy of the GNU Lesser General Public License
                     18: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     20: MA 02111-1307, USA. */
                     21:
                     22: #include "gmp.h"
                     23: #include "gmp-impl.h"
                     24:
                     25: #ifndef L1_CACHE_SIZE
                     26: #define L1_CACHE_SIZE 8192     /* only 68040 has less than this */
                     27: #endif
                     28:
                     29: #define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
                     30:
                     31:
                     32: /* mpn_addsub_n.
                     33:    r1[] = s1[] + s2[]
                     34:    r2[] = s1[] - s2[]
                     35:    All operands have n limbs.
                     36:    In-place operations allowed.  */
                     37: mp_limb_t
                     38: #if __STDC__
                     39: mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
                     40: #else
                     41: mpn_addsub_n (r1p, r2p, s1p, s2p, n)
                     42:      mp_ptr r1p, r2p;
                     43:      mp_srcptr s1p, s2p;
                     44:      mp_size_t n;
                     45: #endif
                     46: {
                     47:   mp_limb_t acyn, acyo;                /* carry for add */
                     48:   mp_limb_t scyn, scyo;                /* carry for subtract */
                     49:   mp_size_t off;               /* offset in operands */
                     50:   mp_size_t this_n;            /* size of current chunk */
                     51:
                     52:   /* We alternatingly add and subtract in chunks that fit into the (L1)
                     53:      cache.  Since the chunks are several hundred limbs, the function call
                     54:      overhead is insignificant, but we get much better locality.  */
                     55:
                     56:   /* We have three variant of the inner loop, the proper loop is chosen
                     57:      depending on whether r1 or r2 are the same operand as s1 or s2.  */
                     58:
                     59:   if (r1p != s1p && r1p != s2p)
                     60:     {
                     61:       /* r1 is not identical to either input operand.  We can therefore write
                     62:         to r1 directly, without using temporary storage.  */
                     63:       acyo = 0;
                     64:       scyo = 0;
                     65:       for (off = 0; off < n; off += PART_SIZE)
                     66:        {
                     67:          this_n = MIN (n - off, PART_SIZE);
                     68: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
                     69:          acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
                     70: #else
                     71:          acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
                     72:          acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
                     73: #endif
                     74: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
                     75:          scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
                     76: #else
                     77:          scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
                     78:          scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
                     79: #endif
                     80:        }
                     81:     }
                     82:   else if (r2p != s1p && r2p != s2p)
                     83:     {
                     84:       /* r2 is not identical to either input operand.  We can therefore write
                     85:         to r2 directly, without using temporary storage.  */
                     86:       acyo = 0;
                     87:       scyo = 0;
                     88:       for (off = 0; off < n; off += PART_SIZE)
                     89:        {
                     90:          this_n = MIN (n - off, PART_SIZE);
                     91: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
                     92:          scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
                     93: #else
                     94:          scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
                     95:          scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
                     96: #endif
                     97: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
                     98:          acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
                     99: #else
                    100:          acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
                    101:          acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
                    102: #endif
                    103:        }
                    104:     }
                    105:   else
                    106:     {
                    107:       /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
                    108:         Need temporary storage.  */
                    109:       mp_limb_t tp[PART_SIZE];
                    110:       acyo = 0;
                    111:       scyo = 0;
                    112:       for (off = 0; off < n; off += PART_SIZE)
                    113:        {
                    114:          this_n = MIN (n - off, PART_SIZE);
                    115: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
                    116:          acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
                    117: #else
                    118:          acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
                    119:          acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
                    120: #endif
                    121: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
                    122:          scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
                    123: #else
                    124:          scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
                    125:          scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
                    126: #endif
                    127:          MPN_COPY (r1p + off, tp, this_n);
                    128:        }
                    129:     }
                    130:
                    131:   return 2 * acyo + scyo;
                    132: }
                    133:
                    134: #ifdef MAIN
                    135: #include <stdlib.h>
                    136: #include <stdio.h>
                    137: #include "timing.h"
                    138:
                    139: long cputime ();
                    140:
                    141: int
                    142: main (int argc, char **argv)
                    143: {
                    144:   mp_ptr r1p, r2p, s1p, s2p;
                    145:   double t;
                    146:   mp_size_t n;
                    147:
                    148:   n = strtol (argv[1], 0, 0);
                    149:
                    150:   r1p = malloc (n * BYTES_PER_MP_LIMB);
                    151:   r2p = malloc (n * BYTES_PER_MP_LIMB);
                    152:   s1p = malloc (n * BYTES_PER_MP_LIMB);
                    153:   s2p = malloc (n * BYTES_PER_MP_LIMB);
                    154:   TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
                    155:   printf ("              separate add and sub: %.3f\n", t);
                    156:   TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
                    157:   printf ("combined addsub separate variables: %.3f\n", t);
                    158:   TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
                    159:   printf ("        combined addsub r1 overlap: %.3f\n", t);
                    160:   TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
                    161:   printf ("        combined addsub r2 overlap: %.3f\n", t);
                    162:   TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
                    163:   printf ("          combined addsub in-place: %.3f\n", t);
                    164:
                    165:   return 0;
                    166: }
                    167: #endif

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>