OpenXM_contrib/gmp/mpn/generic/addsub_n.c - annotate

Return to addsub_n.c CVS log
Up to [local] / OpenXM_contrib / gmp / mpn / generic
Annotation of OpenXM_contrib/gmp/mpn/generic/addsub_n.c, Revision 1.1.1.2

1.1       maekawa     1: /* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
                      2:
1.1.1.2 ! ohara       3: Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
1.1       maekawa     4:
                      5: This file is part of the GNU MP Library.
                      6:
                      7: The GNU MP Library is free software; you can redistribute it and/or modify
                      8: it under the terms of the GNU Lesser General Public License as published by
                      9: the Free Software Foundation; either version 2.1 of the License, or (at your
                     10: option) any later version.
                     11:
                     12: The GNU MP Library is distributed in the hope that it will be useful, but
                     13: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     14: or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
                     15: License for more details.
                     16:
                     17: You should have received a copy of the GNU Lesser General Public License
                     18: along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
                     19: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
                     20: MA 02111-1307, USA. */
                     21:
                     22: #include "gmp.h"
                     23: #include "gmp-impl.h"
                     24:
                     25: #ifndef L1_CACHE_SIZE
                     26: #define L1_CACHE_SIZE 8192     /* only 68040 has less than this */
                     27: #endif
                     28:
                     29: #define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
                     30:
                     31:
                     32: /* mpn_addsub_n.
                     33:    r1[] = s1[] + s2[]
                     34:    r2[] = s1[] - s2[]
                     35:    All operands have n limbs.
                     36:    In-place operations allowed.  */
                     37: mp_limb_t
                     38: mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
                     39: {
                     40:   mp_limb_t acyn, acyo;                /* carry for add */
                     41:   mp_limb_t scyn, scyo;                /* carry for subtract */
                     42:   mp_size_t off;               /* offset in operands */
                     43:   mp_size_t this_n;            /* size of current chunk */
                     44:
                     45:   /* We alternatingly add and subtract in chunks that fit into the (L1)
                     46:      cache.  Since the chunks are several hundred limbs, the function call
                     47:      overhead is insignificant, but we get much better locality.  */
                     48:
                     49:   /* We have three variant of the inner loop, the proper loop is chosen
                     50:      depending on whether r1 or r2 are the same operand as s1 or s2.  */
                     51:
                     52:   if (r1p != s1p && r1p != s2p)
                     53:     {
                     54:       /* r1 is not identical to either input operand.  We can therefore write
                     55:         to r1 directly, without using temporary storage.  */
                     56:       acyo = 0;
                     57:       scyo = 0;
                     58:       for (off = 0; off < n; off += PART_SIZE)
                     59:        {
                     60:          this_n = MIN (n - off, PART_SIZE);
                     61: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
                     62:          acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
                     63: #else
                     64:          acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
                     65:          acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
                     66: #endif
                     67: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
                     68:          scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
                     69: #else
                     70:          scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
                     71:          scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
                     72: #endif
                     73:        }
                     74:     }
                     75:   else if (r2p != s1p && r2p != s2p)
                     76:     {
                     77:       /* r2 is not identical to either input operand.  We can therefore write
                     78:         to r2 directly, without using temporary storage.  */
                     79:       acyo = 0;
                     80:       scyo = 0;
                     81:       for (off = 0; off < n; off += PART_SIZE)
                     82:        {
                     83:          this_n = MIN (n - off, PART_SIZE);
                     84: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
                     85:          scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
                     86: #else
                     87:          scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
                     88:          scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
                     89: #endif
                     90: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
                     91:          acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
                     92: #else
                     93:          acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
                     94:          acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
                     95: #endif
                     96:        }
                     97:     }
                     98:   else
                     99:     {
                    100:       /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
                    101:         Need temporary storage.  */
                    102:       mp_limb_t tp[PART_SIZE];
                    103:       acyo = 0;
                    104:       scyo = 0;
                    105:       for (off = 0; off < n; off += PART_SIZE)
                    106:        {
                    107:          this_n = MIN (n - off, PART_SIZE);
                    108: #if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
                    109:          acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
                    110: #else
                    111:          acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
                    112:          acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
                    113: #endif
                    114: #if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
                    115:          scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
                    116: #else
                    117:          scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
                    118:          scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
                    119: #endif
                    120:          MPN_COPY (r1p + off, tp, this_n);
                    121:        }
                    122:     }
                    123:
                    124:   return 2 * acyo + scyo;
                    125: }
                    126:
                    127: #ifdef MAIN
                    128: #include <stdlib.h>
                    129: #include <stdio.h>
                    130: #include "timing.h"
                    131:
                    132: long cputime ();
                    133:
                    134: int
                    135: main (int argc, char **argv)
                    136: {
                    137:   mp_ptr r1p, r2p, s1p, s2p;
                    138:   double t;
                    139:   mp_size_t n;
                    140:
                    141:   n = strtol (argv[1], 0, 0);
                    142:
                    143:   r1p = malloc (n * BYTES_PER_MP_LIMB);
                    144:   r2p = malloc (n * BYTES_PER_MP_LIMB);
                    145:   s1p = malloc (n * BYTES_PER_MP_LIMB);
                    146:   s2p = malloc (n * BYTES_PER_MP_LIMB);
                    147:   TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
                    148:   printf ("              separate add and sub: %.3f\n", t);
                    149:   TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
                    150:   printf ("combined addsub separate variables: %.3f\n", t);
                    151:   TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
                    152:   printf ("        combined addsub r1 overlap: %.3f\n", t);
                    153:   TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
                    154:   printf ("        combined addsub r2 overlap: %.3f\n", t);
                    155:   TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
                    156:   printf ("          combined addsub in-place: %.3f\n", t);
                    157:
                    158:   return 0;
                    159: }
                    160: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>