Annotation of OpenXM_contrib/gmp/mpn/generic/mode1o.c, Revision 1.1.1.1
1.1 ohara 1: /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
2:
3: THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4: CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5: FUTURE GNU MP RELEASES.
6:
7: Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
8:
9: This file is part of the GNU MP Library.
10:
11: The GNU MP Library is free software; you can redistribute it and/or modify
12: it under the terms of the GNU Lesser General Public License as published by
13: the Free Software Foundation; either version 2.1 of the License, or (at your
14: option) any later version.
15:
16: The GNU MP Library is distributed in the hope that it will be useful, but
17: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
19: License for more details.
20:
21: You should have received a copy of the GNU Lesser General Public License
22: along with the GNU MP Library; see the file COPYING.LIB. If not, write to
23: the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
24: MA 02111-1307, USA.
25: */
26:
27: #include "gmp.h"
28: #include "gmp-impl.h"
29: #include "longlong.h"
30:
31:
32: /* Calculate an r satisfying
33:
34: r*b^k + a - c == q*d
35:
36: where b=2^BITS_PER_MP_LIMB, a is {src,size}, k is either size or size-1
37: (the caller won't know which), and q is the quotient (discarded). d must
38: be odd, c can be any limb value.
39:
40: If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
41:
42: This slightly strange function suits the initial Nx1 reduction for GCDs
43: or Jacobi symbols since the factors of 2 in b^k can be ignored, leaving
44: -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be
45: ignored, or for the Jacobi symbol it can be accounted for. The function
46: also suits divisibility and congruence testing since if r=0 (or r=d) is
47: obtained then a==c mod d.
48:
49:
50: r is a bit like the remainder returned by mpn_divexact_by3c, and is the
51: sort of remainder a hypothetical mpn_divexact_1 might return. Like
52: mpn_divexact_by3c, r represents a borrow, since effectively quotient
53: limbs are chosen so that subtracting that multiple of d from src at each
54: step will produce a zero limb.
55:
56: A long calculation can be done piece by piece from low to high by passing
57: the return value from one part as the carry parameter to the next part.
58: The effective final k becomes anything between size and size-n, if n
59: pieces are used.
60:
61:
62: A similar sort of routine could be constructed based on adding multiples
63: of d at each limb, much like redc in mpz_powm does. Subtracting however
64: has a small advantage that when subtracting to cancel out l there's never
65: a borrow into h, whereas using an addition would put a carry into h
66: depending whether l==0 or l!=0.
67:
68:
69: In terms of efficiency, this function is similar to a mul-by-inverse
70: mpn_mod_1. Both are essentially two multiplies and are best suited to
71: CPUs with low latency multipliers (in comparison to a divide instruction
72: at least.) But modexact has a few less supplementary operations, only
73: needs low part and high part multiplies, and has fewer working quantities
74: (helping CPUs with few registers).
75:
76:
77: In the main loop it will be noted that the new carry (call it r) is the
78: sum of the high product h and any borrow from l=s-c. If c<d then we will
79: have r<d too, for the following reasons. Let q=l*inverse be the quotient
80: limb, so that q*d = b*h + l. Now if h=d-1 then
81:
82: l = q*d - b*(d-1) <= (b-1)*d - b*(d-1) = b-d
83:
84: But if l=s-c produces a borrow when c<d, then l>=b-d+1 and hence will
85: never have h=d-1 and so r=h+borrow <= d-1.
86:
87: When c>=d, on the other hand, h=d-1 can certainly occur together with a
88: borrow, thereby giving only r<=d, as per the function definition above.
89:
90: As a design decision it's left to the caller to check for r=d if it might
91: be passing c>=d. Several applications have c<d initially so the extra
92: test is often unnecessary, for example the GCDs or a plain divisibility
93: d|a test will pass c=0.
94:
95:
96: The special case for size==1 is so that it can be assumed c<=d in the
97: high<=divisor test at the end. c<=d is only guaranteed after at least
98: one iteration of the main loop. There's also a decent chance one % is
99: faster than a modlimb_invert, though that will depend on the processor.
100:
101: A CPU specific implementation might want to omit the size==1 code or the
102: high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither
103: useful. */
104:
105:
106: mp_limb_t
107: mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t c)
108: {
109: mp_limb_t s, h, l, inverse, dummy, dmul;
110: mp_size_t i;
111:
112: ASSERT (size >= 1);
113: ASSERT (d & 1);
114: ASSERT_MPN (src, size);
115: ASSERT_LIMB (d);
116: ASSERT_LIMB (c);
117:
118: if (size == 1)
119: {
120: s = src[0];
121: if (s > c)
122: {
123: l = s-c;
124: h = l % d;
125: if (h != 0)
126: h = d - h;
127: }
128: else
129: {
130: l = c-s;
131: h = l % d;
132: }
133: return h;
134: }
135:
136:
137: modlimb_invert (inverse, d);
138: dmul = d << GMP_NAIL_BITS;
139:
140: i = 0;
141: do
142: {
143: s = src[i];
144: SUBC_LIMB (c, l, s, c);
145: l = (l * inverse) & GMP_NUMB_MASK;
146: umul_ppmm (h, dummy, l, dmul);
147: c += h;
148: }
149: while (++i < size-1);
150:
151:
152: s = src[i];
153: if (s <= d)
154: {
155: /* With high<=d the final step can be a subtract and addback. If c==0
156: then the addback will restore to l>=0. If c==d then will get l==d
157: if s==0, but that's ok per the function definition. */
158:
159: l = c - s;
160: if (l > c)
161: l += d;
162:
163: ASSERT (l <= d);
164: return l;
165: }
166: else
167: {
168: /* Can't skip a divide, just do the loop code once more. */
169:
170: SUBC_LIMB (c, l, s, c);
171: l = (l * inverse) & GMP_NUMB_MASK;
172: umul_ppmm (h, dummy, l, dmul);
173: c += h;
174:
175: ASSERT (c <= d);
176: return c;
177: }
178: }
179:
180:
181:
182: #if 0
183:
184: /* The following is an alternate form that might shave one cycle on a
185: superscalar processor since it takes c+=h off the dependent chain,
186: leaving just a low product, high product, and a subtract.
187:
188: This is for CPU specific implementations to consider. A special case for
189: high<divisor and/or size==1 can be added if desired.
190:
191: Notice that c is only ever 0 or 1, since if s-c produces a borrow then
192: x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become
193: c=(x==0xFF..FF) too, if that helped. */
194:
195: mp_limb_t
196: mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
197: {
198: mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2;
199: mp_limb_t c = 0;
200: mp_size_t i;
201:
202: ASSERT (size >= 1);
203: ASSERT (d & 1);
204:
205: modlimb_invert (inverse, d);
206: dmul = d << GMP_NAIL_BITS;
207:
208: for (i = 0; i < size; i++)
209: {
210: ASSERT (c==0 || c==1);
211:
212: s = src[i];
213: SUBC_LIMB (c1, x, s, c);
214:
215: SUBC_LIMB (c2, y, x, h);
216: c = c1 + c2;
217:
218: y = (y * inverse) & GMP_NUMB_MASK;
219: umul_ppmm (h, dummy, y, dmul);
220: }
221:
222: h += c;
223: return h;
224: }
225:
226: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>