/********************************************************************/
/**                                                                **/
/**                     INTEGER FACTORIZATION                      **/
/**                                                                **/
/********************************************************************/
/* $Id: ifactor1.c,v 1.1.1.1 1999/09/16 13:47:33 karim Exp $ */
#include "pari.h"

/*********************************************************************/
/**                                                                 **/
/**                        PSEUDO PRIMALITY                         **/
/**                                                                 **/
/*********************************************************************/
static GEN sqrt1, sqrt2, t1, t;
static long r1;

/* The following two internal routines don't restore avma -- the caller
   must do so at the end. */
static GEN
init_miller(GEN n)
{
  if (signe(n) < 0) n = absi(n);
  t=addsi(-1,n); r1=vali(t); t1 = shifti(t,-r1);
  sqrt1=cgeti(lg(t)); sqrt1[1]=evalsigne(0)|evallgefint(2);
  sqrt2=cgeti(lg(t)); sqrt2[1]=evalsigne(0)|evallgefint(2);
  return n;
}

/* is n strong pseudo-prime for base a ? `End matching' (check for square
 * roots of -1) added by GN */
/* TODO: If ends do mismatch, then we have factored n, and this information
   should somehow be made available to the factoring machinery. --GN */
static int
bad_for_base(GEN n, GEN a)
{
  long r, av=avma, lim=stack_lim(av,1);
  GEN c2, c = powmodulo(a,t1,n);

  if (!is_pm1(c) && !egalii(t,c)) /* go fishing for -1, not for 1 */
  {
    for (r=r1-1; r; r--)	/* (this saves one squaring/reduction) */
    {
      c2=c; c=resii(sqri(c),n);
      if (egalii(t,c)) break;
      if (low_stack(lim, stack_lim(av,1)))
      {
	GEN *gsav[2]; gsav[0]=&c; gsav[1]=&c2;
	if(DEBUGMEM>1) err(warnmem,"miller(rabin)");
	gerepilemany(av, gsav, 2);
      }
    }
    if (!r) return 1;
    /* sqrt(-1) seen, compare or remember */
    if (signe(sqrt1))		/* we saw one earlier: compare */
    {
      /* check if too many sqrt(-1)s mod n */
      if (!egalii(c2,sqrt1) && !egalii(c2,sqrt2)) return 1;
    }
    else { affii(c2,sqrt1); affii(subii(n,c2),sqrt2); } /* remember */
  }
  return 0;
}

/* Miller-Rabin test for k random bases */
long
millerrabin(GEN n, long k)
{
  long r,i,av2, av = avma;

  if (!signe(n)) return 0;
  /* If |n| <= 3, check if n = +- 1 */
  if (lgefint(n)==3 && (ulong)(n[2])<=3) return (n[2] != 1);

  if (!mod2(n)) return 0;
  n = init_miller(n); av2=avma;
  for (i=1; i<=k; i++)
  {
    do r = smodsi(mymyrand(),n); while (!r);
    if (DEBUGLEVEL > 4)
      fprintferr("Miller-Rabin: testing base %ld\n",
		 r);
    if (bad_for_base(n, stoi(r))) { avma=av; return 0; }
    avma=av2;
  }
  avma=av; return 1;
}

/* As above for k bases taken in pr (i.e not random).
 * We must have |n|>2 and 1<=k<=11 (not checked) or k in {16,17} to select
 * some special sets of bases.
 *
 * By computations of Gerhard Jaeschke, `On strong pseudoprimes to several
 * bases', Math.Comp. 61 (1993), 915--926  (see also Chris Caldwell's Prime
 * Number Pages at http://www.utm.edu/research/primes/prove2.html),  we have:
 *
 * k == 4  (bases 2,3,5,7)  correctly detects all composites
 *    n <     118 670 087 467 == 172243 * 688969  with the single exception of
 *    n ==      3 215 031 751 == 151 * 751 * 28351,
 *
 * k == 5  (bases 2,3,5,7,11)  correctly detects all composites
 *    n <   2 152 302 898 747 == 6763 * 10627 * 29947,
 *
 * k == 6  (bases 2,3,...,13)  correctly detects all composites
 *    n <   3 474 749 660 383 == 1303 * 16927 * 157543,
 *
 * k == 7  (bases 2,3,...,17)  correctly detects all composites
 *    n < 341 550 071 728 321 == 10670053 * 32010157,
 * and even this limiting value is caught by an end mismatch between bases
 * 2 and 5 (or 5 and 17).
 *
 * Moreover, the four bases chosen at
 *
 * k == 16  (2,13,23,1662803)  will correctly detect all composites up
 * to at least 10^12, and the combination at
 *
 * k == 17  (31,73)  detects most odd composites without prime factors > 100
 * in the range  n < 2^36  (with less than 250 exceptions, indeed with fewer
 * than 1400 exceptions up to 2^42). --GN
 * (DATA TO BE COMPLETED)
 */
int				/* no longer static -- needed in mpqs.c */
miller(GEN n, long k)
{
  long r,i,av2, av = avma;
  static long pr[] =
    { 0, 2,3,5,7,11,13,17,19,23,29, 31,73, 2,13,23,1662803UL, };
  long *p;

  if (!mod2(n)) return 0;
  if (k==16)
  {				/* use smaller (faster) bases if possible */
    if (lgefint(n)==3 && (ulong)(n[2]) < 3215031751UL) p = pr; /* 2,3,5,7 */
    else p = pr+13;		/* 2,13,23,1662803 */
    k=4;
  }
  else if (k==17)
  {
    if (lgefint(n)==3 && (ulong)(n[2]) < 1373653) p = pr; /* 2,3 */
    else p = pr+11;		/* 31,73 */
    k=2;
  }
  else p = pr;			/* 2,3,5,... */
  n = init_miller(n); av2=avma;
  for (i=1; i<=k; i++)
  {
    r = smodsi(p[i],n); if (!r) break;
    if (bad_for_base(n, stoi(r))) { avma = av; return 0; }
    avma=av2;
  }
  avma=av; return 1;
}

/***********************************************************************/
/**                                                                   **/
/**                       PRIMES IN SUCCESSION                        **/
/** (abstracted by GN 1998Aug21 mainly for use in ellfacteur() below) **/
/**                                                                   **/
/***********************************************************************/

/* map from prime residue classes mod 210 to their numbers in {0...47}.
   Subscripts into this array take the form ((k-1)%210)/2, ranging from
   0 to 104.  Unused entries are 128 */
#define NPRC 128

static
unsigned char prc210_no[] =
{
  0, NPRC, NPRC, NPRC, NPRC, 1, 2, NPRC, 3, 4, NPRC, /* 21 */
  5, NPRC, NPRC, 6, 7, NPRC, NPRC, 8, NPRC, 9, /* 41 */
  10, NPRC, 11, NPRC, NPRC, 12, NPRC, NPRC, 13, 14, NPRC,	/* 63 */
  NPRC, 15, NPRC, 16, 17, NPRC, NPRC, 18, NPRC, 19, /* 83 */
  NPRC, NPRC, 20, NPRC, NPRC, NPRC, 21, NPRC, 22, 23, NPRC, /* 105 */
  24, 25, NPRC, 26, NPRC, NPRC, NPRC, 27, NPRC, NPRC, /* 125 */
  28, NPRC, 29, NPRC, NPRC, 30, 31, NPRC, 32, NPRC, NPRC, /* 147 */
  33, 34, NPRC, NPRC, 35, NPRC, NPRC, 36, NPRC, 37, /* 167 */
  38, NPRC, 39, NPRC, NPRC, 40, 41, NPRC, NPRC, 42, NPRC,	/* 189 */
  43, 44, NPRC, 45, 46, NPRC, NPRC, NPRC, NPRC, 47, /* 209 */
};

/* map from prime residue classes mod 210 (by number) to their smallest
   positive representatives */
static
unsigned char prc210_rp[] =
{
  1, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79,
  83, 89, 97, 101, 103, 107, 109, 113, 121, 127, 131, 137, 139, 143, 149,
  151, 157, 163, 167, 169, 173, 179, 181, 187, 191, 193, 197, 199, 209,
};

/* first differences of the preceding */
static
unsigned char prc210_d1[] =
{
  10, 2, 4, 2, 4, 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2, 6,
  4, 6, 8, 4, 2, 4, 2, 4, 8, 6, 4, 6, 2, 4, 6,
  2, 6, 6, 4, 2, 4, 6, 2, 6, 4, 2, 4, 2, 10, 2,
};

GEN
nextprime(GEN n)
{
  long rc,rc0,rcd,rcn,av1,av2, av = avma;

  if (typ(n) != t_INT) n=gceil(n); /* accept arguments in R --GN */
  if (typ(n) != t_INT) err(arither1);
  if (signe(n) <= 0) { avma=av; return gdeux; }
  if (lgefint(n) <= 3)
  { /* check if n <= 7 */
    ulong k = n[2];
    if (k <= 2) { avma=av; return gdeux; }
    if (k == 3) { avma = av; return stoi(3); }
    if (k <= 5) { avma = av; return stoi(5); }
    if (k <= 7) { avma = av; return stoi(7); }
  }
  /* here n > 7 */
  if (!(mod2(n))) n = addsi(1,n);
  rc = rc0 = smodis(n, 210);
  rcn = (long)(prc210_no[rc0>>1]);
  /* find next prime residue class mod 210 */
  while (rcn == NPRC)
  {
    rc += 2;			/* cannot wrap since 209 is coprime */
    rcn = (long)(prc210_no[rc>>1]);
  }
  if (rc > rc0) n = addsi(rc - rc0, n);
  /* now find an actual prime */
  av2 = av1 = avma;
  for(;;)
  {
    if (miller(n,10)) break;
    av1 = avma;
    rcd = prc210_d1[rcn];
    if (++rcn > 47) rcn = 0;
    n = addsi(rcd,n);
  }
  if (av1!=av2) return gerepile(av,av1,n);
  return (av1==av)? icopy(n): n;
}

GEN
precprime(GEN n)
{
  long rc,rc0,rcd,rcn,av1,av2, av = avma;

  if (typ(n) != t_INT) n=gfloor(n); /* accept arguments in R --GN */
  if (typ(n) != t_INT) err(arither1);
  if (signe(n)<=0) { avma=av; return gzero; }
  if (lgefint(n) <= 3)
  { /* check if n <= 10 */
    ulong k = n[2];
    if (k <= 1) { avma=av; return gzero; }
    if (k == 2) { avma=av; return gdeux; }
    if (k <= 4) { avma=av; return stoi(3); }
    if (k <= 6) { avma=av; return stoi(5); }
    if (k <= 10) { avma=av; return stoi(7); }
  }
  /* here n >= 11 */
  if (!(mod2(n))) n = addsi(-1,n);
  rc = rc0 = smodis(n, 210);
  rcn = (long)(prc210_no[rc0>>1]);
  /* find last prime residue class mod 210 */
  while (rcn == NPRC)
  {
    rc -= 2;			/* cannot wrap since 1 is coprime */
    rcn = (long)(prc210_no[rc>>1]);
  }
  if (rc < rc0) n = addsi(rc - rc0, n);
  /* now find an actual prime */
  av2 = av1 = avma;
  for(;;)
  {
    if (miller(n,10)) break;
    av1 = avma;
    if (rcn == 0)
    { rcd = 2; rcn = 47; }
    else
      rcd = prc210_d1[--rcn];
    n = addsi(-rcd,n);
  }
  if (av1!=av2) return gerepile(av,av1,n);
  return (av1==av)? icopy(n): n;
}

/* find next single-word prime strictly larger than p.  If **d is non-NULL,
   this will be p + *(*d)++, using the diffptr table.  Otherwise imitate
   nextprime().  Apart from *d, caller must supply a long variable to which
   rcn points, initialized either to NPRC or to the correct residue class
   number for the current p;  we'll use this to track the current prime
   residue class mod 210 once we're out of range of the diffptr table, and
   we'll update it before that if it isn't NPRC.  *q is incremented when-
   ever q!=NULL and we wrap from 209 mod 210 to 1 mod 210;  this make sense
   only when *rcn already held the correct value.  Caller must also supply
   the second argument for miller(). --GN1998Aug22 */
ulong
snextpr(ulong p, byteptr *d, long *rcn, long *q, long k)
{
  static ulong pp[] =
    { evaltyp(t_INT)|m_evallg(3), evalsigne(1)|evallgefint(3), 0 };
  static ulong *pp2 = pp + 2;
  static GEN gp = (GEN)pp;
  long d1 = **d, rcn0;

  if (d1)
  {
    if (*rcn != NPRC)
    {
      rcn0 = *rcn;
      while (d1 > 0)
      {
	d1 -= prc210_d1[*rcn];
	if (++*rcn > 47) { *rcn = 0; if (q) (*q)++; }
      }
      if (d1 < 0)
      {
	fprintferr("snextpr: prime %lu wasn\'t %lu mod 210\n",
		   p, (ulong)prc210_rp[rcn0]);
	err(bugparier, "[caller of] snextpr");
      }
    }
    return p + *(*d)++;
  }
  /* we are beyond the diffptr table */
  if (*rcn == NPRC)		/* we need to initialize this now */
  {
    *rcn = prc210_no[(p % 210) >> 1];
    if (*rcn == NPRC)
    {
      fprintferr("snextpr: %lu should have been prime but isn\'t\n", p);
      err(bugparier, "[caller of] snextpr");
    }
  }
  /* look for the next one */
  *pp2 = p;
  *pp2 += prc210_d1[*rcn];
  if (++*rcn > 47) *rcn = 0;
  while (!miller(gp, k))
  {
    *pp2 += prc210_d1[*rcn];
    if (++*rcn > 47) { *rcn = 0; if (q) (*q)++; }
    if (*pp2 <= 11)		/* wraparound mod 2^BITS_IN_LONG */
    {
      fprintferr("snextpr: integer wraparound after prime %lu\n", p);
      err(bugparier, "[caller of] snextpr");
    }
  }
  return *pp2;
}


/***********************************************************************/
/**                                                                   **/
/**                        FACTORIZATION (ECM)                        **/
/**   Integer factorization using the elliptic curves method (ECM).   **/
/**   ellfacteur() returns a non trivial factor of N, assuming N>0,   **/
/**   is composite, and has no prime divisor below 2^14 or so.        **/
/**   Extensively modified by GN Jul-Aug 1998, with much helpful      **/
/**   advice by Paul Zimmermann.  Thanks also to Guillaume Hanrot     **/
/**   and Igor Schein for providing many CPU cycles whilst testing.   **/
/**                                                                   **/
/***********************************************************************/

static GEN N, gl, *XAUX;
#define nbcmax 64		/* max number of simultaneous curves */
#define bstpmax 1024		/* max number of baby step table entries */

/* addition/doubling/multiplication of a point on an `elliptic curve'
   mod N may result in one of three things:  a new bona fide point,
   a point at infinity  (betraying itself by a denominator divisible
   by N),  or a point which is at infinity mod some nontrivial factor
   of N but finite mod some other factor  (betraying itself by a denom-
   inator which has nontrivial gcd with N, and this is of course what
   we want). */
/* (In the second case, addition/doubling will simply abort, copying one
   of the summands to the destination array of points unless they coincide.
   Multiplication will stop at some unpredictable intermediate stage:  The
   destination will contain _some_ multiple of the input point, but not
   necessarily the desired one, which doesn't matter.  As long as we're
   multiplying (B1 phase) we simply carry on with the next multiplier.
   During the B2 phase, the only additions are the giant steps, and the
   worst that can happen here is that we lose one residue class mod 210
   of prime multipliers on 4 of the curves, so again, we ignore the problem
   and just carry on.) */
/* The idea is:  Select a handful of curves mod N and one point P on each of
   them.  Try to compute, for each such point, the multiple [M]P = Q where
   M is the product of all powers <= B2 of primes <= nextprime(B1), for some
   suitably chosen B1 and B2.  Then check whether multiplying Q by one of the
   primes < nextprime(B2) would betray a factor.  This second stage proceeds
   by looking separately at the primes in each residue class mod 210, four
   curves at a time, and stepping additively to ever larger multipliers,
   by comparing X coordinates of points which we would need to add in order
   to reach another prime multiplier in the same residue class.  `Comparing'
   means that we accumulate a product of differences of X coordinates, and
   from time to time take a gcd of this product with N. */
/* Montgomery's trick of hiding the cost of computing inverses mod N at a
   price of three extra multiplications mod N, by working on up to 64 or
   even 128 points in parallel, is used heavily. --GN */

/* *** auxiliary functions for ellfacteur: *** */

/* Parallel addition on nbc curves, assigning the result to locations at and
   following *X3, *Y3.  Safe to be called with X3,Y3 equal to X2,Y2  (_not_
   to X1,Y1).  It is also safe to overwrite Y2 with X3.  (If Y coords of
   result not desired, set Y3=NULL.)  If nbc1 < nbc, the first summand is
   assumed to hold only nbc1 distinct points, which are repeated as often
   as we need them  (useful for adding one point on each of a few curves
   to several other points on the same curves).
   Return 0 when successful, 1 when we hit a denominator divisible by N,
   and 2 when gcd(denominator, N) is a nontrivial factor of N, which will
   be preserved in gl.
   We use more stack space than the old code did, and thus run a bit of a
   risk of overflowing it, but it's still bounded by a constant multiple
   of lgefint(N)*nbc, as it was in the old version --GN1998Jul02,Aug12 */
/* (Lessee:  Second phase creates 12 items on the stack, per iteration,
   of which four are twice as long and one is thrice as long as N --
   makes 18 units per iteration.  First phase creates 4 units.  Total
   can be as large as about 4*nbcmax+18*8 units.  And elladd2() is just
   as bad, and elldouble() comes to about 3*nbcmax+29*8 units.  A few
   strategic garbage collections every 8 iterations should help when nbc
   is large...) --GN1998Aug23 */

static int
elladd0(long nbc, long nbc1,
	GEN *X1, GEN *Y1, GEN *X2, GEN *Y2, GEN *X3, GEN *Y3)
{
  GEN lambda;
  GEN W[2*nbcmax], *A=W+nbc;	/* W[0],A[0] never used */
  long i, av=avma, tetpil;
  ulong mask = ~0UL;

  /* actually, this is only ever called with nbc1==nbc or nbc1==4, so: */
  if (nbc1 == 4) mask = 3;
  else if (nbc1 < nbc) err(bugparier, "[caller of] elladd0");

  /* W[0] = gun; */
  W[1] = /* A[0] =*/ subii(X1[0], X2[0]);
  for (i=1; i<nbc; i++)
  {
    A[i] = subii(X1[i&mask], X2[i]); /* don't waste time reducing mod N here */
    W[i+1] = modii(mulii(A[i], W[i]), N);
  }
  tetpil = avma;

  /* if gl != N we have a factor */
  if (!invmod(W[nbc], N, &gl))
  {
    if (!egalii(N,gl)) { gl = gerepile(av,tetpil,gl); return 2; }
    if (X2 != X3)
    {
      long k;
      /* cannot add on one of the curves mod N:  make sure X3 contains
	 something useful before letting the caller proceed */
      for (k = 2*nbc; k--; ) affii(X2[k],X3[k]);
    }
    avma = av; return 1;
  }

  while (i--)			/* nbc times, actually */
  {
    lambda = modii(mulii(subii(Y1[i&mask], Y2[i]),
			 i?mulii(gl, W[i]):gl), N);
    modiiz(subii(sqri(lambda), addii(X2[i], X1[i&mask])), N, X3[i]);
    if (Y3)
      modiiz(subii(mulii(lambda, subii(X1[i&mask], X3[i])),
		   Y1[i&mask]),
	     N, Y3[i]);
    if (!i) break;
    gl = modii(mulii(gl, A[i]), N);
    if (!(i&7)) gl = gerepileupto(tetpil, gl);
  }
  avma=av; return 0;
}

/* Shortcut variant, for use in cases where Y coordinates follow their
   corresponding X coordinates, and the first summand doesn't need to be
   repeated */
static int
elladd(long nbc, GEN *X1, GEN *X2, GEN *X3)
{
  return elladd0(nbc, nbc, X1, X1+nbc, X2, X2+nbc, X3, X3+nbc);
}
/* this could perhaps become a macro --GN */

/* The next one is exactly the same except it does twice as many additions
   (and thus hides even more of the cost of the modular inverse);  the net
   effect is the same as elladd(nbc,X1,X2,X3) followed by elladd(nbc,X4,X5,X6).
   Safe to have X2==X3 and/or X5==X6, and of course safe to have X1 or X2
   coincide with X4 or X5, in any order. */

static int
elladd2(long nbc, GEN *X1, GEN *X2, GEN *X3, GEN *X4, GEN *X5, GEN *X6)
{
  GEN lambda, *Y1 = X1+nbc, *Y2 = X2+nbc, *Y3 = X3+nbc;
  GEN *Y4 = X4+nbc, *Y5 = X5+nbc, *Y6 = X6+nbc;
  GEN W[4*nbcmax], *A=W+2*nbc;	/* W[0],A[0] never used */
  long i,j, av=avma, tetpil;
  /* W[0] = gun; */
  W[1] = /* A[0] =*/ subii(X1[0], X2[0]);
  for (i=1; i<nbc; i++)
  {
    A[i] = subii(X1[i], X2[i]);	/* don't waste time reducing mod N here */
    W[i+1] = modii(mulii(A[i], W[i]), N);
  }
  for (j=0; j<nbc; i++,j++)
  {
    A[i] = subii(X4[j], X5[j]);
    W[i+1] = modii(mulii(A[i], W[i]), N);
  }
  tetpil = avma;

  /* if gl != N we have a factor */
  if (!invmod(W[2*nbc], N, &gl))
  {
    if (!egalii(N,gl)) { gl = gerepile(av,tetpil,gl); return 2; }
    if (X2 != X3)
    {
      long k;
      /* cannot add on one of the curves mod N:  make sure X3 contains
	 something useful before letting the caller proceed */
      for (k = 2*nbc; k--; ) affii(X2[k],X3[k]);
    }
    if (X5 != X6)
    {
      long k;
      /* same for X6 */
      for (k = 2*nbc; k--; ) affii(X5[k],X6[k]);
    }
    avma = av; return 1;
  }

  while (j--)			/* nbc times, actually */
  {
    i--;
    lambda = modii(mulii(subii(Y4[j], Y5[j]),
			 mulii(gl, W[i])), N);
    modiiz(subii(sqri(lambda), addii(X5[j], X4[j])), N, X6[j]);
    modiiz(subii(mulii(lambda, subii(X4[j], X6[j])), Y4[j]), N, Y6[j]);
    gl = modii(mulii(gl, A[i]), N);
    if (!(i&7)) gl = gerepileupto(tetpil, gl);
  }
  while (i--)			/* nbc times */
  {
    lambda = modii(mulii(subii(Y1[i], Y2[i]),
			 i?mulii(gl, W[i]):gl), N);
    modiiz(subii(sqri(lambda), addii(X2[i], X1[i])), N, X3[i]);
    modiiz(subii(mulii(lambda, subii(X1[i], X3[i])), Y1[i]), N, Y3[i]);
    if (!i) break;
    gl = modii(mulii(gl, A[i]), N);
    if (!(i&7)) gl = gerepileupto(tetpil, gl);
  }
  avma=av; return 0;
}

/* Parallel doubling on nbc curves, assigning the result to locations at
   and following *X2.  Safe to be called with X2 equal to X1.  Return
   value as for elladd() above.  If we find a point at infinity mod N,
   and if X1 != X2, we copy the points at X1 to X2.
   Use fewer assignments than the old code.  Strangely, whereas this gains
   about 3% on my P133 with elladd(), it makes hardly any difference here
   with elldouble() --GN */
static int
elldouble(long nbc, GEN *X1, GEN *X2)
{
  GEN lambda,v, *Y1 = X1+nbc, *Y2 = X2+nbc;
  GEN W[nbcmax+1];		/* W[0] never used */
  long i, av=avma, tetpil;
  /*W[0] = gun;*/ W[1] = Y1[0];
  for (i=1; i<nbc; i++)
    W[i+1] = modii(mulii(Y1[i], W[i]), N);
  tetpil = avma;

  if (!invmod(W[nbc], N, &gl))
  {
    if (!egalii(N,gl)) { gl = gerepile(av,tetpil,gl); return 2; }
    if (X1 != X2)
    {
      long k;
      /* cannot double on one of the curves mod N:  make sure X2 contains
	 something useful before letting the caller proceed */
      for (k = 2*nbc; k--; ) affii(X1[k],X2[k]);
    }
    avma = av; return 1;
  }

  while (i--)			/* nbc times, actually */
  {
    lambda = modii(mulii(addsi(1, mulsi(3, sqri(X1[i]))),
			 i?mulii(gl,W[i]):gl), N);
    if (signe(lambda))		/* half of zero is still zero */
      lambda = shifti(mod2(lambda)? addii(lambda, N): lambda, -1);
    v = modii(subii(sqri(lambda), shifti(X1[i],1)), N);
    if (i) gl = modii(mulii(gl, Y1[i]), N);
    modiiz(subii(mulii(lambda, subii(X1[i], v)), Y1[i]), N, Y2[i]);
    affii(v, X2[i]);
    if (!(i&7) && i) gl = gerepileupto(tetpil, gl);
  }
  avma = av; return 0;
}

/* Parallel multiplication by an odd prime k on nbc curves, storing the
   result to locations at and following *X2.  Safe to be called with X2
   equal to X1.  Return values as for elladd() and elldouble().
   Uses (a simplified variant of) Peter Montgomery's PRAC (PRactical Addition
   Chain) algorithm;  see ftp://ftp.cwi.nl/pub/pmontgom/Lucas.ps.gz .
   With thanks to Paul Zimmermann for the reference.  --GN1998Aug13 */

/* We use an array of GENs pointed at by XAUX as a scratchpad;  this will
   have been set up by ellfacteur()  (so we don't need to reinitialize it
   each time). */

static int
ellmult(long nbc, ulong k, GEN *X1, GEN *X2) /* k>2 prime, not checked */
{
  long i,d,e,e1,r,av=avma,tetpil;
  int res;
  GEN *A=X2, *B=XAUX, *S, *T=XAUX+2*nbc;

  for (i = 2*nbc; i--; ) { affii(X1[i],XAUX[i]); }
  tetpil = avma;

  /* first doubling picks up X1;  after this we'll be working in XAUX and
     X2 only, mostly via A and B and T */
  if ((res = elldouble(nbc, X1, X2)) != 0)
  {
    if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
    return res;
  }

  /* split the work at the golden ratio */
  r = (long)(k*0.61803398875 + .5);
  d = k - r; e = r - d;		/* NB d+e == r, so no danger of ofl below */

  while (d != e)
  {

    /* apply one of the nine transformations from PM's Table 4.  We first
       figure out which, and then go into an eight-way switch, because
       some of the transformations are similar enough to share code. */

    if (d <= e + (e>>2))	/* floor(1.25*e) */
    {
      if ((d+e)%3 == 0)
      { i = 0;	goto apply; }	/* Table 4, rule 1 */
      else if ((d-e)%6 == 0)
      { i = 1; goto apply; }	/* rule 2 */
      /* else fall through */
    }
    if ((d+3)>>2 <= e)		/* equiv to d <= 4*e but cannot ofl */
    { i = 2; goto apply; }	/* rule 3, the most common case */
    if ((d&1)==(e&1))
    { i = 1; goto apply; }	/* rule 4, which does the same as rule 2 */
    if (!(d&1))
    { i = 3; goto apply; }	/* rule 5 */
    if (d%3 == 0)
    { i = 4; goto apply; }	/* rule 6 */
    if ((d+e)%3 == 0)
    { i = 5; goto apply; }	/* rule 7 */
    if ((d-e)%3 == 0)
    { i = 6; goto apply; }	/* rule 8 */
    /* when we get here, e must be even, for otherwise one of rules 4,5
       would have applied */
    i = 7;			/* rule 9 */

  apply:
    switch(i)			/* i takes values in {0,...,7} here */
    {
    case 0:			/* rule 1 */
      e1 = d - e; d = (d + e1)/3; e = (e - e1)/3;
      if ((res = elladd(nbc, A, B, T)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      if ((res = elladd2(nbc, T, A, A, T, B, B)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      break;			/* end of rule 1 */
    case 1:			/* rules 2 and 4, part 1 */
      d -= e;
      if ((res = elladd(nbc, A, B, B)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      /* FALL THROUGH */
    case 3:			/* rule 5, and 2nd part of rules 2 and 4 */
      d >>= 1;
      if ((res = elldouble(nbc, A, A)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      break;			/* end of rules 2, 4, and 5 */
    case 4:			/* rule 6 */
      d /= 3;
      if ((res = elldouble(nbc, A, T)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      if ((res = elladd(nbc, T, A, A)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      /* FALL THROUGH */
    case 2:			/* rule 3, and 2nd part of rule 6 */
      d -= e;
      if ((res = elladd(nbc, A, B, B)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      break;			/* end of rules 3 and 6 */
    case 5:			/* rule 7 */
      d = (d - e - e)/3;
      if ((res = elldouble(nbc, A, T)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      if ((res = elladd2(nbc, T, A, A, T, B, B)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      break;			/* end of rule 7 */
    case 6:			/* rule 8 */
      d = (d - e)/3;
      if ((res = elladd(nbc, A, B, B)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      if ((res = elldouble(nbc, A, T)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      if ((res = elladd(nbc, T, A, A)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      break;			/* end of rule 8 */
    case 7:			/* rule 9 */
      e >>= 1;
      if ((res = elldouble(nbc, B, B)) != 0)
      {
	if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
	return res;
      }
      break;			/* end of rule 9 */
    default:			/* never reached */
      break;
    }
    /* end of Table 4 processing */

    /* swap d <-> e and A <-> B if necessary */
    if (d < e) { r = d; d = e; e = r; S = A; A = B; B = S; }
  } /* while */
  if ((res = elladd(nbc, XAUX, X2, X2)) != 0)
  {
    if (res > 1) { gl = gerepile(av,tetpil,gl); } else avma = av;
    return res;
  }
  avma = av; return 0;
}

/* PRAC implementation notes - main changes against the paper version:
   (1) The general function  [m+n]P = f([m]P,[n]P,[m-n]P)  collapses  (for
   m!=n)  to an elladd() which does not depend on the third argument;  and
   thus all references to the third variable (C in the paper) can be elimi-
   nated. (2) Since our multipliers are prime, the outer loop of the paper
   version executes only once, and thus is invisible above. (3) The first
   step in the inner loop of the paper version will always be rule 3, but
   the addition requested by this rule amounts to a doubling, and it will
   always be followed by a swap, so we have unrolled this first iteration.
   (4) Some simplifications in rules 6 and 7 are possible given the above,
   and we can save one addition in each of the two cases.  NB one can show
   that none of the other elladd()s in the loop can ever turn out to de-
   generate into an elldouble. (5) I tried to optimize for rule 3, which
   is used far more frequently than all others together, but it didn't
   improve things, so I removed the nested tight loop again.  --GN */

/* The main loop body of ellfacteur() runs slightly _slower_  under PRAC than
   under a straightforward left-shift binary multiplication algorithm when
   N has <30 digits and B1 is small;  PRAC wins when N and B1 get larger.
   Weird. --GN */


/* memory layout in ellfacteur():  We'll have a large-ish array of GEN
   pointers, and one huge chunk of memory containing all the actual GEN
   (t_INT) objects.
   nbc will be held constant throughout the invocation. */
/* The B1 stage of each iteration through the main loop needs little
   space:  enough for the X and Y coordinates of the current points,
   and twice as much again as scratchpad for ellmult(). */
/* The B2 stage, starting from some current set of points Q, needs, in
   succession:
   - space for [2]Q, [4]Q, ..., [10]Q, and [p]Q for building the helix;
   - space for 48*nbc X and Y coordinates to hold the helix.  Now this
   could re-use [2]Q,...,[8]Q, but only with difficulty, since we don't
   know in advance which residue class mod 210 our p is going to be in.
   It can and should re-use [p]Q, though;
   - space for (temporarily [30]Q and then) [210]Q, [420]Q, and several
   further doublings until the giant step multiplier is reached.  This
   _can_ re-use the remaining cells from above.  The computation of [210]Q
   will have been the last call to ellmult() within this iteration of the
   main loop, so the scratchpad is now also free to be re-used.  We also
   compute [630]Q by a parallel addition;  we'll need it later to get the
   baby-step table bootstrapped a little faster. */
/* Finally, for no more than 4 curves at a time, room for up to 1024 X
   coordinates only  (the Y coordinates needed whilst setting up this baby
   step table are temporarily stored in the upper half, and overwritten
   during the last series of additions). */

/* Graphically:  after end of B1 stage  (X,Y are the coords of Q):
   +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--
   | X Y |  scratch  | [2]Q| [4]Q| [6]Q| [8]Q|[10]Q|    ...    | ...
   +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--
   *X    *XAUX *XT   *XD                                       *XB

   [30]Q is computed from [10]Q.  [210]Q can go into XY, etc:
   +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--
   |[210]|[420]|[630]|[840]|[1680,3360,6720,...,2048*210]      |bstp table...
   +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--
   *X    *XAUX *XT   *XD      [*XG, somewhere here]            *XB .... *XH

   So we need (13 + 48) * 2 * nbc slots here, and another 4096 slots for
   the baby step table (not all of which will be used when we start with a
   small B1, but it's better to allocate and initialize ahead of time all
   the slots that might be needed later). */

/* Note on memory locality:  During the B2 phase, accesses to the helix
   (once it has been set up)  will be clustered by curves  (4 out of nbc at
   a time).  Accesses to the baby steps table will wander from one end of
   the array to the other and back, one such cycle per giant step, and
   during a full cycle we would expect on the order of 2E4 accesses when
   using the largest giant step size.  Thus we shouldn't be doing too bad
   with respect to thrashing a (512KBy) L2 cache.  However, we don't want
   the baby step table to grow larger than this, even if it would reduce
   the number of E.C. operations by a few more per cent for very large B2,
   lest cache thrashing slow down everything disproportionally. --GN */

/* parameters for miller() via snextpr(), for use by ellfacteur() */
#define miller_k1 16		/* B1 phase, foolproof below 10^12 */
#define miller_k2 1		/* B2 phase, not foolproof, much faster */
/* (miller_k2 will let thousands of composites slip through, which doesn't
   harm ECM, but ellmult() during the B1 phase should only be fed primes
   which really are prime) */

/* ellfacteur() has been re-tuned to be useful as a first stage before
   MPQS, especially for _large_ arguments, when insist is false, and now
   also for the case when insist is true, vaguely following suggestions
   by Paul Zimmermann  (see http://www.loria.fr/~zimmerma/ and especially
   http://www.loria.fr/~zimmerma/records/ecmnet.html)  of INRIA/LORIA.
   --GN 1998Jul,Aug */
GEN
ellfacteur(GEN n, int insist)
{
  static ulong TB1[] =
    {
      /* table revised, cf. below 1998Aug15 --GN */
      142,172,208,252,305,370,450,545,661,801,972,1180,1430,
      1735,2100,2550,3090,3745,4540,5505,6675,8090,9810,11900,
      14420,17490,21200,25700,31160,37780UL,45810UL,55550UL,67350UL,
      81660UL,99010UL,120050UL,145550UL,176475UL,213970UL,259430UL,
      314550UL,381380UL,462415UL,560660UL,679780UL,824220UL,999340UL,
      1211670UL,1469110UL,1781250UL,2159700UL,2618600UL,3175000UL,
      3849600UL,4667500UL,5659200UL,6861600UL,8319500UL,10087100UL,
      12230300UL,14828900UL,17979600UL,21799700UL,26431500UL,
      32047300UL,38856400UL,	/* 110 times that still fits into 32bits */
#ifdef LONG_IS_64BITS
      47112200UL,57122100UL,69258800UL,83974200UL,101816200UL,
      123449000UL,149678200UL,181480300UL,220039400UL,266791100UL,
      323476100UL,392204900UL,475536500UL,576573500UL,699077800UL,
      847610500UL,1027701900UL,1246057200UL,1510806400UL,1831806700UL,
      2221009800UL,2692906700UL,3265067200UL,3958794400UL,4799917500UL,
      /* the only reason to stop here is that I got bored  (and that users
	 will get bored watching their 64bit machines churning on such large
	 numbers for month after month).  Someone can extend this table when
	 the hardware has gotten 100 times faster than now --GN */
#endif
    };
  static ulong TB1_for_stage[] =
    {
      /* table revised 1998Aug11 --GN.  The idea is to start a little below
	 the optimal B1 for finding factors which would just have been missed
	 by pollardbrent(), and escalate gradually, changing curves suf-
	 ficiently frequently to give good coverage of the small factor
	 ranges.  The table entries grow a bit faster than what Paul says
	 would be optimal, but having a single table instead of a 2D array
	 keeps the code simple */
      500,520,560,620,700,800,900,1000,1150,1300,1450,1600,1800,2000,
      2200,2450,2700,2950,3250,3600,4000,4400,4850,5300,5800,6400,
      7100,7850,8700,9600,10600,11700,12900,14200,15700,17300,
      19000,21000,23200,25500,28000,31000,34500UL,38500UL,43000UL,
      48000UL,53800UL,60400UL,67750UL,76000UL,85300UL,95700UL,
      107400UL,120500UL,135400UL,152000UL,170800UL,191800UL,215400UL,
      241800UL,271400UL,304500UL,341500UL,383100UL,429700UL,481900UL,
      540400UL,606000UL,679500UL,761800UL,854100UL,957500UL,1073500UL,
    };
  long nbc,nbc2,dsn,dsnmax,rep,spc,gse,gss,rcn,rcn0,bstp,bstp0;
  long a,i,j,k, av,av1,lim, size = expi(n) + 1, tf = lgefint(n);
  ulong B1,B2,B2_p,B2_rt,m,p,p0,p2,dp;
  GEN w,w0,x,*X,*XT,*XD,*XG,*YG,*XH,*XB,*XB2,*Xh,*Yh,*Xb, res = cgeti(tf);
  int rflag, use_clones = 0;
  byteptr d, d0;

  av = avma;			/* taking res into account */
  N = n;			/* make n known to auxiliary functions */
  /* determine where we'll start, how long we'll persist, and how many
     curves we'll use in parallel */
  if (insist)
  {
    dsnmax = (size >> 2) - 10;
    if (dsnmax < 0) dsnmax = 0;
#ifdef LONG_IS_64BITS
    else if (dsnmax > 90) dsnmax = 90;
#else
    else if (dsnmax > 65) dsnmax = 65;
#endif
    dsn = (size >> 3) - 5;
    if (dsn < 0) dsn = 0;
    else if (dsn > 47) dsn = 47;
    /* pick up the torch where non-insistent stage would have given up */
    nbc = dsn + (dsn >> 2) + 9;	/* 8 or more curves in parallel */
    nbc &= ~3;			/* nbc is always a multiple of 4 */
    if (nbc > nbcmax) nbc = nbcmax;
    a = 1 + (nbcmax<<7);	/* seed for choice of curves */
  }
  else
  {
    dsn = (size - 140) >> 3;
    if (dsn > 12) dsn = 12;
    dsnmax = 72;
    if (dsn < 0)		/* < 140 bits: decline the task */
    {
#ifdef __EMX__
      /* MPQS's disk access under DOS/EMX would be abysmally slow, so... */
      dsn = 0;
      rep = 20;
      nbc = 8;
#else
      if (DEBUGLEVEL >= 4)
      {
	fprintferr("ECM: number too small to justify this stage\n");
	flusherr();
      }
      avma = av; return NULL;
#endif
    }
    else
    {
      rep = (size <= 248 ?
	     (size <= 176 ? (size - 124) >> 4 : (size - 148) >> 3) :
	     (size - 224) >> 1);
      nbc = ((size >> 3) << 2) - 80;
      if (nbc < 8) nbc = 8;
      else if (nbc > nbcmax) nbc = nbcmax;
#ifdef __EMX__
      rep += 20;
#endif
    }

    /* it may be convenient to use disjoint sets of curves for the non-insist
       and insist phases;  moreover, repeated non-insistent calls acting on
       factors of the same original number should try to use fresh curves.
       The following achieves this */
    a = 1 + (nbcmax<<3)*(size & 0xf);
  }
  if (dsn > dsnmax) dsn = dsnmax;

  if (DEBUGLEVEL >= 4)
  {
    (void) timer2();		/* clear timer */
    fprintferr("ECM: working on %ld curves at a time; initializing", nbc);
    if (!insist)
    {
      if (rep == 1)
	fprintferr(" for one round");
      else
	fprintferr(" for up to %ld rounds", rep);
    }
    fprintferr("...\n");
  }

  /* The auxiliary routines above need < (3*nbc+240)*tf words on the PARI
     stack, in addition to the spc*(tf+1) words occupied by our main table.
     If stack space is already tight, try the heap, using newbloc() and
     killbloc() */
  nbc2 = nbc << 1;
  spc = (13 + 48) * nbc2 + bstpmax * 4;
  if ((long)((GEN)avma - (GEN)bot) < spc + 385 + (spc + 3*nbc + 240)*tf)
  {
    if (DEBUGLEVEL >= 5)
    {
      fprintferr("ECM: stack tight, using clone space on the heap\n");
    }
    use_clones = 1;
    x = newbloc(spc + 385);
  }
  else
    x = new_chunk(spc + 385);
  X = 1 + (GEN*)x;		/* B1 phase: current point */
  XAUX = X    + nbc2;		/* scratchpad for ellmult() */
  XT   = XAUX + nbc2;		/* ditto, will later hold [3*210]Q */
  XD   = XT   + nbc2;		/* room for various multiples */
  XB   = XD   + 20*nbc;		/* start of baby steps table */
  XB2  = XB   + 2 * bstpmax;	/* middle of baby steps table */
  XH   = XB2  + 2 * bstpmax;	/* end of bstps table, start of helix */
  Xh   = XH   + 96*nbc;		/* little helix, X coords */
  Yh   = XH   + 192;		/* ditto, Y coords */
  /* XG will be set later, inside the main loop, since it depends on B2 */

  {
    long tw = evallg(tf) | evaltyp(t_INT);

    if (use_clones)
      w = newbloc(spc*tf);
    else
      w = new_chunk(spc*tf);
    w0 = w;			/* remember this for later... */
    for (i = spc; i--; )
    {
      *w = tw; X[i] = w; w += tf; /* hack for: w = cgeti(tf) */
    }
    /* Xh range of 384 pointers not set;  these will later duplicate the
       pointers in the XH range, 4 curves at a time.  Some of the cells
       reserved here for the XB range will never be used, instead, we'll
       warp the pointers to connect to (read-only) GENs in the X/XD range;
       it would be complicated to skip them here to conserve merely a few
       KBy of stack or heap space. --GN */
  }

  /* *** ECM MAIN LOOP *** */
  for(;;)
  {
    d = diffptr; rcn = NPRC;	/* multipliers begin at the beginning */

    /* pick curves */
    for (i = nbc2; i--; ) affsi(a++, X[i]);
    /* pick bounds */
    B1 = insist ? TB1[dsn] : TB1_for_stage[dsn];
    B2 = 110*B1;
    B2_rt = (ulong)(sqrt((double)B2));
    /* pick giant step exponent and size.
       With 32 baby steps, a giant step corresponds to 32*420 = 13440, appro-
       priate for the smallest B2s.  With 1024, a giant step will be 430080;
       this will be appropriate for B1 >~ 42000, where 512 baby steps would
       imply roughly the same number of E.C. additions. */
    gse = (B1 < 656 ?
	   (B1 < 200 ? 5 : 6) :
	   (B1 < 10500 ?
	    (B1 < 2625 ? 7 : 8) :
	    (B1 < 42000 ? 9 : 10)
	    )
	   );
    gss = 1UL << gse;
    XG = XT + gse*nbc2;		/* will later hold [2^(gse+1)*210]Q */
    YG = XG + nbc;

    if (DEBUGLEVEL >= 4)
    {
      fprintferr("ECM: time = %6ld ms\nECM: dsn = %2ld,\tB1 = %4lu,",
                 timer2(), dsn, B1);
      fprintferr("\tB2 = %6lu,\tgss = %4ld*420\n", B2, gss);
      flusherr();
    }
    p = *d++;

    /* ---B1 PHASE--- */
    /* treat p=2 separately */
    B2_p = B2 >> 1;
    for (m=1; m<=B2_p; m<<=1)
    {
      if ((rflag = elldouble(nbc, X, X)) > 1) goto fin;
      else if (rflag) break;
    }

    /* p=3,...,nextprime(B1) */
    while (p < B1 && p <= B2_rt)
    {
      p = snextpr(p, &d, &rcn, NULL, miller_k1);
      B2_p = B2/p;		/* beware integer overflow on 32-bit CPUs */
      for (m=1; m<=B2_p; m*=p)
      {
	if ((rflag = ellmult(nbc, p, X, X)) > 1) goto fin;
	else if (rflag) break;
      }
    }
    /* primes p larger than sqrt(B2) can appear only to the 1st power */
    while (p < B1)
    {
      p = snextpr(p, &d, &rcn, NULL, miller_k1);
      if (ellmult(nbc, p, X, X) > 1) goto fin; /* p^2 > B2: no loop */
    }

    if (DEBUGLEVEL >= 4)
    {
      fprintferr("ECM: time = %6ld ms, B1 phase done, ", timer2());
      fprintferr("p = %lu, setting up for B2\n", p);
    }

    /* ---B2 PHASE--- */
    /* compute [2]Q,...,[10]Q, which we need to build the helix */
    if (elldouble(nbc, X, XD) > 1)
      goto fin;			/* [2]Q */
    if (elldouble(nbc, XD, XD + nbc2) > 1)
      goto fin;			/* [4]Q */
    if (elladd(nbc, XD, XD + nbc2, XD + (nbc<<2)) > 1)
      goto fin;			/* [6]Q */
    if (elladd2(nbc,
		XD, XD + (nbc<<2), XT + (nbc<<3),
		XD + nbc2, XD + (nbc<<2), XD + (nbc<<3)) > 1)
      goto fin;			/* [8]Q and [10]Q */
    if (DEBUGLEVEL >= 7)
      fprintferr("\t(got [2]Q...[10]Q)\n");

    /* get next prime (still using the foolproof test) */
    p = snextpr(p, &d, &rcn, NULL, miller_k1);
    /* make sure we have the residue class number (mod 210) */
    if (rcn == NPRC)
    {
      rcn = prc210_no[(p % 210) >> 1];
      if (rcn == NPRC)
      {
	fprintferr("ECM: %lu should have been prime but isn\'t\n", p);
	err(bugparier, "ellfacteur");
      }
    }

    /* compute [p]Q and put it into its place in the helix */
    if (ellmult(nbc, p, X, XH + rcn*nbc2) > 1) goto fin;
    if (DEBUGLEVEL >= 7)
      fprintferr("\t(got [p]Q, p = %lu = %lu mod 210)\n",
		 p, (ulong)(prc210_rp[rcn]));

    /* save current p, d, and rcn;  we'll need them more than once below */
    p0 = p;
    d0 = d;
    rcn0 = rcn;			/* remember where the helix wraps */
    bstp0 = 0;			/* p is at baby-step offset 0 from itself */

    /* fill up the helix, stepping forward through the prime residue classes
       mod 210 until we're back at the r'class of p0.  Keep updating p so
       that we can print meaningful diagnostics if a factor shows up;  but
       don't bother checking which of these p's are in fact prime */
    for (i = 47; i; i--)	/* 47 iterations */
    {
      p += (dp = (ulong)prc210_d1[rcn]);
      if (rcn == 47)
      {				/* wrap mod 210 */
	if (elladd(nbc, XT + dp*nbc, XH + rcn*nbc2, XH) > 1)
	  goto fin;
	rcn = 0;
	continue;
      }
      if (elladd(nbc, XT + dp*nbc, XH + rcn*nbc2, XH + rcn*nbc2 + nbc2) > 1)
	goto fin;
      rcn++;
    }
    if (DEBUGLEVEL >= 7)
      fprintferr("\t(got initial helix)\n");

    /* compute [210]Q etc, which will be needed for the baby step table */
    if (ellmult(nbc, 3, XD + (nbc<<3), X) > 1) goto fin;
    if (ellmult(nbc, 7, X, X) > 1) goto fin; /* [210]Q */
    /* this was the last call to ellmult() in the main loop body;  may now
       overwrite XAUX and slots XD and following */
    if (elldouble(nbc, X, XAUX) > 1) goto fin; /* [420]Q */
    if (elladd(nbc, X, XAUX, XT) > 1) goto fin; /* [630]Q */
    if (elladd(nbc, X, XT, XD) > 1) goto fin; /* [840]Q */
    for (i=1; i <= gse; i++)	/* gse successive doublings */
    {
      if (elldouble(nbc, XT + i*nbc2, XD + i*nbc2) > 1) goto fin;
    }
    /* (the last iteration has initialized XG to [210*2^(gse+1)]Q) */

    if (DEBUGLEVEL >= 4)
    {
      fprintferr("ECM: time = %6ld ms, entering B2 phase, p = %lu\n",
		 timer2(), p);
    }

    /* inner loop over small sets of 4 curves at a time */
    for (i = nbc - 4; i >= 0; i -= 4)
    {
      if (DEBUGLEVEL >= 6)
	fprintferr("ECM: finishing curves %ld...%ld\n", i, i+3);
      /* copy relevant pointers from XH to Xh.  Recall memory layout in XH
         is:  nbc X coordinates followed by nbc Y coordinates for residue
	 class 1 mod 210, then the same for r.c. 11 mod 210, etc.  Memory
	 layout for Xh is: four X coords for 1 mod 210, four for 11 mod 210,
	 etc, four for 209 mod 210, and then the corresponding Y coordinates
	 in the same order.  This will allow us to do a giant step on Xh
	 using just three calls to elladd0() each acting on 64 points in
	 parallel */
      for (j = 48; j--; )
      {
	k = nbc2*j + i;
	m = j << 2;		/* X coordinates */
	Xh[m]   = XH[k];   Xh[m+1] = XH[k+1];
	Xh[m+2] = XH[k+2]; Xh[m+3] = XH[k+3];
	k += nbc;		/* Y coordinates */
	Yh[m]   = XH[k];   Yh[m+1] = XH[k+1];
	Yh[m+2] = XH[k+2]; Yh[m+3] = XH[k+3];
      }
      /* build baby step table of X coords of multiples of [210]Q.  XB[4*j]
	 will point at X coords on four curves from [(j+1)*210]Q.  Until
	 we're done, we need some Y coords as well, which we keep in the
	 second half of the table, overwriting them at the end when gse==10.
	 Those multiples which we already have  (by 1,2,3,4,8,16,...,2^gse)
	 are entered simply by copying the pointers, ignoring the small
	 number of slots in w that were initially reserved for them.
	 Here are the initial entries... */
      for (Xb=XB,k=2,j=i; k--; Xb=XB2,j+=nbc) /* do first X, then Y coords */
      {
	Xb[0]  = X[j];      Xb[1]  = X[j+1]; /* [210]Q */
	Xb[2]  = X[j+2];    Xb[3]  = X[j+3];
	Xb[4]  = XAUX[j];   Xb[5]  = XAUX[j+1]; /* [420]Q */
	Xb[6]  = XAUX[j+2]; Xb[7]  = XAUX[j+3];
	Xb[8]  = XT[j];     Xb[9]  = XT[j+1]; /* [630]Q */
	Xb[10] = XT[j+2];   Xb[11] = XT[j+3];
	Xb += 4;		/* this points at [420]Q */
	/* ... entries at powers of 2 times 210 .... */
	for (m = 2; m < gse+k; m++) /* omit Y coords of [2^gse*210]Q */
	{
	  long m2 = m*nbc2 + j;
	  Xb += (2UL<<m);	/* points now at [2^m*210]Q */
	  Xb[0] = XAUX[m2];   Xb[1] = XAUX[m2+1];
	  Xb[2] = XAUX[m2+2]; Xb[3] = XAUX[m2+3];
	}
      }
      if (DEBUGLEVEL >= 7)
	fprintferr("\t(extracted precomputed helix / baby step entries)\n");
      /* ... glue in between, up to 16*210 ... */
      if (elladd0(12, 4,	/* 12 pts + (4 pts replicated thrice) */
		  XB + 12, XB2 + 12,
		  XB,      XB2,
		  XB + 16, XB2 + 16)
	  > 1) goto fin;	/* 4 + {1,2,3} = {5,6,7} */
      if (elladd0(28, 4,	/* 28 pts + (4 pts replicated 7fold) */
		  XB + 28, XB2 + 28,
		  XB,      XB2,
		  XB + 32, XB2 + 32)
	  > 1) goto fin;	/* 8 + {1,...,7} = {9,...,15} */
      /* ... and the remainder of the lot */
      for (m = 5; m <= gse; m++)
      {
	/* fill in from 2^(m-1)+1 to 2^m-1 in chunks of 64 and 60 points */
	ulong m2 = 2UL << m;	/* will point at 2^(m-1)+1 */
	for (j = 0; j < m2-64; j+=64) /* executed 0 times when m == 5 */
	{
	  if (elladd0(64, 4,
		      XB + m2 - 4, XB2 + m2 - 4,
		      XB + j,      XB2 + j,
		      XB + m2 + j,
		      (m<gse ? XB2 + m2 + j : NULL))
	      > 1) goto fin;
	} /* j == m2-64 here, 60 points left */
	if (elladd0(60, 4,
		    XB + m2 - 4, XB2 + m2 - 4,
		    XB + j,      XB2 + j,
		    XB + m2 + j,
		    (m<gse ? XB2 + m2 + j : NULL))
	    > 1) goto fin;
	/* (when m==gse, drop Y coords of result, and when both equal 1024,
	   overwrite Y coords of second argument with X coords of result) */
      }
      if (DEBUGLEVEL >= 7)
	fprintferr("\t(baby step table complete)\n");
      /* initialize a few other things */
      bstp = bstp0;
      p = p0; d = d0; rcn = rcn0;
      gl = gun;
      av1 = avma;
      lim=stack_lim(av1,1);
      /* the correct entry in XB to use depends on bstp and on where we are
	 on the helix.  As we skip from prime to prime, bstp will be incre-
	 mented by snextpr() each time we wrap around through residue class
	 number 0 (1 mod 210),  but the baby step should not be taken until
	 rcn>=rcn0  (i.e. until we pass again the residue class of p0).
	 The correct signed multiplier is thus k = bstp - (rcn < rcn0),
	 and the offset from XB is four times (|k| - 1).  When k==0, we may
	 ignore the current prime  (if it had led to a factorization, this
	 would have been noted during the last giant step, or -- when we
	 first get here -- whilst initializing the helix).  When k > gss,
	 we must do a giant step and bump bstp back by -2*gss.
	 The gcd of the product of X coord differences against N is taken just
	 before we do a giant step. */

      /* loop over probable primes p0 < p <= nextprime(B2),
	 inserting giant steps as necessary */
      while (p < B2)
      {
	/* save current p for diagnostics */
	p2 = p;
	/* get next probable prime */
	p = snextpr(p, &d, &rcn, &bstp, miller_k2);
	/* work out the corresponding baby-step multiplier */
	k = bstp - (rcn < rcn0 ? 1 : 0);
	/* check whether it's giant-step time */
	if (k > gss)
	{
	  /* take gcd */
	  gl = mppgcd(gl, n);
	  if (!is_pm1(gl) && !egalii(gl, n)) { p = p2; goto fin; }
	  gl = gun;
	  avma = av1;
	  while (k > gss)	/* hm, just how large are those prime gaps? */
	  {
	    /* giant step */
	    if (DEBUGLEVEL >= 7)
	      fprintferr("\t(giant step at p = %lu)\n", p);
	    if (elladd0(64, 4,
			XG + i, YG + i,
			Xh, Yh, Xh, Yh) > 1) goto fin;
	    if (elladd0(64, 4,
			XG + i, YG + i,
			Xh + 64, Yh + 64, Xh + 64, Yh + 64) > 1) goto fin;
	    if (elladd0(64, 4,
			XG + i, YG + i,
			Xh + 128, Yh + 128, Xh + 128, Yh + 128)
		> 1) goto fin;
	    bstp -= (gss << 1);
	    /* recompute multiplier */
	    k = bstp - (rcn < rcn0 ? 1 : 0);
	  }
	}
	if (!k) continue;	/* point of interest is already in Xh */
	if (k < 0) k = -k;
	m = ((ulong)k - 1) << 2;
	/* accumulate product of differences of X coordinates */
	j = rcn<<2;
	gl = modii(mulii(gl, subii(XB[m],   Xh[j])), n);
	gl = modii(mulii(gl, subii(XB[m+1], Xh[j+1])), n);
	gl = modii(mulii(gl, subii(XB[m+2], Xh[j+2])), n);
	gl = modii(mulii(gl, subii(XB[m+3], Xh[j+3])), n);
	if (low_stack(lim, stack_lim(av1,1)))
	{
	  if(DEBUGMEM>1) err(warnmem,"ellfacteur");
	  gl = gerepileupto(av1, gl);
	}
      }	/* loop over p */
      avma = av1;
    } /* for i (loop over sets of 4 curves) */

    /* continuation part of main loop */

    if (dsn < dsnmax)
    {
      dsn += insist ? 1 : 2;
      if (dsn > dsnmax) dsn = dsnmax;
    }

    if (!insist && !--rep)
    {
      if (DEBUGLEVEL >= 4)
      {
	fprintferr("ECM: time = %6ld ms,\tellfacteur giving up.\n",
		   timer2());
	flusherr();
      }
      avma = av;
      if (use_clones) { gunclone(w0); gunclone(x); }
      return NULL;
    }
  }
  /* *** END OF ECM MAIN LOOP *** */
fin:
  affii(gl, res);

  if (DEBUGLEVEL >= 4)
  {
    fprintferr("ECM: time = %6ld ms,\tp <= %6lu,\n\tfound factor = %Z\n",
	       timer2(), p, res);
    flusherr();
  }
  avma=av;
  if (use_clones) { gunclone(w0); gunclone(x); }
  return res;
}

/***********************************************************************/
/**                                                                   **/
/**                FACTORIZATION (Pollard-Brent rho)                  **/
/**  pollardbrent() returns a non trivial factor of n, assuming n is  **/
/**  composite and has no small prime divisor, or NULL if going on    **/
/**  would take more time than we want to spend.  GN1998Jun18-26      **/
/**                 (Cf. Algorithm 8.5.2 in ACiCNT)                   **/
/**                                                                   **/
/***********************************************************************/
static void
rho_dbg(long c, long msg_mask)
{
  if (c & msg_mask) return;
  fprintferr("Rho: time = %6ld ms,\t%3ld round%s\n",
             timer2(), c, (c==1?"":"s"));
  flusherr();
}

/* Tuning parameter:  for input up to 64 bits long, we must not spend more
 * than a very short time, for fear of slowing things down on average.
 * With the current tuning formula, increase our efforts somewhat at 49 bit
 * input  (an extra round for each bit at first),  and go up more and more
 * rapidly after we pass 80 bits. */

#define tune_pb_min 14		/* even 15 seems too much */

/* We return NULL when we run out of time, or a single t_INT containing a
   nontrivial factor of n, or a vector of t_INTs, each triple of successive
   entries containing a factor, an exponent  (equal to un),  and a factor
   class  (NULL for unknown or zero for known composite),  matching the
   internal representation used by the ifac_*() routines below.  Repeated
   factors can arise and are legal;  the caller will be sorting the factors
   anyway. */
GEN
pollardbrent(GEN n)
{
  long tf = lgefint(n), size = 0, delta, retries = 0, msg_mask;
  long c0, c, k, k1, l, avP, avx, GGG, av = avma;
  GEN x, x1, y, P, g, g1, res;

  if (DEBUGLEVEL > 3) (void)timer2(); /* clear timer */

  if (tf >= 4)
    size = expi(n) + 1;
  else if (tf == 3)		/* try to keep purify happy...  */
    size = BITS_IN_LONG - bfffo(n[2]);

  if (size <= 32)
    c0 = 32;			/* amounts very nearly to `insist' */
  else if (size <= 48)
    c0 = tune_pb_min;
  else if (size <= 72)
    c0 = tune_pb_min + size - 24;
  else if (size <= 301)
    /* nonlinear increase in effort, kicking in around 80 bits */
    /* 301 gives 48121 + tune_pb_min */
    c0 = tune_pb_min + size - 60 +
      ((size-73)>>1)*((size-70)>>3)*((size-56)>>4);
  else 
    c0 = 49152;			/* ECM is faster when it'd take longer */

  c = c0 << 5;			/* 32 iterations per round */
  msg_mask = (size >= 448? 0x1fff:
                           (size >= 192? (256L<<((size-128)>>6))-1: 0xff));
PB_RETRY:
 /* trick to make a `random' choice determined by n.  Don't use x^2+0 or
  * x^2-2, ever.  Don't use x^2-3 or x^2-7 with a starting value of 2.
  * x^2+4, x^2+9 are affine conjugate to x^2+1, so don't use them either.
  *
  * (the point being that when we get called again on a composite cofactor
  * of something we've already seen, we had better avoid the same delta) */
  switch ((size + retries) & 7)
  {
    case 0: delta=  1; break;
    case 1: delta= -1; break;
    case 2: delta=  3; break;
    case 3: delta=  5; break;
    case 4: delta= -5; break;
    case 5: delta=  7; break;
    case 6: delta= 11; break;
    case 7: delta=-11; break;
  }
  if (DEBUGLEVEL > 3)
  {
    if (!retries)
    {
      if (size < 1536)
	fprintferr("Rho: searching small factor of %ld-bit integer\n", size);
      else
	fprintferr("Rho: searching small factor of %ld-word integer\n", tf-2);
    }
    else
      fprintferr("Rho: restarting for remaining rounds...\n");
    fprintferr("Rho: using X^2%+1ld for up to %ld rounds of 32 iterations\n",
               delta, c >> 5);
    flusherr();
  }
  x=gdeux; P=gun; g1 = NULL; k = 1; l = 1;
  (void)new_chunk(10 + 6 * tf); /* enough for cgetg(10) + 3 divii */
  y = cgeti(tf); affsi(2, y);
  x1= cgeti(tf); affsi(2, x1);
  avx = avma;
  avP = (long)new_chunk(2 * tf); /* enough for x = addsi(tf+1) */
  GGG = (long)new_chunk(4 * tf); /* enough for P = modii(2tf+1, tf) */

  for (;;)			/* terminated under the control of c */
  {
    /* use the polynomial  x^2 + delta */
#define one_iter() {\
    avma = GGG; x = resii(sqri(x), n); /* to garbage zone */\
    avma = avx; x = addsi(delta,x);    /* erase garbage */\
    avma = GGG; P = mulii(P, subii(x1, x));\
    avma = avP; P = modii(P,n); }

    one_iter();

    if ((--c & 0x1f)==0)	/* one round complete */
    {
      g = mppgcd(n, P);
      if (!is_pm1(g)) goto fin;	/* caught something */
      if (c <= 0)
      {				/* getting bored */
        if (DEBUGLEVEL > 3)
        {
          fprintferr("Rho: time = %6ld ms,\tPollard-Brent giving up.\n",
                     timer2());
          flusherr();
        }
        avma=av; return NULL;
      }
      P = gun;			/* not necessary, but saves 1 mulii/round */
      if (DEBUGLEVEL > 3) rho_dbg(c0-(c>>5), msg_mask);
      affii(x,y);
    }

    if (--k) continue;		/* normal end of loop body */

    if (c & 0x1f) /* otherwise, we already checked */
    {
      g = mppgcd(n, P);
      if (!is_pm1(g)) goto fin;
      P = gun;
    }

   /* Fast forward phase, doing l inner iterations without computing gcds.
    * Check first whether it would take us beyond the alloted time.
    * Fast forward rounds count only half  (although they're taking
    * more like 2/3 the time of normal rounds).  This to counteract the
    * nuisance that all c0 between 4096 and 6144 would act exactly as
    * 4096;  with the halving trick only the range 4096..5120 collapses
    * (similarly for all other powers of two) */
    if ((c-=(l>>1)) <= 0)
    {				/* got bored */
      if (DEBUGLEVEL > 3)
      {
	fprintferr("Rho: time = %6ld ms,\tPollard-Brent giving up.\n",
		   timer2());
	flusherr();
      }
      avma=av; return NULL;
    }
    c &= ~0x1f;			/* keep it on multiples of 32 */

    /* Fast forward loop */
    affii(x, x1); k = l; l <<= 1;
    /* don't show this for the first several (short) fast forward phases. */
    if (DEBUGLEVEL > 3 && (l>>7) > msg_mask)
    {
      fprintferr("Rho: fast forward phase (%ld rounds of 64)...\n", l>>7);
      flusherr();
    }
    for (k1=k; k1; k1--) one_iter();
    if (DEBUGLEVEL > 3 && (l>>7) > msg_mask)
    {
      fprintferr("Rho: time = %6ld ms,\t%3ld rounds, back to normal mode\n",
		 timer2(), c0-(c>>5));
      flusherr();
    }

    affii(x,y);
  } /* forever */

fin:
  /* An accumulated gcd was > 1 */
  /* if it isn't n, and looks prime, return it */
  if  (!egalii(g,n))
  {
    if (miller(g,17))
    {
      if (DEBUGLEVEL > 3)
      {
        rho_dbg(c0-(c>>5), 0);
	fprintferr("\tfound factor = %Z\n",g);
	flusherr();
      }
      avma=av; return icopy(g);
    }
    avma = avx; g1 = icopy(g);  /* known composite, keep it safe */
    avx = avma;
  }
  else g1 = n;			/* and work modulo g1 for backtracking */

  /* Here g1 is known composite */
  if (DEBUGLEVEL > 3 && size > 192)
  {
    fprintferr("Rho: hang on a second, we got something here...\n");
    flusherr();
  }
  for(;;) /* backtrack until period recovered. Must terminate */
  {
    avma = GGG; y = resii(sqri(y), g1);
    avma = avx; y = addsi(delta,y);
    g = mppgcd(subii(x1, y), g1);
    if (!is_pm1(g)) break;

    if (DEBUGLEVEL > 3 && (--c & 0x1f) == 0) rho_dbg(c0-(c>>5), msg_mask);
  }

  avma = av; /* safe */
  if (g1 == n || egalii(g,g1))
  {
    if (g1 == n && egalii(g,g1))
    { /* out of luck */
      if (DEBUGLEVEL > 3)
      {
        rho_dbg(c0-(c>>5), 0);
        fprintferr("\tPollard-Brent failed.\n"); flusherr();
      }
      if (++retries >= 4) return NULL;
      goto PB_RETRY;
    }
    /* half lucky: we've split n, but g1 equals either g or n */
    if (DEBUGLEVEL > 3)
    {
      rho_dbg(c0-(c>>5), 0);
      fprintferr("\tfound %sfactor = %Z\n",
                 (g1!=n ? "composite " : ""), g);
      flusherr();
    }
    res = cgetg(7, t_VEC);
    res[1] = licopy(g);         /* factor */
    res[2] = un;		/* exponent 1 */
    res[3] = (g1!=n? zero: (long)NULL); /* known composite when g1!=n */

    res[4] = ldivii(n,g);       /* cofactor */
    res[5] = un;		/* exponent 1 */
    res[6] = (long)NULL;	/* unknown */
    return res;
  }
  /* g < g1 < n : our lucky day -- we've split g1, too */
  res = cgetg(10, t_VEC);
  /* unknown status for all three factors */
  res[1] = licopy(g);    res[2] = un; res[3] = (long)NULL;
  res[4] = ldivii(g1,g); res[5] = un; res[6] = (long)NULL;
  res[7] = ldivii(n,g1); res[8] = un; res[9] = (long)NULL;
  if (DEBUGLEVEL > 3)
  {
    rho_dbg(c0-(c>>5), 0);
    fprintferr("\tfound factors = %Z, %Z,\n\tand %Z\n",
               res[1], res[4], res[7]);
    flusherr();
  }
  return res;
}

/***********************************************************************/
/**                                                                   **/
/**                      DETECTING ODD POWERS                         **/
/**  Factoring engines like MPQS which ultimately rely on computing   **/
/**  gcd(N, x^2-y^2) to find a nontrivial factor of N are fundamen-   **/
/**  tally incapable of splitting a proper power of an odd prime,     **/
/**  because of the cyclicity of the prime residue class group.  We   **/
/**  already have a square-detection function carrecomplet(), which   **/
/**  also returns the square root if appropriate.  Here's an analogue **/
/**  for cubes, fifth and 7th powers.  11th powers are a non-issue so **/
/**  long as mpqs() gives up beyond 100 decimal digits  (since ECM    **/
/**  easily find a 10-digit prime factor of a 100-digit number).      **/
/**  GN1998Jun28                                                      **/
/**                                                                   **/
/***********************************************************************/

/* Use a multistage sieve.  First stages work mod 211, 209, 61, 203;
   if the argument is larger than a word, we first reduce mod the product
   of these and then take the remainder apart.  Second stages use 117,
   31, 43, 71 in this order.  Moduli which are no longer interesting are
   skipped.  Everything is encoded in a single table of 106 24-bit masks.
   We only need the first half of the residues.  Three bits per modulus
   indicate which residues are 7th (bit 2), 5th (bit 1) powers or cubes
   (bit 0);  the eight moduli above are assigned right-to-left.  The table
   will err on the side of safety if one of the moduli divides the number
   to be tested, but as this leads to inefficiency it should still be
   avoided. */

static ulong powersmod[106] = {
  077777777ul,	/* 0 */
  077777777ul,	/* 1 */
  013562440ul,	/* 2 */
  012462540ul,	/* 3 */
  013562440ul,	/* 4 */
  052662441ul,	/* 5 */
  016663440ul,	/* 6 */
  016463450ul,	/* 7 */
  013573551ul,	/* 8 */
  012462540ul,	/* 9 */
  012462464ul,	/* 10 */
  013462771ul,	/* 11 */
  012466473ul,	/* 12 */
  012463641ul,	/* 13 */
  052463646ul,	/* 14 */
  012563446ul,	/* 15 */
  013762440ul,	/* 16 */
  052766440ul,	/* 17 */
  012772451ul,	/* 18 */
  012762454ul,	/* 19 */
  032763550ul,	/* 20 */
  013763664ul,	/* 21 */
  017763460ul,	/* 22 */
  037762565ul,	/* 23 */
  017762540ul,	/* 24 */
  057762441ul,	/* 25 */
  037772452ul,	/* 26 */
  017773551ul,	/* 27 */
  017767541ul,	/* 28 */
  017767640ul,	/* 29 */
  037766450ul,	/* 30 */
  017762752ul,	/* 31 */
  037762762ul,	/* 32 */
  017762742ul,	/* 33 */
  037763762ul,	/* 34 */
  017763740ul,	/* 35 */
  077763740ul,	/* 36 */
  077762750ul,	/* 37 */
  077762752ul,	/* 38 */
  077762750ul,	/* 39 */
  077762743ul,	/* 40 */
  077767740ul,	/* 41 */
  077763741ul,	/* 42 */
  077763762ul,	/* 43 */
  077772760ul,	/* 44 */
  077762770ul,	/* 45 */
  077766750ul,	/* 46 */
  077762740ul,	/* 47 */
  077763740ul,	/* 48 */
  077763750ul,	/* 49 */
  077763752ul,	/* 50 */
  077762740ul,	/* 51 */
  077762740ul,	/* 52 */
  077772740ul,	/* 53 */
  077762762ul,	/* 54 */
  077763765ul,	/* 55 */
  077763770ul,	/* 56 */
  077767750ul,	/* 57 */
  077766753ul,	/* 58 */
  077776740ul,	/* 59 */
  077772741ul,	/* 60 */
  077772744ul,	/* 61 */
  077773740ul,	/* 62 */
  077773743ul,	/* 63 */
  077773751ul,	/* 64 */
  077772771ul,	/* 65 */
  077772760ul,	/* 66 */
  077772763ul,	/* 67 */
  077772751ul,	/* 68 */
  077773750ul,	/* 69 */
  077777740ul,	/* 70 */
  077773745ul,	/* 71 */
  077772740ul,	/* 72 */
  077772742ul,	/* 73 */
  077772744ul,	/* 74 */
  077776750ul,	/* 75 */
  077773771ul,	/* 76 */
  077773774ul,	/* 77 */
  077773760ul,	/* 78 */
  077772741ul,	/* 79 */
  077772740ul,	/* 80 */
  077772740ul,	/* 81 */
  077772741ul,	/* 82 */
  077773754ul,	/* 83 */
  077773750ul,	/* 84 */
  077773740ul,	/* 85 */
  077776741ul,	/* 86 */
  077776771ul,	/* 87 */
  077776773ul,	/* 88 */
  077772761ul,	/* 89 */
  077773741ul,	/* 90 */
  077773740ul,	/* 91 */
  077773740ul,	/* 92 */
  077772740ul,	/* 93 */
  077772752ul,	/* 94 */
  077772750ul,	/* 95 */
  077772751ul,	/* 96 */
  077773741ul,	/* 97 */
  077773761ul,	/* 98 */
  077777760ul,	/* 99 */
  077772765ul,	/* 100 */
  077772742ul,	/* 101 */
  077777751ul,	/* 102 */
  077777750ul,	/* 103 */
  077777745ul,	/* 104 */
  077777770ul	/* 105 */
};

/* Returns 3, 5, or 7 if x is a cube (but not a 5th or 7th power),  a 5th
   power (but not a 7th),  or a 7th power, and in this case creates the
   base on the stack and assigns its address to *pt.  Otherwise returns 0.
   x must be of type t_INT and nonzero;  this is not checked.  The *mask
   argument tells us which things to check -- bit 0: 3rd, bit 1: 5th,
   bit 2: 7th pwr;  set a bit to have the corresponding power examined --
   and is updated appropriately for a possible follow-up call */

long				/* no longer static -- used in mpqs.c */
is_odd_power(GEN x, GEN *pt, long *mask)
{
  long av=avma, tetpil, lgx=lgefint(x), exponent=0, residue, resbyte;
  GEN y;

  *mask &= 7;			/* paranoia */
  if (!*mask) return 0;		/* useful when running in a loop */
  if (signe(x) < 0) x=absi(x);

  if (DEBUGLEVEL >= 5)
  {
    fprintferr("OddPwrs: is %Z\n\t...a", x);
    if (*mask&1) fprintferr(" 3rd%s",
			    (*mask==7?",":(*mask!=1?" or":"")));
    if (*mask&2) fprintferr(" 5th%s",
			    (*mask==7?", or":(*mask&4?" or":"")));
    if (*mask&4) fprintferr(" 7th");
    fprintferr(" power?\n");
  }
  if (lgx > 3) residue = smodis(x, 211*209*61*203);
  else residue = x[2];

  resbyte=residue%211; if (resbyte > 105) resbyte = 211 - resbyte;
  *mask &= powersmod[resbyte];
  if (DEBUGLEVEL >= 5)
  {
    fprintferr("\tmodulo: resid. (remaining possibilities)\n");
    fprintferr("\t   211:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
	       resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
  }
  if (!*mask) { avma=av; return 0; }

  if (*mask & 3)
  {
    resbyte=residue%209; if (resbyte > 104) resbyte = 209 - resbyte;
    *mask &= (powersmod[resbyte] >> 3);
    if (DEBUGLEVEL >= 5)
      fprintferr("\t   209:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
		 resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
    if (!*mask) { avma=av; return 0; }
  }
  if (*mask & 3)
  {
    resbyte=residue%61; if (resbyte > 30) resbyte = 61 - resbyte;
    *mask &= (powersmod[resbyte] >> 6);
    if (DEBUGLEVEL >= 5)
      fprintferr("\t    61:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
		 resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
    if (!*mask) { avma=av; return 0; }
  }
  if (*mask & 5)
  {
    resbyte=residue%203; if (resbyte > 101) resbyte = 203 - resbyte;
    *mask &= (powersmod[resbyte] >> 9);
    if (DEBUGLEVEL >= 5)
      fprintferr("\t   203:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
		 resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
    if (!*mask) { avma=av; return 0; }
  }

  if (lgx > 3) residue = smodis(x, 117*31*43*71);
  else residue = x[2];

  if (*mask & 1)
  {
    resbyte=residue%117; if (resbyte > 58) resbyte = 117 - resbyte;
    *mask &= (powersmod[resbyte] >> 12);
    if (DEBUGLEVEL >= 5)
      fprintferr("\t   117:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
		 resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
    if (!*mask) { avma=av; return 0; }
  }
  if (*mask & 3)
  {
    resbyte=residue%31; if (resbyte > 15) resbyte = 31 - resbyte;
    *mask &= (powersmod[resbyte] >> 15);
    if (DEBUGLEVEL >= 5)
      fprintferr("\t    31:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
		 resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
    if (!*mask) { avma=av; return 0; }
  }
  if (*mask & 5)
  {
    resbyte=residue%43; if (resbyte > 21) resbyte = 43 - resbyte;
    *mask &= (powersmod[resbyte] >> 18);
    if (DEBUGLEVEL >= 5)
      fprintferr("\t    43:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
		 resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
    if (!*mask) { avma=av; return 0; }
  }
  if (*mask & 6)
  {
    resbyte=residue%71; if (resbyte > 35) resbyte = 71 - resbyte;
    *mask &= (powersmod[resbyte] >> 21);
    if (DEBUGLEVEL >= 5)
      fprintferr("\t    71:  %3ld   (3rd %ld, 5th %ld, 7th %ld)\n",
		 resbyte, *mask&1, (*mask>>1)&1, (*mask>>2)&1);
    if (!*mask) { avma=av; return 0; }
  }

  /* priority to higher powers -- if we have a 21st, it'll be easier to
     rediscover that its 7th root is a cube than that its cube root is
     a 7th power */
  if ((resbyte = *mask & 4))	/* assignment */
    exponent = 7;
  else if ((resbyte = *mask & 2))
    exponent = 5;
  else
    { resbyte = 1; exponent = 3; }
  /* leave that mask bit on for the moment, we might need it for a
     subsequent call */

  /* precision in the following is one extra significant word (overkill) */
  y=ground(gpow(x, ginv(stoi(exponent)), lgx));
  if (!egalii(gpowgs(y, exponent), x))
  {
    if (DEBUGLEVEL >= 5)
    {
      if (exponent == 3)
	fprintferr("\tBut it nevertheless wasn't a cube.\n");
      else
	fprintferr("\tBut it nevertheless wasn't a %ldth power.\n",
		   exponent);
    }
    *mask &= ~resbyte;		/* _now_ turn the bit off */
    avma=av; return 0;
  }
  /* caller (ifac_crack() below) will report the final result if it was
     a pure power, so no further diagnostics here */

  tetpil=avma;
  if (!pt) { avma=av; return exponent; } /* this branch not used */
  *pt=gerepile(av,tetpil,icopy(y));
  return exponent;
}

/***********************************************************************/
/**                                                                   **/
/**                FACTORIZATION  (master iteration)                  **/
/**      Driver for the various methods of finding large factors      **/
/**      (after trial division has cast out the very small ones).     **/
/**                        GN1998Jun24--30                            **/
/**                                                                   **/
/***********************************************************************/

/**  Direct use:
 **  ifac_start()  registers a number  (without prime factors < 100)
 **    with the iterative factorizer, and also registers whether or
 **    not we should terminate early if we find that the number is
 **    not squarefree, and a hint about which method(s) to use.  This
 **    must always be called first.  The input _must_ have been checked
 **    to be composite by the caller.  The routine immediately tries
 **    to decompose it nontrivially into a product of two factors,
 **    except in squarefreeness (`Moebius') mode.
 **  ifac_primary_factor()  returns a prime divisor  (not necessarily
 **    the smallest)  and the corresponding exponent. */

/**  Encapsulated user interface:
 **  ifac_decomp()  does the right thing for auxdecomp()  (put a succession
 **    of prime divisor / exponent pairs onto the stack, not necessarily
 **    sorted, although in practice they will tend not to be too far from
 **    the correct order).
 **
 **  For each of the additive/multiplicative arithmetic functions, there is
 **  a `contributor' below, to be called on any large composite cofactor
 **  left over after trial division by small primes, whose result can then
 **  be added to or multiplied with whatever we already have:
 **  ifac_moebius()  ifac_issquarefree()  ifac_totient()  ifac_omega()
 **  ifac_bigomega()  ifac_numdiv()  ifac_sumdiv()  ifac_sumdivk() */

/* We never test whether the input number is prime or composite, since
   presumably it will have come out of the small factors finder stage
   (which doesn't really exist yet but which will test the left-over
   cofactor for primality once it does). */

/* The data structure in which we preserve whatever we know at any given
   time about our number N is kept on the PARI stack, and updated as needed.
   This makes the machinery re-entrant  (you can have more than one fac-
   torization using ifac_start()/ifac_primary_factor() in progress simul-
   taneously so long as you preserve the GEN across garbage collections),
   and which avoids memory leaks when a lengthy factorization is interrupted.
   We also make an effort to keep the whole affair connected, and the parent
   object will always be older than its children.  This may in rare cases
   lead to some extra copying around, and knowing what is garbage at any
   given time is not entirely trivial.  See below for examples how to do
   it right.  (Connectedness can be destroyed if callers of ifac_main()
   create other stuff on the stack in between calls.  This is harmless
   as long as ifac_realloc() is used to re-create a connected object at
   the head of the stack just before collecting garbage.) */

/* Note that a PARI integer can have hundreds of millions of distinct prime
   factors larger than 2^16, given enough memory.  And since there's no
   guarantee that we will find factors in order of increasing size, we must
   be prepared to drag a very large amount of data around  (although this
   will _very_ rarely happen for random input!).  So we start with a small
   structure and extend it when necessary. */

/* The idea of data structure and algorithm is:
   Let N0 be whatever is currently left of N after dividing off all the
   prime powers we have already returned to the caller.  Then we maintain
   N0 as a product
   (1)   N0 = \prod_i P_i^{e_i} * \prod_j Q_j^{f_j} * \prod_k C_k^{g_k}
   where the P_i and Q_j are distinct primes, each C_k is known composite,
   none of the P_i divides any C_k, and we also know the total ordering
   of all the P_i, Q_j and C_k  (in particular, we will never try to divide
   a C_k by a larger Q_j).  Some of the C_k may have common factors, although
   this will not often be the case. */

/* Caveat implementor:  Taking gcds among C_k's is very likely to cost at
   least as much time as dividing off any primes as we find them, and book-
   keeping would be a nightmare  (since D=gcd(C_1,C_2) can still have common
   factors with both C_1/D and C_2/D, and so on...). */

/* At startup, we just initialize the structure to
   (2)        N = C_1^1   (composite). */

/* Whenever ifac_primary_factor() or ifac_decomp()  (or, mutatis mutandis,
   one of the three arithmetic user interface routines)  needs a primary
   factor, and the smallest thing in our list is P_1, we return that and
   its exponent, and remove it from our list.
   (When nothing is left, we return a sentinel value -- gun.  And in Moebius
   mode, when we see something with exponent > 1, whether prime or composite,
   we yell at our caller by returning gzero or 0, depending on the function).
   In all other cases, ifac_main() iterates the following steps until we have
   a P_1 in the smallest position. */

/* When the smallest item is C_1  (as it is initially):
   (3.1) Crack C_1 into a nontrivial product  U_1 * U_2  by whatever method
   comes to mind for this size.  (U for `unknown'.)  Cracking will detect
   squares  (and biquadrates etc),  and it may detect odd powers, so we
   might instead see a power of some U_1 here, or even something of the form
   U_1^k*U_2^k.  (Of course the exponent already attached to C_1 is taken
   into account in the following.)
   (3.2) If we have U_1*U_2, sort the two factors;  convert to U_1^2 if they
   happen to be equal  (which they shouldn't -- squares should have been
   caught at the preceding stage).  Note that U_1 and  (if it exists)  U_2
   are automatically smaller than anything else in our list.
   (3.3) Check U_1  (and U_2)  for primality, and flag them accordingly.
   (3.4) Iterate. */

/* When the smallest item is Q_1:
   This is the potentially unpleasant case.  The idea is to go through the
   entire list and try to divide Q_1 off each of the current C_k's, which
   will usually fail, but may succeed several times.  When a division was
   successful, the corresponding C_k is removed from our list, and the co-
   factor becomes a U_l for the moment unless it is 1  (which happens when
   C_k was a power of Q_1).  When we're through we upgrade Q_1 to P_1 status,
   and then do a primality check on each U_l and sort it back into the list
   either as a Q_j or as a C_k.  If during the insertion sort we discover
   that some U_l equals some P_i or Q_j or C_k we already have, we just add
   U_l's exponent to that of its twin.  (The sorting should therefore happen
   before the primality test).
   Note that this may produce one or more elements smaller than the P_1
   we just confirmed, so we may have to repeat the iteration. */

/* There's a little trick that avoids some Q_1 instances.  Just after we do
   a sweep to classify all current unknowns as either composites or primes,
   we do another downward sweep beginning with the largest current factor
   and stopping just above the largest current composite.  Every Q_j we
   pass is turned into a P_i.  (Different primes are automatically coprime
   among each other, and primes tend not to divide smaller composites.) */

/* (We have no use for comparing the square of a prime to N0.  Normally
   we will get called after casting out only the smallest primes, and
   since we cannot guarantee that we see the large prime factors in as-
   cending order, we cannot stop when we find one larger than sqrt(N0).) */

/* Data structure:  We keep everything in a single t_VEC of t_INTs.  The
   first component records whether we're doing full (NULL) or Moebius (un)
   factorization;  in the latter case many subroutines return a sentinel
   value as soon as they spot an exponent > 1.  The second component records
   the hint from factorint()'s optional flag, for use by ifac_crack().
   The remaining components  (initially 15)  are used in groups of three:
   a GEN pointer at the t_INT value of the factor, a pointer at the t_INT
   exponent  (usually gun or gdeux so we don't clutter up the stack too
   much),  and another t_INT GEN pointer to record the class of the factor:
   NULL for unknown, zero for known composite C_k, un for known prime Q_j
   awaiting trial division, and deux for finished prime P_i. */

/* When during the division stage we re-sort a C_k-turned-U_l to a lower
   position, we rotate any intervening material upward towards its old
   slot.  When a C_k was divided down to 1, its slot is left empty at
   first;  similarly when the re-sorting detects a repeated factor.
   After the sorting phase, we de-fragment the list and squeeze all the
   occupied slots together to the high end, so that ifac_crack() has room
   for new factors.  When this doesn't suffice, we abandon the current
   vector and allocate a somewhat larger one, defragmenting again during
   copying. */

/* (For internal use, note that all exponents will fit into C longs, given
   PARI's lgefint field size.  When we work with them, we sometimes read
   out the GEN pointer, and sometimes do an itos, whatever is more con-
   venient for the task at hand.) */


/*** Overview and forward declarations: ***/

/* The `*where' argument in the following points into *partial at the
   first of the three fields of the first occupied slot.  It's there
   because the caller would already know where `here' is, so we don't
   want to search for it again, although it wouldn't take much time.
   On the other hand, we do not preserve this from one user-interface
   call to the next. */

static GEN
ifac_find(GEN *partial, GEN *where);
/* Return GEN pointing at the first nonempty slot strictly behind the
   current *where, or NULL if such doesn't exist.  Can be used to skip
   a range of vacant slots, or to initialize *where in the first place
   (pass partial in both args).  Does not modify its argument pointers. */

void
ifac_realloc(GEN *partial, GEN *where, long new_lg);
/* Move to a larger main vector, updating *where if it points into it.
   Certainly updates *partial.  Can be used as a specialized gcopy before
   a gerepileupto()/gerepilemanysp()  (pass 0 as the new length).
   Normally, one would pass new_lg=1 to let this function guess the
   new size.  To be used sparingly. */

static long
ifac_crack(GEN *partial, GEN *where);
/* Split the first (composite) entry.  There _must_ already be room for
   another factor below *where, and *where will be updated.  Factor and
   cofactor will be inserted in the correct order, updating *where, or
   factor^k will be inserted if such should be the case  (leaving *where
   unchanged).  The factor or factors will be set to unknown, and inherit
   the exponent  (or a multiple thereof)  of its/their ancestor.  Returns
   number of factors written into the structure  (normally 2, but 1 if a
   factor equalled its cofactor, and may be more than 1 if a factoring
   engine returned a vector of factors instead of a single factor).  Can
   reallocate the data structure in the vector-of-factors case  (but not
   in the more common single-factor case) */

static long
ifac_insert_multiplet(GEN *partial, GEN *where, GEN facvec);
/* Gets called to complete ifac_crack()'s job when a factoring engine
   splits the current factor into a product of three or more new factors.
   Makes room for them if necessary, sorts them, gives them the right
   exponents and class etc.  Also returns the number of factors actually
   written, which may be less than the number of components in facvec
   if there are duplicates.--- Vectors of factors  (cf pollardbrent()
   above)  actually contain `slots' of three GENs per factor with the
   three fields being interpreted exactly as in our partial factorization
   data structure.  Thus `engines' can tell us what they already happen to
   know about factors being prime or composite and/or appearing to a power
   larger than the first */

static long
ifac_divide(GEN *partial, GEN *where);
/* Divide all current composites by first  (prime, class Q)  entry, updating
   its exponent, and turning it into a finished prime  (class P).  Return 1
   if any such divisions succeeded  (in Moebius mode, the update may then
   not have been completed),  or 0 if none of them succeeded.  Doesn't
   modify *where. */

static long
ifac_sort_one(GEN *partial, GEN *where, GEN washere);
/* re-sort one  (typically unknown)  entry from washere to a new position,
   rotating intervening entries upward to fill the vacant space.  It may
   happen (rarely) that the new position is the same as the old one, or
   that the new value of the entry coincides with a value already occupying
   a lower slot, in which latter case we just add exponents  (and use the
   `more known' class, and return 1 immediately when in Moebius mode).
   The slots between *where and washere must be in sorted order, so a
   sweep using this to re-sort several unknowns must proceed upward  (see
   ifac_resort() below).  Return 1 if we see an exponent > 1  (in Moebius
   mode without completing the update),  0 otherwise. */

static long
ifac_resort(GEN *partial, GEN *where);
/* sort all current unknowns downward to where they belong.  Sweeps
   in the upward direction.  Not needed after ifac_crack(), only when
   ifac_divide() returned true.  May update *where.  Returns 1 when an
   ifac_sort_one() call does so to indicate a repeated factor, or 0 if
   any and all such calls returned 0 */

static void
ifac_defrag(GEN *partial, GEN *where);
/* defragment: collect and squeeze out any unoccupied slots above *where
   during a downward sweep.  Unoccupied slots arise when a composite factor
   dissolves completely whilst dividing off a prime, or when ifac_resort()
   spots a coincidence and merges two factors.  *where will be updated */

static void
ifac_whoiswho(GEN *partial, GEN *where, long after_crack);
/* determine primality or compositeness of all current unknowns, and set
   class Q primes to finished (class P) if everything larger is already
   known to be prime.  When after_crack is nonnegative, only look at the
   first after_crack things in the list (do nothing when it's zero) */

static GEN
ifac_main(GEN *partial);
/* main loop:  iterate until smallest entry is a finished prime;  returns
   a `where' pointer, or gun if nothing left, or gzero in Moebius mode if
   we aren't squarefree */

/* NB In the most common cases, control flows from the user interface to
   ifac_main() and then to a succession of ifac_crack()s and ifac_divide()s,
   with (typically) none of the latter finding anything. */

/** user interface: **/
/* return initial data structure, see ifac_crack() below for semantics
   of the hint argument */
GEN
ifac_start(GEN n, long moebius, long hint);

/* run main loop until primary factor is found, return the prime and
   assign the exponent.  If nothing left, return gun and set exponent
   to 0;  if in Moebius mode and a square factor is discovered, return
   gzero and set exponent to 0 */
GEN
ifac_primary_factor(GEN *partial, long *exponent);

/* call ifac_start() and run main loop until factorization is complete,
   accumulating prime / exponent pairs on the PARI stack to be picked up
   by aux_end().  Return number of distinct primes found */
long
ifac_decomp(GEN n, long hint);

/* completely encapsulated functions;  these call ifac_start() themselves,
   and ensure proper stack housekeeping etc.  Call them on any large
   composite left over after trial division, and multiply/add the result
   onto whatever you already have from the small factors.  Don't call
   them on large primes;  they will run into trouble */
long
ifac_moebius(GEN n, long hint);

long
ifac_issquarefree(GEN n, long hint);

long
ifac_omega(GEN n, long hint);

long
ifac_bigomega(GEN n, long hint);

GEN
ifac_totient(GEN n, long hint);	/* for gp's eulerphi() */

GEN
ifac_numdiv(GEN n, long hint);

GEN
ifac_sumdiv(GEN n, long hint);

GEN
ifac_sumdivk(GEN n, long k, long hint);

/*** implementation ***/

#define ifac_initial_length 24	/* codeword, moebius flag, hint, 7 slots */
/* (more than enough in most cases -- a 512-bit product of distinct 8-bit
   primes needs at most 7 slots at a time) */

GEN
ifac_start(GEN n, long moebius, long hint)
{
  GEN part, here;

  if (typ(n) != t_INT) err(typeer, "ifac_start");
  if (signe(n) == 0)
    err(talker, "factoring 0 in ifac_start");

  part = cgetg(ifac_initial_length, t_VEC);
  here = part + ifac_initial_length;
  part[1] = moebius? un : (long)NULL;
  switch(hint)
  {
  case 0:
    part[2] = zero; break;
  case 1:
    part[2] = un; break;
  case 2:
    part[2] = deux; break;
  default:
    part[2] = (long)stoi(hint);
  }
  if (isonstack(n))
    n = absi(n);
  /* make copy, because we'll later want to mpdivis() into it in place.
     If it's not on stack, then we assume it is a clone made for us by
     auxdecomp0(), and we assume the sign has already been set positive */
  /* fill first slot at the top end */
  *--here = zero;		/* initially composite */
  *--here = un;			/* initial exponent 1 */
  *--here = (long) n;
  /* and NULL out the remaining slots */
  while (here > part + 3) *--here = (long)NULL;
  return part;
}

static GEN
ifac_find(GEN *partial, GEN *where)
{
  long lgp = lg(*partial);
  GEN end = *partial + lgp;
  GEN scan = *where + 3;

  if (DEBUGLEVEL >= 5)
  {
    if (!*partial || typ(*partial) != t_VEC)
      err(typeer, "ifac_find");
    if (lg(*partial) < ifac_initial_length)
      err(talker, "partial impossibly short in ifac_find");
    if (!(*where) ||
	*where > *partial + lgp - 3 ||
        *where < *partial)	/* sic */
      err(talker, "`*where\' out of bounds in ifac_find");
  }
  while (scan < end && !*scan) scan += 3;
  /* paranoia -- check completely NULLed ? nope -- we never inspect the
     exponent field for deciding whether a slot is empty or occupied */
  if (scan < end)
  {
    if (DEBUGLEVEL >= 5)
    {
      if (!scan[1])
	err(talker, "factor has NULL exponent in ifac_find");
    }
    return scan;
  }
  return NULL;
}

/* simple defragmenter */
static void
ifac_defrag(GEN *partial, GEN *where)
{
  long lgp = lg(*partial);
  GEN scan_new = *partial + lgp - 3, scan_old = scan_new;

  while (scan_old >= *where)
  {
    if (*scan_old)		/* slot occupied? */
    {
      if (scan_old < scan_new)
      {
	scan_new[2] = scan_old[2];
	scan_new[1] = scan_old[1];
	*scan_new = *scan_old;
      }
      scan_new -= 3;		/* point at next slot to be written */
    }
    scan_old -= 3;
  }
  scan_new += 3;		/* back up to last slot written */
  *where = scan_new;
  while (scan_new > *partial + 3)
    *--scan_new = (long)NULL;	/* erase junk */
}

/* and complex version combined with reallocation.  If new_lg is 0, we
   use the old length, so this acts just like gcopy except that the where
   pointer is carried along;  if it is 1, we make an educated guess.
   Exception:  If new_lg is 0, the vector is full to the brim, and the
   first entry is composite, we make it longer to avoid being called again
   a microsecond later  (at significant cost).
   It is safe to call this with NULL for the where argument;  if it doesn't
   point anywhere within the old structure, it will be left alone */
void
ifac_realloc(GEN *partial, GEN *where, long new_lg)
{
  long old_lg = lg(*partial);
  GEN newpart, scan_new, scan_old;

  if (DEBUGLEVEL >= 5)		/* none of these should ever happen */
  {
    if (!*partial || typ(*partial) != t_VEC)
      err(typeer, "ifac_realloc");
    if (lg(*partial) < ifac_initial_length)
      err(talker, "partial impossibly short in ifac_realloc");
  }

  if (new_lg == 1)
    new_lg = 2*old_lg - 6;	/* from 7 slots to 13 to 25... */
  else if (new_lg <= old_lg)	/* includes case new_lg == 0 */
  {
    new_lg = old_lg;
    if ((*partial)[3] &&	/* structure full */
	((*partial)[5]==zero || (*partial)[5]==(long)NULL))
				/* and first entry composite or unknown */
      new_lg += 6;		/* give it a little more breathing space */
  }
  newpart = cgetg(new_lg, t_VEC);
  if (DEBUGMEM >= 3)
  {
    fprintferr("IFAC: new partial factorization structure (%ld slots)\n",
	       (new_lg - 3)/3);
    flusherr();
  }
  newpart[1] = (*partial)[1];	/* moebius */
  newpart[2] = (*partial)[2];	/* hint */
  /* downward sweep through the old *partial, picking up where1 and carry-
     ing it over if and when we pass it.  (This will only be useful if
     it pointed at a non-empty slot.)  Factors are licopy()d so that we
     again have a nice object  (parent older than children, connected),
     except the one factor that may still be living in a clone where n
     originally was;  exponents are similarly copied if they aren't global
     constants;  class-of-factor fields are always global constants so we
     need only copy them as pointers.  Caller may then do a gerepileupto()
     or a gerepilemanysp() */
  scan_new = newpart + new_lg - 3;
  scan_old = *partial + old_lg - 3;
  for (; scan_old > *partial + 2; scan_old -= 3)
  {
    if (*where == scan_old) *where = scan_new;
    if (!*scan_old) continue;	/* skip empty slots */

    *scan_new =
      isonstack((GEN)(*scan_old)) ?
	licopy((GEN)(*scan_old)) : *scan_old;
    scan_new[1] =
      isonstack((GEN)(scan_old[1])) ?
	licopy((GEN)(scan_old[1])) : scan_old[1];
    scan_new[2] = scan_old[2];
    scan_new -= 3;
  }
  scan_new += 3;		/* back up to last slot written */
  while (scan_new > newpart + 3)
    *--scan_new = (long)NULL;
  *partial = newpart;
}

#define moebius_mode ((*partial)[1])

/* Bubble-sort-of-thing sort.  Won't be exercised frequently,
   so this is ok. */
static long
ifac_sort_one(GEN *partial, GEN *where, GEN washere)
{
  GEN scan = washere - 3;
  GEN value, exponent, class0, class1;
  long cmp_res;

  if (DEBUGLEVEL >= 5)		/* none of these should ever happen */
  {
    long lgp;
    if (!*partial || typ(*partial) != t_VEC)
      err(typeer, "ifac_sort_one");
    if ((lgp = lg(*partial)) < ifac_initial_length)
      err(talker, "partial impossibly short in ifac_sort_one");
    if (!(*where) ||
	*where < *partial + 3 ||
	*where > *partial + lgp - 3)
      err(talker, "`*where\' out of bounds in ifac_sort_one");
    if (!washere ||
	washere < *where ||
	washere > *partial + lgp - 3)
      err(talker, "`washere\' out of bounds in ifac_sort_one");
  }
  value = (GEN)(*washere);
  exponent = (GEN)(washere[1]);
  if (exponent != gun && moebius_mode && cmpsi(1,exponent) < 0)
    return 1;			/* should have been detected by caller */
  class0 = (GEN)(washere[2]);

  if (scan < *where) return 0;	/* nothing to do, washere==*where */

  cmp_res = -1;			/* sentinel */
  while (scan >= *where)	/* therefore at least once */
  {
    if (*scan)			/* current slot nonempty */
    {
      /* check against where */
      cmp_res = cmpii(value, (GEN)(*scan));
      if (cmp_res >= 0) break;	/* have found where to stop */
    }
    /* copy current slot upward by one position and move pointers down */
    scan[5] = scan[2];
    scan[4] = scan[1];
    scan[3] = *scan;
    scan -= 3;
  }
  scan += 3;
  /* at this point there are the following possibilities:
     (*) cmp_res == -1.  Either value is less than that at *where, or for
     some reason *where was pointing at one or more vacant slots and any
     factors we saw en route were larger than value.  At any rate,
     scan == *where now, and scan is pointing at an empty slot, into
     which we'll stash our entry.
     (*) cmp_res == 0.  The entry at scan-3 is the one, we compare class0
     fields and add exponents, and put it all into the vacated scan slot,
     NULLing the one at scan-3  (and possibly updating *where).
     (*) cmp_res == 1.  The slot at scan is the one to store our entry
     into. */
  if (cmp_res != 0)
  {
    if (cmp_res < 0 && scan != *where)
      err(talker, "misaligned partial detected in ifac_sort_one");
    *scan = (long)value;
    scan[1] = (long)exponent;
    scan[2] = (long)class0;
    return 0;
  }
  /* case cmp_res == 0: repeated factor detected */
  if (DEBUGLEVEL >= 4)
  {
    fprintferr("IFAC: repeated factor %Z\n\tdetected in ifac_sort_one\n",
	       value);
    flusherr();
  }
  if (moebius_mode) return 1;	/* not squarefree */
  /* if old class0 was composite and new is prime, or vice versa,
     complain  (and if one class0 was unknown and the other wasn't,
     use the known one) */
  class1 = (GEN)(scan[-1]);
  if (class0)			/* should never be used */
  {
    if(class1)
    {
      if (class0 == gzero && class1 != gzero)
	err(talker, "composite equals prime in ifac_sort_one");
      else if (class0 != gzero && class1 == gzero)
	err(talker, "prime equals composite in ifac_sort_one");
      else if (class0 == gdeux)	/* should happen even less */
	scan[2] = (long)class0;	/* use it */
    }
    else			/* shouldn't happen either */
      scan[2] = (long)class0;	/* use it */
  }
  /* else stay with the existing known class0 */
  scan[2] = (long)class1;
  /* in any case, add exponents */
  if (scan[-2] == un && exponent == gun)
    scan[1] = deux;
  else
    scan[1] = laddii((GEN)(scan[-2]), exponent);
  /* move the value over */
  *scan = scan[-3];
  /* null out the vacated slot below */
  *--scan = (long)NULL;
  *--scan = (long)NULL;
  *--scan = (long)NULL;
  /* finally, see whether *where should be pulled in */
  if (scan == *where) *where += 3;
  return 0;
}

/* the following loop around the former doesn't need to check moebius_mode
   because ifac_sort_one() never returns 1 in normal mode */
static long
ifac_resort(GEN *partial, GEN *where)
{
  long lgp = lg(*partial), res = 0;
  GEN scan = *where;

  for (; scan < *partial + lgp; scan += 3)
  {
    if (*scan &&		/* slot occupied */
	!scan[2])		/* with an unknown */
    {
      res |= ifac_sort_one(partial, where, scan);
      if (res) return res;	/* early exit */
    }
  }
  return res;
}

/* sweep downward so we can with luck turn some Qs into Ps */
static void
ifac_whoiswho(GEN *partial, GEN *where, long after_crack)
{
  long lgp = lg(*partial), larger_compos = 0;
  GEN scan, scan_end = *partial + lgp - 3;

  if (DEBUGLEVEL >= 5)
  {
    if (!*partial || typ(*partial) != t_VEC)
      err(typeer, "ifac_whoiswho");
    if (lg(*partial) < ifac_initial_length)
      err(talker, "partial impossibly short in ifac_whoiswho");
    if (!(*where) ||
	*where > scan_end ||
        *where < *partial + 3)
      err(talker, "`*where\' out of bounds in ifac_whoiswho");
  }

  if (after_crack == 0) return;
  if (after_crack > 0)
  {
    larger_compos = 1;		/* disable Q-to-P trick */
    scan = *where + 3*(after_crack - 1);
				/* check at most after_crack entries */
    if (scan > scan_end)	/* ooops... */
    {
      err(warner, "avoiding nonexistent factors in ifac_whoiswho");
      scan = scan_end;
    }
  }
  else { larger_compos = 0; scan = scan_end; }

  for (; scan >= *where; scan -= 3)
  {
    if (scan[2])		/* known class of factor */
    {
      if (scan[2] == zero) larger_compos = 1;
      else if (!larger_compos && scan[2] == un)
      {
	if (DEBUGLEVEL >= 3)
	{
	  fprintferr("IFAC: factor %Z\n\tis prime (no larger composite)\n",
		     **where);
	  fprintferr("IFAC: prime %Z\n\tappears with exponent = %ld\n",
		     **where, itos((GEN)(*where)[1]));
	}
	scan[2] = deux;
      }	/* no else case */
      continue;
    }
    scan[2] =
      (isprime((GEN)(*scan)) ?
       (larger_compos ? un : deux) : /* un- or finished prime */
       zero);			/* composite */

    if (scan[2] == zero) larger_compos = 1;
    if (DEBUGLEVEL >= 3)
    {
      fprintferr("IFAC: factor %Z\n\tis %s\n", *scan,
		 (scan[2] == zero ? "composite" : "prime"));
    }
  }
}

/* Here we normally do not check that the first entry is a not-finished
   prime.  Stack management: we may allocate a new exponent */
static long
ifac_divide(GEN *partial, GEN *where)
{
  long lgp = lg(*partial);
  GEN scan = *where + 3;
  long res = 0, exponent, newexp, otherexp;

  if (DEBUGLEVEL >= 5)		/* none of these should ever happen */
  {
    if (!*partial || typ(*partial) != t_VEC)
      err(typeer, "ifac_divide");
    if (lg(*partial) < ifac_initial_length)
      err(talker, "partial impossibly short in ifac_divide");
    if (!(*where) ||
	*where > *partial + lgp - 3 ||
        *where < *partial + 3)
      err(talker, "`*where\' out of bounds in ifac_divide");
    if ((*where)[2] != un)
      err(talker, "division by composite or finished prime in ifac_divide");
  }
  if (!(**where))		/* always test just this one */
    err(talker, "division by nothing in ifac_divide");

  newexp = exponent = itos((GEN)((*where)[1]));
  if (exponent > 1 && moebius_mode) return 1;
  /* should've been caught by caller already */

  /* go for it */
  for (; scan < *partial + lgp; scan += 3)
  {
    if (scan[2] != zero) continue; /* the other thing ain't composite */
    otherexp = 0;
    /* let mpdivis divide the other factor in place to keep stack clutter
       minimal */
    while (mpdivis((GEN)(*scan), (GEN)(**where), (GEN)(*scan)))
    {
      if (moebius_mode) return 1; /* immediately */
      if (!otherexp) otherexp = itos((GEN)(scan[1]));
      newexp += otherexp;
    }
    if (newexp > exponent)	/* did anything happen? */
    {
      (*where)[1] = (newexp == 2 ? deux : (long)(stoi(newexp)));
      exponent = newexp;
      if (is_pm1((GEN)(*scan))) /* factor dissolved completely */
      {
	*scan = scan[1] = (long)NULL;
	if (DEBUGLEVEL >= 4)
	  fprintferr("IFAC: a factor was a power of another prime factor\n");
      }
      else if (DEBUGLEVEL >= 4)
      {
	fprintferr("IFAC: a factor was divisible by another prime factor,\n");
	fprintferr("\tleaving a cofactor = %Z\n", *scan);
      }
      scan[2] = (long)NULL;	/* at any rate it's Unknown now */
      res = 1;
      if (DEBUGLEVEL >= 5)
      {
	fprintferr("IFAC: prime %Z\n\tappears at least to the power %ld\n",
		   **where, newexp);
      }
    }
  } /* for */
  (*where)[2] = deux;		/* make it a finished prime */
  if (DEBUGLEVEL >= 3)
  {
    fprintferr("IFAC: prime %Z\n\tappears with exponent = %ld\n",
	       **where, newexp);
  }
  return res;
}


GEN mpqs(GEN N);		/* in src/modules/mpqs.c, maybe a dummy,
				   returns a factor, or a vector of factors,
				   or NULL */

/* The following takes the place of 2.0.9.alpha's find_factor(). */

/* The meaning of the hint changes against 2.0.9.alpha to:
   hint == 0 : Use our own strategy, and this should be the default
   hint & 1  : Avoid mpqs(), use ellfacteur() after pollardbrent()
   hint & 2  : Avoid first-stage ellfacteur() in favour of mpqs()
   (which may still fall back to ellfacteur() if mpqs() is not installed
   or gives up)
   hint & 4  : Avoid even the pollardbrent() stage
   hint & 8  : Avoid final ellfacteur();  this may `declare' a composite
   to be prime. */

/* stack housekeeping:  this routine may create one or more objects  (a new
   factor, or possibly several, and perhaps one or more new exponents > 2) */
static long
ifac_crack(GEN *partial, GEN *where)
{
  long hint, cmp_res, exp1 = 1, exp2 = 1, av;
  GEN factor = NULL, exponent;

  if (DEBUGLEVEL >= 5)		/* none of these should ever happen */
  {
    long lgp;
    if (!*partial || typ(*partial) != t_VEC)
      err(typeer, "ifac_crack");
    if ((lgp = lg(*partial)) < ifac_initial_length)
      err(talker, "partial impossibly short in ifac_crack");
    if (!(*where) ||
	*where < *partial + 6 || /* sic -- caller must realloc first */
	*where > *partial + lgp - 3)
      err(talker, "`*where\' out of bounds in ifac_crack");
    if (!(**where) || typ((GEN)(**where)) != t_INT)
      err(typeer, "ifac_crack");
    if ((*where)[2] != zero)
      err(talker, "operand not known composite in ifac_crack");
  }
  hint = itos((GEN)((*partial)[2])) & 15;
  exponent = (GEN)((*where)[1]);

  if (DEBUGLEVEL >= 3)
    fprintferr("IFAC: cracking composite\n\t%Z\n", **where);

  /* crack squares.  Quite fast due to the initial square residue test */
  if (DEBUGLEVEL >= 4)
    fprintferr("IFAC: checking for pure square\n");
  av = avma;
  while (carrecomplet((GEN)(**where), &factor))
  {
    if (DEBUGLEVEL >= 4)
      fprintferr("IFAC: found %Z =\n\t%Z ^2\n", **where, factor);
    affii(factor, (GEN)(**where)); avma = av; factor = NULL;
    if (exponent == gun)
      (*where)[1] = deux;
    else if (exponent == gdeux)
    { (*where)[1] = (long)stoi(4); av = avma; }
    else
    { affii(shifti(exponent, 1), (GEN)((*where)[1])); avma = av; }
    exponent = (GEN)((*where)[1]);
    if (moebius_mode) return 0;	/* no need to carry on... */
    exp1 = 2;
  } /* while carrecomplet */

  /* check whether our composite hasn't become prime */
  if (exp1 > 1 && isprime((GEN)(**where)))
  {
    (*where)[2] = un;
    if (DEBUGLEVEL >= 4)
    {
      fprintferr("IFAC: factor %Z\n\tis prime\n",**where);
      flusherr();
    }
    return 0;			/* bypass subsequent ifac_whoiswho() call */
  }
  /* still composite -- carry on */

  /* MPQS cannot factor prime powers;  check for cubes/5th/7th powers.
     Do this even if MPQS is blocked by hint -- it still serves a useful
     purpose in bounded factorization */
  {
    long mask = 7;
    if (DEBUGLEVEL == 4)
      fprintferr("IFAC: checking for odd power\n");
    /* (At debug levels > 4, is_odd_power() itself prints something more
       informative) */
    av = avma;
    while ((exp1 =		/* assignment */
	    is_odd_power((GEN)(**where), &factor, &mask)))
    {
      if (exp2 == 1) exp2 = exp1; /* remember this after the loop */
      if (DEBUGLEVEL >= 4)
	fprintferr("IFAC: found %Z =\n\t%Z ^%ld\n", **where, factor, exp1);
      affii(factor, (GEN)(**where)); avma = av; factor = NULL;
      if (exponent == gun)
      { (*where)[1] = (long)stoi(exp1); av = avma; }
      else if (exponent == gdeux)
      { (*where)[1] = (long)stoi(exp1<<1); av = avma; }
      else
      { affii(mulsi(exp1, exponent), (GEN)((*where)[1])); avma = av; }
      exponent = (GEN)((*where)[1]);
      if (moebius_mode) return 0; /* no need to carry on... */
    } /* while is_odd_power */

    if (exp2 > 1)
    {				/* Something nice has happened */
      /* check whether our composite hasn't become prime */
      if (isprime((GEN)(**where)))
      {
        (*where)[2] = un;
	if (DEBUGLEVEL >= 4)
	{
	  fprintferr("IFAC: factor %Z\n\tis prime\n", **where);
	  flusherr();
	}
	return 0;		/* bypass subsequent ifac_whoiswho() call */
      }
      /* base of power is still composite  (an exceedingly rare case),
	 fall through */
    }
  } /* odd power stage */

  /* pollardbrent() Rho usually gets a first chance */
  if (!(hint & 4))
  {
    if (DEBUGLEVEL >= 4)
      fprintferr("IFAC: trying Pollard-Brent rho method first\n");
    factor = pollardbrent((GEN)(**where));
  } /* Rho stage */

  /* if this didn't work, try one of our high-power beasties */
  if (!factor && !(hint & 2))
  {
    if (DEBUGLEVEL >= 4)
      fprintferr("IFAC: trying Lenstra-Montgomery ECM\n");
    factor = ellfacteur((GEN)(**where), 0); /* do not insist */
  } /* First ECM stage */

  if (!factor && !(hint & 1))
  {
    if (DEBUGLEVEL >= 4)
      fprintferr("IFAC: trying Multi-Polynomial Quadratic Sieve\n");
    factor = mpqs((GEN)(**where));
  } /* MPQS stage */

  if (!factor)
  {
    if (!(hint & 8))		/* still no luck?  force it */
    {
      if (DEBUGLEVEL >= 4)
	fprintferr("IFAC: forcing ECM, may take some time\n");
      factor = ellfacteur((GEN)(**where), 1);
    } /* final ECM stage, guaranteed to succeed */
    else			/* limited factorization */
    {
      if (DEBUGLEVEL >= 2)
      {
	err(warner, "IFAC: unfactored composite declared prime");
	/* don't print it out at level 3 or above, where it would appear
	   several times before and after this message already */
	if (DEBUGLEVEL == 2)
	{
	  fprintferr("\t%Z\n",**where);
	  flusherr();
	}
      }
      (*where)[2] = un;		/* might as well trial-divide by it... */
      return 1;
    }
  } /* Final ECM stage */

  if (DEBUGLEVEL >= 1)
  {
    if (!factor)		/* never reached */
      err(talker, "all available factoring methods failed in ifac_crack");
  }
  if (typ(factor) == t_VEC)	/* delegate this case */
    return ifac_insert_multiplet(partial, where, factor);

  else if (typ(factor) != t_INT)
  {
    fprintferr("IFAC: factorizer returned strange object to ifac_crack\n");
    outerr(factor);
    err(bugparier, "factoring");
  }

  /* got single integer back:  work out the cofactor (in place) */
  if (!mpdivis((GEN)(**where), factor, (GEN)(**where)))
  {
    fprintferr("IFAC: factoring %Z\n", **where);
    fprintferr("\tyielded `factor\' %Z\n\twhich isn't!\n", factor);
    err(bugparier, "factoring");
  }

  /* the factoring engines report the factor found when DEBUGLEVEL is
     large enough;  let's tell about the cofactor */
  if (DEBUGLEVEL >= 4)
    fprintferr("IFAC: cofactor = %Z\n", **where);

  /* ok, now `factor' is one factor and **where is the other, find out
     which is larger */
  cmp_res = cmpii(factor, (GEN)(**where));
  if (cmp_res < 0)		/* common case */
  {
    (*where)[2] = (long)NULL;	/* mark cofactor `unknown' */
    (*where)[-1] = (long)NULL;	/* mark factor `unknown' */
    (*where)[-2] =
      isonstack(exponent) ? licopy(exponent) : (long)exponent;
    *where -= 3;
    **where = (long)factor;
    return 2;
  }
  else if (cmp_res == 0)	/* hep, split a square in the middle */
  {
    err(warner,
	"square not found by carrecomplet, ifac_crack recovering");
    cgiv(factor);
    (*where)[2] = (long)NULL;	/* mark the sqrt `unknown' */
    if (exponent == gun)	/* double the exponent */
      (*where)[1] = deux;
    else if (exponent == gdeux)
      (*where)[1] = (long)stoi(4); /* make a new one */
    else			/* overwrite old exponent */
    {
      av = avma;
      affii(shifti(exponent, 1), (GEN)((*where)[1]));
      avma = av;
      /* leave *where unchanged */
    }
    if (moebius_mode) return 0;
    return 1;
  }
  else				/* factor > cofactor, rearrange */
  {
    (*where)[2] = (long)NULL;	/* mark factor `unknown' */
    (*where)[-1] = (long)NULL;	/* mark cofactor `unknown' */
    (*where)[-2] =
      isonstack(exponent) ? licopy(exponent) : (long)exponent;
    *where -= 3;
    **where = (*where)[3];	/* move cofactor pointer to lowest slot */
    (*where)[3] = (long)factor;	/* save factor */
    return 2;
  }
}

/* the following doesn't collect garbage;  caller's caller should do it
   (which means ifac_main()).  No diagnostics either, the factoring engine
   should have printed what it found when DEBUGLEVEL>=4 or so.  Note facvec
   contains slots of three components per factor;  repeated factors are
   expressly allowed  (and their classes shouldn't contradict each other
   whereas their exponents will be added up) */
static long
ifac_insert_multiplet(GEN *partial, GEN *where, GEN facvec)
{
  long j,k=1, lfv=lg(facvec)-1, nf=lfv/3, room=(long)(*where-*partial);
  /* one of the factors will go into the *where slot, so room is now
     3 times the number of slots we can use */
  long needroom = lfv - room;
  GEN sorted, auxvec = cgetg(nf+1, t_VEC), factor;
  long exponent = itos((GEN)((*where)[1])); /* the old exponent */
  GEN newexp;

  if (DEBUGLEVEL >= 5)
    fprintferr("IFAC: incorporating set of %ld factors%s\n",
	       nf, (DEBUGLEVEL >=6 ? "..." : ""));
  if (needroom > 0)
    ifac_realloc(partial, where, lg(*partial) + needroom + 3);
  /* one extra slot for paranoia, errm, future use */

  /* create sort permutation from the values of the factors */
  for (j=nf; j; j--) auxvec[j] = facvec[3*j-2]; /* just the pointers */
  sorted = sindexsort(auxvec);
  /* and readjust the result for the triple spacing */
  for (j=nf; j; j--) sorted[j] = 3*sorted[j]-2;
  if (DEBUGLEVEL >= 6)
    fprintferr("\tsorted them...\n");

  /* store factors, beginning at *where, and catching any duplicates */
  **where = facvec[sorted[nf]];
  if ((newexp = (GEN)(facvec[sorted[nf]+1])) != gun) /* new exponent > 1 */
  {
    if (exponent == 1)
      (*where)[1] = isonstack(newexp) ? licopy(newexp) : (long)newexp;
    else
      (*where)[1] = lmulsi(exponent, newexp);
  } /* if new exponent is 1, the old exponent already in place will do */
  (*where)[2] = facvec[sorted[nf]+2]; /* copy class */
  if (DEBUGLEVEL >= 6)
    fprintferr("\tstored (largest) factor no. %ld...\n", nf);

  for (j=nf-1; j; j--)
  {
    factor = (GEN)(facvec[sorted[j]]);
    if (egalii(factor, (GEN)(**where)))
    {
      if (DEBUGLEVEL >= 6)
	fprintferr("\tfactor no. %ld is a duplicate%s\n",
		   j, (j>1 ? "..." : ""));
      /* update exponent, ignore class which would already have been set,
	 and then forget current factor */
      if ((newexp = (GEN)(facvec[sorted[j]+1])) != gun) /* new exp > 1 */
      {				/* now we have at least 3 */
	(*where)[1] = laddii((GEN)((*where)[1]),
			     mulsi(exponent, newexp));
      }
      else
      {
	if ((*where)[1] == un && exponent == 1)
	  (*where)[1] = deux;
	else
	  (*where)[1] = laddsi(exponent, (GEN)((*where)[1]));
	/* not safe to add 1 in place -- that might overwrite gdeux,
	   with `interesting' consequences */
      }
      if (moebius_mode) return 0; /* stop now, but with exponent updated */
      continue;
    }
    (*where)[-1] = facvec[sorted[j]+2];	/* class as given */
    if ((newexp = (GEN)(facvec[sorted[j]+1])) != gun) /* new exp > 1 */
    {
      if (exponent == 1 && newexp == gdeux)
	(*where)[-2] = deux;
      else			/* exponent*newexp > 2 */
	(*where)[-2] = lmulsi(exponent, newexp);
    }
    else
    {
      (*where)[-2] = (exponent == 1 ? un :
		      (exponent == 2 ? deux :
		       (long)stoi(exponent))); /* inherit parent's exponent */
    }
    (*where)[-3] = isonstack(factor) ? licopy(factor) : (long)factor;
				/* keep components younger than *partial */
    *where -= 3;
    k++;
    if (DEBUGLEVEL >= 6)
      fprintferr("\tfactor no. %ld was unique%s\n",
		 j, (j>1 ? " (so far)..." : ""));
  }
  /* make the `sorted' object safe for garbage collection  (probably not a
     problem, since it should be in the garbage zone from everybody's
     perspective, but it's easy to do it) */
  *sorted = evaltyp(t_INT) | evallg(nf+1);
  return k;
}

static GEN
ifac_main(GEN *partial)
{
  /* leave the basic error checking to ifac_find() */
  GEN here = ifac_find(partial, partial);
  long res, nf;

  /* if nothing left, return gun */
  if (!here) return gun;

  /* if we are in Moebius mode and have already detected a repeated factor,
     stop right here.  Shouldn't really happen */
  if (moebius_mode && here[1] != un)
  {
    if (DEBUGLEVEL >= 3)
    {
      fprintferr("IFAC: main loop: repeated old factor\n\t%Z\n", *here);
      flusherr();
    }
    return gzero;
  }

  /* loop until first entry is a finished prime.  May involve reallocations
     and thus updates of *partial */
  while (here[2] != deux)
  {
    /* if it's unknown, something has gone wrong;  try to recover */
    if (!(here[2]))
    {
      err(warner, "IFAC: unknown factor seen in main loop");
      res = ifac_resort(partial, &here);
      if (res) return gzero;	/* can only happen in Moebius mode */
      ifac_whoiswho(partial, &here, -1);
      /* defrag for good measure */
      ifac_defrag(partial, &here);
      continue;
    }
    /* if it's composite, crack it */
    if (here[2] == zero)
    {
      /* make sure there's room for another factor */
      if (here < *partial + 6)
      {				/* try defrag first */
	ifac_defrag(partial, &here);
	if (here < *partial + 6) /* no luck */
	{
	  ifac_realloc(partial, &here, 1); /* guaranteed to work */
	  /* Unfortunately, we can't do a garbage collection here since we
	     know too little about where in the stack the old components
	     were. */
	}
      }
      nf = ifac_crack(partial, &here);
      if (moebius_mode && here[1] != un) /* that was a power */
      {
	if (DEBUGLEVEL >= 3)
	{
	  fprintferr("IFAC: main loop: repeated new factor\n\t%Z\n", *here);
	  flusherr();
	}
	return gzero;
      }
      /* deal with the new unknowns.  No resort, since ifac_crack will
	 already have sorted them */
      ifac_whoiswho(partial, &here, nf);
      continue;
    }
    /* if it's prime but not yet finished, finish it */
    if (here[2] == un)
    {
      res = ifac_divide(partial, &here);
      if (res)
      {
	if (moebius_mode)
	{
	  if (DEBUGLEVEL >= 3)
	  {
	    fprintferr("IFAC: main loop: another factor was divisible by\n");
	    fprintferr("\t%Z\n", *here); flusherr();
	  }
	  return gzero;
	}
	ifac_defrag(partial, &here);
	(void)(ifac_resort(partial, &here)); /* sort new cofactors down */
	/* it doesn't matter right now whether this finds a repeated factor,
	   since we never get to this point in Moebius mode */
	ifac_defrag(partial, &here); /* resort may have created new gaps */
	ifac_whoiswho(partial, &here, -1);
      }
      continue;
    }
    /* there are no other cases, never reached */
    err(talker, "non-existent factor class in ifac_main");
  } /* while */
  if (moebius_mode && here[1] != un)
  {
    if (DEBUGLEVEL >= 3)
    {
      fprintferr("IFAC: after main loop: repeated old factor\n\t%Z\n", *here);
      flusherr();
    }
    return gzero; /* just a safety net */
  }
  if (DEBUGLEVEL >= 4)
  {
    long nf = (*partial + lg(*partial) - here - 3)/3;
    if (nf)
      fprintferr("IFAC: main loop: %ld factor%s left\n",
		 nf, (nf>1 ? "s" : ""));
    else
      fprintferr("IFAC: main loop: this was the last factor\n");
    flusherr();
  }
  return here;
}

/* Caller of the following should worry about stack management, it makes
   a rather shameless mess :^) */
GEN
ifac_primary_factor(GEN *partial, long *exponent)
{
  GEN here = ifac_main(partial);
  GEN res;

  if (here == gun) { *exponent = 0; return gun; }
  else if (here == gzero) { *exponent = 0; return gzero; }

  res = icopy((GEN)(*here));
  *exponent = itos((GEN)(here[1]));
  here[2] = here[1] = *here = (long)NULL;
  return res;
}

/* encapsulated routines */

/* prime/exponent pairs need to appear contiguously on the stack, but we
   also need to have our data structure somewhere, and we don't know in
   advance how many primes will turn up.  The following discipline achieves
   this:  When ifac_decomp() is called, n should point at an object older
   than the oldest small prime/exponent pair  (auxdecomp0() guarantees
   this easily since it mpdivis()es any divisors it discovers off its own
   copy of the original N).  We allocate sufficient space to accommodate
   several pairs -- eleven pairs ought to fit in a space not much larger
   than n itself -- before calling ifac_start().  If we manage to complete
   the factorization before we run out of space, we free the data structure
   and cull the excess reserved space before returning.  When we do run out,
   we have to leapfrog to generate more  (guesstimating the requirements
   from what is left in the partial factorization structure);  room for
   fresh pairs is allocated at the head of the stack, followed by an
   ifac_realloc() to reconnect the data structure and move it out of the
   way, followed by a few pointer tweaks to connect the new pairs space
   to the old one.-- This whole affair translates into a surprisingly
   compact little routine. */

#define ifac_overshoot 64	/* lgefint(n)+64 words reserved */

long
ifac_decomp(GEN n, long hint)
{
  long tf=lgefint(n), av=avma, lim=stack_lim(av,1);
  long nb=0;
  GEN part, here, workspc = new_chunk(tf + ifac_overshoot), pairs = (GEN)av;
  /* workspc will be doled out by us in pairs of smaller t_INTs */
  long tetpil = avma;		/* remember head of workspc zone */

  if (!n || typ(n) != t_INT) err(typeer, "ifac_decomp");
  if (!signe(n) || tf < 3) err(talker, "factoring 0 in ifac_decomp");

  part = ifac_start(n, 0, hint);
  here = ifac_main(&part);

  while (here != gun)
  {
    long lf=lgefint((GEN)(*here));
    if (pairs - workspc < lf + 3) /* out of room, leapfrog */
    {
      /* the ifac_realloc() below will clear tetpil - avma words
	 on the stack, which should be about enough for the extra
	 primes we're going to see, and we'll want several more to
	 accommodate further exponents.  In most cases, the lf + 3
	 below is pure paranoia, but the factor we're about to copy
	 might be the one sitting off the stack in the original n,
	 so let's play safe */
      workspc = new_chunk(lf + 3 + ifac_overshoot);
      ifac_realloc(&part, &here, 0);
      here = ifac_find(&part, &part);
      tetpil = (long)workspc;
    }
    /* room enough now */
    nb++;
    pairs -= lf;
    *pairs = evaltyp(t_INT) | evallg(lf);
    affii((GEN)(*here), pairs);
    pairs -= 3;
    *pairs = evaltyp(t_INT) | evallg(3);
    affii((GEN)(here[1]), pairs);
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      if(DEBUGMEM>1) err(warnmem,"[2] ifac_decomp");
      ifac_realloc(&part, &here, 0);
      part = gerepileupto(tetpil, part);
    }
  }
  avma = (long)pairs;
  if (DEBUGLEVEL >= 3)
  {
    fprintferr("IFAC: found %ld large prime (power) factor%s.\n",
	       nb, (nb>1? "s": ""));
    flusherr();
  }
  return nb;
}

long
ifac_moebius(GEN n, long hint)
{
  long mu=1, av=avma, lim=stack_lim(av,1);
  GEN part = ifac_start(n, 1, hint);
  GEN here = ifac_main(&part);

  while (here != gun && here != gzero)
  {
    if (itos((GEN)(here[1])) > 1)
    { here = gzero; break; }	/* shouldn't happen */
    mu = -mu;
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      if(DEBUGMEM>1) err(warnmem,"ifac_moebius");
      ifac_realloc(&part, &here, 0);
      part = gerepileupto(av, part);
    }
  }
  avma = av;
  return (here == gun ? mu : 0);
}

long
ifac_issquarefree(GEN n, long hint)
{
  long av=avma, lim=stack_lim(av,1);
  GEN part = ifac_start(n, 1, hint);
  GEN here = ifac_main(&part);

  while (here != gun && here != gzero)
  {
    if (itos((GEN)(here[1])) > 1)
    { here = gzero; break; }	/* shouldn't happen */
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      if(DEBUGMEM>1) err(warnmem,"ifac_issquarefree");
      ifac_realloc(&part, &here, 0);
      part = gerepileupto(av, part);
    }
  }
  avma = av;
  return (here == gun ? 1 : 0);
}

long
ifac_omega(GEN n, long hint)
{
  long omega=0, av=avma, lim=stack_lim(av,1);
  GEN part = ifac_start(n, 0, hint);
  GEN here = ifac_main(&part);

  while (here != gun)
  {
    omega++;
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      if(DEBUGMEM>1) err(warnmem,"ifac_omega");
      ifac_realloc(&part, &here, 0);
      part = gerepileupto(av, part);
    }
  }
  avma = av;
  return omega;
}

long
ifac_bigomega(GEN n, long hint)
{
  long Omega=0, av=avma, lim=stack_lim(av,1);
  GEN part = ifac_start(n, 0, hint);
  GEN here = ifac_main(&part);

  while (here != gun)
  {
    Omega += itos((GEN)(here[1]));
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      if(DEBUGMEM>1) err(warnmem,"ifac_bigomega");
      ifac_realloc(&part, &here, 0);
      part = gerepileupto(av, part);
    }
  }
  avma = av;
  return Omega;
}

GEN
ifac_totient(GEN n, long hint)
{
  GEN res = cgeti(lgefint(n));
  long exponent, av=avma, tetpil, lim=stack_lim(av,1);
  GEN phi = gun;
  GEN part = ifac_start(n, 0, hint);
  GEN here = ifac_main(&part);

  while (here != gun)
  {
    phi = mulii(phi, addsi(-1, (GEN)(*here)));
    if (here[1] != un)
    {
      if (here[1] == deux)
      {
	phi = mulii(phi, (GEN)(*here));
      }
      else
      {
	exponent = itos((GEN)(here[1]));
	phi = mulii(phi, gpowgs((GEN)(*here), exponent-1));
      }
    }
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      GEN *gsav[2];
      if(DEBUGMEM>1) err(warnmem,"ifac_totient");
      tetpil = avma;
      ifac_realloc(&part, &here, 0);
      phi = icopy(phi);
      gsav[0] = &phi; gsav[1] = &part;
      gerepilemanysp(av, tetpil, gsav, 2);
      /* don't try to preserve here, safer to pick it up again
	 (and ifac_find does a lot of sanity checking at high
	 debuglevels) */
      here = ifac_find(&part, &part);
    }
  }
  affii(phi, res);
  avma = av;
  return res;
}

GEN
ifac_numdiv(GEN n, long hint)
{
  /* we don't preallocate since it's too hard to guess the right
     size here */
  GEN res;
  long av=avma, tetpil, lim=stack_lim(av,1);
  GEN exponent, tau = gun;
  GEN part = ifac_start(n, 0, hint);
  GEN here = ifac_main(&part);

  while (here != gun)
  {
    exponent = (GEN)(here[1]);
    tau = mulii(tau, addsi(1, exponent));
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      GEN *gsav[2];
      if(DEBUGMEM>1) err(warnmem,"ifac_numdiv");
      tetpil = avma;
      ifac_realloc(&part, &here, 0);
      tau = icopy(tau);
      gsav[0] = &tau; gsav[1] = &part;
      gerepilemanysp(av, tetpil, gsav, 2);
      /* (see ifac_totient()) */
      here = ifac_find(&part, &part);
    }
  }
  tetpil = avma;
  res = icopy(tau);
  return gerepile(av, tetpil, res);
}

GEN
ifac_sumdiv(GEN n, long hint)
{
  /* don't preallocate */
  GEN res;
  long exponent, av=avma, tetpil, lim=stack_lim(av,1);
  GEN contrib, sigma = gun;
  GEN part = ifac_start(n, 0, hint);
  GEN here = ifac_main(&part);

  while (here != gun)
  {
    exponent = itos((GEN)(here[1]));
    contrib = addsi(1, (GEN)(*here));
    for (; exponent > 1; exponent--)
      contrib = addsi(1, mulii((GEN)(*here), contrib));
    sigma = mulii(sigma, contrib);
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      GEN *gsav[2];
      if(DEBUGMEM>1) err(warnmem,"ifac_sumdiv");
      tetpil = avma;
      ifac_realloc(&part, &here, 0);
      sigma = icopy(sigma);
      gsav[0] = &sigma; gsav[1] = &part;
      gerepilemanysp(av, tetpil, gsav, 2);
      /* (see ifac_totient()) */
      here = ifac_find(&part, &part);
    }
  }
  tetpil = avma;
  res = icopy(sigma);
  return gerepile(av, tetpil, res);
}

/* k should be positive, and indeed it had better be > 1  (not checked).
   The calling function knows what to do with the other cases. */
GEN
ifac_sumdivk(GEN n, long k, long hint)
{
  /* don't preallocate */
  GEN res;
  long exponent, av=avma, tetpil, lim=stack_lim(av,1);
  GEN contrib, q, sigma = gun;
  GEN part = ifac_start(n, 0, hint);
  GEN here = ifac_main(&part);

  while (here != gun)
  {
    exponent = itos((GEN)(here[1]));
    q = gpowgs((GEN)(*here), k);
    contrib = addsi(1, q);
    for (; exponent > 1; exponent--)
      contrib = addsi(1, mulii(q, contrib));
    sigma = mulii(sigma, contrib);
    here[2] = here[1] = *here = (long)NULL;
    here = ifac_main(&part);
    if (low_stack(lim, stack_lim(av,1)))
    {
      GEN *gsav[2];
      if(DEBUGMEM>1) err(warnmem,"ifac_sumdivk");
      tetpil = avma;
      ifac_realloc(&part, &here, 0);
      sigma = icopy(sigma);
      gsav[0] = &sigma; gsav[1] = &part;
      gerepilemanysp(av, tetpil, gsav, 2);
      /* (see ifac_totient()) */
      here = ifac_find(&part, &part);
    }
  }
  tetpil = avma;
  res = icopy(sigma);
  return gerepile(av, tetpil, res);
}