1  /* mpn_gcdext -- Extended Greatest Common Divisor.
       2  
       3  Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
       4  Inc.
       5  
       6  This file is part of the GNU MP Library.
       7  
       8  The GNU MP Library is free software; you can redistribute it and/or modify
       9  it under the terms of either:
      10  
      11    * the GNU Lesser General Public License as published by the Free
      12      Software Foundation; either version 3 of the License, or (at your
      13      option) any later version.
      14  
      15  or
      16  
      17    * the GNU General Public License as published by the Free Software
      18      Foundation; either version 2 of the License, or (at your option) any
      19      later version.
      20  
      21  or both in parallel, as here.
      22  
      23  The GNU MP Library is distributed in the hope that it will be useful, but
      24  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      25  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      26  for more details.
      27  
      28  You should have received copies of the GNU General Public License and the
      29  GNU Lesser General Public License along with the GNU MP Library.  If not,
      30  see https://www.gnu.org/licenses/.  */
      31  
      32  #include "gmp-impl.h"
      33  #include "longlong.h"
      34  
      35  /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
      36     the size is returned (if inputs are non-normalized, result may be
      37     non-normalized too). Temporary space needed is M->n + n.
      38   */
      39  static size_t
      40  hgcd_mul_matrix_vector (struct hgcd_matrix *M,
      41  			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
      42  {
      43    mp_limb_t ah, bh;
      44  
      45    /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
      46  
      47       t  = u00 * a
      48       r  = u10 * b
      49       r += t;
      50  
      51       t  = u11 * b
      52       b  = u01 * a
      53       b += t;
      54    */
      55  
      56    if (M->n >= n)
      57      {
      58        mpn_mul (tp, M->p[0][0], M->n, ap, n);
      59        mpn_mul (rp, M->p[1][0], M->n, bp, n);
      60      }
      61    else
      62      {
      63        mpn_mul (tp, ap, n, M->p[0][0], M->n);
      64        mpn_mul (rp, bp, n, M->p[1][0], M->n);
      65      }
      66  
      67    ah = mpn_add_n (rp, rp, tp, n + M->n);
      68  
      69    if (M->n >= n)
      70      {
      71        mpn_mul (tp, M->p[1][1], M->n, bp, n);
      72        mpn_mul (bp, M->p[0][1], M->n, ap, n);
      73      }
      74    else
      75      {
      76        mpn_mul (tp, bp, n, M->p[1][1], M->n);
      77        mpn_mul (bp, ap, n, M->p[0][1], M->n);
      78      }
      79    bh = mpn_add_n (bp, bp, tp, n + M->n);
      80  
      81    n += M->n;
      82    if ( (ah | bh) > 0)
      83      {
      84        rp[n] = ah;
      85        bp[n] = bh;
      86        n++;
      87      }
      88    else
      89      {
      90        /* Normalize */
      91        while ( (rp[n-1] | bp[n-1]) == 0)
      92  	n--;
      93      }
      94  
      95    return n;
      96  }
      97  
      98  #define COMPUTE_V_ITCH(n) (2*(n))
      99  
     100  /* Computes |v| = |(g - u a)| / b, where u may be positive or
     101     negative, and v is of the opposite sign. max(a, b) is of size n, u and
     102     v at most size n, and v must have space for n+1 limbs. */
     103  static mp_size_t
     104  compute_v (mp_ptr vp,
     105  	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
     106  	   mp_srcptr gp, mp_size_t gn,
     107  	   mp_srcptr up, mp_size_t usize,
     108  	   mp_ptr tp)
     109  {
     110    mp_size_t size;
     111    mp_size_t an;
     112    mp_size_t bn;
     113    mp_size_t vn;
     114  
     115    ASSERT (n > 0);
     116    ASSERT (gn > 0);
     117    ASSERT (usize != 0);
     118  
     119    size = ABS (usize);
     120    ASSERT (size <= n);
     121    ASSERT (up[size-1] > 0);
     122  
     123    an = n;
     124    MPN_NORMALIZE (ap, an);
     125    ASSERT (gn <= an);
     126  
     127    if (an >= size)
     128      mpn_mul (tp, ap, an, up, size);
     129    else
     130      mpn_mul (tp, up, size, ap, an);
     131  
     132    size += an;
     133  
     134    if (usize > 0)
     135      {
     136        /* |v| = -v = (u a - g) / b */
     137  
     138        ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
     139        MPN_NORMALIZE (tp, size);
     140        if (size == 0)
     141  	return 0;
     142      }
     143    else
     144      { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
     145  	 (g + |u| a) always fits in (|usize| + an) limbs. */
     146  
     147        ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
     148        size -= (tp[size - 1] == 0);
     149      }
     150  
     151    /* Now divide t / b. There must be no remainder */
     152    bn = n;
     153    MPN_NORMALIZE (bp, bn);
     154    ASSERT (size >= bn);
     155  
     156    vn = size + 1 - bn;
     157    ASSERT (vn <= n + 1);
     158  
     159    mpn_divexact (vp, tp, size, bp, bn);
     160    vn -= (vp[vn-1] == 0);
     161  
     162    return vn;
     163  }
     164  
     165  /* Temporary storage:
     166  
     167     Initial division: Quotient of at most an - n + 1 <= an limbs.
     168  
     169     Storage for u0 and u1: 2(n+1).
     170  
     171     Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
     172  
     173     Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
     174  
     175     When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
     176  
     177     When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
     178  
     179     For the lehmer call after the loop, Let T denote
     180     GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
     181     u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
     182     for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
     183     sufficient for both operations.
     184  
     185  */
     186  
     187  /* Optimal choice of p seems difficult. In each iteration the division
     188   * of work between hgcd and the updates of u0 and u1 depends on the
     189   * current size of the u. It may be desirable to use a different
     190   * choice of p in each iteration. Also the input size seems to matter;
     191   * choosing p = n / 3 in the first iteration seems to improve
     192   * performance slightly for input size just above the threshold, but
     193   * degrade performance for larger inputs. */
     194  #define CHOOSE_P_1(n) ((n) / 2)
     195  #define CHOOSE_P_2(n) ((n) / 3)
     196  
     197  mp_size_t
     198  mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
     199  	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
     200  {
     201    mp_size_t talloc;
     202    mp_size_t scratch;
     203    mp_size_t matrix_scratch;
     204    mp_size_t ualloc = n + 1;
     205  
     206    struct gcdext_ctx ctx;
     207    mp_size_t un;
     208    mp_ptr u0;
     209    mp_ptr u1;
     210  
     211    mp_ptr tp;
     212  
     213    TMP_DECL;
     214  
     215    ASSERT (an >= n);
     216    ASSERT (n > 0);
     217    ASSERT (bp[n-1] > 0);
     218  
     219    TMP_MARK;
     220  
     221    /* FIXME: Check for small sizes first, before setting up temporary
     222       storage etc. */
     223    talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
     224  
     225    /* For initial division */
     226    scratch = an - n + 1;
     227    if (scratch > talloc)
     228      talloc = scratch;
     229  
     230    if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
     231      {
     232        /* For hgcd loop. */
     233        mp_size_t hgcd_scratch;
     234        mp_size_t update_scratch;
     235        mp_size_t p1 = CHOOSE_P_1 (n);
     236        mp_size_t p2 = CHOOSE_P_2 (n);
     237        mp_size_t min_p = MIN(p1, p2);
     238        mp_size_t max_p = MAX(p1, p2);
     239        matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
     240        hgcd_scratch = mpn_hgcd_itch (n - min_p);
     241        update_scratch = max_p + n - 1;
     242  
     243        scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
     244        if (scratch > talloc)
     245  	talloc = scratch;
     246  
     247        /* Final mpn_gcdext_lehmer_n call. Need space for u and for
     248  	 copies of a and b. */
     249        scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
     250  	+ 3*GCDEXT_DC_THRESHOLD;
     251  
     252        if (scratch > talloc)
     253  	talloc = scratch;
     254  
     255        /* Cofactors u0 and u1 */
     256        talloc += 2*(n+1);
     257      }
     258  
     259    tp = TMP_ALLOC_LIMBS(talloc);
     260  
     261    if (an > n)
     262      {
     263        mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
     264  
     265        if (mpn_zero_p (ap, n))
     266  	{
     267  	  MPN_COPY (gp, bp, n);
     268  	  *usizep = 0;
     269  	  TMP_FREE;
     270  	  return n;
     271  	}
     272      }
     273  
     274    if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
     275      {
     276        mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
     277  
     278        TMP_FREE;
     279        return gn;
     280      }
     281  
     282    MPN_ZERO (tp, 2*ualloc);
     283    u0 = tp; tp += ualloc;
     284    u1 = tp; tp += ualloc;
     285  
     286    ctx.gp = gp;
     287    ctx.up = up;
     288    ctx.usize = usizep;
     289  
     290    {
     291      /* For the first hgcd call, there are no u updates, and it makes
     292         some sense to use a different choice for p. */
     293  
     294      /* FIXME: We could trim use of temporary storage, since u0 and u1
     295         are not used yet. For the hgcd call, we could swap in the u0
     296         and u1 pointers for the relevant matrix elements. */
     297  
     298      struct hgcd_matrix M;
     299      mp_size_t p = CHOOSE_P_1 (n);
     300      mp_size_t nn;
     301  
     302      mpn_hgcd_matrix_init (&M, n - p, tp);
     303      nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
     304      if (nn > 0)
     305        {
     306  	ASSERT (M.n <= (n - p - 1)/2);
     307  	ASSERT (M.n + p <= (p + n - 1) / 2);
     308  
     309  	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
     310  	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
     311  
     312  	MPN_COPY (u0, M.p[1][0], M.n);
     313  	MPN_COPY (u1, M.p[1][1], M.n);
     314  	un = M.n;
     315  	while ( (u0[un-1] | u1[un-1] ) == 0)
     316  	  un--;
     317        }
     318      else
     319        {
     320  	/* mpn_hgcd has failed. Then either one of a or b is very
     321  	   small, or the difference is very small. Perform one
     322  	   subtraction followed by one division. */
     323  	u1[0] = 1;
     324  
     325  	ctx.u0 = u0;
     326  	ctx.u1 = u1;
     327  	ctx.tp = tp + n; /* ualloc */
     328  	ctx.un = 1;
     329  
     330  	/* Temporary storage n */
     331  	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
     332  	if (n == 0)
     333  	  {
     334  	    TMP_FREE;
     335  	    return ctx.gn;
     336  	  }
     337  
     338  	un = ctx.un;
     339  	ASSERT (un < ualloc);
     340        }
     341    }
     342  
     343    while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
     344      {
     345        struct hgcd_matrix M;
     346        mp_size_t p = CHOOSE_P_2 (n);
     347        mp_size_t nn;
     348  
     349        mpn_hgcd_matrix_init (&M, n - p, tp);
     350        nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
     351        if (nn > 0)
     352  	{
     353  	  mp_ptr t0;
     354  
     355  	  t0 = tp + matrix_scratch;
     356  	  ASSERT (M.n <= (n - p - 1)/2);
     357  	  ASSERT (M.n + p <= (p + n - 1) / 2);
     358  
     359  	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
     360  	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
     361  
     362  	  /* By the same analysis as for mpn_hgcd_matrix_mul */
     363  	  ASSERT (M.n + un <= ualloc);
     364  
     365  	  /* FIXME: This copying could be avoided by some swapping of
     366  	   * pointers. May need more temporary storage, though. */
     367  	  MPN_COPY (t0, u0, un);
     368  
     369  	  /* Temporary storage ualloc */
     370  	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
     371  
     372  	  ASSERT (un < ualloc);
     373  	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
     374  	}
     375        else
     376  	{
     377  	  /* mpn_hgcd has failed. Then either one of a or b is very
     378  	     small, or the difference is very small. Perform one
     379  	     subtraction followed by one division. */
     380  	  ctx.u0 = u0;
     381  	  ctx.u1 = u1;
     382  	  ctx.tp = tp + n; /* ualloc */
     383  	  ctx.un = un;
     384  
     385  	  /* Temporary storage n */
     386  	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
     387  	  if (n == 0)
     388  	    {
     389  	      TMP_FREE;
     390  	      return ctx.gn;
     391  	    }
     392  
     393  	  un = ctx.un;
     394  	  ASSERT (un < ualloc);
     395  	}
     396      }
     397    /* We have A = ... a + ... b
     398  	     B =  u0 a +  u1 b
     399  
     400  	     a = u1  A + ... B
     401  	     b = -u0 A + ... B
     402  
     403       with bounds
     404  
     405         |u0|, |u1| <= B / min(a, b)
     406  
     407       We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
     408       in which case the only reduction done so far is a = A - k B for
     409       some k.
     410  
     411       Compute g = u a + v b = (u u1 - v u0) A + (...) B
     412       Here, u, v are bounded by
     413  
     414         |u| <= b,
     415         |v| <= a
     416    */
     417  
     418    ASSERT ( (ap[n-1] | bp[n-1]) > 0);
     419  
     420    if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
     421      {
     422        /* Must return the smallest cofactor, +u1 or -u0 */
     423        int c;
     424  
     425        MPN_COPY (gp, ap, n);
     426  
     427        MPN_CMP (c, u0, u1, un);
     428        /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
     429  	 this case we choose the cofactor + 1, corresponding to G = A
     430  	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
     431        ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
     432        if (c < 0)
     433  	{
     434  	  MPN_NORMALIZE (u0, un);
     435  	  MPN_COPY (up, u0, un);
     436  	  *usizep = -un;
     437  	}
     438        else
     439  	{
     440  	  MPN_NORMALIZE_NOT_ZERO (u1, un);
     441  	  MPN_COPY (up, u1, un);
     442  	  *usizep = un;
     443  	}
     444  
     445        TMP_FREE;
     446        return n;
     447      }
     448    else if (UNLIKELY (u0[0] == 0) && un == 1)
     449      {
     450        mp_size_t gn;
     451        ASSERT (u1[0] == 1);
     452  
     453        /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
     454        gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
     455  
     456        TMP_FREE;
     457        return gn;
     458      }
     459    else
     460      {
     461        mp_size_t u0n;
     462        mp_size_t u1n;
     463        mp_size_t lehmer_un;
     464        mp_size_t lehmer_vn;
     465        mp_size_t gn;
     466  
     467        mp_ptr lehmer_up;
     468        mp_ptr lehmer_vp;
     469        int negate;
     470  
     471        lehmer_up = tp; tp += n;
     472  
     473        /* Call mpn_gcdext_lehmer_n with copies of a and b. */
     474        MPN_COPY (tp, ap, n);
     475        MPN_COPY (tp + n, bp, n);
     476        gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
     477  
     478        u0n = un;
     479        MPN_NORMALIZE (u0, u0n);
     480        ASSERT (u0n > 0);
     481  
     482        if (lehmer_un == 0)
     483  	{
     484  	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
     485  	  MPN_COPY (up, u0, u0n);
     486  	  *usizep = -u0n;
     487  
     488  	  TMP_FREE;
     489  	  return gn;
     490  	}
     491  
     492        lehmer_vp = tp;
     493        /* Compute v = (g - u a) / b */
     494        lehmer_vn = compute_v (lehmer_vp,
     495  			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
     496  
     497        if (lehmer_un > 0)
     498  	negate = 0;
     499        else
     500  	{
     501  	  lehmer_un = -lehmer_un;
     502  	  negate = 1;
     503  	}
     504  
     505        u1n = un;
     506        MPN_NORMALIZE (u1, u1n);
     507        ASSERT (u1n > 0);
     508  
     509        ASSERT (lehmer_un + u1n <= ualloc);
     510        ASSERT (lehmer_vn + u0n <= ualloc);
     511  
     512        /* We may still have v == 0 */
     513  
     514        /* Compute u u0 */
     515        if (lehmer_un <= u1n)
     516  	/* Should be the common case */
     517  	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
     518        else
     519  	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
     520  
     521        un = u1n + lehmer_un;
     522        un -= (up[un - 1] == 0);
     523  
     524        if (lehmer_vn > 0)
     525  	{
     526  	  mp_limb_t cy;
     527  
     528  	  /* Overwrites old u1 value */
     529  	  if (lehmer_vn <= u0n)
     530  	    /* Should be the common case */
     531  	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
     532  	  else
     533  	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
     534  
     535  	  u1n = u0n + lehmer_vn;
     536  	  u1n -= (u1[u1n - 1] == 0);
     537  
     538  	  if (u1n <= un)
     539  	    {
     540  	      cy = mpn_add (up, up, un, u1, u1n);
     541  	    }
     542  	  else
     543  	    {
     544  	      cy = mpn_add (up, u1, u1n, up, un);
     545  	      un = u1n;
     546  	    }
     547  	  up[un] = cy;
     548  	  un += (cy != 0);
     549  
     550  	  ASSERT (un < ualloc);
     551  	}
     552        *usizep = negate ? -un : un;
     553  
     554        TMP_FREE;
     555        return gn;
     556      }
     557  }