1  /* Generate perfect square testing data.
       2  
       3  Copyright 2002-2004, 2012, 2014 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  #include <stdio.h>
      32  #include <stdlib.h>
      33  
      34  #include "bootstrap.c"
      35  
      36  
      37  /* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
      38     (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
      39     residues and non-residues modulo small factors of that modulus.
      40  
      41     For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
      42     function exists specifically because 2^24-1 and 2^48-1 have nice sets of
      43     prime factors.  For other limb sizes it's considered, but if it doesn't
      44     have good factors then mpn_mod_1 will be used instead.
      45  
      46     When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
      47     selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
      48     with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
      49     GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
      50     calculation within a single limb.
      51  
      52     In either case primes can be combined to make divisors.  The table data
      53     then effectively indicates remainders which are quadratic residues mod
      54     all the primes.  This sort of combining reduces the number of steps
      55     needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
      56     Nothing is gained or lost in terms of detections, the same total fraction
      57     of non-residues will be identified.
      58  
      59     Nothing particularly sophisticated is attempted for combining factors to
      60     make divisors.  This is probably a kind of knapsack problem so it'd be
      61     too hard to attempt anything completely general.  For the usual 32 and 64
      62     bit limbs we get a good enough result just pairing the biggest and
      63     smallest which fit together, repeatedly.
      64  
      65     Another aim is to get powerful combinations, ie. divisors which identify
      66     biggest fraction of non-residues, and have those run first.  Again for
      67     the usual 32 and 64 bits it seems good enough just to pair for big
      68     divisors then sort according to the resulting fraction of non-residues
      69     identified.
      70  
      71     Also in this program, a table sq_res_0x100 of residues modulo 256 is
      72     generated.  This simply fills bits into limbs of the appropriate
      73     build-time GMP_LIMB_BITS each.
      74  
      75  */
      76  
      77  
      78  /* Normally we aren't using const in gen*.c programs, so as not to have to
      79     bother figuring out if it works, but using it with f_cmp_divisor and
      80     f_cmp_fraction avoids warnings from the qsort calls. */
      81  
      82  /* Same tests as gmp.h. */
      83  #if  defined (__STDC__)                                 \
      84    || defined (__cplusplus)                              \
      85    || defined (_AIX)                                     \
      86    || defined (__DECC)                                   \
      87    || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
      88    || defined (_MSC_VER)                                 \
      89    || defined (_WIN32)
      90  #define HAVE_CONST        1
      91  #endif
      92  
      93  #if ! HAVE_CONST
      94  #define const
      95  #endif
      96  
      97  
      98  mpz_t  *sq_res_0x100;          /* table of limbs */
      99  int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
     100  int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
     101  double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
     102  
     103  int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
     104  int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
     105  int     max_divisor;       /* all divisors <= max_divisor */
     106  int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
     107  double  total_fraction;    /* of squares */
     108  mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
     109  mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
     110  mpz_t   pp_inverted;       /* invert_limb style inverse */
     111  mpz_t   mod_mask;          /* 2^mod_bits-1 */
     112  char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
     113  
     114  /* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
     115  struct rawfactor_t {
     116    int     divisor;
     117    int     multiplicity;
     118  };
     119  struct rawfactor_t  *rawfactor;
     120  int                 nrawfactor;
     121  
     122  /* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
     123  struct factor_t {
     124    int     divisor;
     125    mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
     126    mpz_t   mask;      /* indicating squares mod divisor */
     127    double  fraction;  /* squares/total */
     128  };
     129  struct factor_t  *factor;
     130  int              nfactor;       /* entries in use in factor array */
     131  int              factor_alloc;  /* entries allocated to factor array */
     132  
     133  
     134  int
     135  f_cmp_divisor (const void *parg, const void *qarg)
     136  {
     137    const struct factor_t *p, *q;
     138    p = (const struct factor_t *) parg;
     139    q = (const struct factor_t *) qarg;
     140    if (p->divisor > q->divisor)
     141      return 1;
     142    else if (p->divisor < q->divisor)
     143      return -1;
     144    else
     145      return 0;
     146  }
     147  
     148  int
     149  f_cmp_fraction (const void *parg, const void *qarg)
     150  {
     151    const struct factor_t *p, *q;
     152    p = (const struct factor_t *) parg;
     153    q = (const struct factor_t *) qarg;
     154    if (p->fraction > q->fraction)
     155      return 1;
     156    else if (p->fraction < q->fraction)
     157      return -1;
     158    else
     159      return 0;
     160  }
     161  
     162  /* Remove array[idx] by copying the remainder down, and adjust narray
     163     accordingly.  */
     164  #define COLLAPSE_ELEMENT(array, idx, narray)                    \
     165    do {                                                          \
     166      memmove (&(array)[idx],					\
     167  	     &(array)[idx+1],					\
     168  	     ((narray)-((idx)+1)) * sizeof (array[0]));		\
     169      (narray)--;                                                 \
     170    } while (0)
     171  
     172  
     173  /* return n*2^p mod m */
     174  int
     175  mul_2exp_mod (int n, int p, int m)
     176  {
     177    while (--p >= 0)
     178      n = (2 * n) % m;
     179    return n;
     180  }
     181  
     182  /* return -n mod m */
     183  int
     184  neg_mod (int n, int m)
     185  {
     186    assert (n >= 0 && n < m);
     187    return (n == 0 ? 0 : m-n);
     188  }
     189  
     190  /* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
     191     "-(idx<<mod_bits)" can be a square modulo m.  */
     192  void
     193  square_mask (mpz_t mask, int m)
     194  {
     195    int    p, i, r, idx;
     196  
     197    p = mul_2exp_mod (1, mod_bits, m);
     198    p = neg_mod (p, m);
     199  
     200    mpz_set_ui (mask, 0L);
     201    for (i = 0; i < m; i++)
     202      {
     203        r = (i * i) % m;
     204        idx = (r * p) % m;
     205        mpz_setbit (mask, (unsigned long) idx);
     206      }
     207  }
     208  
     209  void
     210  generate_sq_res_0x100 (int limb_bits)
     211  {
     212    int  i, res;
     213  
     214    nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
     215    sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
     216  
     217    for (i = 0; i < nsq_res_0x100; i++)
     218      mpz_init_set_ui (sq_res_0x100[i], 0L);
     219  
     220    for (i = 0; i < 0x100; i++)
     221      {
     222        res = (i * i) % 0x100;
     223        mpz_setbit (sq_res_0x100[res / limb_bits],
     224                    (unsigned long) (res % limb_bits));
     225      }
     226  
     227    sq_res_0x100_num = 0;
     228    for (i = 0; i < nsq_res_0x100; i++)
     229      sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
     230    sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
     231  }
     232  
     233  void
     234  generate_mod (int limb_bits, int nail_bits)
     235  {
     236    int    numb_bits = limb_bits - nail_bits;
     237    int    i, divisor;
     238  
     239    mpz_init_set_ui (pp, 0L);
     240    mpz_init_set_ui (pp_norm, 0L);
     241    mpz_init_set_ui (pp_inverted, 0L);
     242  
     243    /* no more than limb_bits many factors in a one limb modulus (and of
     244       course in reality nothing like that many) */
     245    factor_alloc = limb_bits;
     246    factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
     247    rawfactor = (struct rawfactor_t *) xmalloc (factor_alloc * sizeof (*rawfactor));
     248  
     249    if (numb_bits % 4 != 0)
     250      {
     251        strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
     252        goto use_pp;
     253      }
     254  
     255    max_divisor = 2*limb_bits;
     256    max_divisor_bits = log2_ceil (max_divisor);
     257  
     258    if (numb_bits / 4 < max_divisor_bits)
     259      {
     260        /* Wind back to one limb worth of max_divisor, if that will let us use
     261           mpn_mod_34lsub1.  */
     262        max_divisor = limb_bits;
     263        max_divisor_bits = log2_ceil (max_divisor);
     264  
     265        if (numb_bits / 4 < max_divisor_bits)
     266          {
     267            strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
     268            goto use_pp;
     269          }
     270      }
     271  
     272    {
     273      /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
     274      mpz_t  m, q, r;
     275      int    multiplicity;
     276  
     277      mod34_bits = (numb_bits / 4) * 3;
     278  
     279      /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
     280         the mod34_bits mark, adding the two halves for a remainder of at most
     281         mod34_bits+1 many bits */
     282      mod_bits = mod34_bits + 1;
     283  
     284      mpz_init_set_ui (m, 1L);
     285      mpz_mul_2exp (m, m, mod34_bits);
     286      mpz_sub_ui (m, m, 1L);
     287  
     288      mpz_init (q);
     289      mpz_init (r);
     290  
     291      for (i = 3; i <= max_divisor; i+=2)
     292        {
     293          if (! isprime (i))
     294            continue;
     295  
     296          mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
     297          if (mpz_sgn (r) != 0)
     298            continue;
     299  
     300          /* if a repeated prime is found it's used as an i^n in one factor */
     301          divisor = 1;
     302          multiplicity = 0;
     303          do
     304            {
     305              if (divisor > max_divisor / i)
     306                break;
     307              multiplicity++;
     308              mpz_set (m, q);
     309              mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
     310            }
     311          while (mpz_sgn (r) == 0);
     312  
     313          assert (nrawfactor < factor_alloc);
     314          rawfactor[nrawfactor].divisor = i;
     315          rawfactor[nrawfactor].multiplicity = multiplicity;
     316          nrawfactor++;
     317        }
     318  
     319      mpz_clear (m);
     320      mpz_clear (q);
     321      mpz_clear (r);
     322    }
     323  
     324    if (nrawfactor <= 2)
     325      {
     326        mpz_t  new_pp;
     327  
     328        sprintf (mod34_excuse, "only %d small factor%s",
     329                 nrawfactor, nrawfactor == 1 ? "" : "s");
     330  
     331      use_pp:
     332        /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
     333           tried with just one */
     334        max_divisor = 2*limb_bits;
     335        max_divisor_bits = log2_ceil (max_divisor);
     336  
     337        mpz_init (new_pp);
     338        nrawfactor = 0;
     339        mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
     340  
     341        /* one copy of each small prime */
     342        mpz_set_ui (pp, 1L);
     343        for (i = 3; i <= max_divisor; i+=2)
     344          {
     345            if (! isprime (i))
     346              continue;
     347  
     348            mpz_mul_ui (new_pp, pp, (unsigned long) i);
     349            if (mpz_sizeinbase (new_pp, 2) > mod_bits)
     350              break;
     351            mpz_set (pp, new_pp);
     352  
     353            assert (nrawfactor < factor_alloc);
     354            rawfactor[nrawfactor].divisor = i;
     355            rawfactor[nrawfactor].multiplicity = 1;
     356            nrawfactor++;
     357          }
     358  
     359        /* Plus an extra copy of one or more of the primes selected, if that
     360           still fits in max_divisor and the total in mod_bits.  Usually only
     361           3 or 5 will be candidates */
     362        for (i = nrawfactor-1; i >= 0; i--)
     363          {
     364            if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
     365              continue;
     366            mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
     367            if (mpz_sizeinbase (new_pp, 2) > mod_bits)
     368              continue;
     369            mpz_set (pp, new_pp);
     370  
     371            rawfactor[i].multiplicity++;
     372          }
     373  
     374        mod_bits = mpz_sizeinbase (pp, 2);
     375  
     376        mpz_set (pp_norm, pp);
     377        while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
     378          mpz_add (pp_norm, pp_norm, pp_norm);
     379  
     380        mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
     381  
     382        mpz_clear (new_pp);
     383      }
     384  
     385    /* start the factor array */
     386    for (i = 0; i < nrawfactor; i++)
     387      {
     388        int  j;
     389        assert (nfactor < factor_alloc);
     390        factor[nfactor].divisor = 1;
     391        for (j = 0; j < rawfactor[i].multiplicity; j++)
     392          factor[nfactor].divisor *= rawfactor[i].divisor;
     393        nfactor++;
     394      }
     395  
     396   combine:
     397    /* Combine entries in the factor array.  Combine the smallest entry with
     398       the biggest one that will fit with it (ie. under max_divisor), then
     399       repeat that with the new smallest entry. */
     400    qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
     401    for (i = nfactor-1; i >= 1; i--)
     402      {
     403        if (factor[i].divisor <= max_divisor / factor[0].divisor)
     404          {
     405            factor[0].divisor *= factor[i].divisor;
     406            COLLAPSE_ELEMENT (factor, i, nfactor);
     407            goto combine;
     408          }
     409      }
     410  
     411    total_fraction = 1.0;
     412    for (i = 0; i < nfactor; i++)
     413      {
     414        mpz_init (factor[i].inverse);
     415        mpz_invert_ui_2exp (factor[i].inverse,
     416                            (unsigned long) factor[i].divisor,
     417                            (unsigned long) mod_bits);
     418  
     419        mpz_init (factor[i].mask);
     420        square_mask (factor[i].mask, factor[i].divisor);
     421  
     422        /* fraction of possible squares */
     423        factor[i].fraction = (double) mpz_popcount (factor[i].mask)
     424          / factor[i].divisor;
     425  
     426        /* total fraction of possible squares */
     427        total_fraction *= factor[i].fraction;
     428      }
     429  
     430    /* best tests first (ie. smallest fraction) */
     431    qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
     432  }
     433  
     434  void
     435  print (int limb_bits, int nail_bits)
     436  {
     437    int    i;
     438    mpz_t  mhi, mlo;
     439  
     440    printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
     441    printf ("\n");
     442  
     443    printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
     444            limb_bits, nail_bits);
     445    printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
     446            limb_bits, nail_bits);
     447    printf ("#endif\n");
     448    printf ("\n");
     449  
     450    printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
     451    printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
     452            (1.0 - sq_res_0x100_fraction) * 100.0,
     453            0x100 - sq_res_0x100_num);
     454    printf ("static const mp_limb_t\n");
     455    printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
     456    for (i = 0; i < nsq_res_0x100; i++)
     457      {
     458        printf ("  CNST_LIMB(0x");
     459        mpz_out_str (stdout, 16, sq_res_0x100[i]);
     460        printf ("),\n");
     461      }
     462    printf ("};\n");
     463    printf ("\n");
     464  
     465    if (mpz_sgn (pp) != 0)
     466      {
     467        printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
     468        printf ("/* PERFSQR_PP = ");
     469      }
     470    else
     471      printf ("/* 2^%d-1 = ", mod34_bits);
     472    for (i = 0; i < nrawfactor; i++)
     473      {
     474        if (i != 0)
     475          printf (" * ");
     476        printf ("%d", rawfactor[i].divisor);
     477        if (rawfactor[i].multiplicity != 1)
     478          printf ("^%d", rawfactor[i].multiplicity);
     479      }
     480    printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
     481  
     482    printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
     483    if (mpz_sgn (pp) != 0)
     484      {
     485        printf ("#define PERFSQR_PP            CNST_LIMB(0x");
     486        mpz_out_str (stdout, 16, pp);
     487        printf (")\n");
     488        printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
     489        mpz_out_str (stdout, 16, pp_norm);
     490        printf (")\n");
     491        printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
     492        mpz_out_str (stdout, 16, pp_inverted);
     493        printf (")\n");
     494      }
     495    printf ("\n");
     496  
     497    mpz_init (mhi);
     498    mpz_init (mlo);
     499  
     500    printf ("/* This test identifies %.2f%% as non-squares. */\n",
     501            (1.0 - total_fraction) * 100.0);
     502    printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
     503    printf ("  do {                              \\\n");
     504    printf ("    mp_limb_t  r;                   \\\n");
     505    if (mpz_sgn (pp) != 0)
     506      printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
     507    else
     508      printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
     509  
     510    for (i = 0; i < nfactor; i++)
     511      {
     512        printf ("                                    \\\n");
     513        printf ("    /* %5.2f%% */                    \\\n",
     514                (1.0 - factor[i].fraction) * 100.0);
     515  
     516        printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
     517                factor[i].divisor <= limb_bits ? 1 : 2,
     518                factor[i].divisor);
     519        mpz_out_str (stdout, 16, factor[i].inverse);
     520        printf ("), \\\n");
     521        printf ("                   CNST_LIMB(0x");
     522  
     523        if ( factor[i].divisor <= limb_bits)
     524          {
     525            mpz_out_str (stdout, 16, factor[i].mask);
     526          }
     527        else
     528          {
     529            mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
     530            mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
     531            mpz_out_str (stdout, 16, mhi);
     532            printf ("), CNST_LIMB(0x");
     533            mpz_out_str (stdout, 16, mlo);
     534          }
     535        printf (")); \\\n");
     536      }
     537  
     538    printf ("  } while (0)\n");
     539    printf ("\n");
     540  
     541    printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
     542            (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
     543    printf ("\n");
     544  
     545    printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
     546    printf ("#define PERFSQR_DIVISORS  { 256,");
     547    for (i = 0; i < nfactor; i++)
     548        printf (" %d,", factor[i].divisor);
     549    printf (" }\n");
     550  
     551  
     552    mpz_clear (mhi);
     553    mpz_clear (mlo);
     554  }
     555  
     556  int
     557  main (int argc, char *argv[])
     558  {
     559    int  limb_bits, nail_bits;
     560  
     561    if (argc != 3)
     562      {
     563        fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
     564        exit (1);
     565      }
     566  
     567    limb_bits = atoi (argv[1]);
     568    nail_bits = atoi (argv[2]);
     569  
     570    if (limb_bits <= 0
     571        || nail_bits < 0
     572        || nail_bits >= limb_bits)
     573      {
     574        fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
     575                 limb_bits, nail_bits);
     576        exit (1);
     577      }
     578  
     579    generate_sq_res_0x100 (limb_bits);
     580    generate_mod (limb_bits, nail_bits);
     581  
     582    print (limb_bits, nail_bits);
     583  
     584    return 0;
     585  }