(root)/
gmp-6.3.0/
tune/
tuneup.c
       1  /* Create tuned thresholds for various algorithms.
       2  
       3  Copyright 1999-2003, 2005, 2006, 2008-2017 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  
      32  /* Usage: tuneup [-t] [-t] [-p precision]
      33  
      34     -t turns on some diagnostic traces, a second -t turns on more traces.
      35  
      36     Notes:
      37  
      38     The code here isn't a vision of loveliness, mainly because it's subject
      39     to ongoing changes according to new things wanting to be tuned, and
      40     practical requirements of systems tested.
      41  
      42     Sometimes running the program twice produces slightly different results.
      43     This is probably because there's so little separating algorithms near
      44     their crossover, and on that basis it should make little or no difference
      45     to the final speed of the relevant routines, but nothing has been done to
      46     check that carefully.
      47  
      48     Algorithm:
      49  
      50     The thresholds are determined as follows.  A crossover may not be a
      51     single size but rather a range where it oscillates between method A or
      52     method B faster.  If the threshold is set making B used where A is faster
      53     (or vice versa) that's bad.  Badness is the percentage time lost and
      54     total badness is the sum of this over all sizes measured.  The threshold
      55     is set to minimize total badness.
      56  
      57     Suppose, as sizes increase, method B becomes faster than method A.  The
      58     effect of the rule is that, as you look at increasing sizes, isolated
      59     points where B is faster are ignored, but when it's consistently faster,
      60     or faster on balance, then the threshold is set there.  The same result
      61     is obtained thinking in the other direction of A becoming faster at
      62     smaller sizes.
      63  
      64     In practice the thresholds tend to be chosen to bring on the next
      65     algorithm fairly quickly.
      66  
      67     This rule is attractive because it's got a basis in reason and is fairly
      68     easy to implement, but no work has been done to actually compare it in
      69     absolute terms to other possibilities.
      70  
      71     Implementation:
      72  
      73     In a normal library build the thresholds are constants.  To tune them
      74     selected objects are recompiled with the thresholds as global variables
      75     instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
      76     the end of gmp-impl.h, and rules in tune/Makefile.am.
      77  
      78     MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
      79     threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
      80     level, but recurse into the basecase.
      81  
      82     MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
      83     Other routines in turn will make use of both of those.  Naturally the
      84     dependants must be tuned first.
      85  
      86     In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
      87     just a threshold based on comparing two routines (mpn_divrem_1 and
      88     mpn_divexact_1), and no further use of the value determined.
      89  
      90     Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
      91     just comparisons between certain routines on representative data.
      92  
      93     Shortcuts are applied when native (assembler) versions of routines exist.
      94     For instance a native mpn_sqr_basecase is assumed to be always faster
      95     than mpn_mul_basecase, with no measuring.
      96  
      97     No attempt is made to tune within assembler routines, for instance
      98     DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
      99     written and tuned all by hand.  Assembler routines that might have hard
     100     limits are recompiled though, to make them accept a bigger range of sizes
     101     than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
     102  
     103     Limitations:
     104  
     105     The FFTs aren't subject to the same badness rule as the other thresholds,
     106     so each k is probably being brought on a touch early.  This isn't likely
     107     to make a difference, and the simpler probing means fewer tests.
     108  
     109  */
     110  
     111  #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
     112  
     113  #include "config.h"
     114  
     115  #include <math.h>
     116  #include <stdio.h>
     117  #include <stdlib.h>
     118  #include <time.h>
     119  #if HAVE_UNISTD_H
     120  #include <unistd.h>
     121  #endif
     122  
     123  #include "gmp-impl.h"
     124  #include "longlong.h"
     125  
     126  #include "tests.h"
     127  #include "speed.h"
     128  
     129  #if !HAVE_DECL_OPTARG
     130  extern char *optarg;
     131  extern int optind, opterr;
     132  #endif
     133  
     134  
     135  #define DEFAULT_MAX_SIZE   1000  /* limbs */
     136  
     137  #if WANT_FFT
     138  mp_size_t  option_fft_max_size = 50000;  /* limbs */
     139  #else
     140  mp_size_t  option_fft_max_size = 0;
     141  #endif
     142  int        option_trace = 0;
     143  int        option_fft_trace = 0;
     144  struct speed_params  s;
     145  
     146  struct dat_t {
     147    mp_size_t  size;
     148    double     d;
     149  } *dat = NULL;
     150  int  ndat = 0;
     151  int  allocdat = 0;
     152  
     153  /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
     154     case use zero here, which for params.max_size means no limit.  */
     155  #ifndef TUNE_SQR_TOOM2_MAX
     156  #define TUNE_SQR_TOOM2_MAX  0
     157  #endif
     158  
     159  mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
     160  mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
     161  mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
     162  mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
     163  mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
     164  mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
     165  mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
     166  mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
     167  mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
     168  mp_size_t  mul_toom43_to_toom54_threshold = MP_SIZE_T_MAX;
     169  mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
     170  mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
     171  mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
     172  mp_size_t  sqr_toom2_threshold
     173    = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
     174  mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
     175  mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
     176  mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
     177  mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
     178  mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
     179  mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
     180  mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
     181  mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
     182  mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
     183  mp_size_t  sqrlo_basecase_threshold     = MP_SIZE_T_MAX;
     184  mp_size_t  sqrlo_dc_threshold           = MP_SIZE_T_MAX;
     185  mp_size_t  sqrlo_sqr_threshold          = MP_SIZE_T_MAX;
     186  mp_size_t  mulmid_toom42_threshold      = MP_SIZE_T_MAX;
     187  mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
     188  mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
     189  mp_size_t  div_qr_2_pi2_threshold       = MP_SIZE_T_MAX;
     190  mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
     191  mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
     192  mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
     193  mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
     194  mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
     195  mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
     196  mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
     197  mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
     198  mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
     199  mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
     200  mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
     201  mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
     202  mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
     203  mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
     204  mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
     205  mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
     206  mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
     207  mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
     208  mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
     209  mp_size_t  hgcd_appr_threshold          = MP_SIZE_T_MAX;
     210  mp_size_t  hgcd_reduce_threshold        = MP_SIZE_T_MAX;
     211  mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
     212  mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
     213  int	   div_qr_1n_pi1_method		= 0;
     214  mp_size_t  div_qr_1_norm_threshold      = MP_SIZE_T_MAX;
     215  mp_size_t  div_qr_1_unnorm_threshold    = MP_SIZE_T_MAX;
     216  mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
     217  mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
     218  mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
     219  mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
     220  int	   mod_1_1p_method		= 0;
     221  mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
     222  mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
     223  mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
     224  mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
     225  mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
     226  mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
     227  mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
     228  mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
     229  mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
     230  mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
     231  mp_size_t  fac_odd_threshold            = 0;
     232  mp_size_t  fac_dsc_threshold            = FAC_DSC_THRESHOLD_LIMIT;
     233  
     234  mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
     235  mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
     236  
     237  struct param_t {
     238    const char        *name;
     239    speed_function_t  function;
     240    speed_function_t  function2;
     241    double            step_factor;    /* how much to step relatively */
     242    int               step;           /* how much to step absolutely */
     243    double            function_fudge; /* multiplier for "function" speeds */
     244    int               stop_since_change;
     245    double            stop_factor;
     246    mp_size_t         min_size;
     247    int               min_is_always;
     248    mp_size_t         max_size;
     249    mp_size_t         check_size;
     250    mp_size_t         size_extra;
     251  
     252  #define DATA_HIGH_LT_R  1
     253  #define DATA_HIGH_GE_R  2
     254    int               data_high;
     255  
     256    int               noprint;
     257  };
     258  
     259  
     260  /* These are normally undefined when false, which suits "#if" fine.
     261     But give them zero values so they can be used in plain C "if"s.  */
     262  #ifndef UDIV_PREINV_ALWAYS
     263  #define UDIV_PREINV_ALWAYS 0
     264  #endif
     265  #ifndef HAVE_NATIVE_mpn_divexact_1
     266  #define HAVE_NATIVE_mpn_divexact_1 0
     267  #endif
     268  #ifndef HAVE_NATIVE_mpn_div_qr_1n_pi1
     269  #define HAVE_NATIVE_mpn_div_qr_1n_pi1 0
     270  #endif
     271  #ifndef HAVE_NATIVE_mpn_divrem_1
     272  #define HAVE_NATIVE_mpn_divrem_1 0
     273  #endif
     274  #ifndef HAVE_NATIVE_mpn_divrem_2
     275  #define HAVE_NATIVE_mpn_divrem_2 0
     276  #endif
     277  #ifndef HAVE_NATIVE_mpn_mod_1
     278  #define HAVE_NATIVE_mpn_mod_1 0
     279  #endif
     280  #ifndef HAVE_NATIVE_mpn_mod_1_1p
     281  #define HAVE_NATIVE_mpn_mod_1_1p 0
     282  #endif
     283  #ifndef HAVE_NATIVE_mpn_modexact_1_odd
     284  #define HAVE_NATIVE_mpn_modexact_1_odd 0
     285  #endif
     286  #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
     287  #define HAVE_NATIVE_mpn_preinv_divrem_1 0
     288  #endif
     289  #ifndef HAVE_NATIVE_mpn_preinv_mod_1
     290  #define HAVE_NATIVE_mpn_preinv_mod_1 0
     291  #endif
     292  #ifndef HAVE_NATIVE_mpn_sqr_basecase
     293  #define HAVE_NATIVE_mpn_sqr_basecase 0
     294  #endif
     295  
     296  
     297  #define MAX3(a,b,c)  MAX (MAX (a, b), c)
     298  
     299  mp_limb_t
     300  randlimb_norm (void)
     301  {
     302    mp_limb_t  n;
     303    mpn_random (&n, 1);
     304    n |= GMP_NUMB_HIGHBIT;
     305    return n;
     306  }
     307  
     308  #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
     309  
     310  mp_limb_t
     311  randlimb_half (void)
     312  {
     313    mp_limb_t  n;
     314    mpn_random (&n, 1);
     315    n &= GMP_NUMB_HALFMASK;
     316    n += (n==0);
     317    return n;
     318  }
     319  
     320  
     321  /* Add an entry to the end of the dat[] array, reallocing to make it bigger
     322     if necessary.  */
     323  void
     324  add_dat (mp_size_t size, double d)
     325  {
     326  #define ALLOCDAT_STEP  500
     327  
     328    ASSERT_ALWAYS (ndat <= allocdat);
     329  
     330    if (ndat == allocdat)
     331      {
     332        dat = (struct dat_t *) __gmp_allocate_or_reallocate
     333          (dat, allocdat * sizeof(dat[0]),
     334           (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
     335        allocdat += ALLOCDAT_STEP;
     336      }
     337  
     338    dat[ndat].size = size;
     339    dat[ndat].d = d;
     340    ndat++;
     341  }
     342  
     343  
     344  /* Return the threshold size based on the data accumulated. */
     345  mp_size_t
     346  analyze_dat (int final)
     347  {
     348    double  x, min_x;
     349    int     j, min_j;
     350  
     351    /* If the threshold is set at dat[0].size, any positive values are bad. */
     352    x = 0.0;
     353    for (j = 0; j < ndat; j++)
     354      if (dat[j].d > 0.0)
     355        x += dat[j].d;
     356  
     357    if (option_trace >= 2 && final)
     358      {
     359        printf ("\n");
     360        printf ("x is the sum of the badness from setting thresh at given size\n");
     361        printf ("  (minimum x is sought)\n");
     362        printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
     363      }
     364  
     365    min_x = x;
     366    min_j = 0;
     367  
     368  
     369    /* When stepping to the next dat[j].size, positive values are no longer
     370       bad (so subtracted), negative values become bad (so add the absolute
     371       value, meaning subtract). */
     372    for (j = 0; j < ndat; x -= dat[j].d, j++)
     373      {
     374        if (option_trace >= 2 && final)
     375          printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
     376  
     377        if (x < min_x)
     378          {
     379            min_x = x;
     380            min_j = j;
     381          }
     382      }
     383  
     384    return min_j;
     385  }
     386  
     387  
     388  /* Measuring for recompiled mpn/generic/div_qr_1.c,
     389   * mpn/generic/divrem_1.c, mpn/generic/mod_1.c and mpz/fac_ui.c */
     390  
     391  mp_limb_t mpn_div_qr_1_tune (mp_ptr, mp_limb_t *, mp_srcptr, mp_size_t, mp_limb_t);
     392  
     393  #if defined (__cplusplus)
     394  extern "C" {
     395  #endif
     396  
     397  mp_limb_t mpn_divrem_1_tune (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
     398  mp_limb_t mpn_mod_1_tune (mp_srcptr, mp_size_t, mp_limb_t);
     399  void mpz_fac_ui_tune (mpz_ptr, unsigned long);
     400  
     401  #if defined (__cplusplus)
     402  }
     403  #endif
     404  
     405  double
     406  speed_mpn_mod_1_tune (struct speed_params *s)
     407  {
     408    SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
     409  }
     410  double
     411  speed_mpn_divrem_1_tune (struct speed_params *s)
     412  {
     413    SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
     414  }
     415  double
     416  speed_mpz_fac_ui_tune (struct speed_params *s)
     417  {
     418    SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui_tune);
     419  }
     420  double
     421  speed_mpn_div_qr_1_tune (struct speed_params *s)
     422  {
     423    SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1_tune);
     424  }
     425  
     426  double
     427  tuneup_measure (speed_function_t fun,
     428                  const struct param_t *param,
     429                  struct speed_params *s)
     430  {
     431    static struct param_t  dummy;
     432    double   t;
     433    TMP_DECL;
     434  
     435    if (! param)
     436      param = &dummy;
     437  
     438    s->size += param->size_extra;
     439  
     440    TMP_MARK;
     441    SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
     442    SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
     443  
     444    mpn_random (s->xp, s->size);
     445    mpn_random (s->yp, s->size);
     446  
     447    switch (param->data_high) {
     448    case DATA_HIGH_LT_R:
     449      s->xp[s->size-1] %= s->r;
     450      s->yp[s->size-1] %= s->r;
     451      break;
     452    case DATA_HIGH_GE_R:
     453      s->xp[s->size-1] |= s->r;
     454      s->yp[s->size-1] |= s->r;
     455      break;
     456    }
     457  
     458    t = speed_measure (fun, s);
     459  
     460    s->size -= param->size_extra;
     461  
     462    TMP_FREE;
     463    return t;
     464  }
     465  
     466  
     467  #define PRINT_WIDTH  31
     468  
     469  void
     470  print_define_start (const char *name)
     471  {
     472    printf ("#define %-*s  ", PRINT_WIDTH, name);
     473    if (option_trace)
     474      printf ("...\n");
     475  }
     476  
     477  void
     478  print_define_end_remark (const char *name, mp_size_t value, const char *remark)
     479  {
     480    if (option_trace)
     481      printf ("#define %-*s  ", PRINT_WIDTH, name);
     482  
     483    if (value == MP_SIZE_T_MAX)
     484      printf ("MP_SIZE_T_MAX");
     485    else
     486      printf ("%5ld", (long) value);
     487  
     488    if (remark != NULL)
     489      printf ("  /* %s */", remark);
     490    printf ("\n");
     491    fflush (stdout);
     492  }
     493  
     494  void
     495  print_define_end (const char *name, mp_size_t value)
     496  {
     497    const char  *remark;
     498    if (value == MP_SIZE_T_MAX)
     499      remark = "never";
     500    else if (value == 0)
     501      remark = "always";
     502    else
     503      remark = NULL;
     504    print_define_end_remark (name, value, remark);
     505  }
     506  
     507  void
     508  print_define (const char *name, mp_size_t value)
     509  {
     510    print_define_start (name);
     511    print_define_end (name, value);
     512  }
     513  
     514  void
     515  print_define_remark (const char *name, mp_size_t value, const char *remark)
     516  {
     517    print_define_start (name);
     518    print_define_end_remark (name, value, remark);
     519  }
     520  
     521  void
     522  print_define_with_speedup (const char *name, mp_size_t value,
     523  			   mp_size_t runner_up, double speedup)
     524  {
     525    char buf[100];
     526    snprintf (buf, sizeof(buf), "%.2f%% faster than %ld",
     527  	    100.0 * (speedup - 1), runner_up);
     528    print_define_remark (name, value, buf);
     529  }
     530  
     531  void
     532  one (mp_size_t *threshold, struct param_t *param)
     533  {
     534    int  since_positive, since_thresh_change;
     535    int  thresh_idx, new_thresh_idx;
     536  
     537  #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
     538  
     539    DEFAULT (param->function_fudge, 1.0);
     540    DEFAULT (param->function2, param->function);
     541    DEFAULT (param->step_factor, 0.01);  /* small steps by default */
     542    DEFAULT (param->step, 1);            /* small steps by default */
     543    DEFAULT (param->stop_since_change, 80);
     544    DEFAULT (param->stop_factor, 1.2);
     545    DEFAULT (param->min_size, 10);
     546    DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
     547  
     548    if (param->check_size != 0)
     549      {
     550        double   t1, t2;
     551        s.size = param->check_size;
     552  
     553        *threshold = s.size+1;
     554        t1 = tuneup_measure (param->function, param, &s);
     555  
     556        *threshold = s.size;
     557        t2 = tuneup_measure (param->function2, param, &s);
     558        if (t1 == -1.0 || t2 == -1.0)
     559          {
     560            printf ("Oops, can't run both functions at size %ld\n",
     561                    (long) s.size);
     562            abort ();
     563          }
     564        t1 *= param->function_fudge;
     565  
     566        /* ask that t2 is at least 4% below t1 */
     567        if (t1 < t2*1.04)
     568          {
     569            if (option_trace)
     570              printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
     571            *threshold = MP_SIZE_T_MAX;
     572            if (! param->noprint)
     573              print_define (param->name, *threshold);
     574            return;
     575          }
     576  
     577        if (option_trace >= 2)
     578          printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
     579                  (long) s.size, t1, t2);
     580      }
     581  
     582    if (! param->noprint || option_trace)
     583      print_define_start (param->name);
     584  
     585    ndat = 0;
     586    since_positive = 0;
     587    since_thresh_change = 0;
     588    thresh_idx = 0;
     589  
     590    if (option_trace >= 2)
     591      {
     592        printf ("             algorithm-A  algorithm-B   ratio  possible\n");
     593        printf ("              (seconds)    (seconds)    diff    thresh\n");
     594      }
     595  
     596    for (s.size = param->min_size;
     597         s.size < param->max_size;
     598         s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
     599      {
     600        double   ti, tiplus1, d;
     601  
     602        /*
     603          FIXME: check minimum size requirements are met, possibly by just
     604          checking for the -1 returns from the speed functions.
     605        */
     606  
     607        /* using method A at this size */
     608        *threshold = s.size+1;
     609        ti = tuneup_measure (param->function, param, &s);
     610        if (ti == -1.0)
     611          abort ();
     612        ti *= param->function_fudge;
     613  
     614        /* using method B at this size */
     615        *threshold = s.size;
     616        tiplus1 = tuneup_measure (param->function2, param, &s);
     617        if (tiplus1 == -1.0)
     618          abort ();
     619  
     620        /* Calculate the fraction by which the one or the other routine is
     621           slower.  */
     622        if (tiplus1 >= ti)
     623          d = (tiplus1 - ti) / tiplus1;  /* negative */
     624        else
     625          d = (tiplus1 - ti) / ti;       /* positive */
     626  
     627        add_dat (s.size, d);
     628  
     629        new_thresh_idx = analyze_dat (0);
     630  
     631        if (option_trace >= 2)
     632          printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
     633                  (long) s.size, ti, tiplus1, d,
     634                  ti > tiplus1 ? '#' : ' ',
     635                  (long) dat[new_thresh_idx].size);
     636  
     637        /* Stop if the last time method i was faster was more than a
     638           certain number of measurements ago.  */
     639  #define STOP_SINCE_POSITIVE  200
     640        if (d >= 0)
     641          since_positive = 0;
     642        else
     643          if (++since_positive > STOP_SINCE_POSITIVE)
     644            {
     645              if (option_trace >= 1)
     646                printf ("stopped due to since_positive (%d)\n",
     647                        STOP_SINCE_POSITIVE);
     648              break;
     649            }
     650  
     651        /* Stop if method A has become slower by a certain factor. */
     652        if (ti >= tiplus1 * param->stop_factor)
     653          {
     654            if (option_trace >= 1)
     655              printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
     656                      param->stop_factor);
     657            break;
     658          }
     659  
     660        /* Stop if the threshold implied hasn't changed in a certain
     661           number of measurements.  (It's this condition that usually
     662           stops the loop.) */
     663        if (thresh_idx != new_thresh_idx)
     664          since_thresh_change = 0, thresh_idx = new_thresh_idx;
     665        else
     666          if (++since_thresh_change > param->stop_since_change)
     667            {
     668              if (option_trace >= 1)
     669                printf ("stopped due to since_thresh_change (%d)\n",
     670                        param->stop_since_change);
     671              break;
     672            }
     673  
     674        /* Stop if the threshold implied is more than a certain number of
     675           measurements ago.  */
     676  #define STOP_SINCE_AFTER   500
     677        if (ndat - thresh_idx > STOP_SINCE_AFTER)
     678          {
     679            if (option_trace >= 1)
     680              printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
     681                      STOP_SINCE_AFTER);
     682            break;
     683          }
     684  
     685        /* Stop when the size limit is reached before the end of the
     686           crossover, but only show this as an error for >= the default max
     687           size.  FIXME: Maybe should make it a param choice whether this is
     688           an error.  */
     689        if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
     690          {
     691            fprintf (stderr, "%s\n", param->name);
     692            fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
     693                     (long) dat[0].size, (long) dat[ndat-1].size, ndat);
     694            fprintf (stderr, "    max size reached before end of crossover\n");
     695            break;
     696          }
     697      }
     698  
     699    if (option_trace >= 1)
     700      printf ("sizes %ld to %ld total %d measurements\n",
     701              (long) dat[0].size, (long) dat[ndat-1].size, ndat);
     702  
     703    *threshold = dat[analyze_dat (1)].size;
     704  
     705    if (param->min_is_always)
     706      {
     707        if (*threshold == param->min_size)
     708          *threshold = 0;
     709      }
     710  
     711    if (! param->noprint || option_trace)
     712      print_define_end (param->name, *threshold);
     713  }
     714  
     715  /* Time N different FUNCTIONS with the same parameters and size, to
     716     select the fastest. Since *_METHOD defines start numbering from
     717     one, if functions[i] is fastest, the value of the define is i+1.
     718     Also output a comment with speedup compared to the next fastest
     719     function. The NAME argument is used only for trace output.
     720  
     721     Returns the index of the fastest function.
     722  */
     723  int
     724  one_method (int n, speed_function_t *functions,
     725  	    const char *name, const char *define,
     726  	    const struct param_t *param)
     727  {
     728    double *t;
     729    int i;
     730    int method;
     731    int method_runner_up;
     732  
     733    TMP_DECL;
     734    TMP_MARK;
     735    t = (double*) TMP_ALLOC (n * sizeof (*t));
     736  
     737    for (i = 0; i < n; i++)
     738      {
     739        t[i] = tuneup_measure (functions[i], param, &s);
     740        if (option_trace >= 1)
     741  	printf ("size=%ld, %s, method %d %.9f\n",
     742  		(long) s.size, name, i + 1, t[i]);
     743        if (t[i] == -1.0)
     744  	{
     745  	  printf ("Oops, can't measure all %s methods\n", name);
     746  	  abort ();
     747  	}
     748      }
     749    method = 0;
     750    for (i = 1; i < n; i++)
     751      if (t[i] < t[method])
     752        method = i;
     753  
     754    method_runner_up = (method == 0);
     755    for (i = 0; i < n; i++)
     756      if (i != method && t[i] < t[method_runner_up])
     757        method_runner_up = i;
     758  
     759    print_define_with_speedup (define, method + 1, method_runner_up + 1,
     760  			     t[method_runner_up] / t[method]);
     761  
     762    TMP_FREE;
     763    return method;
     764  }
     765  
     766  
     767  /* Special probing for the fft thresholds.  The size restrictions on the
     768     FFTs mean the graph of time vs size has a step effect.  See this for
     769     example using
     770  
     771         ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
     772         gnuplot foo.gnuplot
     773  
     774     The current approach is to compare routines at the midpoint of relevant
     775     steps.  Arguably a more sophisticated system of threshold data is wanted
     776     if this step effect remains. */
     777  
     778  struct fft_param_t {
     779    const char        *table_name;
     780    const char        *threshold_name;
     781    const char        *modf_threshold_name;
     782    mp_size_t         *p_threshold;
     783    mp_size_t         *p_modf_threshold;
     784    mp_size_t         first_size;
     785    mp_size_t         max_size;
     786    speed_function_t  function;
     787    speed_function_t  mul_modf_function;
     788    speed_function_t  mul_function;
     789    mp_size_t         sqr;
     790  };
     791  
     792  
     793  /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
     794     N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
     795     of 2^(k-1) bits. */
     796  
     797  mp_size_t
     798  fft_step_size (int k)
     799  {
     800    mp_size_t  step;
     801  
     802    step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
     803    step *= (mp_size_t) 1 << k;
     804  
     805    if (step <= 0)
     806      {
     807        printf ("Can't handle k=%d\n", k);
     808        abort ();
     809      }
     810  
     811    return step;
     812  }
     813  
     814  mp_size_t
     815  fft_next_size (mp_size_t pl, int k)
     816  {
     817    mp_size_t  m = fft_step_size (k);
     818  
     819  /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
     820  
     821    if (pl == 0 || (pl & (m-1)) != 0)
     822      pl = (pl | (m-1)) + 1;
     823  
     824  /*    printf (" %ld\n", pl); */
     825    return pl;
     826  }
     827  
     828  #define NMAX_DEFAULT 1000000
     829  #define MAX_REPS 25
     830  #define MIN_REPS 5
     831  
     832  static inline size_t
     833  mpn_mul_fft_lcm (size_t a, unsigned int k)
     834  {
     835    unsigned int l = k;
     836  
     837    while (a % 2 == 0 && k > 0)
     838      {
     839        a >>= 1;
     840        k--;
     841      }
     842    return a << l;
     843  }
     844  
     845  mp_size_t
     846  fftfill (mp_size_t pl, int k, int sqr)
     847  {
     848    mp_size_t maxLK;
     849    mp_bitcnt_t N, Nprime, nprime, M;
     850  
     851    N = pl * GMP_NUMB_BITS;
     852    M = N >> k;
     853  
     854    maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
     855  
     856    Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
     857    nprime = Nprime / GMP_NUMB_BITS;
     858    if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
     859      {
     860        size_t K2;
     861        for (;;)
     862  	{
     863  	  K2 = 1L << mpn_fft_best_k (nprime, sqr);
     864  	  if ((nprime & (K2 - 1)) == 0)
     865  	    break;
     866  	  nprime = (nprime + K2 - 1) & -K2;
     867  	  Nprime = nprime * GMP_LIMB_BITS;
     868  	}
     869      }
     870    ASSERT_ALWAYS (nprime < pl);
     871  
     872    return Nprime;
     873  }
     874  
     875  static int
     876  compare_double (const void *ap, const void *bp)
     877  {
     878    double a = * (const double *) ap;
     879    double b = * (const double *) bp;
     880  
     881    if (a < b)
     882      return -1;
     883    else if (a > b)
     884      return 1;
     885    else
     886      return 0;
     887  }
     888  
     889  double
     890  median (double *times, int n)
     891  {
     892    qsort (times, n, sizeof (double), compare_double);
     893    return times[n/2];
     894  }
     895  
     896  #define FFT_CACHE_SIZE 25
     897  typedef struct fft_cache
     898  {
     899    mp_size_t n;
     900    double time;
     901  } fft_cache_t;
     902  
     903  fft_cache_t fft_cache[FFT_CACHE_SIZE];
     904  
     905  double
     906  cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
     907  		int n_measurements)
     908  {
     909    int i;
     910    double t, ttab[MAX_REPS];
     911  
     912    if (fft_cache[k].n == n)
     913      return fft_cache[k].time;
     914  
     915    for (i = 0; i < n_measurements; i++)
     916      {
     917        speed_starttime ();
     918        mpn_mul_fft (rp, n, ap, n, bp, n, k);
     919        ttab[i] = speed_endtime ();
     920      }
     921  
     922    t = median (ttab, n_measurements);
     923    fft_cache[k].n = n;
     924    fft_cache[k].time = t;
     925    return t;
     926  }
     927  
     928  #define INSERT_FFTTAB(idx, nval, kval)					\
     929    do {									\
     930      fft_tab[idx].n = nval;						\
     931      fft_tab[idx].k = kval;						\
     932      fft_tab[idx+1].n = (1 << 27) - 1;	/* sentinel, 27b wide field */	\
     933      fft_tab[idx+1].k = (1 <<  5) - 1;					\
     934    } while (0)
     935  
     936  int
     937  fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
     938  {
     939    mp_size_t n, n1, prev_n1;
     940    int k, best_k, last_best_k, kmax;
     941    int eff, prev_eff;
     942    double t0, t1;
     943    int n_measurements;
     944    mp_limb_t *ap, *bp, *rp;
     945    mp_size_t alloc;
     946    struct fft_table_nk *fft_tab;
     947  
     948    fft_tab = mpn_fft_table3[p->sqr];
     949  
     950    for (k = 0; k < FFT_CACHE_SIZE; k++)
     951      fft_cache[k].n = 0;
     952  
     953    if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
     954      {
     955        nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
     956      }
     957  
     958    if (print)
     959      printf ("#define %s%*s", p->table_name, 38, "");
     960  
     961    if (idx == 0)
     962      {
     963        INSERT_FFTTAB (0, nmin, initial_k);
     964  
     965        if (print)
     966  	{
     967  	  printf ("\\\n  { ");
     968  	  printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
     969  	}
     970  
     971        idx = 1;
     972      }
     973  
     974    ap = (mp_ptr) malloc (sizeof (mp_limb_t));
     975    if (p->sqr)
     976      bp = ap;
     977    else
     978      bp = (mp_ptr) malloc (sizeof (mp_limb_t));
     979    rp = (mp_ptr) malloc (sizeof (mp_limb_t));
     980    alloc = 1;
     981  
     982    /* Round n to comply to initial k value */
     983    n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
     984  
     985    n_measurements = (18 - initial_k) | 1;
     986    n_measurements = MAX (n_measurements, MIN_REPS);
     987    n_measurements = MIN (n_measurements, MAX_REPS);
     988  
     989    last_best_k = initial_k;
     990    best_k = initial_k;
     991  
     992    while (n < nmax)
     993      {
     994        int start_k, end_k;
     995  
     996        /* Assume the current best k is best until we hit its next FFT step.  */
     997        t0 = 99999;
     998  
     999        prev_n1 = n + 1;
    1000  
    1001        start_k = MAX (4, best_k - 4);
    1002        end_k = MIN (24, best_k + 4);
    1003        for (k = start_k; k <= end_k; k++)
    1004  	{
    1005            n1 = mpn_fft_next_size (prev_n1, k);
    1006  
    1007  	  eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
    1008  
    1009  	  if (eff < 70)		/* avoid measuring too slow fft:s */
    1010  	    continue;
    1011  
    1012  	  if (n1 > alloc)
    1013  	    {
    1014  	      alloc = n1;
    1015  	      if (p->sqr)
    1016  		{
    1017  		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
    1018  		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
    1019  		  ap = bp = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
    1020  		  mpn_random (ap, alloc);
    1021  		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
    1022  		}
    1023  	      else
    1024  		{
    1025  		  ap = (mp_ptr) realloc (ap, sizeof (mp_limb_t));
    1026  		  bp = (mp_ptr) realloc (bp, sizeof (mp_limb_t));
    1027  		  rp = (mp_ptr) realloc (rp, sizeof (mp_limb_t));
    1028  		  ap = (mp_ptr) realloc (ap, alloc * sizeof (mp_limb_t));
    1029  		  mpn_random (ap, alloc);
    1030  		  bp = (mp_ptr) realloc (bp, alloc * sizeof (mp_limb_t));
    1031  		  mpn_random (bp, alloc);
    1032  		  rp = (mp_ptr) realloc (rp, alloc * sizeof (mp_limb_t));
    1033  		}
    1034  	    }
    1035  
    1036  	  t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
    1037  
    1038  	  if (t1 * n_measurements > 0.3)
    1039  	    n_measurements -= 2;
    1040  	  n_measurements = MAX (n_measurements, MIN_REPS);
    1041  
    1042  	  if (t1 < t0)
    1043  	    {
    1044  	      best_k = k;
    1045  	      t0 = t1;
    1046  	    }
    1047  	}
    1048  
    1049        n1 = mpn_fft_next_size (prev_n1, best_k);
    1050  
    1051        if (last_best_k != best_k)
    1052  	{
    1053  	  ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
    1054  
    1055  	  if (idx >= FFT_TABLE3_SIZE)
    1056  	    {
    1057  	      printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
    1058  	      abort ();
    1059  	    }
    1060  	  INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
    1061  
    1062  	  if (print)
    1063  	    {
    1064  	      printf (", ");
    1065  	      if (idx % 4 == 0)
    1066  		printf ("\\\n    ");
    1067  	      printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
    1068  	    }
    1069  
    1070  	  if (option_trace >= 2)
    1071  	    {
    1072  	      printf ("{%lu,%u}\n", prev_n1, best_k);
    1073  	      fflush (stdout);
    1074  	    }
    1075  
    1076  	  last_best_k = best_k;
    1077  	  idx++;
    1078  	}
    1079  
    1080        for (;;)
    1081  	{
    1082  	  prev_n1 = n1;
    1083  	  prev_eff = fftfill (prev_n1, best_k, p->sqr);
    1084  	  n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
    1085  	  eff = fftfill (n1, best_k, p->sqr);
    1086  
    1087  	  if (eff != prev_eff)
    1088  	    break;
    1089  	}
    1090  
    1091        n = prev_n1;
    1092      }
    1093  
    1094    kmax = sizeof (mp_size_t) * 4;	/* GMP_MP_SIZE_T_BITS / 2 */
    1095    kmax = MIN (kmax, 25-1);
    1096    for (k = last_best_k + 1; k <= kmax; k++)
    1097      {
    1098        if (idx >= FFT_TABLE3_SIZE)
    1099  	{
    1100  	  printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
    1101  	  abort ();
    1102  	}
    1103        INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
    1104  
    1105        if (print)
    1106  	{
    1107  	  printf (", ");
    1108  	  if (idx % 4 == 0)
    1109  	    printf ("\\\n    ");
    1110  	  printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
    1111  	}
    1112  
    1113        idx++;
    1114      }
    1115  
    1116    if (print)
    1117      printf (" }\n");
    1118  
    1119    free (ap);
    1120    if (! p->sqr)
    1121      free (bp);
    1122    free (rp);
    1123  
    1124    return idx;
    1125  }
    1126  
    1127  void
    1128  fft (struct fft_param_t *p)
    1129  {
    1130    mp_size_t  size;
    1131    int        k, idx, initial_k;
    1132  
    1133    /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
    1134  
    1135  #if 1
    1136    {
    1137      /* Use plain one() mechanism, for some reasonable initial values of k.  The
    1138         advantage is that we don't depend on mpn_fft_table3, which can therefore
    1139         leave it completely uninitialized.  */
    1140  
    1141      static struct param_t param;
    1142      mp_size_t thres, best_thres;
    1143      int best_k;
    1144      char buf[20];
    1145  
    1146      best_thres = MP_SIZE_T_MAX;
    1147      best_k = -1;
    1148  
    1149      for (k = 5; k <= 7; k++)
    1150        {
    1151  	param.name = p->modf_threshold_name;
    1152  	param.min_size = 100;
    1153  	param.max_size = 2000;
    1154  	param.function  = p->mul_function;
    1155  	param.step_factor = 0.0;
    1156  	param.step = 4;
    1157  	param.function2 = p->mul_modf_function;
    1158  	param.noprint = 1;
    1159  	s.r = k;
    1160  	one (&thres, &param);
    1161  	if (thres < best_thres)
    1162  	  {
    1163  	    best_thres = thres;
    1164  	    best_k = k;
    1165  	  }
    1166        }
    1167  
    1168      *(p->p_modf_threshold) = best_thres;
    1169      sprintf (buf, "k = %d", best_k);
    1170      print_define_remark (p->modf_threshold_name, best_thres, buf);
    1171      initial_k = best_k;
    1172    }
    1173  #else
    1174    size = p->first_size;
    1175    for (;;)
    1176      {
    1177        double  tk, tm;
    1178  
    1179        size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
    1180        k = mpn_fft_best_k (size, p->sqr);
    1181  
    1182        if (size >= p->max_size)
    1183          break;
    1184  
    1185        s.size = size + fft_step_size (k) / 2;
    1186        s.r = k;
    1187        tk = tuneup_measure (p->mul_modf_function, NULL, &s);
    1188        if (tk == -1.0)
    1189          abort ();
    1190  
    1191        tm = tuneup_measure (p->mul_function, NULL, &s);
    1192        if (tm == -1.0)
    1193          abort ();
    1194  
    1195        if (option_trace >= 2)
    1196          printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
    1197                  (long) size,
    1198                  (long) size + fft_step_size (k) / 2, k, tk,
    1199                  (long) s.size, tm);
    1200  
    1201        if (tk < tm)
    1202          {
    1203  	  *p->p_modf_threshold = s.size;
    1204  	  print_define (p->modf_threshold_name, *p->p_modf_threshold);
    1205  	  break;
    1206          }
    1207      }
    1208    initial_k = ?;
    1209  #endif
    1210  
    1211    /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
    1212  
    1213    idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
    1214    printf ("#define %s_SIZE %d\n", p->table_name, idx);
    1215  
    1216    /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
    1217  
    1218    size = 2 * *p->p_modf_threshold;	/* OK? */
    1219    for (;;)
    1220      {
    1221        double  tk, tm;
    1222        mp_size_t mulmod_size, mul_size;;
    1223  
    1224        if (size >= p->max_size)
    1225          break;
    1226  
    1227        mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
    1228        mul_size = (size + mulmod_size) / 2;	/* middle of step */
    1229  
    1230        s.size = mulmod_size;
    1231        tk = tuneup_measure (p->function, NULL, &s);
    1232        if (tk == -1.0)
    1233          abort ();
    1234  
    1235        s.size = mul_size;
    1236        tm = tuneup_measure (p->mul_function, NULL, &s);
    1237        if (tm == -1.0)
    1238          abort ();
    1239  
    1240        if (option_trace >= 2)
    1241          printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
    1242                  (long) size,
    1243                  (long) mulmod_size, tk,
    1244                  (long) mul_size, tm);
    1245  
    1246        size = mulmod_size;
    1247  
    1248        if (tk < tm)
    1249          {
    1250  	  *p->p_threshold = s.size;
    1251  	  print_define (p->threshold_name, *p->p_threshold);
    1252  	  break;
    1253          }
    1254      }
    1255  }
    1256  
    1257  /* Compare mpn_mul_1 to whatever fast exact single-limb division we have.  This
    1258     is currently mpn_divexact_1, but will become mpn_bdiv_1_qr_pi2 or somesuch.
    1259     This is used in get_str and set_str.  */
    1260  void
    1261  relspeed_div_1_vs_mul_1 (void)
    1262  {
    1263    const size_t max_opsize = 100;
    1264    mp_size_t n;
    1265    long j;
    1266    mp_limb_t rp[max_opsize];
    1267    mp_limb_t ap[max_opsize];
    1268    double multime, divtime;
    1269  
    1270    mpn_random (ap, max_opsize);
    1271  
    1272    multime = 0;
    1273    for (n = max_opsize; n > 1; n--)
    1274      {
    1275        mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
    1276        speed_starttime ();
    1277        for (j = speed_precision; j != 0 ; j--)
    1278  	mpn_mul_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
    1279        multime += speed_endtime () / n;
    1280      }
    1281  
    1282    divtime = 0;
    1283    for (n = max_opsize; n > 1; n--)
    1284      {
    1285        /* Make input divisible for good measure.  */
    1286        ap[n - 1] = mpn_mul_1 (ap, ap, n - 1, MP_BASES_BIG_BASE_10);
    1287  
    1288  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
    1289  	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
    1290  			    MP_BASES_BIG_BASE_BINVERTED_10,
    1291  			    MP_BASES_BIG_BASE_CTZ_10);
    1292  #else
    1293  	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
    1294  #endif
    1295        speed_starttime ();
    1296        for (j = speed_precision; j != 0 ; j--)
    1297  	{
    1298  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
    1299  	  mpn_pi1_bdiv_q_1 (rp, ap, n, MP_BASES_BIG_BASE_10,
    1300  			    MP_BASES_BIG_BASE_BINVERTED_10,
    1301  			    MP_BASES_BIG_BASE_CTZ_10);
    1302  #else
    1303  	  mpn_divexact_1 (rp, ap, n, MP_BASES_BIG_BASE_10);
    1304  #endif
    1305  	}
    1306        divtime += speed_endtime () / n;
    1307      }
    1308  
    1309    print_define ("DIV_1_VS_MUL_1_PERCENT", (int) (100 * divtime/multime));
    1310  }
    1311  
    1312  
    1313  /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
    1314     giving wrong results.  */
    1315  void
    1316  tune_mul_n (void)
    1317  {
    1318    static struct param_t  param;
    1319    mp_size_t next_toom_start;
    1320    int something_changed;
    1321  
    1322    param.function = speed_mpn_mul_n;
    1323  
    1324    param.name = "MUL_TOOM22_THRESHOLD";
    1325    param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
    1326    param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
    1327    one (&mul_toom22_threshold, &param);
    1328  
    1329    param.noprint = 1;
    1330  
    1331    /* Threshold sequence loop.  Disable functions that would be used in a very
    1332       narrow range, re-measuring things when that happens.  */
    1333    something_changed = 1;
    1334    while (something_changed)
    1335      {
    1336        something_changed = 0;
    1337  
    1338  	next_toom_start = mul_toom22_threshold;
    1339  
    1340  	if (mul_toom33_threshold != 0)
    1341  	  {
    1342  	    param.name = "MUL_TOOM33_THRESHOLD";
    1343  	    param.min_size = MAX (next_toom_start, MPN_TOOM33_MUL_MINSIZE);
    1344  	    param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
    1345  	    one (&mul_toom33_threshold, &param);
    1346  
    1347  	    if (next_toom_start * 1.05 >= mul_toom33_threshold)
    1348  	      {
    1349  		mul_toom33_threshold = 0;
    1350  		something_changed = 1;
    1351  	      }
    1352  	  }
    1353  
    1354  	next_toom_start = MAX (next_toom_start, mul_toom33_threshold);
    1355  
    1356  	if (mul_toom44_threshold != 0)
    1357  	  {
    1358  	    param.name = "MUL_TOOM44_THRESHOLD";
    1359  	    param.min_size = MAX (next_toom_start, MPN_TOOM44_MUL_MINSIZE);
    1360  	    param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
    1361  	    one (&mul_toom44_threshold, &param);
    1362  
    1363  	    if (next_toom_start * 1.05 >= mul_toom44_threshold)
    1364  	      {
    1365  		mul_toom44_threshold = 0;
    1366  		something_changed = 1;
    1367  	      }
    1368  	  }
    1369  
    1370  	next_toom_start = MAX (next_toom_start, mul_toom44_threshold);
    1371  
    1372  	if (mul_toom6h_threshold != 0)
    1373  	  {
    1374  	    param.name = "MUL_TOOM6H_THRESHOLD";
    1375  	    param.min_size = MAX (next_toom_start, MPN_TOOM6H_MUL_MINSIZE);
    1376  	    param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
    1377  	    one (&mul_toom6h_threshold, &param);
    1378  
    1379  	    if (next_toom_start * 1.05 >= mul_toom6h_threshold)
    1380  	      {
    1381  		mul_toom6h_threshold = 0;
    1382  		something_changed = 1;
    1383  	      }
    1384  	  }
    1385  
    1386  	next_toom_start = MAX (next_toom_start, mul_toom6h_threshold);
    1387  
    1388  	if (mul_toom8h_threshold != 0)
    1389  	  {
    1390  	    param.name = "MUL_TOOM8H_THRESHOLD";
    1391  	    param.min_size = MAX (next_toom_start, MPN_TOOM8H_MUL_MINSIZE);
    1392  	    param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
    1393  	    one (&mul_toom8h_threshold, &param);
    1394  
    1395  	    if (next_toom_start * 1.05 >= mul_toom8h_threshold)
    1396  	      {
    1397  		mul_toom8h_threshold = 0;
    1398  		something_changed = 1;
    1399  	      }
    1400  	  }
    1401      }
    1402  
    1403      print_define ("MUL_TOOM33_THRESHOLD", MUL_TOOM33_THRESHOLD);
    1404      print_define ("MUL_TOOM44_THRESHOLD", MUL_TOOM44_THRESHOLD);
    1405      print_define ("MUL_TOOM6H_THRESHOLD", MUL_TOOM6H_THRESHOLD);
    1406      print_define ("MUL_TOOM8H_THRESHOLD", MUL_TOOM8H_THRESHOLD);
    1407  
    1408    /* disabled until tuned */
    1409    MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
    1410  }
    1411  
    1412  void
    1413  tune_mul (void)
    1414  {
    1415    static struct param_t  param;
    1416    mp_size_t thres;
    1417  
    1418    param.noprint = 1;
    1419  
    1420    param.function = speed_mpn_toom32_for_toom43_mul;
    1421    param.function2 = speed_mpn_toom43_for_toom32_mul;
    1422    param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
    1423    param.min_size = MPN_TOOM43_MUL_MINSIZE * 24 / 17;
    1424    one (&thres, &param);
    1425    mul_toom32_to_toom43_threshold = thres * 17 / 24;
    1426    print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
    1427  
    1428    param.function = speed_mpn_toom32_for_toom53_mul;
    1429    param.function2 = speed_mpn_toom53_for_toom32_mul;
    1430    param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
    1431    param.min_size = MPN_TOOM53_MUL_MINSIZE * 30 / 19;
    1432    one (&thres, &param);
    1433    mul_toom32_to_toom53_threshold = thres * 19 / 30;
    1434    print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
    1435  
    1436    param.function = speed_mpn_toom42_for_toom53_mul;
    1437    param.function2 = speed_mpn_toom53_for_toom42_mul;
    1438    param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
    1439    param.min_size = MPN_TOOM53_MUL_MINSIZE * 20 / 11;
    1440    one (&thres, &param);
    1441    mul_toom42_to_toom53_threshold = thres * 11 / 20;
    1442    print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
    1443  
    1444    param.function = speed_mpn_toom42_mul;
    1445    param.function2 = speed_mpn_toom63_mul;
    1446    param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
    1447    param.min_size = MPN_TOOM63_MUL_MINSIZE * 2;
    1448    one (&thres, &param);
    1449    mul_toom42_to_toom63_threshold = thres / 2;
    1450    print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
    1451  
    1452    /* Use ratio 5/6 when measuring, the middle of the range 2/3 to 1. */
    1453    param.function = speed_mpn_toom43_for_toom54_mul;
    1454    param.function2 = speed_mpn_toom54_for_toom43_mul;
    1455    param.name = "MUL_TOOM43_TO_TOOM54_THRESHOLD";
    1456    param.min_size = MPN_TOOM54_MUL_MINSIZE * 6 / 5;
    1457    one (&thres, &param);
    1458    mul_toom43_to_toom54_threshold = thres * 5 / 6;
    1459    print_define ("MUL_TOOM43_TO_TOOM54_THRESHOLD", mul_toom43_to_toom54_threshold);
    1460  }
    1461  
    1462  
    1463  void
    1464  tune_mullo (void)
    1465  {
    1466    static struct param_t  param;
    1467  
    1468    param.function = speed_mpn_mullo_n;
    1469  
    1470    param.name = "MULLO_BASECASE_THRESHOLD";
    1471    param.min_size = 2;
    1472    param.min_is_always = 1;
    1473    param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
    1474    param.stop_factor = 1.5;
    1475    param.noprint = 1;
    1476    one (&mullo_basecase_threshold, &param);
    1477  
    1478    param.name = "MULLO_DC_THRESHOLD";
    1479    param.min_size = 8;
    1480    param.min_is_always = 0;
    1481    param.max_size = 1000;
    1482    one (&mullo_dc_threshold, &param);
    1483  
    1484    if (mullo_basecase_threshold >= mullo_dc_threshold)
    1485      {
    1486        print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
    1487        print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
    1488      }
    1489    else
    1490      {
    1491        print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
    1492        print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
    1493      }
    1494  
    1495    if (WANT_FFT && mul_fft_threshold < MP_SIZE_T_MAX / 2)
    1496      {
    1497        param.name = "MULLO_MUL_N_THRESHOLD";
    1498        param.min_size = mullo_dc_threshold;
    1499        param.max_size = 2 * mul_fft_threshold;
    1500        param.noprint = 0;
    1501        param.step_factor = 0.03;
    1502        one (&mullo_mul_n_threshold, &param);
    1503      }
    1504    else
    1505      print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
    1506  			 "without FFT use mullo forever");
    1507  }
    1508  
    1509  void
    1510  tune_sqrlo (void)
    1511  {
    1512    static struct param_t  param;
    1513  
    1514    param.function = speed_mpn_sqrlo;
    1515  
    1516    param.name = "SQRLO_BASECASE_THRESHOLD";
    1517    param.min_size = 2;
    1518    param.min_is_always = 1;
    1519    param.max_size = SQRLO_BASECASE_THRESHOLD_LIMIT-1;
    1520    param.stop_factor = 1.5;
    1521    param.noprint = 1;
    1522    one (&sqrlo_basecase_threshold, &param);
    1523  
    1524    param.name = "SQRLO_DC_THRESHOLD";
    1525    param.min_size = 8;
    1526    param.min_is_always = 0;
    1527    param.max_size = SQRLO_DC_THRESHOLD_LIMIT-1;
    1528    one (&sqrlo_dc_threshold, &param);
    1529  
    1530    if (sqrlo_basecase_threshold >= sqrlo_dc_threshold)
    1531      {
    1532        print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_dc_threshold);
    1533        print_define_remark ("SQRLO_DC_THRESHOLD", 0, "never mpn_sqrlo_basecase");
    1534      }
    1535    else
    1536      {
    1537        print_define ("SQRLO_BASECASE_THRESHOLD", sqrlo_basecase_threshold);
    1538        print_define ("SQRLO_DC_THRESHOLD", sqrlo_dc_threshold);
    1539      }
    1540  
    1541    if (WANT_FFT && sqr_fft_threshold < MP_SIZE_T_MAX / 2)
    1542      {
    1543        param.name = "SQRLO_SQR_THRESHOLD";
    1544        param.min_size = sqrlo_dc_threshold;
    1545        param.max_size = 2 * sqr_fft_threshold;
    1546        param.noprint = 0;
    1547        param.step_factor = 0.03;
    1548        one (&sqrlo_sqr_threshold, &param);
    1549      }
    1550    else
    1551      print_define_remark ("SQRLO_SQR_THRESHOLD", MP_SIZE_T_MAX,
    1552  			 "without FFT use sqrlo forever");
    1553  }
    1554  
    1555  void
    1556  tune_mulmid (void)
    1557  {
    1558    static struct param_t  param;
    1559  
    1560    param.name = "MULMID_TOOM42_THRESHOLD";
    1561    param.function = speed_mpn_mulmid_n;
    1562    param.min_size = 4;
    1563    param.max_size = 100;
    1564    one (&mulmid_toom42_threshold, &param);
    1565  }
    1566  
    1567  void
    1568  tune_mulmod_bnm1 (void)
    1569  {
    1570    static struct param_t  param;
    1571  
    1572    param.name = "MULMOD_BNM1_THRESHOLD";
    1573    param.function = speed_mpn_mulmod_bnm1;
    1574    param.min_size = 4;
    1575    param.max_size = 100;
    1576    one (&mulmod_bnm1_threshold, &param);
    1577  }
    1578  
    1579  void
    1580  tune_sqrmod_bnm1 (void)
    1581  {
    1582    static struct param_t  param;
    1583  
    1584    param.name = "SQRMOD_BNM1_THRESHOLD";
    1585    param.function = speed_mpn_sqrmod_bnm1;
    1586    param.min_size = 4;
    1587    param.max_size = 100;
    1588    one (&sqrmod_bnm1_threshold, &param);
    1589  }
    1590  
    1591  
    1592  /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
    1593     is faster only at size==2 then we don't want to bother with extra code
    1594     just for that.  Start karatsuba from 4 same as MUL above.  */
    1595  
    1596  void
    1597  tune_sqr (void)
    1598  {
    1599    /* disabled until tuned */
    1600    SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
    1601  
    1602    if (HAVE_NATIVE_mpn_sqr_basecase)
    1603      {
    1604        print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
    1605        sqr_basecase_threshold = 0;
    1606      }
    1607    else
    1608      {
    1609        static struct param_t  param;
    1610        param.name = "SQR_BASECASE_THRESHOLD";
    1611        param.function = speed_mpn_sqr;
    1612        param.min_size = 3;
    1613        param.min_is_always = 1;
    1614        param.max_size = TUNE_SQR_TOOM2_MAX;
    1615        param.noprint = 1;
    1616        one (&sqr_basecase_threshold, &param);
    1617      }
    1618  
    1619    {
    1620      static struct param_t  param;
    1621      param.name = "SQR_TOOM2_THRESHOLD";
    1622      param.function = speed_mpn_sqr;
    1623      param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
    1624      param.max_size = TUNE_SQR_TOOM2_MAX;
    1625      param.noprint = 1;
    1626      one (&sqr_toom2_threshold, &param);
    1627  
    1628      if (! HAVE_NATIVE_mpn_sqr_basecase
    1629          && sqr_toom2_threshold < sqr_basecase_threshold)
    1630        {
    1631          /* Karatsuba becomes faster than mul_basecase before
    1632             sqr_basecase does.  Arrange for the expression
    1633             "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
    1634             selects mpn_sqr_basecase in mpn_sqr to be false, by setting
    1635             SQR_TOOM2_THRESHOLD to zero, making
    1636             SQR_BASECASE_THRESHOLD the toom2 threshold.  */
    1637  
    1638          sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
    1639          SQR_TOOM2_THRESHOLD = 0;
    1640  
    1641          print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
    1642                               "toom2");
    1643          print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
    1644                               "never sqr_basecase");
    1645        }
    1646      else
    1647        {
    1648          if (! HAVE_NATIVE_mpn_sqr_basecase)
    1649            print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
    1650          print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
    1651        }
    1652    }
    1653  
    1654    {
    1655      static struct param_t  param;
    1656      mp_size_t next_toom_start;
    1657      int something_changed;
    1658  
    1659      param.function = speed_mpn_sqr;
    1660      param.noprint = 1;
    1661  
    1662    /* Threshold sequence loop.  Disable functions that would be used in a very
    1663       narrow range, re-measuring things when that happens.  */
    1664      something_changed = 1;
    1665      while (something_changed)
    1666        {
    1667  	something_changed = 0;
    1668  
    1669  	next_toom_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
    1670  
    1671  	sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
    1672  	param.name = "SQR_TOOM3_THRESHOLD";
    1673  	param.min_size = MAX (next_toom_start, MPN_TOOM3_SQR_MINSIZE);
    1674  	param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
    1675  	one (&sqr_toom3_threshold, &param);
    1676  
    1677  	next_toom_start = MAX (next_toom_start, sqr_toom3_threshold);
    1678  
    1679  	if (sqr_toom4_threshold != 0)
    1680  	  {
    1681  	    param.name = "SQR_TOOM4_THRESHOLD";
    1682  	    sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
    1683  	    param.min_size = MAX (next_toom_start, MPN_TOOM4_SQR_MINSIZE);
    1684  	    param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
    1685  	    one (&sqr_toom4_threshold, &param);
    1686  
    1687  	    if (next_toom_start * 1.05 >= sqr_toom4_threshold)
    1688  	      {
    1689  		sqr_toom4_threshold = 0;
    1690  		something_changed = 1;
    1691  	      }
    1692  	  }
    1693  
    1694  	next_toom_start = MAX (next_toom_start, sqr_toom4_threshold);
    1695  
    1696  	if (sqr_toom6_threshold != 0)
    1697  	  {
    1698  	    param.name = "SQR_TOOM6_THRESHOLD";
    1699  	    sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
    1700  	    param.min_size = MAX (next_toom_start, MPN_TOOM6_SQR_MINSIZE);
    1701  	    param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
    1702  	    one (&sqr_toom6_threshold, &param);
    1703  
    1704  	    if (next_toom_start * 1.05 >= sqr_toom6_threshold)
    1705  	      {
    1706  		sqr_toom6_threshold = 0;
    1707  		something_changed = 1;
    1708  	      }
    1709  	  }
    1710  
    1711  	next_toom_start = MAX (next_toom_start, sqr_toom6_threshold);
    1712  
    1713  	if (sqr_toom8_threshold != 0)
    1714  	  {
    1715  	    param.name = "SQR_TOOM8_THRESHOLD";
    1716  	    sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
    1717  	    param.min_size = MAX (next_toom_start, MPN_TOOM8_SQR_MINSIZE);
    1718  	    param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
    1719  	    one (&sqr_toom8_threshold, &param);
    1720  
    1721  	    if (next_toom_start * 1.05 >= sqr_toom8_threshold)
    1722  	      {
    1723  		sqr_toom8_threshold = 0;
    1724  		something_changed = 1;
    1725  	      }
    1726  	  }
    1727        }
    1728  
    1729      print_define ("SQR_TOOM3_THRESHOLD", SQR_TOOM3_THRESHOLD);
    1730      print_define ("SQR_TOOM4_THRESHOLD", SQR_TOOM4_THRESHOLD);
    1731      print_define ("SQR_TOOM6_THRESHOLD", SQR_TOOM6_THRESHOLD);
    1732      print_define ("SQR_TOOM8_THRESHOLD", SQR_TOOM8_THRESHOLD);
    1733    }
    1734  }
    1735  
    1736  
    1737  void
    1738  tune_dc_div (void)
    1739  {
    1740    s.r = 0;		/* clear to make speed function do 2n/n */
    1741    {
    1742      static struct param_t  param;
    1743      param.name = "DC_DIV_QR_THRESHOLD";
    1744      param.function = speed_mpn_sbpi1_div_qr;
    1745      param.function2 = speed_mpn_dcpi1_div_qr;
    1746      param.min_size = 6;
    1747      one (&dc_div_qr_threshold, &param);
    1748    }
    1749    {
    1750      static struct param_t  param;
    1751      param.name = "DC_DIVAPPR_Q_THRESHOLD";
    1752      param.function = speed_mpn_sbpi1_divappr_q;
    1753      param.function2 = speed_mpn_dcpi1_divappr_q;
    1754      param.min_size = 6;
    1755      one (&dc_divappr_q_threshold, &param);
    1756    }
    1757  }
    1758  
    1759  static double
    1760  speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
    1761  {
    1762    if (s->size < DC_DIV_QR_THRESHOLD)
    1763      return speed_mpn_sbpi1_div_qr (s);
    1764    else
    1765      return speed_mpn_dcpi1_div_qr (s);
    1766  }
    1767  
    1768  void
    1769  tune_mu_div (void)
    1770  {
    1771    s.r = 0;		/* clear to make speed function do 2n/n */
    1772    {
    1773      static struct param_t  param;
    1774      param.name = "MU_DIV_QR_THRESHOLD";
    1775      param.function = speed_mpn_dcpi1_div_qr;
    1776      param.function2 = speed_mpn_mu_div_qr;
    1777      param.min_size = mul_toom22_threshold;
    1778      param.max_size = 5000;
    1779      param.step_factor = 0.02;
    1780      one (&mu_div_qr_threshold, &param);
    1781    }
    1782    {
    1783      static struct param_t  param;
    1784      param.name = "MU_DIVAPPR_Q_THRESHOLD";
    1785      param.function = speed_mpn_dcpi1_divappr_q;
    1786      param.function2 = speed_mpn_mu_divappr_q;
    1787      param.min_size = mul_toom22_threshold;
    1788      param.max_size = 5000;
    1789      param.step_factor = 0.02;
    1790      one (&mu_divappr_q_threshold, &param);
    1791    }
    1792    {
    1793      static struct param_t  param;
    1794      param.name = "MUPI_DIV_QR_THRESHOLD";
    1795      param.function = speed_mpn_sbordcpi1_div_qr;
    1796      param.function2 = speed_mpn_mupi_div_qr;
    1797      param.min_size = 6;
    1798      param.min_is_always = 1;
    1799      param.max_size = 1000;
    1800      param.step_factor = 0.02;
    1801      one (&mupi_div_qr_threshold, &param);
    1802    }
    1803  }
    1804  
    1805  void
    1806  tune_dc_bdiv (void)
    1807  {
    1808    s.r = 0;		/* clear to make speed function do 2n/n*/
    1809    {
    1810      static struct param_t  param;
    1811      param.name = "DC_BDIV_QR_THRESHOLD";
    1812      param.function = speed_mpn_sbpi1_bdiv_qr;
    1813      param.function2 = speed_mpn_dcpi1_bdiv_qr;
    1814      param.min_size = 4;
    1815      one (&dc_bdiv_qr_threshold, &param);
    1816    }
    1817    {
    1818      static struct param_t  param;
    1819      param.name = "DC_BDIV_Q_THRESHOLD";
    1820      param.function = speed_mpn_sbpi1_bdiv_q;
    1821      param.function2 = speed_mpn_dcpi1_bdiv_q;
    1822      param.min_size = 4;
    1823      one (&dc_bdiv_q_threshold, &param);
    1824    }
    1825  }
    1826  
    1827  void
    1828  tune_mu_bdiv (void)
    1829  {
    1830    s.r = 0;		/* clear to make speed function do 2n/n*/
    1831    {
    1832      static struct param_t  param;
    1833      param.name = "MU_BDIV_QR_THRESHOLD";
    1834      param.function = speed_mpn_dcpi1_bdiv_qr;
    1835      param.function2 = speed_mpn_mu_bdiv_qr;
    1836      param.min_size = dc_bdiv_qr_threshold;
    1837      param.max_size = 5000;
    1838      param.step_factor = 0.02;
    1839      one (&mu_bdiv_qr_threshold, &param);
    1840    }
    1841    {
    1842      static struct param_t  param;
    1843      param.name = "MU_BDIV_Q_THRESHOLD";
    1844      param.function = speed_mpn_dcpi1_bdiv_q;
    1845      param.function2 = speed_mpn_mu_bdiv_q;
    1846      param.min_size = dc_bdiv_q_threshold;
    1847      param.max_size = 5000;
    1848      param.step_factor = 0.02;
    1849      one (&mu_bdiv_q_threshold, &param);
    1850    }
    1851  }
    1852  
    1853  void
    1854  tune_invertappr (void)
    1855  {
    1856    static struct param_t  param;
    1857  
    1858    param.function = speed_mpn_ni_invertappr;
    1859    param.name = "INV_MULMOD_BNM1_THRESHOLD";
    1860    param.min_size = 5;
    1861    one (&inv_mulmod_bnm1_threshold, &param);
    1862  
    1863    param.function = speed_mpn_invertappr;
    1864    param.name = "INV_NEWTON_THRESHOLD";
    1865    param.min_size = 5;
    1866    one (&inv_newton_threshold, &param);
    1867  }
    1868  
    1869  void
    1870  tune_invert (void)
    1871  {
    1872    static struct param_t  param;
    1873  
    1874    param.function = speed_mpn_invert;
    1875    param.name = "INV_APPR_THRESHOLD";
    1876    param.min_size = 5;
    1877    one (&inv_appr_threshold, &param);
    1878  }
    1879  
    1880  void
    1881  tune_binvert (void)
    1882  {
    1883    static struct param_t  param;
    1884  
    1885    param.function = speed_mpn_binvert;
    1886    param.name = "BINV_NEWTON_THRESHOLD";
    1887    param.min_size = 8;		/* pointless with smaller operands */
    1888    one (&binv_newton_threshold, &param);
    1889  }
    1890  
    1891  void
    1892  tune_redc (void)
    1893  {
    1894  #define TUNE_REDC_2_MAX 100
    1895  #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
    1896  #define WANT_REDC_2 1
    1897  #endif
    1898  
    1899  #if WANT_REDC_2
    1900    {
    1901      static struct param_t  param;
    1902      param.name = "REDC_1_TO_REDC_2_THRESHOLD";
    1903      param.function = speed_mpn_redc_1;
    1904      param.function2 = speed_mpn_redc_2;
    1905      param.min_size = 1;
    1906      param.min_is_always = 1;
    1907      param.max_size = TUNE_REDC_2_MAX;
    1908      param.noprint = 1;
    1909      param.stop_factor = 1.5;
    1910      one (&redc_1_to_redc_2_threshold, &param);
    1911    }
    1912    {
    1913      static struct param_t  param;
    1914      param.name = "REDC_2_TO_REDC_N_THRESHOLD";
    1915      param.function = speed_mpn_redc_2;
    1916      param.function2 = speed_mpn_redc_n;
    1917      param.min_size = 16;
    1918      param.noprint = 1;
    1919      one (&redc_2_to_redc_n_threshold, &param);
    1920    }
    1921    if (redc_1_to_redc_2_threshold >= redc_2_to_redc_n_threshold)
    1922      {
    1923        redc_2_to_redc_n_threshold = 0;	/* disable redc_2 */
    1924  
    1925        /* Never use redc2, measure redc_1 -> redc_n cutoff, store result as
    1926  	 REDC_1_TO_REDC_2_THRESHOLD.  */
    1927        {
    1928  	static struct param_t  param;
    1929  	param.name = "REDC_1_TO_REDC_2_THRESHOLD";
    1930  	param.function = speed_mpn_redc_1;
    1931  	param.function2 = speed_mpn_redc_n;
    1932  	param.min_size = 16;
    1933  	param.noprint = 1;
    1934  	one (&redc_1_to_redc_2_threshold, &param);
    1935        }
    1936      }
    1937    print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
    1938    print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
    1939  #else
    1940    {
    1941      static struct param_t  param;
    1942      param.name = "REDC_1_TO_REDC_N_THRESHOLD";
    1943      param.function = speed_mpn_redc_1;
    1944      param.function2 = speed_mpn_redc_n;
    1945      param.min_size = 16;
    1946      one (&redc_1_to_redc_n_threshold, &param);
    1947    }
    1948  #endif
    1949  }
    1950  
    1951  void
    1952  tune_matrix22_mul (void)
    1953  {
    1954    static struct param_t  param;
    1955    param.name = "MATRIX22_STRASSEN_THRESHOLD";
    1956    param.function = speed_mpn_matrix22_mul;
    1957    param.min_size = 2;
    1958    one (&matrix22_strassen_threshold, &param);
    1959  }
    1960  
    1961  void
    1962  tune_hgcd2 (void)
    1963  {
    1964    static struct param_t  param;
    1965    hgcd2_func_t *f[5] =
    1966      { mpn_hgcd2_1,
    1967        mpn_hgcd2_2,
    1968        mpn_hgcd2_3,
    1969        mpn_hgcd2_4,
    1970        mpn_hgcd2_5 };
    1971    speed_function_t speed_f[5] =
    1972      { speed_mpn_hgcd2_1,
    1973        speed_mpn_hgcd2_2,
    1974        speed_mpn_hgcd2_3,
    1975        speed_mpn_hgcd2_4,
    1976        speed_mpn_hgcd2_5 };
    1977    int best;
    1978  
    1979    s.size = 1;
    1980    best = one_method (5, speed_f, "mpn_hgcd2", "HGCD2_DIV1_METHOD", &param);
    1981  
    1982    /* Use selected function when tuning hgcd and gcd */
    1983    hgcd2_func = f[best];
    1984  }
    1985  
    1986  void
    1987  tune_hgcd (void)
    1988  {
    1989    static struct param_t  param;
    1990    param.name = "HGCD_THRESHOLD";
    1991    param.function = speed_mpn_hgcd;
    1992    /* We seem to get strange results for small sizes */
    1993    param.min_size = 30;
    1994    one (&hgcd_threshold, &param);
    1995  }
    1996  
    1997  void
    1998  tune_hgcd_appr (void)
    1999  {
    2000    static struct param_t  param;
    2001    param.name = "HGCD_APPR_THRESHOLD";
    2002    param.function = speed_mpn_hgcd_appr;
    2003    /* We seem to get strange results for small sizes */
    2004    param.min_size = 50;
    2005    param.stop_since_change = 150;
    2006    one (&hgcd_appr_threshold, &param);
    2007  }
    2008  
    2009  void
    2010  tune_hgcd_reduce (void)
    2011  {
    2012    static struct param_t  param;
    2013    param.name = "HGCD_REDUCE_THRESHOLD";
    2014    param.function = speed_mpn_hgcd_reduce;
    2015    param.min_size = 30;
    2016    param.max_size = 7000;
    2017    param.step_factor = 0.04;
    2018    one (&hgcd_reduce_threshold, &param);
    2019  }
    2020  
    2021  void
    2022  tune_gcd_dc (void)
    2023  {
    2024    static struct param_t  param;
    2025    param.name = "GCD_DC_THRESHOLD";
    2026    param.function = speed_mpn_gcd;
    2027    param.min_size = hgcd_threshold;
    2028    param.max_size = 3000;
    2029    param.step_factor = 0.02;
    2030    one (&gcd_dc_threshold, &param);
    2031  }
    2032  
    2033  void
    2034  tune_gcdext_dc (void)
    2035  {
    2036    static struct param_t  param;
    2037    param.name = "GCDEXT_DC_THRESHOLD";
    2038    param.function = speed_mpn_gcdext;
    2039    param.min_size = hgcd_threshold;
    2040    param.max_size = 3000;
    2041    param.step_factor = 0.02;
    2042    one (&gcdext_dc_threshold, &param);
    2043  }
    2044  
    2045  /* In tune_powm_sec we compute the table used by the win_size function.  The
    2046     cutoff points are in exponent bits, disregarding other operand sizes.  It is
    2047     not possible to use the one framework since it currently uses a granularity
    2048     of full limbs.
    2049  */
    2050  
    2051  /* This win_size replaces the variant in the powm code, allowing us to
    2052     control k in the k-ary algorithms.  */
    2053  int winsize;
    2054  int
    2055  win_size (mp_bitcnt_t eb)
    2056  {
    2057    return winsize;
    2058  }
    2059  
    2060  void
    2061  tune_powm_sec (void)
    2062  {
    2063    mp_size_t n;
    2064    int k, i;
    2065    mp_size_t itch;
    2066    mp_bitcnt_t nbits, nbits_next, possible_nbits_cutoff;
    2067    const int n_max = 3000 / GMP_NUMB_BITS;
    2068    const int n_measurements = 5;
    2069    mp_ptr rp, bp, ep, mp, tp;
    2070    double ttab[n_measurements], tk, tkp1;
    2071    TMP_DECL;
    2072    TMP_MARK;
    2073  
    2074    possible_nbits_cutoff = 0;
    2075  
    2076    k = 1;
    2077  
    2078    winsize = 10;			/* the itch function needs this */
    2079    itch = mpn_sec_powm_itch (n_max, n_max * GMP_NUMB_BITS, n_max);
    2080  
    2081    rp = TMP_ALLOC_LIMBS (n_max);
    2082    bp = TMP_ALLOC_LIMBS (n_max);
    2083    ep = TMP_ALLOC_LIMBS (n_max);
    2084    mp = TMP_ALLOC_LIMBS (n_max);
    2085    tp = TMP_ALLOC_LIMBS (itch);
    2086  
    2087    mpn_random (bp, n_max);
    2088    mpn_random (mp, n_max);
    2089    mp[0] |= 1;
    2090  
    2091  /* How about taking the M operand size into account?
    2092  
    2093     An operation R=powm(B,E,N) will take time O(log(E)*M(log(N))) (assuming
    2094     B = O(M)).
    2095  
    2096     Using k-ary and no sliding window, the precomputation will need time
    2097     O(2^(k-1)*M(log(N))) and the main computation will need O(log(E)*S(N)) +
    2098     O(log(E)/k*M(N)), for the squarings, multiplications, respectively.
    2099  
    2100     An operation R=powm_sec(B,E,N) will take time like powm.
    2101  
    2102     Using k-ary, the precomputation will need time O(2^k*M(log(N))) and the
    2103     main computation will need O(log(E)*S(N)) + O(log(E)/k*M(N)) +
    2104     O(log(E)/k*2^k*log(N)), for the squarings, multiplications, and full
    2105     table reads, respectively.  */
    2106  
    2107    printf ("#define POWM_SEC_TABLE  ");
    2108  
    2109    /* For nbits == 1, we should always use k == 1, so no need to tune
    2110       that. Starting with nbits == 2 also ensure that nbits always is
    2111       larger than the windowsize k+1. */
    2112    for (nbits = 2; nbits <= n_max * GMP_NUMB_BITS; )
    2113      {
    2114        n = (nbits - 1) / GMP_NUMB_BITS + 1;
    2115  
    2116        /* Generate E such that sliding-window for k and k+1 works equally
    2117  	 well/poorly (but sliding is not used in powm_sec, of course). */
    2118        for (i = 0; i < n; i++)
    2119  	ep[i] = ~CNST_LIMB(0);
    2120  
    2121        winsize = k;
    2122        for (i = 0; i < n_measurements; i++)
    2123  	{
    2124  	  speed_starttime ();
    2125  	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
    2126  	  ttab[i] = speed_endtime ();
    2127  	}
    2128        tk = median (ttab, n_measurements);
    2129  
    2130        winsize = k + 1;
    2131        speed_starttime ();
    2132        for (i = 0; i < n_measurements; i++)
    2133  	{
    2134  	  speed_starttime ();
    2135  	  mpn_sec_powm (rp, bp, n, ep, nbits, mp, n, tp);
    2136  	  ttab[i] = speed_endtime ();
    2137  	}
    2138        tkp1 = median (ttab, n_measurements);
    2139  /*
    2140        printf ("testing: %ld, %d", nbits, k, ep[n-1]);
    2141        printf ("   %10.5f  %10.5f\n", tk, tkp1);
    2142  */
    2143        if (tkp1 < tk)
    2144  	{
    2145  	  if (possible_nbits_cutoff)
    2146  	    {
    2147  	      /* Two consecutive sizes indicate k increase, obey.  */
    2148  
    2149  	      /* Must always have x[k] >= k */
    2150  	      ASSERT_ALWAYS (possible_nbits_cutoff >= k);
    2151  
    2152  	      if (k > 1)
    2153  		printf (",");
    2154  	      printf ("%ld", (long) possible_nbits_cutoff);
    2155  	      k++;
    2156  	      possible_nbits_cutoff = 0;
    2157  	    }
    2158  	  else
    2159  	    {
    2160  	      /* One measurement indicate k increase, save nbits for further
    2161  		 consideration.  */
    2162  	      /* The new larger k gets used for sizes > the cutoff
    2163  		 value, hence the cutoff should be one less than the
    2164  		 smallest size where it gives a speedup. */
    2165  	      possible_nbits_cutoff = nbits - 1;
    2166  	    }
    2167  	}
    2168        else
    2169  	possible_nbits_cutoff = 0;
    2170  
    2171        nbits_next = nbits * 65 / 64;
    2172        nbits = nbits_next + (nbits_next == nbits);
    2173      }
    2174    printf ("\n");
    2175    TMP_FREE;
    2176  }
    2177  
    2178  
    2179  /* size_extra==1 reflects the fact that with high<divisor one division is
    2180     always skipped.  Forcing high<divisor while testing ensures consistency
    2181     while stepping through sizes, ie. that size-1 divides will be done each
    2182     time.
    2183  
    2184     min_size==2 and min_is_always are used so that if plain division is only
    2185     better at size==1 then don't bother including that code just for that
    2186     case, instead go with preinv always and get a size saving.  */
    2187  
    2188  #define DIV_1_PARAMS                    \
    2189    param.check_size = 256;               \
    2190    param.min_size = 2;                   \
    2191    param.min_is_always = 1;              \
    2192    param.data_high = DATA_HIGH_LT_R;     \
    2193    param.size_extra = 1;                 \
    2194    param.stop_factor = 2.0;
    2195  
    2196  
    2197  double (*tuned_speed_mpn_divrem_1) (struct speed_params *);
    2198  
    2199  void
    2200  tune_divrem_1 (void)
    2201  {
    2202    /* plain version by default */
    2203    tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
    2204  
    2205    /* No support for tuning native assembler code, do that by hand and put
    2206       the results in the .asm file, there's no need for such thresholds to
    2207       appear in gmp-mparam.h.  */
    2208    if (HAVE_NATIVE_mpn_divrem_1)
    2209      return;
    2210  
    2211    if (GMP_NAIL_BITS != 0)
    2212      {
    2213        print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
    2214                             "no preinv with nails");
    2215        print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
    2216                             "no preinv with nails");
    2217        return;
    2218      }
    2219  
    2220    if (UDIV_PREINV_ALWAYS)
    2221      {
    2222        print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
    2223        print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
    2224        return;
    2225      }
    2226  
    2227    tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
    2228  
    2229    /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
    2230       a bit out for the fractional part, but that's too bad, the integer part
    2231       is more important. */
    2232    {
    2233      static struct param_t  param;
    2234      param.name = "DIVREM_1_NORM_THRESHOLD";
    2235      DIV_1_PARAMS;
    2236      s.r = randlimb_norm ();
    2237      param.function = speed_mpn_divrem_1_tune;
    2238      one (&divrem_1_norm_threshold, &param);
    2239    }
    2240    {
    2241      static struct param_t  param;
    2242      param.name = "DIVREM_1_UNNORM_THRESHOLD";
    2243      DIV_1_PARAMS;
    2244      s.r = randlimb_half ();
    2245      param.function = speed_mpn_divrem_1_tune;
    2246      one (&divrem_1_unnorm_threshold, &param);
    2247    }
    2248  }
    2249  
    2250  void
    2251  tune_div_qr_1 (void)
    2252  {
    2253    if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
    2254      {
    2255        static struct param_t  param;
    2256        speed_function_t f[] =
    2257  	{
    2258  	 speed_mpn_div_qr_1n_pi1_1,
    2259  	 speed_mpn_div_qr_1n_pi1_2,
    2260  	 speed_mpn_div_qr_1n_pi1_3,
    2261  	 speed_mpn_div_qr_1n_pi1_4,
    2262  	};
    2263  
    2264        s.size = 10;
    2265        s.r = randlimb_norm ();
    2266  
    2267        one_method (numberof(f), f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
    2268      }
    2269  
    2270    {
    2271      static struct param_t  param;
    2272      param.name = "DIV_QR_1_NORM_THRESHOLD";
    2273      DIV_1_PARAMS;
    2274      param.min_size = 1;
    2275      param.min_is_always = 0;
    2276      s.r = randlimb_norm ();
    2277      param.function = speed_mpn_div_qr_1_tune;
    2278      one (&div_qr_1_norm_threshold, &param);
    2279    }
    2280    {
    2281      static struct param_t  param;
    2282      param.name = "DIV_QR_1_UNNORM_THRESHOLD";
    2283      DIV_1_PARAMS;
    2284      param.min_size = 1;
    2285      param.min_is_always = 0;
    2286      s.r = randlimb_half();
    2287      param.function = speed_mpn_div_qr_1_tune;
    2288      one (&div_qr_1_unnorm_threshold, &param);
    2289    }
    2290  }
    2291  
    2292  
    2293  void
    2294  tune_mod_1 (void)
    2295  {
    2296    /* No support for tuning native assembler code, do that by hand and put
    2297       the results in the .asm file, there's no need for such thresholds to
    2298       appear in gmp-mparam.h.  */
    2299    if (HAVE_NATIVE_mpn_mod_1)
    2300      return;
    2301  
    2302    if (GMP_NAIL_BITS != 0)
    2303      {
    2304        print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
    2305                             "no preinv with nails");
    2306        print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
    2307                             "no preinv with nails");
    2308        return;
    2309      }
    2310  
    2311    if (!HAVE_NATIVE_mpn_mod_1_1p)
    2312      {
    2313        static struct param_t  param;
    2314        speed_function_t f[2] =
    2315  	{
    2316  	 speed_mpn_mod_1_1_1,
    2317  	 speed_mpn_mod_1_1_2,
    2318  	};
    2319  
    2320        s.size = 10;
    2321        s.r = randlimb_half ();
    2322        one_method (2, f, "mpn_mod_1_1", "MOD_1_1P_METHOD", &param);
    2323      }
    2324  
    2325    if (UDIV_PREINV_ALWAYS)
    2326      {
    2327        print_define ("MOD_1_NORM_THRESHOLD", 0L);
    2328        print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
    2329      }
    2330    else
    2331      {
    2332        {
    2333  	static struct param_t  param;
    2334  	param.name = "MOD_1_NORM_THRESHOLD";
    2335  	DIV_1_PARAMS;
    2336  	s.r = randlimb_norm ();
    2337  	param.function = speed_mpn_mod_1_tune;
    2338  	one (&mod_1_norm_threshold, &param);
    2339        }
    2340        {
    2341  	static struct param_t  param;
    2342  	param.name = "MOD_1_UNNORM_THRESHOLD";
    2343  	DIV_1_PARAMS;
    2344  	s.r = randlimb_half ();
    2345  	param.function = speed_mpn_mod_1_tune;
    2346  	one (&mod_1_unnorm_threshold, &param);
    2347        }
    2348      }
    2349    {
    2350      static struct param_t  param;
    2351  
    2352      param.check_size = 256;
    2353  
    2354      s.r = randlimb_norm ();
    2355      param.function = speed_mpn_mod_1_tune;
    2356  
    2357      param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
    2358      param.min_size = 2;
    2359      one (&mod_1n_to_mod_1_1_threshold, &param);
    2360    }
    2361  
    2362    {
    2363      static struct param_t  param;
    2364  
    2365      param.check_size = 256;
    2366      s.r = randlimb_half ();
    2367      param.noprint = 1;
    2368  
    2369      param.function = speed_mpn_mod_1_1;
    2370      param.function2 = speed_mpn_mod_1_2;
    2371      param.min_is_always = 1;
    2372      param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
    2373      param.min_size = 2;
    2374      one (&mod_1_1_to_mod_1_2_threshold, &param);
    2375  
    2376      param.function = speed_mpn_mod_1_2;
    2377      param.function2 = speed_mpn_mod_1_4;
    2378      param.min_is_always = 1;
    2379      param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
    2380      param.min_size = 1;
    2381      one (&mod_1_2_to_mod_1_4_threshold, &param);
    2382  
    2383      if (mod_1_1_to_mod_1_2_threshold >= mod_1_2_to_mod_1_4_threshold)
    2384        {
    2385  	/* Never use mod_1_2, measure mod_1_1 -> mod_1_4 */
    2386  	mod_1_2_to_mod_1_4_threshold = 0;
    2387  
    2388  	param.function = speed_mpn_mod_1_1;
    2389  	param.function2 = speed_mpn_mod_1_4;
    2390  	param.min_is_always = 1;
    2391  	param.name = "MOD_1_1_TO_MOD_1_4_THRESHOLD fake";
    2392  	param.min_size = 2;
    2393  	one (&mod_1_1_to_mod_1_2_threshold, &param);
    2394        }
    2395  
    2396      param.function = speed_mpn_mod_1_tune;
    2397      param.function2 = NULL;
    2398      param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
    2399      param.min_size = 2;
    2400      param.min_is_always = 0;
    2401      one (&mod_1u_to_mod_1_1_threshold, &param);
    2402  
    2403      if (mod_1u_to_mod_1_1_threshold >= mod_1_1_to_mod_1_2_threshold)
    2404        mod_1_1_to_mod_1_2_threshold = 0;
    2405      if (mod_1u_to_mod_1_1_threshold >= mod_1_2_to_mod_1_4_threshold)
    2406        mod_1_2_to_mod_1_4_threshold = 0;
    2407  
    2408      print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
    2409      print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold,
    2410  			 mod_1_1_to_mod_1_2_threshold == 0 ? "never mpn_mod_1_1p" : NULL);
    2411      print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold,
    2412  			 mod_1_2_to_mod_1_4_threshold == 0 ? "never mpn_mod_1s_2p" : NULL);
    2413    }
    2414  
    2415    {
    2416      static struct param_t  param;
    2417  
    2418      param.check_size = 256;
    2419  
    2420      param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
    2421      s.r = randlimb_norm ();
    2422      param.function = speed_mpn_preinv_mod_1;
    2423      param.function2 = speed_mpn_mod_1_tune;
    2424      param.min_size = 1;
    2425      one (&preinv_mod_1_to_mod_1_threshold, &param);
    2426    }
    2427  }
    2428  
    2429  
    2430  /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
    2431     imply that udiv_qrnnd_preinv is worth using, but it seems most
    2432     straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
    2433     directly.  */
    2434  
    2435  void
    2436  tune_preinv_divrem_1 (void)
    2437  {
    2438    static struct param_t  param;
    2439    speed_function_t  divrem_1;
    2440    const char        *divrem_1_name;
    2441    double            t1, t2;
    2442  
    2443    if (GMP_NAIL_BITS != 0)
    2444      {
    2445        print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
    2446        return;
    2447      }
    2448  
    2449    /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
    2450       it's faster than mpn_divrem_1.  */
    2451    if (HAVE_NATIVE_mpn_preinv_divrem_1)
    2452      {
    2453        print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
    2454        return;
    2455      }
    2456  
    2457    /* If udiv_qrnnd_preinv is the only division method then of course
    2458       mpn_preinv_divrem_1 should be used.  */
    2459    if (UDIV_PREINV_ALWAYS)
    2460      {
    2461        print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
    2462        return;
    2463      }
    2464  
    2465    /* If we've got an assembler version of mpn_divrem_1, then compare against
    2466       that, not the mpn_divrem_1_div generic C.  */
    2467    if (HAVE_NATIVE_mpn_divrem_1)
    2468      {
    2469        divrem_1 = speed_mpn_divrem_1;
    2470        divrem_1_name = "mpn_divrem_1";
    2471      }
    2472    else
    2473      {
    2474        divrem_1 = speed_mpn_divrem_1_div;
    2475        divrem_1_name = "mpn_divrem_1_div";
    2476      }
    2477  
    2478    param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
    2479    s.size = 200;                     /* generous but not too big */
    2480    /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
    2481       since in general that's probably most common, though in fact for a
    2482       64-bit limb mp_bases[10].big_base is normalized.  */
    2483    s.r = urandom() & (GMP_NUMB_MASK >> 4);
    2484    if (s.r == 0) s.r = 123;
    2485  
    2486    t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
    2487    t2 = tuneup_measure (divrem_1, &param, &s);
    2488    if (t1 == -1.0 || t2 == -1.0)
    2489      {
    2490        printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
    2491                divrem_1_name, (long) s.size);
    2492        abort ();
    2493      }
    2494    if (option_trace >= 1)
    2495      printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
    2496              (long) s.size, t1, divrem_1_name, t2);
    2497  
    2498    print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
    2499  }
    2500  
    2501  
    2502  
    2503  void
    2504  tune_divrem_2 (void)
    2505  {
    2506    static struct param_t  param;
    2507  
    2508    /* No support for tuning native assembler code, do that by hand and put
    2509       the results in the .asm file, and there's no need for such thresholds
    2510       to appear in gmp-mparam.h.  */
    2511    if (HAVE_NATIVE_mpn_divrem_2)
    2512      return;
    2513  
    2514    if (GMP_NAIL_BITS != 0)
    2515      {
    2516        print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
    2517                             "no preinv with nails");
    2518        return;
    2519      }
    2520  
    2521    if (UDIV_PREINV_ALWAYS)
    2522      {
    2523        print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
    2524        return;
    2525      }
    2526  
    2527    /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
    2528       a bit out for the fractional part, but that's too bad, the integer part
    2529       is more important.
    2530  
    2531       min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
    2532       code space if plain division is better only at size==2 or size==3. */
    2533    param.name = "DIVREM_2_THRESHOLD";
    2534    param.check_size = 256;
    2535    param.min_size = 4;
    2536    param.min_is_always = 1;
    2537    param.size_extra = 2;      /* does qsize==nsize-2 divisions */
    2538    param.stop_factor = 2.0;
    2539  
    2540    s.r = randlimb_norm ();
    2541    param.function = speed_mpn_divrem_2;
    2542    one (&divrem_2_threshold, &param);
    2543  }
    2544  
    2545  void
    2546  tune_div_qr_2 (void)
    2547  {
    2548    static struct param_t  param;
    2549    param.name = "DIV_QR_2_PI2_THRESHOLD";
    2550    param.function = speed_mpn_div_qr_2n;
    2551    param.check_size = 500;
    2552    param.min_size = 4;
    2553    one (&div_qr_2_pi2_threshold, &param);
    2554  }
    2555  
    2556  /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
    2557     tune for that.  Its speed can differ on odd or even divisor, so take an
    2558     average threshold for the two.
    2559  
    2560     mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
    2561     might not vary that way, but don't test this since high<divisor isn't
    2562     expected to occur often with small divisors.  */
    2563  
    2564  void
    2565  tune_divexact_1 (void)
    2566  {
    2567    static struct param_t  param;
    2568    mp_size_t  thresh[2], average;
    2569    int        low, i;
    2570  
    2571    /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
    2572       full mpn_divrem_1.  */
    2573    if (HAVE_NATIVE_mpn_divexact_1)
    2574      {
    2575        print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
    2576        return;
    2577      }
    2578  
    2579    ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
    2580  
    2581    param.name = "DIVEXACT_1_THRESHOLD";
    2582    param.data_high = DATA_HIGH_GE_R;
    2583    param.check_size = 256;
    2584    param.min_size = 2;
    2585    param.stop_factor = 1.5;
    2586    param.function  = tuned_speed_mpn_divrem_1;
    2587    param.function2 = speed_mpn_divexact_1;
    2588    param.noprint = 1;
    2589  
    2590    print_define_start (param.name);
    2591  
    2592    for (low = 0; low <= 1; low++)
    2593      {
    2594        s.r = randlimb_half();
    2595        if (low == 0)
    2596          s.r |= 1;
    2597        else
    2598          s.r &= ~CNST_LIMB(7);
    2599  
    2600        one (&thresh[low], &param);
    2601        if (option_trace)
    2602          printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
    2603  
    2604        if (thresh[low] == MP_SIZE_T_MAX)
    2605          {
    2606            average = MP_SIZE_T_MAX;
    2607            goto divexact_1_done;
    2608          }
    2609      }
    2610  
    2611    if (option_trace)
    2612      {
    2613        printf ("average of:");
    2614        for (i = 0; i < numberof(thresh); i++)
    2615          printf (" %ld", (long) thresh[i]);
    2616        printf ("\n");
    2617      }
    2618  
    2619    average = 0;
    2620    for (i = 0; i < numberof(thresh); i++)
    2621      average += thresh[i];
    2622    average /= numberof(thresh);
    2623  
    2624    /* If divexact turns out to be better as early as 3 limbs, then use it
    2625       always, so as to reduce code size and conditional jumps.  */
    2626    if (average <= 3)
    2627      average = 0;
    2628  
    2629   divexact_1_done:
    2630    print_define_end (param.name, average);
    2631  }
    2632  
    2633  
    2634  /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
    2635     same as mpn_mod_1, but this might not be true of an assembler
    2636     implementation.  The threshold used is an average based on data where a
    2637     divide can be skipped and where it can't.
    2638  
    2639     If modexact turns out to be better as early as 3 limbs, then use it
    2640     always, so as to reduce code size and conditional jumps.  */
    2641  
    2642  void
    2643  tune_modexact_1_odd (void)
    2644  {
    2645    static struct param_t  param;
    2646    mp_size_t  thresh_lt, thresh_ge, average;
    2647  
    2648  #if 0
    2649    /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
    2650       of a full mpn_mod_1.  */
    2651    if (HAVE_NATIVE_mpn_modexact_1_odd)
    2652      {
    2653        print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
    2654        return;
    2655      }
    2656  #endif
    2657  
    2658    param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
    2659    param.check_size = 256;
    2660    param.min_size = 2;
    2661    param.stop_factor = 1.5;
    2662    param.function  = speed_mpn_modexact_1c_odd;
    2663    param.function2 = speed_mpn_mod_1_tune;
    2664    param.noprint = 1;
    2665    s.r = randlimb_half () | 1;
    2666  
    2667    print_define_start (param.name);
    2668  
    2669    param.data_high = DATA_HIGH_LT_R;
    2670    one (&thresh_lt, &param);
    2671    if (option_trace)
    2672      printf ("lt thresh %ld\n", (long) thresh_lt);
    2673  
    2674    average = thresh_lt;
    2675    if (thresh_lt != MP_SIZE_T_MAX)
    2676      {
    2677        param.data_high = DATA_HIGH_GE_R;
    2678        one (&thresh_ge, &param);
    2679        if (option_trace)
    2680          printf ("ge thresh %ld\n", (long) thresh_ge);
    2681  
    2682        if (thresh_ge != MP_SIZE_T_MAX)
    2683          {
    2684            average = (thresh_ge + thresh_lt) / 2;
    2685            if (thresh_ge <= 3)
    2686              average = 0;
    2687          }
    2688      }
    2689  
    2690    print_define_end (param.name, average);
    2691  }
    2692  
    2693  
    2694  void
    2695  tune_jacobi_base (void)
    2696  {
    2697    static struct param_t  param;
    2698    speed_function_t f[4] =
    2699      {
    2700       speed_mpn_jacobi_base_1,
    2701       speed_mpn_jacobi_base_2,
    2702       speed_mpn_jacobi_base_3,
    2703       speed_mpn_jacobi_base_4,
    2704      };
    2705  
    2706    s.size = GMP_LIMB_BITS * 3 / 4;
    2707  
    2708    one_method (4, f, "mpn_jacobi_base", "JACOBI_BASE_METHOD", &param);
    2709  }
    2710  
    2711  
    2712  void
    2713  tune_get_str (void)
    2714  {
    2715    /* Tune for decimal, it being most common.  Some rough testing suggests
    2716       other bases are different, but not by very much.  */
    2717    s.r = 10;
    2718    {
    2719      static struct param_t  param;
    2720      GET_STR_PRECOMPUTE_THRESHOLD = 0;
    2721      param.name = "GET_STR_DC_THRESHOLD";
    2722      param.function = speed_mpn_get_str;
    2723      param.min_size = 4;
    2724      param.max_size = GET_STR_THRESHOLD_LIMIT;
    2725      one (&get_str_dc_threshold, &param);
    2726    }
    2727    {
    2728      static struct param_t  param;
    2729      param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
    2730      param.function = speed_mpn_get_str;
    2731      param.min_size = GET_STR_DC_THRESHOLD;
    2732      param.max_size = GET_STR_THRESHOLD_LIMIT;
    2733      one (&get_str_precompute_threshold, &param);
    2734    }
    2735  }
    2736  
    2737  
    2738  double
    2739  speed_mpn_pre_set_str (struct speed_params *s)
    2740  {
    2741    unsigned char *str;
    2742    mp_ptr     wp;
    2743    mp_size_t  wn;
    2744    unsigned   i;
    2745    int        base;
    2746    double     t;
    2747    mp_ptr powtab_mem, tp;
    2748    powers_t powtab[GMP_LIMB_BITS];
    2749    mp_size_t un;
    2750    int chars_per_limb;
    2751    TMP_DECL;
    2752  
    2753    SPEED_RESTRICT_COND (s->size >= 1);
    2754  
    2755    base = s->r == 0 ? 10 : s->r;
    2756    SPEED_RESTRICT_COND (base >= 2 && base <= 256);
    2757  
    2758    TMP_MARK;
    2759  
    2760    str = (unsigned char *) TMP_ALLOC (s->size);
    2761    for (i = 0; i < s->size; i++)
    2762      str[i] = s->xp[i] % base;
    2763  
    2764    LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);
    2765    SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
    2766  
    2767    /* use this during development to check wn is big enough */
    2768    /*
    2769    ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
    2770    */
    2771  
    2772    speed_operand_src (s, (mp_ptr) str, s->size/GMP_LIMB_BYTES);
    2773    speed_operand_dst (s, wp, wn);
    2774    speed_cache_fill (s);
    2775  
    2776    chars_per_limb = mp_bases[base].chars_per_limb;
    2777    un = s->size / chars_per_limb + 1;
    2778    powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
    2779    size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
    2780    powers_t *pt = powtab + n_pows;
    2781    tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
    2782  
    2783    speed_starttime ();
    2784    i = s->reps;
    2785    do
    2786      {
    2787        mpn_pre_set_str (wp, str, s->size, pt, tp);
    2788      }
    2789    while (--i != 0);
    2790    t = speed_endtime ();
    2791  
    2792    TMP_FREE;
    2793    return t;
    2794  }
    2795  
    2796  void
    2797  tune_set_str (void)
    2798  {
    2799    s.r = 10;  /* decimal */
    2800    {
    2801      static struct param_t  param;
    2802      SET_STR_PRECOMPUTE_THRESHOLD = 0;
    2803      param.step_factor = 0.01;
    2804      param.name = "SET_STR_DC_THRESHOLD";
    2805      param.function = speed_mpn_pre_set_str;
    2806      param.min_size = 100;
    2807      param.max_size = 50000;
    2808      one (&set_str_dc_threshold, &param);
    2809    }
    2810    {
    2811      static struct param_t  param;
    2812      param.step_factor = 0.02;
    2813      param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
    2814      param.function = speed_mpn_set_str;
    2815      param.min_size = SET_STR_DC_THRESHOLD;
    2816      param.max_size = 100000;
    2817      one (&set_str_precompute_threshold, &param);
    2818    }
    2819  }
    2820  
    2821  
    2822  void
    2823  tune_fft_mul (void)
    2824  {
    2825    static struct fft_param_t  param;
    2826  
    2827    if (option_fft_max_size == 0)
    2828      return;
    2829  
    2830    param.table_name          = "MUL_FFT_TABLE3";
    2831    param.threshold_name      = "MUL_FFT_THRESHOLD";
    2832    param.p_threshold         = &mul_fft_threshold;
    2833    param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
    2834    param.p_modf_threshold    = &mul_fft_modf_threshold;
    2835    param.first_size          = MUL_TOOM33_THRESHOLD / 2;
    2836    param.max_size            = option_fft_max_size;
    2837    param.function            = speed_mpn_fft_mul;
    2838    param.mul_modf_function   = speed_mpn_mul_fft;
    2839    param.mul_function        = speed_mpn_mul_n;
    2840    param.sqr = 0;
    2841    fft (&param);
    2842  }
    2843  
    2844  
    2845  void
    2846  tune_fft_sqr (void)
    2847  {
    2848    static struct fft_param_t  param;
    2849  
    2850    if (option_fft_max_size == 0)
    2851      return;
    2852  
    2853    param.table_name          = "SQR_FFT_TABLE3";
    2854    param.threshold_name      = "SQR_FFT_THRESHOLD";
    2855    param.p_threshold         = &sqr_fft_threshold;
    2856    param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
    2857    param.p_modf_threshold    = &sqr_fft_modf_threshold;
    2858    param.first_size          = SQR_TOOM3_THRESHOLD / 2;
    2859    param.max_size            = option_fft_max_size;
    2860    param.function            = speed_mpn_fft_sqr;
    2861    param.mul_modf_function   = speed_mpn_mul_fft_sqr;
    2862    param.mul_function        = speed_mpn_sqr;
    2863    param.sqr = 1;
    2864    fft (&param);
    2865  }
    2866  
    2867  void
    2868  tune_fac_ui (void)
    2869  {
    2870    static struct param_t  param;
    2871  
    2872    param.function = speed_mpz_fac_ui_tune;
    2873  
    2874    param.name = "FAC_DSC_THRESHOLD";
    2875    param.min_size = 70;
    2876    param.max_size = FAC_DSC_THRESHOLD_LIMIT;
    2877    one (&fac_dsc_threshold, &param);
    2878  
    2879    param.name = "FAC_ODD_THRESHOLD";
    2880    param.min_size = 22;
    2881    param.stop_factor = 1.7;
    2882    param.min_is_always = 1;
    2883    one (&fac_odd_threshold, &param);
    2884  }
    2885  
    2886  void
    2887  all (void)
    2888  {
    2889    time_t  start_time, end_time;
    2890    TMP_DECL;
    2891  
    2892    TMP_MARK;
    2893    SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
    2894    SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
    2895  
    2896    mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
    2897    mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
    2898  
    2899    fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
    2900  
    2901    speed_time_init ();
    2902    fprintf (stderr, "Using: %s\n", speed_time_string);
    2903  
    2904    fprintf (stderr, "speed_precision %d", speed_precision);
    2905    if (speed_unittime == 1.0)
    2906      fprintf (stderr, ", speed_unittime 1 cycle");
    2907    else
    2908      fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
    2909    if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
    2910      fprintf (stderr, ", CPU freq unknown\n");
    2911    else
    2912      fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
    2913  
    2914    fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
    2915             DEFAULT_MAX_SIZE, (long) option_fft_max_size);
    2916    fprintf (stderr, "\n");
    2917  
    2918    time (&start_time);
    2919    {
    2920      struct tm  *tp;
    2921      tp = localtime (&start_time);
    2922      printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
    2923              tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
    2924  
    2925  #ifdef __GNUC__
    2926      /* gcc sub-minor version doesn't seem to come through as a define */
    2927      printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
    2928  #define PRINTED_COMPILER
    2929  #endif
    2930  #if defined (__SUNPRO_C)
    2931      printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
    2932  #define PRINTED_COMPILER
    2933  #endif
    2934  #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
    2935      /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
    2936      printf ("MIPSpro C %d.%d.%d */\n",
    2937  	    _COMPILER_VERSION / 100,
    2938  	    _COMPILER_VERSION / 10 % 10,
    2939  	    _COMPILER_VERSION % 10);
    2940  #define PRINTED_COMPILER
    2941  #endif
    2942  #if defined (__DECC) && defined (__DECC_VER)
    2943      printf ("DEC C %d */\n", __DECC_VER);
    2944  #define PRINTED_COMPILER
    2945  #endif
    2946  #if ! defined (PRINTED_COMPILER)
    2947      printf ("system compiler */\n");
    2948  #endif
    2949    }
    2950    printf ("\n");
    2951  
    2952    tune_divrem_1 ();
    2953    tune_mod_1 ();
    2954    tune_preinv_divrem_1 ();
    2955    tune_div_qr_1 ();
    2956  #if 0
    2957    tune_divrem_2 ();
    2958  #endif
    2959    tune_div_qr_2 ();
    2960    tune_divexact_1 ();
    2961    tune_modexact_1_odd ();
    2962    printf("\n");
    2963  
    2964    relspeed_div_1_vs_mul_1 ();
    2965    printf("\n");
    2966  
    2967    tune_mul_n ();
    2968    printf("\n");
    2969  
    2970    tune_mul ();
    2971    printf("\n");
    2972  
    2973    tune_sqr ();
    2974    printf("\n");
    2975  
    2976    tune_mulmid ();
    2977    printf("\n");
    2978  
    2979    tune_mulmod_bnm1 ();
    2980    tune_sqrmod_bnm1 ();
    2981    printf("\n");
    2982  
    2983    tune_fft_mul ();
    2984    printf("\n");
    2985  
    2986    tune_fft_sqr ();
    2987    printf ("\n");
    2988  
    2989    tune_mullo ();
    2990    tune_sqrlo ();
    2991    printf("\n");
    2992  
    2993    tune_dc_div ();
    2994    tune_dc_bdiv ();
    2995  
    2996    printf("\n");
    2997    tune_invertappr ();
    2998    tune_invert ();
    2999    printf("\n");
    3000  
    3001    tune_binvert ();
    3002    tune_redc ();
    3003    printf("\n");
    3004  
    3005    tune_mu_div ();
    3006    tune_mu_bdiv ();
    3007    printf("\n");
    3008  
    3009    tune_powm_sec ();
    3010    printf("\n");
    3011  
    3012    tune_get_str ();
    3013    tune_set_str ();
    3014    printf("\n");
    3015  
    3016    tune_fac_ui ();
    3017    printf("\n");
    3018  
    3019    tune_matrix22_mul ();
    3020    tune_hgcd2 ();
    3021    tune_hgcd ();
    3022    tune_hgcd_appr ();
    3023    tune_hgcd_reduce();
    3024    tune_gcd_dc ();
    3025    tune_gcdext_dc ();
    3026    tune_jacobi_base ();
    3027    printf("\n");
    3028  
    3029    time (&end_time);
    3030    printf ("/* Tuneup completed successfully, took %ld seconds */\n",
    3031            (long) (end_time - start_time));
    3032  
    3033    TMP_FREE;
    3034  }
    3035  
    3036  
    3037  int
    3038  main (int argc, char *argv[])
    3039  {
    3040    int  opt;
    3041  
    3042    /* Unbuffered so if output is redirected to a file it isn't lost if the
    3043       program is killed part way through.  */
    3044    setbuf (stdout, NULL);
    3045    setbuf (stderr, NULL);
    3046  
    3047    while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
    3048      {
    3049        switch (opt) {
    3050        case 'f':
    3051          if (optarg[0] == 't')
    3052            option_fft_trace = 2;
    3053          else
    3054            option_fft_max_size = atol (optarg);
    3055          break;
    3056        case 'o':
    3057          speed_option_set (optarg);
    3058          break;
    3059        case 'p':
    3060          speed_precision = atoi (optarg);
    3061          break;
    3062        case 't':
    3063          option_trace++;
    3064          break;
    3065        case '?':
    3066          exit(1);
    3067        }
    3068      }
    3069  
    3070    all ();
    3071    exit (0);
    3072  }