(root)/
gmp-6.3.0/
tune/
speed.c
       1  /* Speed measuring program.
       2  
       3  Copyright 1999-2003, 2005, 2006, 2008-2022 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  /* Usage message is in the code below, run with no arguments to print it.
      32     See README for interesting applications.
      33  
      34     To add a new routine foo(), create a speed_foo() function in the style of
      35     the existing ones and add an entry in the routine[] array.  Put FLAG_R if
      36     speed_foo() wants an "r" parameter.
      37  
      38     The routines don't have help messages or descriptions, but most have
      39     suggestive names.  See the source code for full details.
      40  
      41  */
      42  
      43  #include "config.h"
      44  
      45  #include <limits.h>
      46  #include <stdio.h>
      47  #include <stdlib.h>
      48  #include <string.h>
      49  
      50  #if HAVE_UNISTD_H
      51  #include <unistd.h>  /* for getpid, R_OK */
      52  #endif
      53  
      54  #if TIME_WITH_SYS_TIME
      55  # include <sys/time.h>  /* for struct timeval */
      56  # include <time.h>
      57  #else
      58  # if HAVE_SYS_TIME_H
      59  #  include <sys/time.h>
      60  # else
      61  #  include <time.h>
      62  # endif
      63  #endif
      64  
      65  #if HAVE_SYS_RESOURCE_H
      66  #include <sys/resource.h>  /* for getrusage() */
      67  #endif
      68  
      69  
      70  #include "gmp-impl.h"
      71  #include "longlong.h"  /* for the benefit of speed-many.c */
      72  #include "tests.h"
      73  #include "speed.h"
      74  
      75  
      76  #if !HAVE_DECL_OPTARG
      77  extern char *optarg;
      78  extern int optind, opterr;
      79  #endif
      80  
      81  #if !HAVE_STRTOUL
      82  #define strtoul(p,e,b)  (unsigned long) strtol(p,e,b)
      83  #endif
      84  
      85  #ifdef SPEED_EXTRA_PROTOS
      86  SPEED_EXTRA_PROTOS
      87  #endif
      88  #ifdef SPEED_EXTRA_PROTOS2
      89  SPEED_EXTRA_PROTOS2
      90  #endif
      91  
      92  
      93  #if GMP_LIMB_BITS == 32
      94  #define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK)
      95  #endif
      96  #if GMP_LIMB_BITS == 64
      97  #define GMP_NUMB_0xAA  (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK)
      98  #endif
      99  
     100  
     101  #define CMP_ABSOLUTE     1
     102  #define CMP_RATIO        2
     103  #define CMP_DIFFERENCE   3
     104  #define CMP_DIFFPREV     4
     105  int  option_cmp = CMP_ABSOLUTE;
     106  
     107  #define UNIT_SECONDS        1
     108  #define UNIT_CYCLES         2
     109  #define UNIT_CYCLESPERLIMB  3
     110  int  option_unit = UNIT_SECONDS;
     111  
     112  #define DATA_RANDOM   1
     113  #define DATA_RANDOM2  2
     114  #define DATA_ZEROS    3
     115  #define DATA_AAS      4
     116  #define DATA_FFS      5
     117  #define DATA_2FD      6
     118  int  option_data = DATA_RANDOM;
     119  
     120  int        option_square = 0;
     121  double     option_factor = 0.0;
     122  mp_size_t  option_step = 1;
     123  int        option_gnuplot = 0;
     124  char      *option_gnuplot_basename;
     125  struct size_array_t {
     126    mp_size_t start, end;
     127  } *size_array = NULL;
     128  mp_size_t  size_num = 0;
     129  mp_size_t  size_allocnum = 0;
     130  int        option_resource_usage = 0;
     131  long       option_seed = 123456789;
     132  
     133  struct speed_params  sp;
     134  
     135  #define COLUMN_WIDTH  13  /* for the free-form output */
     136  
     137  #define FLAG_R            (1<<0)  /* require ".r" */
     138  #define FLAG_R_OPTIONAL   (1<<1)  /* optional ".r" */
     139  #define FLAG_RSIZE        (1<<2)
     140  #define FLAG_NODATA       (1<<3)  /* don't alloc xp, yp */
     141  
     142  const struct routine_t {
     143    /* constants */
     144    const char        *name;
     145    speed_function_t  fun;
     146    int               flag;
     147  } routine[] = {
     148  
     149    { "noop",              speed_noop                 },
     150    { "noop_wxs",          speed_noop_wxs             },
     151    { "noop_wxys",         speed_noop_wxys            },
     152  
     153    { "mpn_add_n",         speed_mpn_add_n,     FLAG_R_OPTIONAL },
     154    { "mpn_sub_n",         speed_mpn_sub_n,     FLAG_R_OPTIONAL },
     155    { "mpn_add_1",         speed_mpn_add_1,     FLAG_R },
     156    { "mpn_add_1_inplace", speed_mpn_add_1_inplace, FLAG_R },
     157    { "mpn_sub_1",         speed_mpn_sub_1,     FLAG_R },
     158    { "mpn_sub_1_inplace", speed_mpn_sub_1_inplace, FLAG_R },
     159  
     160    { "mpn_add_err1_n",    speed_mpn_add_err1_n    },
     161    { "mpn_add_err2_n",    speed_mpn_add_err2_n    },
     162    { "mpn_add_err3_n",    speed_mpn_add_err3_n    },
     163    { "mpn_sub_err1_n",    speed_mpn_sub_err1_n    },
     164    { "mpn_sub_err2_n",    speed_mpn_sub_err2_n    },
     165    { "mpn_sub_err3_n",    speed_mpn_sub_err3_n    },
     166  
     167  #if HAVE_NATIVE_mpn_add_n_sub_n
     168    { "mpn_add_n_sub_n",      speed_mpn_add_n_sub_n,     FLAG_R_OPTIONAL },
     169  #endif
     170  
     171    { "mpn_addmul_1",      speed_mpn_addmul_1,  FLAG_R },
     172    { "mpn_submul_1",      speed_mpn_submul_1,  FLAG_R },
     173  #if HAVE_NATIVE_mpn_addmul_2
     174    { "mpn_addmul_2",      speed_mpn_addmul_2,  FLAG_R_OPTIONAL },
     175  #endif
     176  #if HAVE_NATIVE_mpn_addmul_3
     177    { "mpn_addmul_3",      speed_mpn_addmul_3,  FLAG_R_OPTIONAL },
     178  #endif
     179  #if HAVE_NATIVE_mpn_addmul_4
     180    { "mpn_addmul_4",      speed_mpn_addmul_4,  FLAG_R_OPTIONAL },
     181  #endif
     182  #if HAVE_NATIVE_mpn_addmul_5
     183    { "mpn_addmul_5",      speed_mpn_addmul_5,  FLAG_R_OPTIONAL },
     184  #endif
     185  #if HAVE_NATIVE_mpn_addmul_6
     186    { "mpn_addmul_6",      speed_mpn_addmul_6,  FLAG_R_OPTIONAL },
     187  #endif
     188  #if HAVE_NATIVE_mpn_addmul_7
     189    { "mpn_addmul_7",      speed_mpn_addmul_7,  FLAG_R_OPTIONAL },
     190  #endif
     191  #if HAVE_NATIVE_mpn_addmul_8
     192    { "mpn_addmul_8",      speed_mpn_addmul_8,  FLAG_R_OPTIONAL },
     193  #endif
     194  #if HAVE_NATIVE_mpn_addaddmul_1msb0
     195    { "mpn_addaddmul_1msb0",      speed_mpn_addaddmul_1msb0, FLAG_R_OPTIONAL },
     196  #endif
     197    { "mpn_mul_1",         speed_mpn_mul_1,     FLAG_R },
     198    { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R },
     199  #if HAVE_NATIVE_mpn_mul_2
     200    { "mpn_mul_2",         speed_mpn_mul_2,     FLAG_R_OPTIONAL },
     201  #endif
     202  #if HAVE_NATIVE_mpn_mul_3
     203    { "mpn_mul_3",         speed_mpn_mul_3,     FLAG_R_OPTIONAL },
     204  #endif
     205  #if HAVE_NATIVE_mpn_mul_4
     206    { "mpn_mul_4",         speed_mpn_mul_4,     FLAG_R_OPTIONAL },
     207  #endif
     208  #if HAVE_NATIVE_mpn_mul_5
     209    { "mpn_mul_5",         speed_mpn_mul_5,     FLAG_R_OPTIONAL },
     210  #endif
     211  #if HAVE_NATIVE_mpn_mul_6
     212    { "mpn_mul_6",         speed_mpn_mul_6,     FLAG_R_OPTIONAL },
     213  #endif
     214  
     215    { "mpn_divrem_1",      speed_mpn_divrem_1,  FLAG_R },
     216    { "mpn_divrem_1f",     speed_mpn_divrem_1f, FLAG_R },
     217  #if HAVE_NATIVE_mpn_divrem_1c
     218    { "mpn_divrem_1c",     speed_mpn_divrem_1c, FLAG_R },
     219    { "mpn_divrem_1cf",    speed_mpn_divrem_1cf,FLAG_R },
     220  #endif
     221    { "mpn_mod_1",         speed_mpn_mod_1,     FLAG_R },
     222  #if HAVE_NATIVE_mpn_mod_1c
     223    { "mpn_mod_1c",        speed_mpn_mod_1c,    FLAG_R },
     224  #endif
     225    { "mpn_preinv_divrem_1",  speed_mpn_preinv_divrem_1,  FLAG_R },
     226    { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R },
     227    { "mpn_preinv_mod_1",  speed_mpn_preinv_mod_1, FLAG_R },
     228  
     229    { "mpn_mod_1_1",       speed_mpn_mod_1_1,       FLAG_R },
     230    { "mpn_mod_1_1_1",     speed_mpn_mod_1_1_1,     FLAG_R },
     231    { "mpn_mod_1_1_2",     speed_mpn_mod_1_1_2,     FLAG_R },
     232    { "mpn_mod_1s_2",      speed_mpn_mod_1_2,       FLAG_R },
     233    { "mpn_mod_1s_3",      speed_mpn_mod_1_3,       FLAG_R },
     234    { "mpn_mod_1s_4",      speed_mpn_mod_1_4,       FLAG_R },
     235  
     236    { "mpn_divrem_1_div",  speed_mpn_divrem_1_div,  FLAG_R },
     237    { "mpn_divrem_1_inv",  speed_mpn_divrem_1_inv,  FLAG_R },
     238    { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R },
     239    { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R },
     240    { "mpn_mod_1_div",     speed_mpn_mod_1_div,     FLAG_R },
     241    { "mpn_mod_1_inv",     speed_mpn_mod_1_inv,     FLAG_R },
     242  
     243    { "mpn_divrem_2",      speed_mpn_divrem_2,        },
     244    { "mpn_divrem_2_div",  speed_mpn_divrem_2_div,    },
     245    { "mpn_divrem_2_inv",  speed_mpn_divrem_2_inv,    },
     246  
     247    { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R  },
     248    { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R  },
     249    { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R  },
     250    { "mpn_div_qr_1n_pi1_3",speed_mpn_div_qr_1n_pi1_3, FLAG_R  },
     251    { "mpn_div_qr_1n_pi1_4",speed_mpn_div_qr_1n_pi1_4, FLAG_R  },
     252    { "mpn_div_qr_1",      speed_mpn_div_qr_1,      FLAG_R },
     253  
     254    { "mpn_div_qr_2n",     speed_mpn_div_qr_2n,       },
     255    { "mpn_div_qr_2u",     speed_mpn_div_qr_2u,       },
     256  
     257    { "mpn_divexact_1",    speed_mpn_divexact_1,    FLAG_R },
     258    { "mpn_divexact_by3",  speed_mpn_divexact_by3          },
     259  
     260    { "mpn_bdiv_q_1",      speed_mpn_bdiv_q_1,      FLAG_R },
     261    { "mpn_pi1_bdiv_q_1",  speed_mpn_pi1_bdiv_q_1,  FLAG_R_OPTIONAL },
     262    { "mpn_bdiv_dbm1c",    speed_mpn_bdiv_dbm1c,    FLAG_R_OPTIONAL },
     263  
     264  #if HAVE_NATIVE_mpn_modexact_1_odd
     265    { "mpn_modexact_1_odd",  speed_mpn_modexact_1_odd,  FLAG_R },
     266  #endif
     267    { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R },
     268  
     269  #if GMP_NUMB_BITS % 4 == 0
     270    { "mpn_mod_34lsub1",   speed_mpn_mod_34lsub1 },
     271  #endif
     272  
     273    { "mpn_lshift",        speed_mpn_lshift, FLAG_R   },
     274    { "mpn_lshiftc",       speed_mpn_lshiftc, FLAG_R   },
     275    { "mpn_rshift",        speed_mpn_rshift, FLAG_R   },
     276  
     277    { "mpn_and_n",         speed_mpn_and_n,  FLAG_R_OPTIONAL },
     278    { "mpn_andn_n",        speed_mpn_andn_n, FLAG_R_OPTIONAL },
     279    { "mpn_nand_n",        speed_mpn_nand_n, FLAG_R_OPTIONAL },
     280    { "mpn_ior_n",         speed_mpn_ior_n,  FLAG_R_OPTIONAL },
     281    { "mpn_iorn_n",        speed_mpn_iorn_n, FLAG_R_OPTIONAL },
     282    { "mpn_nior_n",        speed_mpn_nior_n, FLAG_R_OPTIONAL },
     283    { "mpn_xor_n",         speed_mpn_xor_n,  FLAG_R_OPTIONAL },
     284    { "mpn_xnor_n",        speed_mpn_xnor_n, FLAG_R_OPTIONAL },
     285    { "mpn_com",           speed_mpn_com              },
     286    { "mpn_neg",           speed_mpn_neg              },
     287  
     288    { "mpn_popcount",      speed_mpn_popcount         },
     289    { "mpn_hamdist",       speed_mpn_hamdist          },
     290  
     291    { "mpn_matrix22_mul",  speed_mpn_matrix22_mul     },
     292  
     293    { "mpn_hgcd2",         speed_mpn_hgcd2, FLAG_NODATA },
     294    { "mpn_hgcd2_1",       speed_mpn_hgcd2_1, FLAG_NODATA },
     295    { "mpn_hgcd2_2",       speed_mpn_hgcd2_2, FLAG_NODATA },
     296    { "mpn_hgcd2_3",       speed_mpn_hgcd2_3, FLAG_NODATA },
     297    { "mpn_hgcd2_4",       speed_mpn_hgcd2_4, FLAG_NODATA },
     298    { "mpn_hgcd2_5",       speed_mpn_hgcd2_5, FLAG_NODATA },
     299    { "mpn_hgcd",          speed_mpn_hgcd             },
     300    { "mpn_hgcd_lehmer",   speed_mpn_hgcd_lehmer      },
     301    { "mpn_hgcd_appr",     speed_mpn_hgcd_appr        },
     302    { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
     303  
     304    { "mpn_hgcd_reduce",   speed_mpn_hgcd_reduce      },
     305    { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1    },
     306    { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2    },
     307  
     308    { "mpn_gcd_1",         speed_mpn_gcd_1,  FLAG_R_OPTIONAL },
     309    { "mpn_gcd_11",        speed_mpn_gcd_11, FLAG_R_OPTIONAL },
     310    { "mpn_gcd_1N",        speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
     311    { "mpn_gcd_22",        speed_mpn_gcd_22, FLAG_R_OPTIONAL },
     312  
     313    { "mpn_gcd",           speed_mpn_gcd                    },
     314  
     315    { "mpn_gcdext",            speed_mpn_gcdext            },
     316    { "mpn_gcdext_single",     speed_mpn_gcdext_single     },
     317    { "mpn_gcdext_double",     speed_mpn_gcdext_double     },
     318    { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single },
     319    { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double },
     320  #if 0
     321    { "mpn_gcdext_lehmer",     speed_mpn_gcdext_lehmer     },
     322  #endif
     323  
     324    { "gmp_primesieve",    speed_gmp_primesieve, FLAG_NODATA      },
     325    { "mpz_nextprime",     speed_mpz_nextprime        },
     326    { "mpz_nextprime_1",   speed_mpz_nextprime_1, FLAG_R_OPTIONAL },
     327    { "mpz_prevprime",     speed_mpz_prevprime        },
     328    { "mpz_prevprime_1",   speed_mpz_prevprime_1, FLAG_R_OPTIONAL },
     329  
     330    { "mpz_jacobi",        speed_mpz_jacobi           },
     331    { "mpn_jacobi_base",   speed_mpn_jacobi_base      },
     332    { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1    },
     333    { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2    },
     334    { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3    },
     335    { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4    },
     336  
     337    { "mpn_mul",           speed_mpn_mul,         FLAG_R_OPTIONAL },
     338    { "mpn_mul_basecase",  speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
     339    { "mpn_sqr_basecase",  speed_mpn_sqr_basecase     },
     340  #if HAVE_NATIVE_mpn_sqr_diagonal
     341    { "mpn_sqr_diagonal",  speed_mpn_sqr_diagonal     },
     342  #endif
     343  #if HAVE_NATIVE_mpn_sqr_diag_addlsh1
     344    { "mpn_sqr_diag_addlsh1", speed_mpn_sqr_diag_addlsh1 },
     345  #endif
     346  
     347    { "mpn_mul_n",         speed_mpn_mul_n            },
     348    { "mpn_sqr",           speed_mpn_sqr              },
     349  
     350    { "mpn_toom2_sqr",     speed_mpn_toom2_sqr        },
     351    { "mpn_toom3_sqr",     speed_mpn_toom3_sqr        },
     352    { "mpn_toom4_sqr",     speed_mpn_toom4_sqr        },
     353    { "mpn_toom6_sqr",     speed_mpn_toom6_sqr        },
     354    { "mpn_toom8_sqr",     speed_mpn_toom8_sqr        },
     355    { "mpn_toom22_mul",    speed_mpn_toom22_mul       },
     356    { "mpn_toom33_mul",    speed_mpn_toom33_mul       },
     357    { "mpn_toom44_mul",    speed_mpn_toom44_mul       },
     358    { "mpn_toom6h_mul",    speed_mpn_toom6h_mul       },
     359    { "mpn_toom8h_mul",    speed_mpn_toom8h_mul       },
     360    { "mpn_toom32_mul",    speed_mpn_toom32_mul       },
     361    { "mpn_toom42_mul",    speed_mpn_toom42_mul       },
     362    { "mpn_toom43_mul",    speed_mpn_toom43_mul       },
     363    { "mpn_toom63_mul",    speed_mpn_toom63_mul       },
     364    { "mpn_nussbaumer_mul",    speed_mpn_nussbaumer_mul    },
     365    { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
     366  #if WANT_OLD_FFT_FULL
     367    { "mpn_mul_fft_full",      speed_mpn_mul_fft_full      },
     368    { "mpn_mul_fft_full_sqr",  speed_mpn_mul_fft_full_sqr  },
     369  #endif
     370    { "mpn_mul_fft",       speed_mpn_mul_fft,     FLAG_R_OPTIONAL },
     371    { "mpn_mul_fft_sqr",   speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
     372  
     373    { "mpn_sqrlo",          speed_mpn_sqrlo           },
     374    { "mpn_sqrlo_basecase", speed_mpn_sqrlo_basecase  },
     375    { "mpn_mullo_n",        speed_mpn_mullo_n         },
     376    { "mpn_mullo_basecase", speed_mpn_mullo_basecase  },
     377  
     378    { "mpn_mulmid_basecase",  speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL },
     379    { "mpn_toom42_mulmid",    speed_mpn_toom42_mulmid },
     380    { "mpn_mulmid_n",         speed_mpn_mulmid_n },
     381    { "mpn_mulmid",           speed_mpn_mulmid, FLAG_R_OPTIONAL },
     382  
     383    { "mpn_bc_mulmod_bnm1",      speed_mpn_bc_mulmod_bnm1      },
     384    { "mpn_mulmod_bnm1",         speed_mpn_mulmod_bnm1         },
     385    { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded },
     386    { "mpn_sqrmod_bnm1",         speed_mpn_sqrmod_bnm1         },
     387  
     388    { "mpn_mulmod_bknp1",        speed_mpn_mulmod_bknp1, FLAG_R_OPTIONAL },
     389    { "mpn_sqrmod_bknp1",        speed_mpn_sqrmod_bknp1, FLAG_R_OPTIONAL },
     390    { "mpn_mulmod_bnp1",         speed_mpn_mulmod_bnp1         },
     391    { "mpn_sqrmod_bnp1",         speed_mpn_sqrmod_bnp1         },
     392  
     393    { "mpn_invert",              speed_mpn_invert              },
     394    { "mpn_invertappr",          speed_mpn_invertappr          },
     395    { "mpn_ni_invertappr",       speed_mpn_ni_invertappr       },
     396    { "mpn_binvert",             speed_mpn_binvert             },
     397    { "mpn_sec_invert",          speed_mpn_sec_invert          },
     398  
     399    { "mpn_sbpi1_div_qr",        speed_mpn_sbpi1_div_qr,    FLAG_R_OPTIONAL},
     400    { "mpn_dcpi1_div_qr",        speed_mpn_dcpi1_div_qr,    FLAG_R_OPTIONAL},
     401    { "mpn_mu_div_qr",           speed_mpn_mu_div_qr,       FLAG_R_OPTIONAL},
     402    { "mpn_mupi_div_qr",         speed_mpn_mupi_div_qr,     FLAG_R_OPTIONAL},
     403    { "mpn_sbpi1_divappr_q",     speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL},
     404    { "mpn_dcpi1_divappr_q",     speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL},
     405  
     406    { "mpn_sbpi1_bdiv_qr",       speed_mpn_sbpi1_bdiv_qr       },
     407    { "mpn_dcpi1_bdiv_qr",       speed_mpn_dcpi1_bdiv_qr       },
     408    { "mpn_sbpi1_bdiv_q",        speed_mpn_sbpi1_bdiv_q        },
     409    { "mpn_dcpi1_bdiv_q",        speed_mpn_dcpi1_bdiv_q        },
     410    { "mpn_sbpi1_bdiv_r",        speed_mpn_sbpi1_bdiv_r        },
     411  
     412    { "mpn_broot",               speed_mpn_broot,    FLAG_R },
     413    { "mpn_broot_invm1",         speed_mpn_broot_invm1, FLAG_R },
     414    { "mpn_brootinv",            speed_mpn_brootinv, FLAG_R },
     415  
     416    { "mpn_get_str",          speed_mpn_get_str,     FLAG_R_OPTIONAL },
     417    { "mpn_set_str",          speed_mpn_set_str,     FLAG_R_OPTIONAL },
     418    { "mpn_set_str_basecase", speed_mpn_bc_set_str,  FLAG_R_OPTIONAL },
     419  
     420    { "mpn_sqrtrem",       speed_mpn_sqrtrem          },
     421    { "mpn_rootrem",       speed_mpn_rootrem, FLAG_R  },
     422    { "mpn_sqrt",          speed_mpn_sqrt             },
     423    { "mpn_root",          speed_mpn_root, FLAG_R     },
     424  
     425    { "mpn_perfect_power_p",  speed_mpn_perfect_power_p,       },
     426    { "mpn_perfect_square_p", speed_mpn_perfect_square_p,      },
     427  
     428    { "mpn_fib2_ui",       speed_mpn_fib2_ui,    FLAG_NODATA },
     429    { "mpz_fib_ui",        speed_mpz_fib_ui,     FLAG_NODATA },
     430    { "mpz_fib2_ui",       speed_mpz_fib2_ui,    FLAG_NODATA },
     431    { "mpz_lucnum_ui",     speed_mpz_lucnum_ui,  FLAG_NODATA },
     432    { "mpz_lucnum2_ui",    speed_mpz_lucnum2_ui, FLAG_NODATA },
     433  
     434    { "mpz_add",           speed_mpz_add              },
     435    { "mpz_invert",        speed_mpz_invert,   FLAG_R_OPTIONAL },
     436    { "mpz_bin_uiui",      speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
     437    { "mpz_bin_ui",        speed_mpz_bin_ui,   FLAG_NODATA | FLAG_R_OPTIONAL },
     438    { "mpz_fac_ui",        speed_mpz_fac_ui,   FLAG_NODATA   },
     439    { "mpz_2fac_ui",       speed_mpz_2fac_ui,  FLAG_NODATA   },
     440    { "mpz_mfac_uiui",     speed_mpz_mfac_uiui,  FLAG_NODATA | FLAG_R_OPTIONAL },
     441    { "mpz_primorial_ui",  speed_mpz_primorial_ui, FLAG_NODATA },
     442    { "mpz_powm",          speed_mpz_powm,     FLAG_R_OPTIONAL },
     443    { "mpz_powm_mod",      speed_mpz_powm_mod         },
     444    { "mpz_powm_redc",     speed_mpz_powm_redc        },
     445    { "mpz_powm_sec",      speed_mpz_powm_sec        },
     446    { "mpz_powm_ui",       speed_mpz_powm_ui,  FLAG_R_OPTIONAL },
     447  
     448    { "mpz_mod",           speed_mpz_mod              },
     449    { "mpn_redc_1",        speed_mpn_redc_1           },
     450    { "mpn_redc_2",        speed_mpn_redc_2           },
     451    { "mpn_redc_n",        speed_mpn_redc_n           },
     452  
     453    { "MPN_COPY",          speed_MPN_COPY             },
     454    { "MPN_COPY_INCR",     speed_MPN_COPY_INCR        },
     455    { "MPN_COPY_DECR",     speed_MPN_COPY_DECR        },
     456    { "memcpy",            speed_memcpy               },
     457  #if HAVE_NATIVE_mpn_copyi
     458    { "mpn_copyi",         speed_mpn_copyi            },
     459  #endif
     460  #if HAVE_NATIVE_mpn_copyd
     461    { "mpn_copyd",         speed_mpn_copyd            },
     462  #endif
     463    { "mpn_sec_tabselect", speed_mpn_sec_tabselect, FLAG_R_OPTIONAL },
     464  #if HAVE_NATIVE_mpn_addlsh1_n == 1
     465    { "mpn_addlsh1_n",     speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
     466  #endif
     467  #if HAVE_NATIVE_mpn_sublsh1_n == 1
     468    { "mpn_sublsh1_n",     speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
     469  #endif
     470  #if HAVE_NATIVE_mpn_addlsh1_n_ip1
     471    { "mpn_addlsh1_n_ip1", speed_mpn_addlsh1_n_ip1    },
     472  #endif
     473  #if HAVE_NATIVE_mpn_addlsh1_n_ip2
     474    { "mpn_addlsh1_n_ip2", speed_mpn_addlsh1_n_ip2    },
     475  #endif
     476  #if HAVE_NATIVE_mpn_sublsh1_n_ip1
     477    { "mpn_sublsh1_n_ip1", speed_mpn_sublsh1_n_ip1    },
     478  #endif
     479  #if HAVE_NATIVE_mpn_rsblsh1_n == 1
     480    { "mpn_rsblsh1_n",     speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
     481  #endif
     482  #if HAVE_NATIVE_mpn_addlsh2_n == 1
     483    { "mpn_addlsh2_n",     speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
     484  #endif
     485  #if HAVE_NATIVE_mpn_sublsh2_n == 1
     486    { "mpn_sublsh2_n",     speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },
     487  #endif
     488  #if HAVE_NATIVE_mpn_addlsh2_n_ip1
     489    { "mpn_addlsh2_n_ip1", speed_mpn_addlsh2_n_ip1    },
     490  #endif
     491  #if HAVE_NATIVE_mpn_addlsh2_n_ip2
     492    { "mpn_addlsh2_n_ip2", speed_mpn_addlsh2_n_ip2    },
     493  #endif
     494  #if HAVE_NATIVE_mpn_sublsh2_n_ip1
     495    { "mpn_sublsh2_n_ip1", speed_mpn_sublsh2_n_ip1    },
     496  #endif
     497  #if HAVE_NATIVE_mpn_rsblsh2_n == 1
     498    { "mpn_rsblsh2_n",     speed_mpn_rsblsh2_n, FLAG_R_OPTIONAL },
     499  #endif
     500  #if HAVE_NATIVE_mpn_addlsh_n
     501    { "mpn_addlsh_n",     speed_mpn_addlsh_n, FLAG_R_OPTIONAL },
     502  #endif
     503  #if HAVE_NATIVE_mpn_sublsh_n
     504    { "mpn_sublsh_n",     speed_mpn_sublsh_n, FLAG_R_OPTIONAL },
     505  #endif
     506  #if HAVE_NATIVE_mpn_addlsh_n_ip1
     507    { "mpn_addlsh_n_ip1", speed_mpn_addlsh_n_ip1    },
     508  #endif
     509  #if HAVE_NATIVE_mpn_addlsh_n_ip2
     510    { "mpn_addlsh_n_ip2", speed_mpn_addlsh_n_ip2    },
     511  #endif
     512  #if HAVE_NATIVE_mpn_sublsh_n_ip1
     513    { "mpn_sublsh_n_ip1", speed_mpn_sublsh_n_ip1    },
     514  #endif
     515  #if HAVE_NATIVE_mpn_rsblsh_n
     516    { "mpn_rsblsh_n",     speed_mpn_rsblsh_n, FLAG_R_OPTIONAL },
     517  #endif
     518  #if HAVE_NATIVE_mpn_rsh1add_n
     519    { "mpn_rsh1add_n",     speed_mpn_rsh1add_n, FLAG_R_OPTIONAL },
     520  #endif
     521  #if HAVE_NATIVE_mpn_rsh1sub_n
     522    { "mpn_rsh1sub_n",     speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
     523  #endif
     524  
     525    { "mpn_cnd_add_n",     speed_mpn_cnd_add_n, FLAG_R_OPTIONAL },
     526    { "mpn_cnd_sub_n",     speed_mpn_cnd_sub_n, FLAG_R_OPTIONAL },
     527  
     528    { "MPN_ZERO",          speed_MPN_ZERO             },
     529  
     530    { "binvert_limb",       speed_binvert_limb,       FLAG_NODATA },
     531    { "binvert_limb_mul1",  speed_binvert_limb_mul1,  FLAG_NODATA },
     532    { "binvert_limb_loop",  speed_binvert_limb_loop,  FLAG_NODATA },
     533    { "binvert_limb_cond",  speed_binvert_limb_cond,  FLAG_NODATA },
     534    { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
     535  
     536    { "malloc_free",                  speed_malloc_free                  },
     537    { "malloc_realloc_free",          speed_malloc_realloc_free          },
     538    { "gmp_allocate_free",            speed_gmp_allocate_free            },
     539    { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free },
     540    { "mpz_init_clear",               speed_mpz_init_clear               },
     541    { "mpq_init_clear",               speed_mpq_init_clear               },
     542    { "mpf_init_clear",               speed_mpf_init_clear               },
     543    { "mpz_init_realloc_clear",       speed_mpz_init_realloc_clear       },
     544  
     545    { "umul_ppmm",         speed_umul_ppmm,     FLAG_R_OPTIONAL },
     546  #if HAVE_NATIVE_mpn_umul_ppmm
     547    { "mpn_umul_ppmm",     speed_mpn_umul_ppmm, FLAG_R_OPTIONAL },
     548  #endif
     549  #if HAVE_NATIVE_mpn_umul_ppmm_r
     550    { "mpn_umul_ppmm_r",   speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL },
     551  #endif
     552  
     553    { "count_leading_zeros",  speed_count_leading_zeros,  FLAG_NODATA | FLAG_R_OPTIONAL },
     554    { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
     555  
     556    { "udiv_qrnnd",             speed_udiv_qrnnd,             FLAG_R_OPTIONAL },
     557    { "udiv_qrnnd_c",           speed_udiv_qrnnd_c,           FLAG_R_OPTIONAL },
     558  #if HAVE_NATIVE_mpn_udiv_qrnnd
     559    { "mpn_udiv_qrnnd",         speed_mpn_udiv_qrnnd,         FLAG_R_OPTIONAL },
     560  #endif
     561  #if HAVE_NATIVE_mpn_udiv_qrnnd_r
     562    { "mpn_udiv_qrnnd_r",       speed_mpn_udiv_qrnnd_r,       FLAG_R_OPTIONAL },
     563  #endif
     564    { "invert_limb",            speed_invert_limb,            FLAG_R_OPTIONAL },
     565  
     566    { "operator_div",           speed_operator_div,           FLAG_R_OPTIONAL },
     567    { "operator_mod",           speed_operator_mod,           FLAG_R_OPTIONAL },
     568  
     569    { "gmp_randseed",    speed_gmp_randseed,    FLAG_R_OPTIONAL               },
     570    { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA },
     571    { "mpz_urandomb",    speed_mpz_urandomb,    FLAG_R_OPTIONAL | FLAG_NODATA },
     572  
     573  #ifdef SPEED_EXTRA_ROUTINES
     574    SPEED_EXTRA_ROUTINES
     575  #endif
     576  #ifdef SPEED_EXTRA_ROUTINES2
     577    SPEED_EXTRA_ROUTINES2
     578  #endif
     579  };
     580  
     581  
     582  struct choice_t {
     583    const struct routine_t  *p;
     584    mp_limb_t               r;
     585    double                  scale;
     586    double                  time;
     587    int                     no_time;
     588    double                  prev_time;
     589    const char              *name;
     590  };
     591  struct choice_t  *choice;
     592  int  num_choices = 0;
     593  
     594  
     595  void
     596  data_fill (mp_ptr ptr, mp_size_t size)
     597  {
     598    switch (option_data) {
     599    case DATA_RANDOM:
     600      mpn_random (ptr, size);
     601      break;
     602    case DATA_RANDOM2:
     603      mpn_random2 (ptr, size);
     604      break;
     605    case DATA_ZEROS:
     606      MPN_ZERO (ptr, size);
     607      break;
     608    case DATA_AAS:
     609      MPN_FILL (ptr, size, GMP_NUMB_0xAA);
     610      break;
     611    case DATA_FFS:
     612      MPN_FILL (ptr, size, GMP_NUMB_MAX);
     613      break;
     614    case DATA_2FD:
     615      MPN_FILL (ptr, size, GMP_NUMB_MAX);
     616      ptr[0] -= 2;
     617      break;
     618    default:
     619      abort();
     620      /*NOTREACHED*/
     621    }
     622  }
     623  
     624  /* The code here handling the various combinations of output options isn't
     625     too attractive, but it works and is fairly clean.  */
     626  
     627  #define SIZE_TO_DIVISOR(n)              \
     628    (option_square == 1 ? (n)*(n)         \
     629    : option_square == 2 ? (n)*((n)+1)/2  \
     630    : (n))
     631  
     632  void
     633  run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
     634  {
     635    const char  *first_open_fastest, *first_open_notfastest, *first_close;
     636    int         i, fastest, want_data;
     637    double      fastest_time;
     638    TMP_DECL;
     639  
     640    TMP_MARK;
     641  
     642    /* allocate data, unless all routines are NODATA */
     643    want_data = 0;
     644    for (i = 0; i < num_choices; i++)
     645      want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0);
     646  
     647    if (want_data)
     648      {
     649        SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp);
     650        SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp);
     651  
     652        data_fill (s->xp, s->size);
     653        data_fill (s->yp, s->size);
     654      }
     655    else
     656      {
     657        sp.xp = NULL;
     658        sp.yp = NULL;
     659      }
     660  
     661    if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
     662      {
     663        first_open_fastest = "(#";
     664        first_open_notfastest = " (";
     665        first_close = ")";
     666      }
     667    else
     668      {
     669        first_open_fastest = "#";
     670        first_open_notfastest = " ";
     671        first_close = "";
     672      }
     673  
     674    fastest = -1;
     675    fastest_time = -1.0;
     676    for (i = 0; i < num_choices; i++)
     677      {
     678        s->r = choice[i].r;
     679        choice[i].time = speed_measure (choice[i].p->fun, s);
     680        choice[i].no_time = (choice[i].time == -1.0);
     681        if (! choice[i].no_time)
     682          choice[i].time *= choice[i].scale;
     683  
     684        /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time
     685           is before any differences.  */
     686        {
     687          double     t;
     688          t = choice[i].time;
     689          if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1)
     690            {
     691              if (choice[i].prev_time == -1.0)
     692                choice[i].no_time = 1;
     693              else
     694                choice[i].time = choice[i].time - choice[i].prev_time;
     695            }
     696          choice[i].prev_time = t;
     697        }
     698  
     699        if (choice[i].no_time)
     700          continue;
     701  
     702        /* Look for the fastest after CMP_DIFFPREV has been applied, but
     703           before CMP_RATIO or CMP_DIFFERENCE.  There's only a fastest shown
     704           if there's more than one routine.  */
     705        if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time))
     706          {
     707            fastest = i;
     708            fastest_time = choice[i].time;
     709          }
     710  
     711        if (option_cmp == CMP_DIFFPREV)
     712          {
     713            /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */
     714            if (option_unit == UNIT_CYCLES)
     715              choice[i].time /= speed_cycletime;
     716            else if (option_unit == UNIT_CYCLESPERLIMB)
     717              {
     718                if (prev_size == -1)
     719                  choice[i].time /= speed_cycletime;
     720                else
     721                  choice[i].time /=  (speed_cycletime
     722                                      * (SIZE_TO_DIVISOR(s->size)
     723                                         - SIZE_TO_DIVISOR(prev_size)));
     724              }
     725          }
     726        else
     727          {
     728            if (option_unit == UNIT_CYCLES)
     729              choice[i].time /= speed_cycletime;
     730            else if (option_unit == UNIT_CYCLESPERLIMB)
     731              choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size));
     732  
     733            if (option_cmp == CMP_RATIO && i > 0)
     734              {
     735                /* A ratio isn't affected by the units chosen. */
     736                if (choice[0].no_time || choice[0].time == 0.0)
     737                  choice[i].no_time = 1;
     738                else
     739                  choice[i].time /= choice[0].time;
     740              }
     741            else if (option_cmp == CMP_DIFFERENCE && i > 0)
     742              {
     743                if (choice[0].no_time)
     744                  {
     745                    choice[i].no_time = 1;
     746                    continue;
     747                  }
     748                choice[i].time -= choice[0].time;
     749              }
     750          }
     751      }
     752  
     753    if (option_gnuplot)
     754      {
     755        /* In CMP_DIFFPREV, don't print anything for the first size, start
     756           with the second where an actual difference is available.
     757  
     758           In CMP_RATIO, print the first column as 1.0.
     759  
     760           The 9 decimals printed is much more than the expected precision of
     761           the measurements actually. */
     762  
     763        if (! (option_cmp == CMP_DIFFPREV && prev_size == -1))
     764          {
     765            fprintf (fp, "%-6ld ", s->size);
     766            for (i = 0; i < num_choices; i++)
     767              fprintf (fp, "  %.9e",
     768                       choice[i].no_time ? 0.0
     769                       : (option_cmp == CMP_RATIO && i == 0) ? 1.0
     770                       : choice[i].time);
     771            fprintf (fp, "\n");
     772          }
     773      }
     774    else
     775      {
     776        fprintf (fp, "%-6ld ", s->size);
     777        for (i = 0; i < num_choices; i++)
     778          {
     779            char  buf[128];
     780            int   decimals;
     781  
     782            if (choice[i].no_time)
     783              {
     784                fprintf (fp, " %*s", COLUMN_WIDTH, "n/a");
     785              }
     786            else
     787              {if (option_unit == UNIT_CYCLESPERLIMB
     788                   || (option_cmp == CMP_RATIO && i > 0))
     789                  decimals = 4;
     790                else if (option_unit == UNIT_CYCLES)
     791                  decimals = 2;
     792                else
     793                  decimals = 9;
     794  
     795                sprintf (buf, "%s%.*f%s",
     796                         i == fastest ? first_open_fastest : first_open_notfastest,
     797                         decimals, choice[i].time, first_close);
     798                fprintf (fp, " %*s", COLUMN_WIDTH, buf);
     799              }
     800          }
     801        fprintf (fp, "\n");
     802      }
     803  
     804    TMP_FREE;
     805  }
     806  
     807  void
     808  run_all (FILE *fp)
     809  {
     810    mp_size_t  prev_size;
     811    int        i;
     812    TMP_DECL;
     813  
     814    TMP_MARK;
     815    SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp);
     816    SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp);
     817  
     818    data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
     819    data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
     820  
     821    for (i = 0; i < size_num; i++)
     822      {
     823        sp.size = size_array[i].start;
     824        prev_size = -1;
     825        for (;;)
     826          {
     827            mp_size_t  step;
     828  
     829            if (option_data == DATA_2FD && sp.size >= 2)
     830              sp.xp[sp.size-1] = 2;
     831  
     832            run_one (fp, &sp, prev_size);
     833            prev_size = sp.size;
     834  
     835            if (option_data == DATA_2FD && sp.size >= 2)
     836              sp.xp[sp.size-1] = MP_LIMB_T_MAX;
     837  
     838            if (option_factor != 0.0)
     839              {
     840                step = (mp_size_t) (sp.size * option_factor - sp.size);
     841                if (step < 1)
     842                  step = 1;
     843              }
     844            else
     845              step = 1;
     846            if (step < option_step)
     847              step = option_step;
     848  
     849            sp.size += step;
     850            if (sp.size > size_array[i].end)
     851              break;
     852          }
     853      }
     854  
     855    TMP_FREE;
     856  }
     857  
     858  
     859  FILE *
     860  fopen_for_write (const char *filename)
     861  {
     862    FILE  *fp;
     863    if ((fp = fopen (filename, "w")) == NULL)
     864      {
     865        fprintf (stderr, "Cannot create %s\n", filename);
     866        exit(1);
     867      }
     868    return fp;
     869  }
     870  
     871  void
     872  fclose_written (FILE *fp, const char *filename)
     873  {
     874    int  err;
     875  
     876    err = ferror (fp);
     877    err |= fclose (fp);
     878  
     879    if (err)
     880      {
     881        fprintf (stderr, "Error writing %s\n", filename);
     882        exit(1);
     883      }
     884  }
     885  
     886  
     887  void
     888  run_gnuplot (int argc, char *argv[])
     889  {
     890    char  *plot_filename;
     891    char  *data_filename;
     892    FILE  *fp;
     893    int   i;
     894  
     895    plot_filename = (char *) (*__gmp_allocate_func)
     896      (strlen (option_gnuplot_basename) + 20);
     897    data_filename = (char *) (*__gmp_allocate_func)
     898      (strlen (option_gnuplot_basename) + 20);
     899  
     900    sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename);
     901    sprintf (data_filename, "%s.data",    option_gnuplot_basename);
     902  
     903    fp = fopen_for_write (plot_filename);
     904  
     905    fprintf (fp, "# Generated with:\n");
     906    fprintf (fp, "#");
     907    for (i = 0; i < argc; i++)
     908      fprintf (fp, " %s", argv[i]);
     909    fprintf (fp, "\n");
     910    fprintf (fp, "\n");
     911  
     912    fprintf (fp, "reset\n");
     913  
     914    /* Putting the key at the top left is usually good, and you can change it
     915       interactively if it's not. */
     916    fprintf (fp, "set key left\n");
     917  
     918    /* write underscores, not subscripts */
     919    fprintf (fp, "set termoption noenhanced\n");
     920  
     921    /* designed to make it possible to see crossovers easily */
     922    fprintf (fp, "set style data lines\n");
     923  
     924    fprintf (fp, "plot ");
     925    for (i = 0; i < num_choices; i++)
     926      {
     927        fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2);
     928        fprintf (fp, " title \"%s\"", choice[i].name);
     929  
     930        if (i != num_choices-1)
     931          fprintf (fp, ", \\");
     932        fprintf (fp, "\n");
     933      }
     934  
     935    fprintf (fp, "load \"-\"\n");
     936    fclose_written (fp, plot_filename);
     937  
     938    fp = fopen_for_write (data_filename);
     939  
     940    /* Unbuffered so you can see where the program was up to if it crashes or
     941       you kill it. */
     942    setbuf (fp, NULL);
     943  
     944    run_all (fp);
     945    fclose_written (fp, data_filename);
     946  }
     947  
     948  
     949  /* Return a limb with n many one bits (starting from the least significant) */
     950  
     951  #define LIMB_ONES(n) \
     952    ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX      \
     953      : (n) == 0 ? CNST_LIMB(0)                   \
     954      : (CNST_LIMB(1) << (n)) - 1)
     955  
     956  mp_limb_t
     957  r_string (const char *s)
     958  {
     959    const char  *s_orig = s;
     960    long        n;
     961  
     962    if (strcmp (s, "aas") == 0)
     963      return GMP_NUMB_0xAA;
     964  
     965    {
     966      mpz_t      z;
     967      mp_limb_t  l;
     968      int        set, siz;
     969  
     970      mpz_init (z);
     971      set = mpz_set_str (z, s, 0);
     972      siz = SIZ(z);
     973      l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]);
     974      mpz_clear (z);
     975      if (set == 0)
     976        {
     977          if (siz > 1 || siz < -1)
     978            printf ("Warning, r parameter %s truncated to %d bits\n",
     979                    s_orig, GMP_LIMB_BITS);
     980          return l;
     981        }
     982    }
     983  
     984    if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
     985      n = strtoul (s+2, (char **) &s, 16);
     986    else
     987      n = strtol (s, (char **) &s, 10);
     988  
     989    if (strcmp (s, "bits") == 0)
     990      {
     991        mp_limb_t  l;
     992        if (n > GMP_LIMB_BITS)
     993          {
     994            fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
     995                     n, GMP_LIMB_BITS);
     996            exit (1);
     997          }
     998        mpn_random (&l, 1);
     999        return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n);
    1000      }
    1001    else  if (strcmp (s, "ones") == 0)
    1002      {
    1003        if (n > GMP_LIMB_BITS)
    1004          {
    1005            fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
    1006                     n, GMP_LIMB_BITS);
    1007            exit (1);
    1008          }
    1009        return LIMB_ONES (n);
    1010      }
    1011    else if (*s != '\0')
    1012      {
    1013        fprintf (stderr, "invalid r parameter: %s\n", s_orig);
    1014        exit (1);
    1015      }
    1016  
    1017    return n;
    1018  }
    1019  
    1020  
    1021  void
    1022  routine_find (struct choice_t *c, const char *s_orig)
    1023  {
    1024    const char  *s;
    1025    int     i;
    1026    size_t  nlen;
    1027  
    1028    c->name = s_orig;
    1029    s = strchr (s_orig, '*');
    1030    if (s != NULL)
    1031      {
    1032        c->scale = atof(s_orig);
    1033        s++;
    1034      }
    1035    else
    1036      {
    1037        c->scale = 1.0;
    1038        s = s_orig;
    1039      }
    1040  
    1041    for (i = 0; i < numberof (routine); i++)
    1042      {
    1043        nlen = strlen (routine[i].name);
    1044        if (memcmp (s, routine[i].name, nlen) != 0)
    1045          continue;
    1046  
    1047        if (s[nlen] == '.')
    1048          {
    1049            /* match, with a .r parameter */
    1050  
    1051            if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL)))
    1052              {
    1053                fprintf (stderr,
    1054                         "Choice %s bad: doesn't take a \".<r>\" parameter\n",
    1055                         s_orig);
    1056                exit (1);
    1057              }
    1058  
    1059            c->p = &routine[i];
    1060            c->r = r_string (s + nlen + 1);
    1061            return;
    1062          }
    1063  
    1064        if (s[nlen] == '\0')
    1065          {
    1066            /* match, with no parameter */
    1067  
    1068            if (routine[i].flag & FLAG_R)
    1069              {
    1070                fprintf (stderr,
    1071                         "Choice %s bad: needs a \".<r>\" parameter\n",
    1072                         s_orig);
    1073                exit (1);
    1074              }
    1075  
    1076            c->p = &routine[i];
    1077            c->r = 0;
    1078            return;
    1079          }
    1080      }
    1081  
    1082    fprintf (stderr, "Choice %s unrecognised\n", s_orig);
    1083    exit (1);
    1084  }
    1085  
    1086  
    1087  void
    1088  usage (void)
    1089  {
    1090    int  i;
    1091  
    1092    speed_time_init ();
    1093  
    1094    printf ("Usage: speed [-options] -s size <routine>...\n");
    1095    printf ("Measure the speed of some routines.\n");
    1096    printf ("Times are in seconds, accuracy is shown.\n");
    1097    printf ("\n");
    1098    printf ("   -p num     set precision as number of time units each routine must run\n");
    1099    printf ("   -s size[-end][,size[-end]]...   sizes to measure\n");
    1100    printf ("              single sizes or ranges, sep with comma or use multiple -s\n");
    1101    printf ("   -t step    step through sizes by given amount\n");
    1102    printf ("   -f factor  step through sizes by given factor (eg. 1.05)\n");
    1103    printf ("   -r         show times as ratios of the first routine\n");
    1104    printf ("   -d         show times as difference from the first routine\n");
    1105    printf ("   -D         show times as difference from previous size shown\n");
    1106    printf ("   -c         show times in CPU cycles\n");
    1107    printf ("   -C         show times in cycles per limb\n");
    1108    printf ("   -u         print resource usage (memory) at end\n");
    1109    printf ("   -P name    output plot files \"name.gnuplot\" and \"name.data\"\n");
    1110    printf ("   -a <type>  use given data: random(default), random2, zeros, aas, ffs, 2fd\n");
    1111    printf ("   -x, -y, -w, -W <align>  specify data alignments, sources and dests\n");
    1112    printf ("   -o addrs   print addresses of data blocks\n");
    1113    printf ("\n");
    1114    printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n");
    1115    printf ("is greater.\n");
    1116    printf ("If both -C and -D are used, it means cycles per however many limbs between a\n");
    1117    printf ("size and the previous size.\n");
    1118    printf ("\n");
    1119    printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n");
    1120    printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n");
    1121    printf ("a log/log plot).\n");
    1122    printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n");
    1123    printf ("when viewing more than one routine, it means same axis scales for all data).\n");
    1124    printf ("\n");
    1125    printf ("The available routines are as follows.\n");
    1126    printf ("\n");
    1127  
    1128    for (i = 0; i < numberof (routine); i++)
    1129      {
    1130        if (routine[i].flag & FLAG_R)
    1131          printf ("\t%s.r\n", routine[i].name);
    1132        else if (routine[i].flag & FLAG_R_OPTIONAL)
    1133          printf ("\t%s (optional .r)\n", routine[i].name);
    1134        else
    1135          printf ("\t%s\n", routine[i].name);
    1136      }
    1137    printf ("\n");
    1138    printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n");
    1139    printf ("r should be in decimal, or use 0xN for hexadecimal.\n");
    1140    printf ("\n");
    1141    printf ("Special forms for r are \"<N>bits\" for a random N bit number, \"<N>ones\" for\n");
    1142    printf ("N one bits, or \"aas\" for 0xAA..AA.\n");
    1143    printf ("\n");
    1144    printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n");
    1145    printf ("The fastest routine at each size is marked with a # (free form output only).\n");
    1146    printf ("\n");
    1147    printf ("%s", speed_time_string);
    1148    printf ("\n");
    1149    printf ("Gnuplot home page http://www.gnuplot.info/\n");
    1150    printf ("Quickplot home page http://quickplot.sourceforge.net/\n");
    1151  }
    1152  
    1153  void
    1154  check_align_option (const char *name, mp_size_t align)
    1155  {
    1156    if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK)
    1157      {
    1158        fprintf (stderr, "Alignment request out of range: %s %ld\n",
    1159                 name, (long) align);
    1160        fprintf (stderr, "  should be 0 to %d (limbs), inclusive\n",
    1161                 SPEED_TMP_ALLOC_ADJUST_MASK);
    1162        exit (1);
    1163      }
    1164  }
    1165  
    1166  int
    1167  main (int argc, char *argv[])
    1168  {
    1169    int  i;
    1170    int  opt;
    1171  
    1172    /* Unbuffered so output goes straight out when directed to a pipe or file
    1173       and isn't lost on killing the program half way.  */
    1174    setbuf (stdout, NULL);
    1175  
    1176    for (;;)
    1177      {
    1178        opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z");
    1179        if (opt == EOF)
    1180          break;
    1181  
    1182        switch (opt) {
    1183        case 'a':
    1184          if (strcmp (optarg, "random") == 0)       option_data = DATA_RANDOM;
    1185          else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2;
    1186          else if (strcmp (optarg, "zeros") == 0)   option_data = DATA_ZEROS;
    1187          else if (strcmp (optarg, "aas") == 0)     option_data = DATA_AAS;
    1188          else if (strcmp (optarg, "ffs") == 0)     option_data = DATA_FFS;
    1189          else if (strcmp (optarg, "2fd") == 0)     option_data = DATA_2FD;
    1190          else
    1191            {
    1192              fprintf (stderr, "unrecognised data option: %s\n", optarg);
    1193              exit (1);
    1194            }
    1195          break;
    1196        case 'C':
    1197          if (option_unit  != UNIT_SECONDS) goto bad_unit;
    1198          option_unit = UNIT_CYCLESPERLIMB;
    1199          break;
    1200        case 'c':
    1201          if (option_unit != UNIT_SECONDS)
    1202            {
    1203            bad_unit:
    1204              fprintf (stderr, "cannot use more than one of -c, -C\n");
    1205              exit (1);
    1206            }
    1207          option_unit = UNIT_CYCLES;
    1208          break;
    1209        case 'D':
    1210          if (option_cmp != CMP_ABSOLUTE) goto bad_cmp;
    1211          option_cmp = CMP_DIFFPREV;
    1212          break;
    1213        case 'd':
    1214          if (option_cmp != CMP_ABSOLUTE)
    1215            {
    1216            bad_cmp:
    1217              fprintf (stderr, "cannot use more than one of -d, -D, -r\n");
    1218              exit (1);
    1219            }
    1220          option_cmp = CMP_DIFFERENCE;
    1221          break;
    1222        case 'E':
    1223          option_square = 1;
    1224          break;
    1225        case 'F':
    1226          option_square = 2;
    1227          break;
    1228        case 'f':
    1229          option_factor = atof (optarg);
    1230          if (option_factor <= 1.0)
    1231            {
    1232              fprintf (stderr, "-f factor must be > 1.0\n");
    1233              exit (1);
    1234            }
    1235          break;
    1236        case 'o':
    1237          speed_option_set (optarg);
    1238          break;
    1239        case 'P':
    1240          option_gnuplot = 1;
    1241          option_gnuplot_basename = optarg;
    1242          break;
    1243        case 'p':
    1244          speed_precision = atoi (optarg);
    1245          break;
    1246        case 'R':
    1247          option_seed = time (NULL);
    1248          break;
    1249        case 'r':
    1250          if (option_cmp != CMP_ABSOLUTE)
    1251            goto bad_cmp;
    1252          option_cmp = CMP_RATIO;
    1253          break;
    1254        case 's':
    1255          {
    1256            char  *s;
    1257            for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ","))
    1258              {
    1259                if (size_num == size_allocnum)
    1260                  {
    1261                    size_array = (struct size_array_t *)
    1262                      __gmp_allocate_or_reallocate
    1263                      (size_array,
    1264                       size_allocnum * sizeof(size_array[0]),
    1265                       (size_allocnum+10) * sizeof(size_array[0]));
    1266                    size_allocnum += 10;
    1267                  }
    1268                if (sscanf (s, "%ld-%ld",
    1269                            &size_array[size_num].start,
    1270                            &size_array[size_num].end) != 2)
    1271                  {
    1272                    size_array[size_num].start = size_array[size_num].end
    1273                      = atol (s);
    1274                  }
    1275  
    1276                if (size_array[size_num].start < 0
    1277                    || size_array[size_num].end < 0
    1278                    || size_array[size_num].start > size_array[size_num].end)
    1279                  {
    1280                    fprintf (stderr, "invalid size parameter: %s\n", s);
    1281                    exit (1);
    1282                  }
    1283  
    1284                size_num++;
    1285              }
    1286          }
    1287          break;
    1288        case 't':
    1289          option_step = atol (optarg);
    1290          if (option_step < 1)
    1291            {
    1292              fprintf (stderr, "-t step must be >= 1\n");
    1293              exit (1);
    1294            }
    1295          break;
    1296        case 'u':
    1297          option_resource_usage = 1;
    1298          break;
    1299        case 'z':
    1300          sp.cache = 1;
    1301          break;
    1302        case 'x':
    1303          sp.align_xp = atol (optarg);
    1304          check_align_option ("-x", sp.align_xp);
    1305          break;
    1306        case 'y':
    1307          sp.align_yp = atol (optarg);
    1308          check_align_option ("-y", sp.align_yp);
    1309          break;
    1310        case 'w':
    1311          sp.align_wp = atol (optarg);
    1312          check_align_option ("-w", sp.align_wp);
    1313          break;
    1314        case 'W':
    1315          sp.align_wp2 = atol (optarg);
    1316          check_align_option ("-W", sp.align_wp2);
    1317          break;
    1318        case '?':
    1319          exit(1);
    1320        }
    1321      }
    1322  
    1323    if (optind >= argc)
    1324      {
    1325        usage ();
    1326        exit (1);
    1327      }
    1328  
    1329    if (size_num == 0)
    1330      {
    1331        fprintf (stderr, "-s <size> must be specified\n");
    1332        exit (1);
    1333      }
    1334  
    1335    gmp_randinit_default (__gmp_rands);
    1336    __gmp_rands_initialized = 1;
    1337    gmp_randseed_ui (__gmp_rands, option_seed);
    1338  
    1339    choice = (struct choice_t *) (*__gmp_allocate_func)
    1340      ((argc - optind) * sizeof(choice[0]));
    1341    for ( ; optind < argc; optind++)
    1342      {
    1343        struct choice_t  c;
    1344        routine_find (&c, argv[optind]);
    1345        choice[num_choices] = c;
    1346        num_choices++;
    1347      }
    1348  
    1349    if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) &&
    1350        num_choices < 2)
    1351      {
    1352        fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n");
    1353      }
    1354  
    1355    speed_time_init ();
    1356    if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB)
    1357      speed_cycletime_need_cycles ();
    1358    else
    1359      speed_cycletime_need_seconds ();
    1360  
    1361    if (option_gnuplot)
    1362      {
    1363        run_gnuplot (argc, argv);
    1364      }
    1365    else
    1366      {
    1367        if (option_unit == UNIT_SECONDS)
    1368          printf ("overhead %.9f secs", speed_measure (speed_noop, NULL));
    1369        else
    1370          printf ("overhead %.2f cycles",
    1371                  speed_measure (speed_noop, NULL) / speed_cycletime);
    1372        printf (", precision %d units of %.2e secs",
    1373                speed_precision, speed_unittime);
    1374  
    1375        if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
    1376          printf (", CPU freq unknown\n");
    1377        else
    1378          printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
    1379  
    1380        printf ("       ");
    1381        for (i = 0; i < num_choices; i++)
    1382          printf (" %*s", COLUMN_WIDTH, choice[i].name);
    1383        printf ("\n");
    1384  
    1385        run_all (stdout);
    1386      }
    1387  
    1388    if (option_resource_usage)
    1389      {
    1390  #if HAVE_GETRUSAGE
    1391        {
    1392          /* This doesn't give data sizes on linux 2.0.x, only utime. */
    1393          struct rusage  r;
    1394          if (getrusage (RUSAGE_SELF, &r) != 0)
    1395            perror ("getrusage");
    1396          else
    1397            printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n",
    1398                    (long) r.ru_utime.tv_sec, (long) r.ru_utime.tv_usec,
    1399                    r.ru_idrss, r.ru_isrss, r.ru_ixrss);
    1400        }
    1401  #else
    1402        printf ("getrusage() not available\n");
    1403  #endif
    1404  
    1405        /* Linux kernel. */
    1406        {
    1407          char  buf[128];
    1408          sprintf (buf, "/proc/%d/status", getpid());
    1409          if (access (buf, R_OK) == 0)
    1410            {
    1411              sprintf (buf, "cat /proc/%d/status", getpid());
    1412              system (buf);
    1413            }
    1414  
    1415        }
    1416      }
    1417  
    1418    return 0;
    1419  }