1 /* Speed measuring program.
2
3 Copyright 1999-2003, 2005, 2006, 2008-2022 Free Software Foundation, Inc.
4
5 This file is part of the GNU MP Library.
6
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9
10 * the GNU Lesser General Public License as published by the Free
11 Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13
14 or
15
16 * the GNU General Public License as published by the Free Software
17 Foundation; either version 2 of the License, or (at your option) any
18 later version.
19
20 or both in parallel, as here.
21
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 for more details.
26
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library. If not,
29 see https://www.gnu.org/licenses/. */
30
31 /* Usage message is in the code below, run with no arguments to print it.
32 See README for interesting applications.
33
34 To add a new routine foo(), create a speed_foo() function in the style of
35 the existing ones and add an entry in the routine[] array. Put FLAG_R if
36 speed_foo() wants an "r" parameter.
37
38 The routines don't have help messages or descriptions, but most have
39 suggestive names. See the source code for full details.
40
41 */
42
43 #include "config.h"
44
45 #include <limits.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49
50 #if HAVE_UNISTD_H
51 #include <unistd.h> /* for getpid, R_OK */
52 #endif
53
54 #if TIME_WITH_SYS_TIME
55 # include <sys/time.h> /* for struct timeval */
56 # include <time.h>
57 #else
58 # if HAVE_SYS_TIME_H
59 # include <sys/time.h>
60 # else
61 # include <time.h>
62 # endif
63 #endif
64
65 #if HAVE_SYS_RESOURCE_H
66 #include <sys/resource.h> /* for getrusage() */
67 #endif
68
69
70 #include "gmp-impl.h"
71 #include "longlong.h" /* for the benefit of speed-many.c */
72 #include "tests.h"
73 #include "speed.h"
74
75
76 #if !HAVE_DECL_OPTARG
77 extern char *optarg;
78 extern int optind, opterr;
79 #endif
80
81 #if !HAVE_STRTOUL
82 #define strtoul(p,e,b) (unsigned long) strtol(p,e,b)
83 #endif
84
85 #ifdef SPEED_EXTRA_PROTOS
86 SPEED_EXTRA_PROTOS
87 #endif
88 #ifdef SPEED_EXTRA_PROTOS2
89 SPEED_EXTRA_PROTOS2
90 #endif
91
92
93 #if GMP_LIMB_BITS == 32
94 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK)
95 #endif
96 #if GMP_LIMB_BITS == 64
97 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK)
98 #endif
99
100
101 #define CMP_ABSOLUTE 1
102 #define CMP_RATIO 2
103 #define CMP_DIFFERENCE 3
104 #define CMP_DIFFPREV 4
105 int option_cmp = CMP_ABSOLUTE;
106
107 #define UNIT_SECONDS 1
108 #define UNIT_CYCLES 2
109 #define UNIT_CYCLESPERLIMB 3
110 int option_unit = UNIT_SECONDS;
111
112 #define DATA_RANDOM 1
113 #define DATA_RANDOM2 2
114 #define DATA_ZEROS 3
115 #define DATA_AAS 4
116 #define DATA_FFS 5
117 #define DATA_2FD 6
118 int option_data = DATA_RANDOM;
119
120 int option_square = 0;
121 double option_factor = 0.0;
122 mp_size_t option_step = 1;
123 int option_gnuplot = 0;
124 char *option_gnuplot_basename;
125 struct size_array_t {
126 mp_size_t start, end;
127 } *size_array = NULL;
128 mp_size_t size_num = 0;
129 mp_size_t size_allocnum = 0;
130 int option_resource_usage = 0;
131 long option_seed = 123456789;
132
133 struct speed_params sp;
134
135 #define COLUMN_WIDTH 13 /* for the free-form output */
136
137 #define FLAG_R (1<<0) /* require ".r" */
138 #define FLAG_R_OPTIONAL (1<<1) /* optional ".r" */
139 #define FLAG_RSIZE (1<<2)
140 #define FLAG_NODATA (1<<3) /* don't alloc xp, yp */
141
142 const struct routine_t {
143 /* constants */
144 const char *name;
145 speed_function_t fun;
146 int flag;
147 } routine[] = {
148
149 { "noop", speed_noop },
150 { "noop_wxs", speed_noop_wxs },
151 { "noop_wxys", speed_noop_wxys },
152
153 { "mpn_add_n", speed_mpn_add_n, FLAG_R_OPTIONAL },
154 { "mpn_sub_n", speed_mpn_sub_n, FLAG_R_OPTIONAL },
155 { "mpn_add_1", speed_mpn_add_1, FLAG_R },
156 { "mpn_add_1_inplace", speed_mpn_add_1_inplace, FLAG_R },
157 { "mpn_sub_1", speed_mpn_sub_1, FLAG_R },
158 { "mpn_sub_1_inplace", speed_mpn_sub_1_inplace, FLAG_R },
159
160 { "mpn_add_err1_n", speed_mpn_add_err1_n },
161 { "mpn_add_err2_n", speed_mpn_add_err2_n },
162 { "mpn_add_err3_n", speed_mpn_add_err3_n },
163 { "mpn_sub_err1_n", speed_mpn_sub_err1_n },
164 { "mpn_sub_err2_n", speed_mpn_sub_err2_n },
165 { "mpn_sub_err3_n", speed_mpn_sub_err3_n },
166
167 #if HAVE_NATIVE_mpn_add_n_sub_n
168 { "mpn_add_n_sub_n", speed_mpn_add_n_sub_n, FLAG_R_OPTIONAL },
169 #endif
170
171 { "mpn_addmul_1", speed_mpn_addmul_1, FLAG_R },
172 { "mpn_submul_1", speed_mpn_submul_1, FLAG_R },
173 #if HAVE_NATIVE_mpn_addmul_2
174 { "mpn_addmul_2", speed_mpn_addmul_2, FLAG_R_OPTIONAL },
175 #endif
176 #if HAVE_NATIVE_mpn_addmul_3
177 { "mpn_addmul_3", speed_mpn_addmul_3, FLAG_R_OPTIONAL },
178 #endif
179 #if HAVE_NATIVE_mpn_addmul_4
180 { "mpn_addmul_4", speed_mpn_addmul_4, FLAG_R_OPTIONAL },
181 #endif
182 #if HAVE_NATIVE_mpn_addmul_5
183 { "mpn_addmul_5", speed_mpn_addmul_5, FLAG_R_OPTIONAL },
184 #endif
185 #if HAVE_NATIVE_mpn_addmul_6
186 { "mpn_addmul_6", speed_mpn_addmul_6, FLAG_R_OPTIONAL },
187 #endif
188 #if HAVE_NATIVE_mpn_addmul_7
189 { "mpn_addmul_7", speed_mpn_addmul_7, FLAG_R_OPTIONAL },
190 #endif
191 #if HAVE_NATIVE_mpn_addmul_8
192 { "mpn_addmul_8", speed_mpn_addmul_8, FLAG_R_OPTIONAL },
193 #endif
194 #if HAVE_NATIVE_mpn_addaddmul_1msb0
195 { "mpn_addaddmul_1msb0", speed_mpn_addaddmul_1msb0, FLAG_R_OPTIONAL },
196 #endif
197 { "mpn_mul_1", speed_mpn_mul_1, FLAG_R },
198 { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R },
199 #if HAVE_NATIVE_mpn_mul_2
200 { "mpn_mul_2", speed_mpn_mul_2, FLAG_R_OPTIONAL },
201 #endif
202 #if HAVE_NATIVE_mpn_mul_3
203 { "mpn_mul_3", speed_mpn_mul_3, FLAG_R_OPTIONAL },
204 #endif
205 #if HAVE_NATIVE_mpn_mul_4
206 { "mpn_mul_4", speed_mpn_mul_4, FLAG_R_OPTIONAL },
207 #endif
208 #if HAVE_NATIVE_mpn_mul_5
209 { "mpn_mul_5", speed_mpn_mul_5, FLAG_R_OPTIONAL },
210 #endif
211 #if HAVE_NATIVE_mpn_mul_6
212 { "mpn_mul_6", speed_mpn_mul_6, FLAG_R_OPTIONAL },
213 #endif
214
215 { "mpn_divrem_1", speed_mpn_divrem_1, FLAG_R },
216 { "mpn_divrem_1f", speed_mpn_divrem_1f, FLAG_R },
217 #if HAVE_NATIVE_mpn_divrem_1c
218 { "mpn_divrem_1c", speed_mpn_divrem_1c, FLAG_R },
219 { "mpn_divrem_1cf", speed_mpn_divrem_1cf,FLAG_R },
220 #endif
221 { "mpn_mod_1", speed_mpn_mod_1, FLAG_R },
222 #if HAVE_NATIVE_mpn_mod_1c
223 { "mpn_mod_1c", speed_mpn_mod_1c, FLAG_R },
224 #endif
225 { "mpn_preinv_divrem_1", speed_mpn_preinv_divrem_1, FLAG_R },
226 { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R },
227 { "mpn_preinv_mod_1", speed_mpn_preinv_mod_1, FLAG_R },
228
229 { "mpn_mod_1_1", speed_mpn_mod_1_1, FLAG_R },
230 { "mpn_mod_1_1_1", speed_mpn_mod_1_1_1, FLAG_R },
231 { "mpn_mod_1_1_2", speed_mpn_mod_1_1_2, FLAG_R },
232 { "mpn_mod_1s_2", speed_mpn_mod_1_2, FLAG_R },
233 { "mpn_mod_1s_3", speed_mpn_mod_1_3, FLAG_R },
234 { "mpn_mod_1s_4", speed_mpn_mod_1_4, FLAG_R },
235
236 { "mpn_divrem_1_div", speed_mpn_divrem_1_div, FLAG_R },
237 { "mpn_divrem_1_inv", speed_mpn_divrem_1_inv, FLAG_R },
238 { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R },
239 { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R },
240 { "mpn_mod_1_div", speed_mpn_mod_1_div, FLAG_R },
241 { "mpn_mod_1_inv", speed_mpn_mod_1_inv, FLAG_R },
242
243 { "mpn_divrem_2", speed_mpn_divrem_2, },
244 { "mpn_divrem_2_div", speed_mpn_divrem_2_div, },
245 { "mpn_divrem_2_inv", speed_mpn_divrem_2_inv, },
246
247 { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R },
248 { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R },
249 { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R },
250 { "mpn_div_qr_1n_pi1_3",speed_mpn_div_qr_1n_pi1_3, FLAG_R },
251 { "mpn_div_qr_1n_pi1_4",speed_mpn_div_qr_1n_pi1_4, FLAG_R },
252 { "mpn_div_qr_1", speed_mpn_div_qr_1, FLAG_R },
253
254 { "mpn_div_qr_2n", speed_mpn_div_qr_2n, },
255 { "mpn_div_qr_2u", speed_mpn_div_qr_2u, },
256
257 { "mpn_divexact_1", speed_mpn_divexact_1, FLAG_R },
258 { "mpn_divexact_by3", speed_mpn_divexact_by3 },
259
260 { "mpn_bdiv_q_1", speed_mpn_bdiv_q_1, FLAG_R },
261 { "mpn_pi1_bdiv_q_1", speed_mpn_pi1_bdiv_q_1, FLAG_R_OPTIONAL },
262 { "mpn_bdiv_dbm1c", speed_mpn_bdiv_dbm1c, FLAG_R_OPTIONAL },
263
264 #if HAVE_NATIVE_mpn_modexact_1_odd
265 { "mpn_modexact_1_odd", speed_mpn_modexact_1_odd, FLAG_R },
266 #endif
267 { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R },
268
269 #if GMP_NUMB_BITS % 4 == 0
270 { "mpn_mod_34lsub1", speed_mpn_mod_34lsub1 },
271 #endif
272
273 { "mpn_lshift", speed_mpn_lshift, FLAG_R },
274 { "mpn_lshiftc", speed_mpn_lshiftc, FLAG_R },
275 { "mpn_rshift", speed_mpn_rshift, FLAG_R },
276
277 { "mpn_and_n", speed_mpn_and_n, FLAG_R_OPTIONAL },
278 { "mpn_andn_n", speed_mpn_andn_n, FLAG_R_OPTIONAL },
279 { "mpn_nand_n", speed_mpn_nand_n, FLAG_R_OPTIONAL },
280 { "mpn_ior_n", speed_mpn_ior_n, FLAG_R_OPTIONAL },
281 { "mpn_iorn_n", speed_mpn_iorn_n, FLAG_R_OPTIONAL },
282 { "mpn_nior_n", speed_mpn_nior_n, FLAG_R_OPTIONAL },
283 { "mpn_xor_n", speed_mpn_xor_n, FLAG_R_OPTIONAL },
284 { "mpn_xnor_n", speed_mpn_xnor_n, FLAG_R_OPTIONAL },
285 { "mpn_com", speed_mpn_com },
286 { "mpn_neg", speed_mpn_neg },
287
288 { "mpn_popcount", speed_mpn_popcount },
289 { "mpn_hamdist", speed_mpn_hamdist },
290
291 { "mpn_matrix22_mul", speed_mpn_matrix22_mul },
292
293 { "mpn_hgcd2", speed_mpn_hgcd2, FLAG_NODATA },
294 { "mpn_hgcd2_1", speed_mpn_hgcd2_1, FLAG_NODATA },
295 { "mpn_hgcd2_2", speed_mpn_hgcd2_2, FLAG_NODATA },
296 { "mpn_hgcd2_3", speed_mpn_hgcd2_3, FLAG_NODATA },
297 { "mpn_hgcd2_4", speed_mpn_hgcd2_4, FLAG_NODATA },
298 { "mpn_hgcd2_5", speed_mpn_hgcd2_5, FLAG_NODATA },
299 { "mpn_hgcd", speed_mpn_hgcd },
300 { "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer },
301 { "mpn_hgcd_appr", speed_mpn_hgcd_appr },
302 { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
303
304 { "mpn_hgcd_reduce", speed_mpn_hgcd_reduce },
305 { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1 },
306 { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2 },
307
308 { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL },
309 { "mpn_gcd_11", speed_mpn_gcd_11, FLAG_R_OPTIONAL },
310 { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
311 { "mpn_gcd_22", speed_mpn_gcd_22, FLAG_R_OPTIONAL },
312
313 { "mpn_gcd", speed_mpn_gcd },
314
315 { "mpn_gcdext", speed_mpn_gcdext },
316 { "mpn_gcdext_single", speed_mpn_gcdext_single },
317 { "mpn_gcdext_double", speed_mpn_gcdext_double },
318 { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single },
319 { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double },
320 #if 0
321 { "mpn_gcdext_lehmer", speed_mpn_gcdext_lehmer },
322 #endif
323
324 { "gmp_primesieve", speed_gmp_primesieve, FLAG_NODATA },
325 { "mpz_nextprime", speed_mpz_nextprime },
326 { "mpz_nextprime_1", speed_mpz_nextprime_1, FLAG_R_OPTIONAL },
327 { "mpz_prevprime", speed_mpz_prevprime },
328 { "mpz_prevprime_1", speed_mpz_prevprime_1, FLAG_R_OPTIONAL },
329
330 { "mpz_jacobi", speed_mpz_jacobi },
331 { "mpn_jacobi_base", speed_mpn_jacobi_base },
332 { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1 },
333 { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2 },
334 { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3 },
335 { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4 },
336
337 { "mpn_mul", speed_mpn_mul, FLAG_R_OPTIONAL },
338 { "mpn_mul_basecase", speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
339 { "mpn_sqr_basecase", speed_mpn_sqr_basecase },
340 #if HAVE_NATIVE_mpn_sqr_diagonal
341 { "mpn_sqr_diagonal", speed_mpn_sqr_diagonal },
342 #endif
343 #if HAVE_NATIVE_mpn_sqr_diag_addlsh1
344 { "mpn_sqr_diag_addlsh1", speed_mpn_sqr_diag_addlsh1 },
345 #endif
346
347 { "mpn_mul_n", speed_mpn_mul_n },
348 { "mpn_sqr", speed_mpn_sqr },
349
350 { "mpn_toom2_sqr", speed_mpn_toom2_sqr },
351 { "mpn_toom3_sqr", speed_mpn_toom3_sqr },
352 { "mpn_toom4_sqr", speed_mpn_toom4_sqr },
353 { "mpn_toom6_sqr", speed_mpn_toom6_sqr },
354 { "mpn_toom8_sqr", speed_mpn_toom8_sqr },
355 { "mpn_toom22_mul", speed_mpn_toom22_mul },
356 { "mpn_toom33_mul", speed_mpn_toom33_mul },
357 { "mpn_toom44_mul", speed_mpn_toom44_mul },
358 { "mpn_toom6h_mul", speed_mpn_toom6h_mul },
359 { "mpn_toom8h_mul", speed_mpn_toom8h_mul },
360 { "mpn_toom32_mul", speed_mpn_toom32_mul },
361 { "mpn_toom42_mul", speed_mpn_toom42_mul },
362 { "mpn_toom43_mul", speed_mpn_toom43_mul },
363 { "mpn_toom63_mul", speed_mpn_toom63_mul },
364 { "mpn_nussbaumer_mul", speed_mpn_nussbaumer_mul },
365 { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
366 #if WANT_OLD_FFT_FULL
367 { "mpn_mul_fft_full", speed_mpn_mul_fft_full },
368 { "mpn_mul_fft_full_sqr", speed_mpn_mul_fft_full_sqr },
369 #endif
370 { "mpn_mul_fft", speed_mpn_mul_fft, FLAG_R_OPTIONAL },
371 { "mpn_mul_fft_sqr", speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
372
373 { "mpn_sqrlo", speed_mpn_sqrlo },
374 { "mpn_sqrlo_basecase", speed_mpn_sqrlo_basecase },
375 { "mpn_mullo_n", speed_mpn_mullo_n },
376 { "mpn_mullo_basecase", speed_mpn_mullo_basecase },
377
378 { "mpn_mulmid_basecase", speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL },
379 { "mpn_toom42_mulmid", speed_mpn_toom42_mulmid },
380 { "mpn_mulmid_n", speed_mpn_mulmid_n },
381 { "mpn_mulmid", speed_mpn_mulmid, FLAG_R_OPTIONAL },
382
383 { "mpn_bc_mulmod_bnm1", speed_mpn_bc_mulmod_bnm1 },
384 { "mpn_mulmod_bnm1", speed_mpn_mulmod_bnm1 },
385 { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded },
386 { "mpn_sqrmod_bnm1", speed_mpn_sqrmod_bnm1 },
387
388 { "mpn_mulmod_bknp1", speed_mpn_mulmod_bknp1, FLAG_R_OPTIONAL },
389 { "mpn_sqrmod_bknp1", speed_mpn_sqrmod_bknp1, FLAG_R_OPTIONAL },
390 { "mpn_mulmod_bnp1", speed_mpn_mulmod_bnp1 },
391 { "mpn_sqrmod_bnp1", speed_mpn_sqrmod_bnp1 },
392
393 { "mpn_invert", speed_mpn_invert },
394 { "mpn_invertappr", speed_mpn_invertappr },
395 { "mpn_ni_invertappr", speed_mpn_ni_invertappr },
396 { "mpn_binvert", speed_mpn_binvert },
397 { "mpn_sec_invert", speed_mpn_sec_invert },
398
399 { "mpn_sbpi1_div_qr", speed_mpn_sbpi1_div_qr, FLAG_R_OPTIONAL},
400 { "mpn_dcpi1_div_qr", speed_mpn_dcpi1_div_qr, FLAG_R_OPTIONAL},
401 { "mpn_mu_div_qr", speed_mpn_mu_div_qr, FLAG_R_OPTIONAL},
402 { "mpn_mupi_div_qr", speed_mpn_mupi_div_qr, FLAG_R_OPTIONAL},
403 { "mpn_sbpi1_divappr_q", speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL},
404 { "mpn_dcpi1_divappr_q", speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL},
405
406 { "mpn_sbpi1_bdiv_qr", speed_mpn_sbpi1_bdiv_qr },
407 { "mpn_dcpi1_bdiv_qr", speed_mpn_dcpi1_bdiv_qr },
408 { "mpn_sbpi1_bdiv_q", speed_mpn_sbpi1_bdiv_q },
409 { "mpn_dcpi1_bdiv_q", speed_mpn_dcpi1_bdiv_q },
410 { "mpn_sbpi1_bdiv_r", speed_mpn_sbpi1_bdiv_r },
411
412 { "mpn_broot", speed_mpn_broot, FLAG_R },
413 { "mpn_broot_invm1", speed_mpn_broot_invm1, FLAG_R },
414 { "mpn_brootinv", speed_mpn_brootinv, FLAG_R },
415
416 { "mpn_get_str", speed_mpn_get_str, FLAG_R_OPTIONAL },
417 { "mpn_set_str", speed_mpn_set_str, FLAG_R_OPTIONAL },
418 { "mpn_set_str_basecase", speed_mpn_bc_set_str, FLAG_R_OPTIONAL },
419
420 { "mpn_sqrtrem", speed_mpn_sqrtrem },
421 { "mpn_rootrem", speed_mpn_rootrem, FLAG_R },
422 { "mpn_sqrt", speed_mpn_sqrt },
423 { "mpn_root", speed_mpn_root, FLAG_R },
424
425 { "mpn_perfect_power_p", speed_mpn_perfect_power_p, },
426 { "mpn_perfect_square_p", speed_mpn_perfect_square_p, },
427
428 { "mpn_fib2_ui", speed_mpn_fib2_ui, FLAG_NODATA },
429 { "mpz_fib_ui", speed_mpz_fib_ui, FLAG_NODATA },
430 { "mpz_fib2_ui", speed_mpz_fib2_ui, FLAG_NODATA },
431 { "mpz_lucnum_ui", speed_mpz_lucnum_ui, FLAG_NODATA },
432 { "mpz_lucnum2_ui", speed_mpz_lucnum2_ui, FLAG_NODATA },
433
434 { "mpz_add", speed_mpz_add },
435 { "mpz_invert", speed_mpz_invert, FLAG_R_OPTIONAL },
436 { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
437 { "mpz_bin_ui", speed_mpz_bin_ui, FLAG_NODATA | FLAG_R_OPTIONAL },
438 { "mpz_fac_ui", speed_mpz_fac_ui, FLAG_NODATA },
439 { "mpz_2fac_ui", speed_mpz_2fac_ui, FLAG_NODATA },
440 { "mpz_mfac_uiui", speed_mpz_mfac_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
441 { "mpz_primorial_ui", speed_mpz_primorial_ui, FLAG_NODATA },
442 { "mpz_powm", speed_mpz_powm, FLAG_R_OPTIONAL },
443 { "mpz_powm_mod", speed_mpz_powm_mod },
444 { "mpz_powm_redc", speed_mpz_powm_redc },
445 { "mpz_powm_sec", speed_mpz_powm_sec },
446 { "mpz_powm_ui", speed_mpz_powm_ui, FLAG_R_OPTIONAL },
447
448 { "mpz_mod", speed_mpz_mod },
449 { "mpn_redc_1", speed_mpn_redc_1 },
450 { "mpn_redc_2", speed_mpn_redc_2 },
451 { "mpn_redc_n", speed_mpn_redc_n },
452
453 { "MPN_COPY", speed_MPN_COPY },
454 { "MPN_COPY_INCR", speed_MPN_COPY_INCR },
455 { "MPN_COPY_DECR", speed_MPN_COPY_DECR },
456 { "memcpy", speed_memcpy },
457 #if HAVE_NATIVE_mpn_copyi
458 { "mpn_copyi", speed_mpn_copyi },
459 #endif
460 #if HAVE_NATIVE_mpn_copyd
461 { "mpn_copyd", speed_mpn_copyd },
462 #endif
463 { "mpn_sec_tabselect", speed_mpn_sec_tabselect, FLAG_R_OPTIONAL },
464 #if HAVE_NATIVE_mpn_addlsh1_n == 1
465 { "mpn_addlsh1_n", speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
466 #endif
467 #if HAVE_NATIVE_mpn_sublsh1_n == 1
468 { "mpn_sublsh1_n", speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
469 #endif
470 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
471 { "mpn_addlsh1_n_ip1", speed_mpn_addlsh1_n_ip1 },
472 #endif
473 #if HAVE_NATIVE_mpn_addlsh1_n_ip2
474 { "mpn_addlsh1_n_ip2", speed_mpn_addlsh1_n_ip2 },
475 #endif
476 #if HAVE_NATIVE_mpn_sublsh1_n_ip1
477 { "mpn_sublsh1_n_ip1", speed_mpn_sublsh1_n_ip1 },
478 #endif
479 #if HAVE_NATIVE_mpn_rsblsh1_n == 1
480 { "mpn_rsblsh1_n", speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
481 #endif
482 #if HAVE_NATIVE_mpn_addlsh2_n == 1
483 { "mpn_addlsh2_n", speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
484 #endif
485 #if HAVE_NATIVE_mpn_sublsh2_n == 1
486 { "mpn_sublsh2_n", speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },
487 #endif
488 #if HAVE_NATIVE_mpn_addlsh2_n_ip1
489 { "mpn_addlsh2_n_ip1", speed_mpn_addlsh2_n_ip1 },
490 #endif
491 #if HAVE_NATIVE_mpn_addlsh2_n_ip2
492 { "mpn_addlsh2_n_ip2", speed_mpn_addlsh2_n_ip2 },
493 #endif
494 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
495 { "mpn_sublsh2_n_ip1", speed_mpn_sublsh2_n_ip1 },
496 #endif
497 #if HAVE_NATIVE_mpn_rsblsh2_n == 1
498 { "mpn_rsblsh2_n", speed_mpn_rsblsh2_n, FLAG_R_OPTIONAL },
499 #endif
500 #if HAVE_NATIVE_mpn_addlsh_n
501 { "mpn_addlsh_n", speed_mpn_addlsh_n, FLAG_R_OPTIONAL },
502 #endif
503 #if HAVE_NATIVE_mpn_sublsh_n
504 { "mpn_sublsh_n", speed_mpn_sublsh_n, FLAG_R_OPTIONAL },
505 #endif
506 #if HAVE_NATIVE_mpn_addlsh_n_ip1
507 { "mpn_addlsh_n_ip1", speed_mpn_addlsh_n_ip1 },
508 #endif
509 #if HAVE_NATIVE_mpn_addlsh_n_ip2
510 { "mpn_addlsh_n_ip2", speed_mpn_addlsh_n_ip2 },
511 #endif
512 #if HAVE_NATIVE_mpn_sublsh_n_ip1
513 { "mpn_sublsh_n_ip1", speed_mpn_sublsh_n_ip1 },
514 #endif
515 #if HAVE_NATIVE_mpn_rsblsh_n
516 { "mpn_rsblsh_n", speed_mpn_rsblsh_n, FLAG_R_OPTIONAL },
517 #endif
518 #if HAVE_NATIVE_mpn_rsh1add_n
519 { "mpn_rsh1add_n", speed_mpn_rsh1add_n, FLAG_R_OPTIONAL },
520 #endif
521 #if HAVE_NATIVE_mpn_rsh1sub_n
522 { "mpn_rsh1sub_n", speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
523 #endif
524
525 { "mpn_cnd_add_n", speed_mpn_cnd_add_n, FLAG_R_OPTIONAL },
526 { "mpn_cnd_sub_n", speed_mpn_cnd_sub_n, FLAG_R_OPTIONAL },
527
528 { "MPN_ZERO", speed_MPN_ZERO },
529
530 { "binvert_limb", speed_binvert_limb, FLAG_NODATA },
531 { "binvert_limb_mul1", speed_binvert_limb_mul1, FLAG_NODATA },
532 { "binvert_limb_loop", speed_binvert_limb_loop, FLAG_NODATA },
533 { "binvert_limb_cond", speed_binvert_limb_cond, FLAG_NODATA },
534 { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
535
536 { "malloc_free", speed_malloc_free },
537 { "malloc_realloc_free", speed_malloc_realloc_free },
538 { "gmp_allocate_free", speed_gmp_allocate_free },
539 { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free },
540 { "mpz_init_clear", speed_mpz_init_clear },
541 { "mpq_init_clear", speed_mpq_init_clear },
542 { "mpf_init_clear", speed_mpf_init_clear },
543 { "mpz_init_realloc_clear", speed_mpz_init_realloc_clear },
544
545 { "umul_ppmm", speed_umul_ppmm, FLAG_R_OPTIONAL },
546 #if HAVE_NATIVE_mpn_umul_ppmm
547 { "mpn_umul_ppmm", speed_mpn_umul_ppmm, FLAG_R_OPTIONAL },
548 #endif
549 #if HAVE_NATIVE_mpn_umul_ppmm_r
550 { "mpn_umul_ppmm_r", speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL },
551 #endif
552
553 { "count_leading_zeros", speed_count_leading_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
554 { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
555
556 { "udiv_qrnnd", speed_udiv_qrnnd, FLAG_R_OPTIONAL },
557 { "udiv_qrnnd_c", speed_udiv_qrnnd_c, FLAG_R_OPTIONAL },
558 #if HAVE_NATIVE_mpn_udiv_qrnnd
559 { "mpn_udiv_qrnnd", speed_mpn_udiv_qrnnd, FLAG_R_OPTIONAL },
560 #endif
561 #if HAVE_NATIVE_mpn_udiv_qrnnd_r
562 { "mpn_udiv_qrnnd_r", speed_mpn_udiv_qrnnd_r, FLAG_R_OPTIONAL },
563 #endif
564 { "invert_limb", speed_invert_limb, FLAG_R_OPTIONAL },
565
566 { "operator_div", speed_operator_div, FLAG_R_OPTIONAL },
567 { "operator_mod", speed_operator_mod, FLAG_R_OPTIONAL },
568
569 { "gmp_randseed", speed_gmp_randseed, FLAG_R_OPTIONAL },
570 { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA },
571 { "mpz_urandomb", speed_mpz_urandomb, FLAG_R_OPTIONAL | FLAG_NODATA },
572
573 #ifdef SPEED_EXTRA_ROUTINES
574 SPEED_EXTRA_ROUTINES
575 #endif
576 #ifdef SPEED_EXTRA_ROUTINES2
577 SPEED_EXTRA_ROUTINES2
578 #endif
579 };
580
581
582 struct choice_t {
583 const struct routine_t *p;
584 mp_limb_t r;
585 double scale;
586 double time;
587 int no_time;
588 double prev_time;
589 const char *name;
590 };
591 struct choice_t *choice;
592 int num_choices = 0;
593
594
595 void
596 data_fill (mp_ptr ptr, mp_size_t size)
597 {
598 switch (option_data) {
599 case DATA_RANDOM:
600 mpn_random (ptr, size);
601 break;
602 case DATA_RANDOM2:
603 mpn_random2 (ptr, size);
604 break;
605 case DATA_ZEROS:
606 MPN_ZERO (ptr, size);
607 break;
608 case DATA_AAS:
609 MPN_FILL (ptr, size, GMP_NUMB_0xAA);
610 break;
611 case DATA_FFS:
612 MPN_FILL (ptr, size, GMP_NUMB_MAX);
613 break;
614 case DATA_2FD:
615 MPN_FILL (ptr, size, GMP_NUMB_MAX);
616 ptr[0] -= 2;
617 break;
618 default:
619 abort();
620 /*NOTREACHED*/
621 }
622 }
623
624 /* The code here handling the various combinations of output options isn't
625 too attractive, but it works and is fairly clean. */
626
627 #define SIZE_TO_DIVISOR(n) \
628 (option_square == 1 ? (n)*(n) \
629 : option_square == 2 ? (n)*((n)+1)/2 \
630 : (n))
631
632 void
633 run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
634 {
635 const char *first_open_fastest, *first_open_notfastest, *first_close;
636 int i, fastest, want_data;
637 double fastest_time;
638 TMP_DECL;
639
640 TMP_MARK;
641
642 /* allocate data, unless all routines are NODATA */
643 want_data = 0;
644 for (i = 0; i < num_choices; i++)
645 want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0);
646
647 if (want_data)
648 {
649 SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp);
650 SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp);
651
652 data_fill (s->xp, s->size);
653 data_fill (s->yp, s->size);
654 }
655 else
656 {
657 sp.xp = NULL;
658 sp.yp = NULL;
659 }
660
661 if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
662 {
663 first_open_fastest = "(#";
664 first_open_notfastest = " (";
665 first_close = ")";
666 }
667 else
668 {
669 first_open_fastest = "#";
670 first_open_notfastest = " ";
671 first_close = "";
672 }
673
674 fastest = -1;
675 fastest_time = -1.0;
676 for (i = 0; i < num_choices; i++)
677 {
678 s->r = choice[i].r;
679 choice[i].time = speed_measure (choice[i].p->fun, s);
680 choice[i].no_time = (choice[i].time == -1.0);
681 if (! choice[i].no_time)
682 choice[i].time *= choice[i].scale;
683
684 /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time
685 is before any differences. */
686 {
687 double t;
688 t = choice[i].time;
689 if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1)
690 {
691 if (choice[i].prev_time == -1.0)
692 choice[i].no_time = 1;
693 else
694 choice[i].time = choice[i].time - choice[i].prev_time;
695 }
696 choice[i].prev_time = t;
697 }
698
699 if (choice[i].no_time)
700 continue;
701
702 /* Look for the fastest after CMP_DIFFPREV has been applied, but
703 before CMP_RATIO or CMP_DIFFERENCE. There's only a fastest shown
704 if there's more than one routine. */
705 if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time))
706 {
707 fastest = i;
708 fastest_time = choice[i].time;
709 }
710
711 if (option_cmp == CMP_DIFFPREV)
712 {
713 /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */
714 if (option_unit == UNIT_CYCLES)
715 choice[i].time /= speed_cycletime;
716 else if (option_unit == UNIT_CYCLESPERLIMB)
717 {
718 if (prev_size == -1)
719 choice[i].time /= speed_cycletime;
720 else
721 choice[i].time /= (speed_cycletime
722 * (SIZE_TO_DIVISOR(s->size)
723 - SIZE_TO_DIVISOR(prev_size)));
724 }
725 }
726 else
727 {
728 if (option_unit == UNIT_CYCLES)
729 choice[i].time /= speed_cycletime;
730 else if (option_unit == UNIT_CYCLESPERLIMB)
731 choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size));
732
733 if (option_cmp == CMP_RATIO && i > 0)
734 {
735 /* A ratio isn't affected by the units chosen. */
736 if (choice[0].no_time || choice[0].time == 0.0)
737 choice[i].no_time = 1;
738 else
739 choice[i].time /= choice[0].time;
740 }
741 else if (option_cmp == CMP_DIFFERENCE && i > 0)
742 {
743 if (choice[0].no_time)
744 {
745 choice[i].no_time = 1;
746 continue;
747 }
748 choice[i].time -= choice[0].time;
749 }
750 }
751 }
752
753 if (option_gnuplot)
754 {
755 /* In CMP_DIFFPREV, don't print anything for the first size, start
756 with the second where an actual difference is available.
757
758 In CMP_RATIO, print the first column as 1.0.
759
760 The 9 decimals printed is much more than the expected precision of
761 the measurements actually. */
762
763 if (! (option_cmp == CMP_DIFFPREV && prev_size == -1))
764 {
765 fprintf (fp, "%-6ld ", s->size);
766 for (i = 0; i < num_choices; i++)
767 fprintf (fp, " %.9e",
768 choice[i].no_time ? 0.0
769 : (option_cmp == CMP_RATIO && i == 0) ? 1.0
770 : choice[i].time);
771 fprintf (fp, "\n");
772 }
773 }
774 else
775 {
776 fprintf (fp, "%-6ld ", s->size);
777 for (i = 0; i < num_choices; i++)
778 {
779 char buf[128];
780 int decimals;
781
782 if (choice[i].no_time)
783 {
784 fprintf (fp, " %*s", COLUMN_WIDTH, "n/a");
785 }
786 else
787 {if (option_unit == UNIT_CYCLESPERLIMB
788 || (option_cmp == CMP_RATIO && i > 0))
789 decimals = 4;
790 else if (option_unit == UNIT_CYCLES)
791 decimals = 2;
792 else
793 decimals = 9;
794
795 sprintf (buf, "%s%.*f%s",
796 i == fastest ? first_open_fastest : first_open_notfastest,
797 decimals, choice[i].time, first_close);
798 fprintf (fp, " %*s", COLUMN_WIDTH, buf);
799 }
800 }
801 fprintf (fp, "\n");
802 }
803
804 TMP_FREE;
805 }
806
807 void
808 run_all (FILE *fp)
809 {
810 mp_size_t prev_size;
811 int i;
812 TMP_DECL;
813
814 TMP_MARK;
815 SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp);
816 SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp);
817
818 data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
819 data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
820
821 for (i = 0; i < size_num; i++)
822 {
823 sp.size = size_array[i].start;
824 prev_size = -1;
825 for (;;)
826 {
827 mp_size_t step;
828
829 if (option_data == DATA_2FD && sp.size >= 2)
830 sp.xp[sp.size-1] = 2;
831
832 run_one (fp, &sp, prev_size);
833 prev_size = sp.size;
834
835 if (option_data == DATA_2FD && sp.size >= 2)
836 sp.xp[sp.size-1] = MP_LIMB_T_MAX;
837
838 if (option_factor != 0.0)
839 {
840 step = (mp_size_t) (sp.size * option_factor - sp.size);
841 if (step < 1)
842 step = 1;
843 }
844 else
845 step = 1;
846 if (step < option_step)
847 step = option_step;
848
849 sp.size += step;
850 if (sp.size > size_array[i].end)
851 break;
852 }
853 }
854
855 TMP_FREE;
856 }
857
858
859 FILE *
860 fopen_for_write (const char *filename)
861 {
862 FILE *fp;
863 if ((fp = fopen (filename, "w")) == NULL)
864 {
865 fprintf (stderr, "Cannot create %s\n", filename);
866 exit(1);
867 }
868 return fp;
869 }
870
871 void
872 fclose_written (FILE *fp, const char *filename)
873 {
874 int err;
875
876 err = ferror (fp);
877 err |= fclose (fp);
878
879 if (err)
880 {
881 fprintf (stderr, "Error writing %s\n", filename);
882 exit(1);
883 }
884 }
885
886
887 void
888 run_gnuplot (int argc, char *argv[])
889 {
890 char *plot_filename;
891 char *data_filename;
892 FILE *fp;
893 int i;
894
895 plot_filename = (char *) (*__gmp_allocate_func)
896 (strlen (option_gnuplot_basename) + 20);
897 data_filename = (char *) (*__gmp_allocate_func)
898 (strlen (option_gnuplot_basename) + 20);
899
900 sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename);
901 sprintf (data_filename, "%s.data", option_gnuplot_basename);
902
903 fp = fopen_for_write (plot_filename);
904
905 fprintf (fp, "# Generated with:\n");
906 fprintf (fp, "#");
907 for (i = 0; i < argc; i++)
908 fprintf (fp, " %s", argv[i]);
909 fprintf (fp, "\n");
910 fprintf (fp, "\n");
911
912 fprintf (fp, "reset\n");
913
914 /* Putting the key at the top left is usually good, and you can change it
915 interactively if it's not. */
916 fprintf (fp, "set key left\n");
917
918 /* write underscores, not subscripts */
919 fprintf (fp, "set termoption noenhanced\n");
920
921 /* designed to make it possible to see crossovers easily */
922 fprintf (fp, "set style data lines\n");
923
924 fprintf (fp, "plot ");
925 for (i = 0; i < num_choices; i++)
926 {
927 fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2);
928 fprintf (fp, " title \"%s\"", choice[i].name);
929
930 if (i != num_choices-1)
931 fprintf (fp, ", \\");
932 fprintf (fp, "\n");
933 }
934
935 fprintf (fp, "load \"-\"\n");
936 fclose_written (fp, plot_filename);
937
938 fp = fopen_for_write (data_filename);
939
940 /* Unbuffered so you can see where the program was up to if it crashes or
941 you kill it. */
942 setbuf (fp, NULL);
943
944 run_all (fp);
945 fclose_written (fp, data_filename);
946 }
947
948
949 /* Return a limb with n many one bits (starting from the least significant) */
950
951 #define LIMB_ONES(n) \
952 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX \
953 : (n) == 0 ? CNST_LIMB(0) \
954 : (CNST_LIMB(1) << (n)) - 1)
955
956 mp_limb_t
957 r_string (const char *s)
958 {
959 const char *s_orig = s;
960 long n;
961
962 if (strcmp (s, "aas") == 0)
963 return GMP_NUMB_0xAA;
964
965 {
966 mpz_t z;
967 mp_limb_t l;
968 int set, siz;
969
970 mpz_init (z);
971 set = mpz_set_str (z, s, 0);
972 siz = SIZ(z);
973 l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]);
974 mpz_clear (z);
975 if (set == 0)
976 {
977 if (siz > 1 || siz < -1)
978 printf ("Warning, r parameter %s truncated to %d bits\n",
979 s_orig, GMP_LIMB_BITS);
980 return l;
981 }
982 }
983
984 if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
985 n = strtoul (s+2, (char **) &s, 16);
986 else
987 n = strtol (s, (char **) &s, 10);
988
989 if (strcmp (s, "bits") == 0)
990 {
991 mp_limb_t l;
992 if (n > GMP_LIMB_BITS)
993 {
994 fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
995 n, GMP_LIMB_BITS);
996 exit (1);
997 }
998 mpn_random (&l, 1);
999 return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n);
1000 }
1001 else if (strcmp (s, "ones") == 0)
1002 {
1003 if (n > GMP_LIMB_BITS)
1004 {
1005 fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
1006 n, GMP_LIMB_BITS);
1007 exit (1);
1008 }
1009 return LIMB_ONES (n);
1010 }
1011 else if (*s != '\0')
1012 {
1013 fprintf (stderr, "invalid r parameter: %s\n", s_orig);
1014 exit (1);
1015 }
1016
1017 return n;
1018 }
1019
1020
1021 void
1022 routine_find (struct choice_t *c, const char *s_orig)
1023 {
1024 const char *s;
1025 int i;
1026 size_t nlen;
1027
1028 c->name = s_orig;
1029 s = strchr (s_orig, '*');
1030 if (s != NULL)
1031 {
1032 c->scale = atof(s_orig);
1033 s++;
1034 }
1035 else
1036 {
1037 c->scale = 1.0;
1038 s = s_orig;
1039 }
1040
1041 for (i = 0; i < numberof (routine); i++)
1042 {
1043 nlen = strlen (routine[i].name);
1044 if (memcmp (s, routine[i].name, nlen) != 0)
1045 continue;
1046
1047 if (s[nlen] == '.')
1048 {
1049 /* match, with a .r parameter */
1050
1051 if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL)))
1052 {
1053 fprintf (stderr,
1054 "Choice %s bad: doesn't take a \".<r>\" parameter\n",
1055 s_orig);
1056 exit (1);
1057 }
1058
1059 c->p = &routine[i];
1060 c->r = r_string (s + nlen + 1);
1061 return;
1062 }
1063
1064 if (s[nlen] == '\0')
1065 {
1066 /* match, with no parameter */
1067
1068 if (routine[i].flag & FLAG_R)
1069 {
1070 fprintf (stderr,
1071 "Choice %s bad: needs a \".<r>\" parameter\n",
1072 s_orig);
1073 exit (1);
1074 }
1075
1076 c->p = &routine[i];
1077 c->r = 0;
1078 return;
1079 }
1080 }
1081
1082 fprintf (stderr, "Choice %s unrecognised\n", s_orig);
1083 exit (1);
1084 }
1085
1086
1087 void
1088 usage (void)
1089 {
1090 int i;
1091
1092 speed_time_init ();
1093
1094 printf ("Usage: speed [-options] -s size <routine>...\n");
1095 printf ("Measure the speed of some routines.\n");
1096 printf ("Times are in seconds, accuracy is shown.\n");
1097 printf ("\n");
1098 printf (" -p num set precision as number of time units each routine must run\n");
1099 printf (" -s size[-end][,size[-end]]... sizes to measure\n");
1100 printf (" single sizes or ranges, sep with comma or use multiple -s\n");
1101 printf (" -t step step through sizes by given amount\n");
1102 printf (" -f factor step through sizes by given factor (eg. 1.05)\n");
1103 printf (" -r show times as ratios of the first routine\n");
1104 printf (" -d show times as difference from the first routine\n");
1105 printf (" -D show times as difference from previous size shown\n");
1106 printf (" -c show times in CPU cycles\n");
1107 printf (" -C show times in cycles per limb\n");
1108 printf (" -u print resource usage (memory) at end\n");
1109 printf (" -P name output plot files \"name.gnuplot\" and \"name.data\"\n");
1110 printf (" -a <type> use given data: random(default), random2, zeros, aas, ffs, 2fd\n");
1111 printf (" -x, -y, -w, -W <align> specify data alignments, sources and dests\n");
1112 printf (" -o addrs print addresses of data blocks\n");
1113 printf ("\n");
1114 printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n");
1115 printf ("is greater.\n");
1116 printf ("If both -C and -D are used, it means cycles per however many limbs between a\n");
1117 printf ("size and the previous size.\n");
1118 printf ("\n");
1119 printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n");
1120 printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n");
1121 printf ("a log/log plot).\n");
1122 printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n");
1123 printf ("when viewing more than one routine, it means same axis scales for all data).\n");
1124 printf ("\n");
1125 printf ("The available routines are as follows.\n");
1126 printf ("\n");
1127
1128 for (i = 0; i < numberof (routine); i++)
1129 {
1130 if (routine[i].flag & FLAG_R)
1131 printf ("\t%s.r\n", routine[i].name);
1132 else if (routine[i].flag & FLAG_R_OPTIONAL)
1133 printf ("\t%s (optional .r)\n", routine[i].name);
1134 else
1135 printf ("\t%s\n", routine[i].name);
1136 }
1137 printf ("\n");
1138 printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n");
1139 printf ("r should be in decimal, or use 0xN for hexadecimal.\n");
1140 printf ("\n");
1141 printf ("Special forms for r are \"<N>bits\" for a random N bit number, \"<N>ones\" for\n");
1142 printf ("N one bits, or \"aas\" for 0xAA..AA.\n");
1143 printf ("\n");
1144 printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n");
1145 printf ("The fastest routine at each size is marked with a # (free form output only).\n");
1146 printf ("\n");
1147 printf ("%s", speed_time_string);
1148 printf ("\n");
1149 printf ("Gnuplot home page http://www.gnuplot.info/\n");
1150 printf ("Quickplot home page http://quickplot.sourceforge.net/\n");
1151 }
1152
1153 void
1154 check_align_option (const char *name, mp_size_t align)
1155 {
1156 if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK)
1157 {
1158 fprintf (stderr, "Alignment request out of range: %s %ld\n",
1159 name, (long) align);
1160 fprintf (stderr, " should be 0 to %d (limbs), inclusive\n",
1161 SPEED_TMP_ALLOC_ADJUST_MASK);
1162 exit (1);
1163 }
1164 }
1165
1166 int
1167 main (int argc, char *argv[])
1168 {
1169 int i;
1170 int opt;
1171
1172 /* Unbuffered so output goes straight out when directed to a pipe or file
1173 and isn't lost on killing the program half way. */
1174 setbuf (stdout, NULL);
1175
1176 for (;;)
1177 {
1178 opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z");
1179 if (opt == EOF)
1180 break;
1181
1182 switch (opt) {
1183 case 'a':
1184 if (strcmp (optarg, "random") == 0) option_data = DATA_RANDOM;
1185 else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2;
1186 else if (strcmp (optarg, "zeros") == 0) option_data = DATA_ZEROS;
1187 else if (strcmp (optarg, "aas") == 0) option_data = DATA_AAS;
1188 else if (strcmp (optarg, "ffs") == 0) option_data = DATA_FFS;
1189 else if (strcmp (optarg, "2fd") == 0) option_data = DATA_2FD;
1190 else
1191 {
1192 fprintf (stderr, "unrecognised data option: %s\n", optarg);
1193 exit (1);
1194 }
1195 break;
1196 case 'C':
1197 if (option_unit != UNIT_SECONDS) goto bad_unit;
1198 option_unit = UNIT_CYCLESPERLIMB;
1199 break;
1200 case 'c':
1201 if (option_unit != UNIT_SECONDS)
1202 {
1203 bad_unit:
1204 fprintf (stderr, "cannot use more than one of -c, -C\n");
1205 exit (1);
1206 }
1207 option_unit = UNIT_CYCLES;
1208 break;
1209 case 'D':
1210 if (option_cmp != CMP_ABSOLUTE) goto bad_cmp;
1211 option_cmp = CMP_DIFFPREV;
1212 break;
1213 case 'd':
1214 if (option_cmp != CMP_ABSOLUTE)
1215 {
1216 bad_cmp:
1217 fprintf (stderr, "cannot use more than one of -d, -D, -r\n");
1218 exit (1);
1219 }
1220 option_cmp = CMP_DIFFERENCE;
1221 break;
1222 case 'E':
1223 option_square = 1;
1224 break;
1225 case 'F':
1226 option_square = 2;
1227 break;
1228 case 'f':
1229 option_factor = atof (optarg);
1230 if (option_factor <= 1.0)
1231 {
1232 fprintf (stderr, "-f factor must be > 1.0\n");
1233 exit (1);
1234 }
1235 break;
1236 case 'o':
1237 speed_option_set (optarg);
1238 break;
1239 case 'P':
1240 option_gnuplot = 1;
1241 option_gnuplot_basename = optarg;
1242 break;
1243 case 'p':
1244 speed_precision = atoi (optarg);
1245 break;
1246 case 'R':
1247 option_seed = time (NULL);
1248 break;
1249 case 'r':
1250 if (option_cmp != CMP_ABSOLUTE)
1251 goto bad_cmp;
1252 option_cmp = CMP_RATIO;
1253 break;
1254 case 's':
1255 {
1256 char *s;
1257 for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ","))
1258 {
1259 if (size_num == size_allocnum)
1260 {
1261 size_array = (struct size_array_t *)
1262 __gmp_allocate_or_reallocate
1263 (size_array,
1264 size_allocnum * sizeof(size_array[0]),
1265 (size_allocnum+10) * sizeof(size_array[0]));
1266 size_allocnum += 10;
1267 }
1268 if (sscanf (s, "%ld-%ld",
1269 &size_array[size_num].start,
1270 &size_array[size_num].end) != 2)
1271 {
1272 size_array[size_num].start = size_array[size_num].end
1273 = atol (s);
1274 }
1275
1276 if (size_array[size_num].start < 0
1277 || size_array[size_num].end < 0
1278 || size_array[size_num].start > size_array[size_num].end)
1279 {
1280 fprintf (stderr, "invalid size parameter: %s\n", s);
1281 exit (1);
1282 }
1283
1284 size_num++;
1285 }
1286 }
1287 break;
1288 case 't':
1289 option_step = atol (optarg);
1290 if (option_step < 1)
1291 {
1292 fprintf (stderr, "-t step must be >= 1\n");
1293 exit (1);
1294 }
1295 break;
1296 case 'u':
1297 option_resource_usage = 1;
1298 break;
1299 case 'z':
1300 sp.cache = 1;
1301 break;
1302 case 'x':
1303 sp.align_xp = atol (optarg);
1304 check_align_option ("-x", sp.align_xp);
1305 break;
1306 case 'y':
1307 sp.align_yp = atol (optarg);
1308 check_align_option ("-y", sp.align_yp);
1309 break;
1310 case 'w':
1311 sp.align_wp = atol (optarg);
1312 check_align_option ("-w", sp.align_wp);
1313 break;
1314 case 'W':
1315 sp.align_wp2 = atol (optarg);
1316 check_align_option ("-W", sp.align_wp2);
1317 break;
1318 case '?':
1319 exit(1);
1320 }
1321 }
1322
1323 if (optind >= argc)
1324 {
1325 usage ();
1326 exit (1);
1327 }
1328
1329 if (size_num == 0)
1330 {
1331 fprintf (stderr, "-s <size> must be specified\n");
1332 exit (1);
1333 }
1334
1335 gmp_randinit_default (__gmp_rands);
1336 __gmp_rands_initialized = 1;
1337 gmp_randseed_ui (__gmp_rands, option_seed);
1338
1339 choice = (struct choice_t *) (*__gmp_allocate_func)
1340 ((argc - optind) * sizeof(choice[0]));
1341 for ( ; optind < argc; optind++)
1342 {
1343 struct choice_t c;
1344 routine_find (&c, argv[optind]);
1345 choice[num_choices] = c;
1346 num_choices++;
1347 }
1348
1349 if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) &&
1350 num_choices < 2)
1351 {
1352 fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n");
1353 }
1354
1355 speed_time_init ();
1356 if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB)
1357 speed_cycletime_need_cycles ();
1358 else
1359 speed_cycletime_need_seconds ();
1360
1361 if (option_gnuplot)
1362 {
1363 run_gnuplot (argc, argv);
1364 }
1365 else
1366 {
1367 if (option_unit == UNIT_SECONDS)
1368 printf ("overhead %.9f secs", speed_measure (speed_noop, NULL));
1369 else
1370 printf ("overhead %.2f cycles",
1371 speed_measure (speed_noop, NULL) / speed_cycletime);
1372 printf (", precision %d units of %.2e secs",
1373 speed_precision, speed_unittime);
1374
1375 if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
1376 printf (", CPU freq unknown\n");
1377 else
1378 printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
1379
1380 printf (" ");
1381 for (i = 0; i < num_choices; i++)
1382 printf (" %*s", COLUMN_WIDTH, choice[i].name);
1383 printf ("\n");
1384
1385 run_all (stdout);
1386 }
1387
1388 if (option_resource_usage)
1389 {
1390 #if HAVE_GETRUSAGE
1391 {
1392 /* This doesn't give data sizes on linux 2.0.x, only utime. */
1393 struct rusage r;
1394 if (getrusage (RUSAGE_SELF, &r) != 0)
1395 perror ("getrusage");
1396 else
1397 printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n",
1398 (long) r.ru_utime.tv_sec, (long) r.ru_utime.tv_usec,
1399 r.ru_idrss, r.ru_isrss, r.ru_ixrss);
1400 }
1401 #else
1402 printf ("getrusage() not available\n");
1403 #endif
1404
1405 /* Linux kernel. */
1406 {
1407 char buf[128];
1408 sprintf (buf, "/proc/%d/status", getpid());
1409 if (access (buf, R_OK) == 0)
1410 {
1411 sprintf (buf, "cat /proc/%d/status", getpid());
1412 system (buf);
1413 }
1414
1415 }
1416 }
1417
1418 return 0;
1419 }