(root)/
gmp-6.3.0/
tune/
time.c
       1  /* Time routines for speed measurements.
       2  
       3  Copyright 1999-2004, 2010-2012 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  
      32  /* Usage:
      33  
      34     The code in this file implements the lowest level of time measuring,
      35     simple one-time measuring of time between two points.
      36  
      37     void speed_starttime (void)
      38     double speed_endtime (void)
      39         Call speed_starttime to start measuring, and then call speed_endtime
      40         when done.
      41  
      42         speed_endtime returns the time taken, in seconds.  Or if the timebase
      43         is in CPU cycles and the CPU frequency is unknown then speed_endtime
      44         returns cycles.  Applications can identify the cycles return by
      45         checking for speed_cycletime (described below) equal to 1.0.
      46  
      47         If some sort of temporary glitch occurs then speed_endtime returns
      48         0.0.  Currently this is for various cases where a negative time has
      49         occurred.  This unfortunately occurs with getrusage on some systems,
      50         and with the hppa cycle counter on hpux.
      51  
      52     double speed_cycletime
      53         The time in seconds for each CPU cycle.  For example on a 100 MHz CPU
      54         this would be 1.0e-8.
      55  
      56         If the CPU frequency is unknown, then speed_cycletime is either 0.0
      57         or 1.0.  It's 0.0 when speed_endtime is returning seconds, or it's
      58         1.0 when speed_endtime is returning cycles.
      59  
      60         It may be noted that "speed_endtime() / speed_cycletime" gives a
      61         measured time in cycles, irrespective of whether speed_endtime is
      62         returning cycles or seconds.  (Assuming cycles can be had, ie. it's
      63         either cycles already or the cpu frequency is known.  See also
      64         speed_cycletime_need_cycles below.)
      65  
      66     double speed_unittime
      67         The unit of time measurement accuracy for the timing method in use.
      68         This is in seconds or cycles, as per speed_endtime.
      69  
      70     char speed_time_string[]
      71         A null-terminated string describing the time method in use.
      72  
      73     void speed_time_init (void)
      74         Initialize time measuring.  speed_starttime() does this
      75         automatically, so it's only needed if an application wants to inspect
      76         the above global variables before making a measurement.
      77  
      78     int speed_precision
      79         The intended accuracy of time measurements.  speed_measure() in
      80         common.c for instance runs target routines with enough repetitions so
      81         it takes at least "speed_unittime * speed_precision" (this expression
      82         works for both cycles or seconds from speed_endtime).
      83  
      84         A program can provide an option so the user to set speed_precision.
      85         If speed_precision is zero when speed_time_init or speed_starttime
      86         first run then it gets a default based on the measuring method
      87         chosen.  (More precision for higher accuracy methods.)
      88  
      89     void speed_cycletime_need_seconds (void)
      90         Call this to demand that speed_endtime will return seconds, and not
      91         cycles.  If only cycles are available then an error is printed and
      92         the program exits.
      93  
      94     void speed_cycletime_need_cycles (void)
      95         Call this to demand that speed_cycletime is non-zero, so that
      96         "speed_endtime() / speed_cycletime" will give times in cycles.
      97  
      98  
      99  
     100     Notes:
     101  
     102     Various combinations of cycle counter, read_real_time(), getrusage(),
     103     gettimeofday() and times() can arise, according to which are available
     104     and their precision.
     105  
     106  
     107     Allowing speed_endtime() to return either seconds or cycles is only a
     108     slight complication and makes it possible for the speed program to do
     109     some sensible things without demanding the CPU frequency.  If seconds are
     110     being measured then it can always print seconds, and if cycles are being
     111     measured then it can always print them without needing to know how long
     112     they are.  Also the tune program doesn't care at all what the units are.
     113  
     114     GMP_CPU_FREQUENCY can always be set when the automated methods in freq.c
     115     fail.  This will be needed if times in seconds are wanted but a cycle
     116     counter is being used, or if times in cycles are wanted but getrusage or
     117     another seconds based timer is in use.
     118  
     119     If the measuring method uses a cycle counter but supplements it with
     120     getrusage or the like, then knowing the CPU frequency is mandatory since
     121     the code compares values from the two.
     122  
     123  
     124     Not done:
     125  
     126     Solaris gethrtime() seems no more than a slow way to access the Sparc V9
     127     cycle counter.  gethrvtime() seems to be relevant only to light weight
     128     processes, it doesn't for instance give nanosecond virtual time.  So
     129     neither of these are used.
     130  
     131  
     132     Bugs:
     133  
     134     getrusage_microseconds_p is fundamentally flawed, getrusage and
     135     gettimeofday can have resolutions other than clock ticks or microseconds,
     136     for instance IRIX 5 has a tick of 10 ms but a getrusage of 1 ms.
     137  
     138  
     139     Enhancements:
     140  
     141     The SGI hardware counter has 64 bits on some machines, which could be
     142     used when available.  But perhaps 32 bits is enough range, and then rely
     143     on the getrusage supplement.
     144  
     145     Maybe getrusage (or times) should be used as a supplement for any
     146     wall-clock measuring method.  Currently a wall clock with a good range
     147     (eg. a 64-bit cycle counter) is used without a supplement.
     148  
     149     On PowerPC the timebase registers could be used, but would have to do
     150     something to find out the speed.  On 6xx chips it's normally 1/4 bus
     151     speed, on 4xx chips it's either that or an external clock.  Measuring
     152     against gettimeofday might be ok.  */
     153  
     154  
     155  #include "config.h"
     156  
     157  #include <errno.h>
     158  #include <setjmp.h>
     159  #include <signal.h>
     160  #include <stddef.h>
     161  #include <stdio.h>
     162  #include <string.h>
     163  #include <stdlib.h> /* for getenv() */
     164  
     165  #if HAVE_FCNTL_H
     166  #include <fcntl.h>  /* for open() */
     167  #endif
     168  
     169  #if HAVE_STDINT_H
     170  #include <stdint.h> /* for uint64_t */
     171  #endif
     172  
     173  #if HAVE_UNISTD_H
     174  #include <unistd.h> /* for sysconf() */
     175  #endif
     176  
     177  #include <sys/types.h>
     178  
     179  #if TIME_WITH_SYS_TIME
     180  # include <sys/time.h>  /* for struct timeval */
     181  # include <time.h>
     182  #else
     183  # if HAVE_SYS_TIME_H
     184  #  include <sys/time.h>
     185  # else
     186  #  include <time.h>
     187  # endif
     188  #endif
     189  
     190  #if HAVE_SYS_MMAN_H
     191  #include <sys/mman.h>      /* for mmap() */
     192  #endif
     193  
     194  #if HAVE_SYS_RESOURCE_H
     195  #include <sys/resource.h>  /* for struct rusage */
     196  #endif
     197  
     198  #if HAVE_SYS_SYSSGI_H
     199  #include <sys/syssgi.h>    /* for syssgi() */
     200  #endif
     201  
     202  #if HAVE_SYS_SYSTEMCFG_H
     203  #include <sys/systemcfg.h> /* for RTC_POWER on AIX */
     204  #endif
     205  
     206  #if HAVE_SYS_TIMES_H
     207  #include <sys/times.h>  /* for times() and struct tms */
     208  #endif
     209  
     210  #include "gmp-impl.h"
     211  
     212  #include "speed.h"
     213  
     214  
     215  /* strerror is only used for some stuff on newish systems, no need to have a
     216     proper replacement */
     217  #if ! HAVE_STRERROR
     218  #define strerror(n)  "<strerror not available>"
     219  #endif
     220  
     221  
     222  char    speed_time_string[256];
     223  int     speed_precision = 0;
     224  double  speed_unittime;
     225  double  speed_cycletime = 0.0;
     226  
     227  
     228  /* don't rely on "unsigned" to "double" conversion, it's broken in SunOS 4
     229     native cc */
     230  #define M_2POWU   (((double) INT_MAX + 1.0) * 2.0)
     231  
     232  #define M_2POW32  4294967296.0
     233  #define M_2POW64  (M_2POW32 * M_2POW32)
     234  
     235  
     236  /* Conditionals for the time functions available are done with normal C
     237     code, which is a lot easier than wildly nested preprocessor directives.
     238  
     239     The choice of what to use is partly made at run-time, according to
     240     whether the cycle counter works and the measured accuracy of getrusage
     241     and gettimeofday.
     242  
     243     A routine that's not available won't be getting called, but is an abort()
     244     to be sure it isn't called mistakenly.
     245  
     246     It can be assumed that if a function exists then its data type will, but
     247     if the function doesn't then the data type might or might not exist, so
     248     the type can't be used unconditionally.  The "struct_rusage" etc macros
     249     provide dummies when the respective function doesn't exist. */
     250  
     251  
     252  #if HAVE_SPEED_CYCLECOUNTER
     253  static const int have_cycles = HAVE_SPEED_CYCLECOUNTER;
     254  #else
     255  static const int have_cycles = 0;
     256  #define speed_cyclecounter(p)  ASSERT_FAIL (speed_cyclecounter not available)
     257  #endif
     258  
     259  /* "stck" returns ticks since 1 Jan 1900 00:00 GMT, where each tick is 2^-12
     260     microseconds.  Same #ifdefs here as in longlong.h.  */
     261  #if defined (__GNUC__) && ! defined (NO_ASM)                            \
     262    && (defined (__i370__) || defined (__s390__) || defined (__mvs__))
     263  static const int  have_stck = 1;
     264  static const int  use_stck = 1;  /* always use when available */
     265  typedef uint64_t  stck_t; /* gcc for s390 is quite new, always has uint64_t */
     266  #define STCK(timestamp)                 \
     267    do {                                  \
     268      asm ("stck %0" : "=Q" (timestamp)); \
     269    } while (0)
     270  #else
     271  static const int  have_stck = 0;
     272  static const int  use_stck = 0;
     273  typedef unsigned long  stck_t;   /* dummy */
     274  #define STCK(timestamp)  ASSERT_FAIL (stck instruction not available)
     275  #endif
     276  #define STCK_PERIOD      (1.0 / 4096e6)   /* 2^-12 microseconds */
     277  
     278  /* mftb
     279     Enhancement: On 64-bit chips mftb gives a 64-bit value, no need for mftbu
     280     and a loop (see powerpc64.asm).  */
     281  #if HAVE_HOST_CPU_FAMILY_powerpc
     282  static const int  have_mftb = 1;
     283  #if defined (__GNUC__) && ! defined (NO_ASM)
     284  #define MFTB(a)                         \
     285    do {                                  \
     286      unsigned  __h1, __l, __h2;          \
     287      do {                                \
     288        asm volatile ("mftbu %0\n"        \
     289  		    "mftb  %1\n"        \
     290  		    "mftbu %2"          \
     291  		    : "=r" (__h1),      \
     292  		      "=r" (__l),       \
     293  		      "=r" (__h2));     \
     294      } while (__h1 != __h2);             \
     295      a[0] = __l;                         \
     296      a[1] = __h1;                        \
     297    } while (0)
     298  #else
     299  #define MFTB(a)   mftb_function (a)
     300  #endif
     301  #else /* ! powerpc */
     302  static const int  have_mftb = 0;
     303  #define MFTB(a)                         \
     304    do {                                  \
     305      a[0] = 0;                           \
     306      a[1] = 0;                           \
     307      ASSERT_FAIL (mftb not available);   \
     308    } while (0)
     309  #endif
     310  
     311  /* Unicos 10.X has syssgi(), but not mmap(). */
     312  #if HAVE_SYSSGI && HAVE_MMAP
     313  static const int  have_sgi = 1;
     314  #else
     315  static const int  have_sgi = 0;
     316  #endif
     317  
     318  #if HAVE_READ_REAL_TIME
     319  static const int have_rrt = 1;
     320  #else
     321  static const int have_rrt = 0;
     322  #define read_real_time(t,s)     ASSERT_FAIL (read_real_time not available)
     323  #define time_base_to_time(t,s)  ASSERT_FAIL (time_base_to_time not available)
     324  #define RTC_POWER     1
     325  #define RTC_POWER_PC  2
     326  #define timebasestruct_t   struct timebasestruct_dummy
     327  struct timebasestruct_dummy {
     328    int             flag;
     329    unsigned int    tb_high;
     330    unsigned int    tb_low;
     331  };
     332  #endif
     333  
     334  #if HAVE_CLOCK_GETTIME
     335  static const int have_cgt = 1;
     336  #define struct_timespec  struct timespec
     337  #else
     338  static const int have_cgt = 0;
     339  #define struct_timespec       struct timespec_dummy
     340  #define clock_gettime(id,ts)  (ASSERT_FAIL (clock_gettime not available), -1)
     341  #define clock_getres(id,ts)   (ASSERT_FAIL (clock_getres not available), -1)
     342  #endif
     343  
     344  #if HAVE_GETRUSAGE
     345  static const int have_grus = 1;
     346  #define struct_rusage   struct rusage
     347  #else
     348  static const int have_grus = 0;
     349  #define getrusage(n,ru)  ASSERT_FAIL (getrusage not available)
     350  #define struct_rusage    struct rusage_dummy
     351  #endif
     352  
     353  #if HAVE_GETTIMEOFDAY
     354  static const int have_gtod = 1;
     355  #define struct_timeval   struct timeval
     356  #else
     357  static const int have_gtod = 0;
     358  #define gettimeofday(tv,tz)  ASSERT_FAIL (gettimeofday not available)
     359  #define struct_timeval   struct timeval_dummy
     360  #endif
     361  
     362  #if HAVE_TIMES
     363  static const int have_times = 1;
     364  #define struct_tms   struct tms
     365  #else
     366  static const int have_times = 0;
     367  #define times(tms)   ASSERT_FAIL (times not available)
     368  #define struct_tms   struct tms_dummy
     369  #endif
     370  
     371  struct tms_dummy {
     372    long  tms_utime;
     373  };
     374  struct timeval_dummy {
     375    long  tv_sec;
     376    long  tv_usec;
     377  };
     378  struct rusage_dummy {
     379    struct_timeval ru_utime;
     380  };
     381  struct timespec_dummy {
     382    long  tv_sec;
     383    long  tv_nsec;
     384  };
     385  
     386  static int  use_cycles;
     387  static int  use_mftb;
     388  static int  use_sgi;
     389  static int  use_rrt;
     390  static int  use_cgt;
     391  static int  use_gtod;
     392  static int  use_grus;
     393  static int  use_times;
     394  static int  use_tick_boundary;
     395  
     396  static unsigned         start_cycles[2];
     397  static stck_t           start_stck;
     398  static unsigned         start_mftb[2];
     399  static unsigned         start_sgi;
     400  static timebasestruct_t start_rrt;
     401  static struct_timespec  start_cgt;
     402  static struct_rusage    start_grus;
     403  static struct_timeval   start_gtod;
     404  static struct_tms       start_times;
     405  
     406  static double  cycles_limit = 1e100;
     407  static double  mftb_unittime;
     408  static double  sgi_unittime;
     409  static double  cgt_unittime;
     410  static double  grus_unittime;
     411  static double  gtod_unittime;
     412  static double  times_unittime;
     413  
     414  /* for RTC_POWER format, ie. seconds and nanoseconds */
     415  #define TIMEBASESTRUCT_SECS(t)  ((t)->tb_high + (t)->tb_low * 1e-9)
     416  
     417  
     418  /* Return a string representing a time in seconds, nicely formatted.
     419     Eg. "10.25ms".  */
     420  char *
     421  unittime_string (double t)
     422  {
     423    static char  buf[128];
     424  
     425    const char  *unit;
     426    int         prec;
     427  
     428    /* choose units and scale */
     429    if (t < 1e-6)
     430      t *= 1e9, unit = "ns";
     431    else if (t < 1e-3)
     432      t *= 1e6, unit = "us";
     433    else if (t < 1.0)
     434      t *= 1e3, unit = "ms";
     435    else
     436      unit = "s";
     437  
     438    /* want 4 significant figures */
     439    if (t < 1.0)
     440      prec = 4;
     441    else if (t < 10.0)
     442      prec = 3;
     443    else if (t < 100.0)
     444      prec = 2;
     445    else
     446      prec = 1;
     447  
     448    sprintf (buf, "%.*f%s", prec, t, unit);
     449    return buf;
     450  }
     451  
     452  
     453  static jmp_buf  cycles_works_buf;
     454  
     455  static RETSIGTYPE
     456  cycles_works_handler (int sig)
     457  {
     458    longjmp (cycles_works_buf, 1);
     459  }
     460  
     461  int
     462  cycles_works_p (void)
     463  {
     464    static int  result = -1;
     465  
     466    if (result != -1)
     467      goto done;
     468  
     469    /* FIXME: On linux, the cycle counter is not saved and restored over
     470     * context switches, making it almost useless for precise cputime
     471     * measurements. When available, it's better to use clock_gettime,
     472     * which seems to have reasonable accuracy (tested on x86_32,
     473     * linux-2.6.26, glibc-2.7). However, there are also some linux
     474     * systems where clock_gettime is broken in one way or the other,
     475     * like CLOCK_PROCESS_CPUTIME_ID not implemented (easy case) or
     476     * kind-of implemented but broken (needs code to detect that), and
     477     * on those systems a wall-clock cycle counter is the least bad
     478     * fallback.
     479     *
     480     * So we need some code to disable the cycle counter on some but not
     481     * all linux systems. */
     482  #ifdef SIGILL
     483    {
     484      RETSIGTYPE (*old_handler) (int);
     485      unsigned  cycles[2];
     486  
     487      old_handler = signal (SIGILL, cycles_works_handler);
     488      if (old_handler == SIG_ERR)
     489        {
     490  	if (speed_option_verbose)
     491  	  printf ("cycles_works_p(): SIGILL not supported, assuming speed_cyclecounter() works\n");
     492  	goto yes;
     493        }
     494      if (setjmp (cycles_works_buf))
     495        {
     496  	if (speed_option_verbose)
     497  	  printf ("cycles_works_p(): SIGILL during speed_cyclecounter(), so doesn't work\n");
     498  	result = 0;
     499  	goto done;
     500        }
     501      speed_cyclecounter (cycles);
     502      signal (SIGILL, old_handler);
     503      if (speed_option_verbose)
     504        printf ("cycles_works_p(): speed_cyclecounter() works\n");
     505    }
     506  #else
     507  
     508    if (speed_option_verbose)
     509      printf ("cycles_works_p(): SIGILL not defined, assuming speed_cyclecounter() works\n");
     510    goto yes;
     511  #endif
     512  
     513   yes:
     514    result = 1;
     515  
     516   done:
     517    return result;
     518  }
     519  
     520  
     521  /* The number of clock ticks per second, but looking at sysconf rather than
     522     just CLK_TCK, where possible.  */
     523  long
     524  clk_tck (void)
     525  {
     526    static long  result = -1L;
     527    if (result != -1L)
     528      return result;
     529  
     530  #if HAVE_SYSCONF
     531    result = sysconf (_SC_CLK_TCK);
     532    if (result != -1L)
     533      {
     534        if (speed_option_verbose)
     535  	printf ("sysconf(_SC_CLK_TCK) is %ld per second\n", result);
     536        return result;
     537      }
     538  
     539    fprintf (stderr,
     540  	   "sysconf(_SC_CLK_TCK) not working, using CLK_TCK instead\n");
     541  #endif
     542  
     543  #ifdef CLK_TCK
     544    result = CLK_TCK;
     545    if (speed_option_verbose)
     546      printf ("CLK_TCK is %ld per second\n", result);
     547    return result;
     548  #else
     549    fprintf (stderr, "CLK_TCK not defined, cannot continue\n");
     550    abort ();
     551  #endif
     552  }
     553  
     554  
     555  /* If two times can be observed less than half a clock tick apart, then
     556     assume "get" is microsecond accurate.
     557  
     558     Two times only 1 microsecond apart are not believed, since some kernels
     559     take it upon themselves to ensure gettimeofday doesn't return the same
     560     value twice, for the benefit of applications using it for a timestamp.
     561     This is obviously very stupid given the speed of CPUs these days.
     562  
     563     Making "reps" many calls to noop_1() is designed to waste some CPU, with
     564     a view to getting measurements 2 microseconds (or more) apart.  "reps" is
     565     increased progressively until such a period is seen.
     566  
     567     The outer loop "attempts" are just to allow for any random nonsense or
     568     system load upsetting the measurements (ie. making two successive calls
     569     to "get" come out as a longer interval than normal).
     570  
     571     Bugs:
     572  
     573     The assumption that any interval less than a half tick implies
     574     microsecond resolution is obviously fairly rash, the true resolution
     575     could be anything between a microsecond and that half tick.  Perhaps
     576     something special would have to be done on a system where this is the
     577     case, since there's no obvious reliable way to detect it
     578     automatically.  */
     579  
     580  #define MICROSECONDS_P(name, type, get, sec, usec)                      \
     581    {                                                                     \
     582      static int  result = -1;                                            \
     583      type      st, et;                                                   \
     584      long      dt, half_tick;                                            \
     585      unsigned  attempt, reps, i, j;                                      \
     586  									\
     587      if (result != -1)                                                   \
     588        return result;                                                    \
     589  									\
     590      result = 0;                                                         \
     591      half_tick = (1000000L / clk_tck ()) / 2;                            \
     592  									\
     593      for (attempt = 0; attempt < 5; attempt++)                           \
     594        {                                                                 \
     595  	reps = 0;                                                       \
     596  	for (;;)                                                        \
     597  	  {                                                             \
     598  	    get (st);                                                   \
     599  	    for (i = 0; i < reps; i++)                                  \
     600  	      for (j = 0; j < 100; j++)                                 \
     601  		noop_1 (CNST_LIMB(0));                                  \
     602  	    get (et);                                                   \
     603  									\
     604  	    dt = (sec(et)-sec(st))*1000000L + usec(et)-usec(st);        \
     605  									\
     606  	    if (speed_option_verbose >= 2)                              \
     607  	      printf ("%s attempt=%u, reps=%u, dt=%ld\n",               \
     608  		      name, attempt, reps, dt);                         \
     609  									\
     610  	    if (dt >= 2)                                                \
     611  	      break;                                                    \
     612  									\
     613  	    reps = (reps == 0 ? 1 : 2*reps);                            \
     614  	    if (reps == 0)                                              \
     615  	      break;  /* uint overflow, not normal */                   \
     616  	  }                                                             \
     617  									\
     618  	if (dt < half_tick)                                             \
     619  	  {                                                             \
     620  	    result = 1;                                                 \
     621  	    break;                                                      \
     622  	  }                                                             \
     623        }                                                                 \
     624  									\
     625      if (speed_option_verbose)                                           \
     626        {                                                                 \
     627  	if (result)                                                     \
     628  	  printf ("%s is microsecond accurate\n", name);                \
     629  	else                                                            \
     630  	  printf ("%s is only %s clock tick accurate\n",                \
     631  		  name, unittime_string (1.0/clk_tck()));               \
     632        }                                                                 \
     633      return result;                                                      \
     634    }
     635  
     636  
     637  int
     638  gettimeofday_microseconds_p (void)
     639  {
     640  #define call_gettimeofday(t)   gettimeofday (&(t), NULL)
     641  #define timeval_tv_sec(t)      ((t).tv_sec)
     642  #define timeval_tv_usec(t)     ((t).tv_usec)
     643    MICROSECONDS_P ("gettimeofday", struct_timeval,
     644  		  call_gettimeofday, timeval_tv_sec, timeval_tv_usec);
     645  }
     646  
     647  int
     648  getrusage_microseconds_p (void)
     649  {
     650  #define call_getrusage(t)   getrusage (0, &(t))
     651  #define rusage_tv_sec(t)    ((t).ru_utime.tv_sec)
     652  #define rusage_tv_usec(t)   ((t).ru_utime.tv_usec)
     653    MICROSECONDS_P ("getrusage", struct_rusage,
     654  		  call_getrusage, rusage_tv_sec, rusage_tv_usec);
     655  }
     656  
     657  /* Test whether getrusage goes backwards, return non-zero if it does
     658     (suggesting it's flawed).
     659  
     660     On a macintosh m68040-unknown-netbsd1.4.1 getrusage looks like it's
     661     microsecond accurate, but has been seen remaining unchanged after many
     662     microseconds have elapsed.  It also regularly goes backwards by 1000 to
     663     5000 usecs, this has been seen after between 500 and 4000 attempts taking
     664     perhaps 0.03 seconds.  We consider this too broken for good measuring.
     665     We used to have configure pretend getrusage didn't exist on this system,
     666     but a runtime test should be more reliable, since we imagine the problem
     667     is not confined to just this exact system tuple.  */
     668  
     669  int
     670  getrusage_backwards_p (void)
     671  {
     672    static int result = -1;
     673    struct rusage  start, prev, next;
     674    long  d;
     675    int   i;
     676  
     677    if (result != -1)
     678      return result;
     679  
     680    getrusage (0, &start);
     681    memcpy (&next, &start, sizeof (next));
     682  
     683    result = 0;
     684    i = 0;
     685    for (;;)
     686      {
     687        memcpy (&prev, &next, sizeof (prev));
     688        getrusage (0, &next);
     689  
     690        if (next.ru_utime.tv_sec < prev.ru_utime.tv_sec
     691  	  || (next.ru_utime.tv_sec == prev.ru_utime.tv_sec
     692  	      && next.ru_utime.tv_usec < prev.ru_utime.tv_usec))
     693  	{
     694  	  if (speed_option_verbose)
     695  	    printf ("getrusage went backwards (attempt %d: %ld.%06ld -> %ld.%06ld)\n",
     696  		    i,
     697  		    (long) prev.ru_utime.tv_sec, (long) prev.ru_utime.tv_usec,
     698  		    (long) next.ru_utime.tv_sec, (long) next.ru_utime.tv_usec);
     699  	  result = 1;
     700  	  break;
     701  	}
     702  
     703        /* minimum 1000 attempts, then stop after either 0.1 seconds or 50000
     704  	 attempts, whichever comes first */
     705        d = 1000000 * (next.ru_utime.tv_sec - start.ru_utime.tv_sec)
     706  	+ (next.ru_utime.tv_usec - start.ru_utime.tv_usec);
     707        i++;
     708        if (i > 50000 || (i > 1000 && d > 100000))
     709  	break;
     710      }
     711  
     712    return result;
     713  }
     714  
     715  /* CLOCK_PROCESS_CPUTIME_ID looks like it's going to be in a future version
     716     of glibc (some time post 2.2).
     717  
     718     CLOCK_VIRTUAL is process time, available in BSD systems (though sometimes
     719     defined, but returning -1 for an error).  */
     720  
     721  #ifdef CLOCK_PROCESS_CPUTIME_ID
     722  # define CGT_ID        CLOCK_PROCESS_CPUTIME_ID
     723  #else
     724  # ifdef CLOCK_VIRTUAL
     725  #  define CGT_ID       CLOCK_VIRTUAL
     726  # endif
     727  #endif
     728  #ifdef CGT_ID
     729  const int  have_cgt_id = 1;
     730  #else
     731  const int  have_cgt_id = 0;
     732  # define CGT_ID       (ASSERT_FAIL (CGT_ID not determined), -1)
     733  #endif
     734  
     735  #define CGT_DELAY_COUNT 1000
     736  
     737  int
     738  cgt_works_p (void)
     739  {
     740    static int  result = -1;
     741    struct_timespec  unit;
     742  
     743    if (! have_cgt)
     744      return 0;
     745  
     746    if (! have_cgt_id)
     747      {
     748        if (speed_option_verbose)
     749  	printf ("clock_gettime don't know what ID to use\n");
     750        result = 0;
     751        return result;
     752      }
     753  
     754    if (result != -1)
     755      return result;
     756  
     757    /* trial run to see if it works */
     758    if (clock_gettime (CGT_ID, &unit) != 0)
     759      {
     760        if (speed_option_verbose)
     761  	printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
     762        result = 0;
     763        return result;
     764      }
     765  
     766    /* get the resolution */
     767    if (clock_getres (CGT_ID, &unit) != 0)
     768      {
     769        if (speed_option_verbose)
     770  	printf ("clock_getres id=%d error: %s\n", CGT_ID, strerror (errno));
     771        result = 0;
     772        return result;
     773      }
     774  
     775    cgt_unittime = unit.tv_sec + unit.tv_nsec * 1e-9;
     776    if (speed_option_verbose)
     777      printf ("clock_gettime is %s accurate\n", unittime_string (cgt_unittime));
     778  
     779    if (cgt_unittime < 10e-9)
     780      {
     781        /* Do we believe this? */
     782        struct timespec start, end;
     783        static volatile int counter;
     784        double duration;
     785        if (clock_gettime (CGT_ID, &start))
     786  	{
     787  	  if (speed_option_verbose)
     788  	    printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
     789  	  result = 0;
     790  	  return result;
     791  	}
     792        /* Loop of at least 1000 memory accesses, ought to take at
     793  	 least 100 ns*/
     794        for (counter = 0; counter < CGT_DELAY_COUNT; counter++)
     795  	;
     796        if (clock_gettime (CGT_ID, &end))
     797  	{
     798  	  if (speed_option_verbose)
     799  	    printf ("clock_gettime id=%d error: %s\n", CGT_ID, strerror (errno));
     800  	  result = 0;
     801  	  return result;
     802  	}
     803        duration = (end.tv_sec + end.tv_nsec * 1e-9
     804  		  - start.tv_sec - start.tv_nsec * 1e-9);
     805        if (speed_option_verbose)
     806  	printf ("delay loop of %d rounds took %s (according to clock_gettime)\n",
     807  		CGT_DELAY_COUNT, unittime_string (duration));
     808        if (duration < 100e-9)
     809  	{
     810  	  if (speed_option_verbose)
     811  	    printf ("clock_gettime id=%d not believable\n", CGT_ID);
     812  	  result = 0;
     813  	  return result;
     814  	}
     815      }
     816    result = 1;
     817    return result;
     818  }
     819  
     820  
     821  static double
     822  freq_measure_mftb_one (void)
     823  {
     824  #define call_gettimeofday(t)   gettimeofday (&(t), NULL)
     825  #define timeval_tv_sec(t)      ((t).tv_sec)
     826  #define timeval_tv_usec(t)     ((t).tv_usec)
     827    FREQ_MEASURE_ONE ("mftb", struct_timeval,
     828  		    call_gettimeofday, MFTB,
     829  		    timeval_tv_sec, timeval_tv_usec);
     830  }
     831  
     832  
     833  static jmp_buf  mftb_works_buf;
     834  
     835  static RETSIGTYPE
     836  mftb_works_handler (int sig)
     837  {
     838    longjmp (mftb_works_buf, 1);
     839  }
     840  
     841  int
     842  mftb_works_p (void)
     843  {
     844    unsigned   a[2];
     845    RETSIGTYPE (*old_handler) (int);
     846    double     cycletime;
     847  
     848    /* suppress a warning about a[] unused */
     849    a[0] = 0;
     850  
     851    if (! have_mftb)
     852      return 0;
     853  
     854  #ifdef SIGILL
     855    old_handler = signal (SIGILL, mftb_works_handler);
     856    if (old_handler == SIG_ERR)
     857      {
     858        if (speed_option_verbose)
     859  	printf ("mftb_works_p(): SIGILL not supported, assuming mftb works\n");
     860        return 1;
     861      }
     862    if (setjmp (mftb_works_buf))
     863      {
     864        if (speed_option_verbose)
     865  	printf ("mftb_works_p(): SIGILL during mftb, so doesn't work\n");
     866        return 0;
     867      }
     868    MFTB (a);
     869    signal (SIGILL, old_handler);
     870    if (speed_option_verbose)
     871      printf ("mftb_works_p(): mftb works\n");
     872  #else
     873  
     874    if (speed_option_verbose)
     875      printf ("mftb_works_p(): SIGILL not defined, assuming mftb works\n");
     876  #endif
     877  
     878  #if ! HAVE_GETTIMEOFDAY
     879    if (speed_option_verbose)
     880      printf ("mftb_works_p(): no gettimeofday available to measure mftb\n");
     881    return 0;
     882  #endif
     883  
     884    /* The time base is normally 1/4 of the bus speed on 6xx and 7xx chips, on
     885       other chips it can be driven from an external clock. */
     886    cycletime = freq_measure ("mftb", freq_measure_mftb_one);
     887    if (cycletime == -1.0)
     888      {
     889        if (speed_option_verbose)
     890  	printf ("mftb_works_p(): cannot measure mftb period\n");
     891        return 0;
     892      }
     893  
     894    mftb_unittime = cycletime;
     895    return 1;
     896  }
     897  
     898  
     899  volatile unsigned  *sgi_addr;
     900  
     901  int
     902  sgi_works_p (void)
     903  {
     904  #if HAVE_SYSSGI && HAVE_MMAP
     905    static int  result = -1;
     906  
     907    size_t          pagesize, offset;
     908    __psunsigned_t  phys, physpage;
     909    void            *virtpage;
     910    unsigned        period_picoseconds;
     911    int             size, fd;
     912  
     913    if (result != -1)
     914      return result;
     915  
     916    phys = syssgi (SGI_QUERY_CYCLECNTR, &period_picoseconds);
     917    if (phys == (__psunsigned_t) -1)
     918      {
     919        /* ENODEV is the error when a counter is not available */
     920        if (speed_option_verbose)
     921  	printf ("syssgi SGI_QUERY_CYCLECNTR error: %s\n", strerror (errno));
     922        result = 0;
     923        return result;
     924      }
     925    sgi_unittime = period_picoseconds * 1e-12;
     926  
     927    /* IRIX 5 doesn't have SGI_CYCLECNTR_SIZE, assume 32 bits in that case.
     928       Challenge/ONYX hardware has a 64 bit byte counter, but there seems no
     929       obvious way to identify that without SGI_CYCLECNTR_SIZE.  */
     930  #ifdef SGI_CYCLECNTR_SIZE
     931    size = syssgi (SGI_CYCLECNTR_SIZE);
     932    if (size == -1)
     933      {
     934        if (speed_option_verbose)
     935  	{
     936  	  printf ("syssgi SGI_CYCLECNTR_SIZE error: %s\n", strerror (errno));
     937  	  printf ("    will assume size==4\n");
     938  	}
     939        size = 32;
     940      }
     941  #else
     942    size = 32;
     943  #endif
     944  
     945    if (size < 32)
     946      {
     947        printf ("syssgi SGI_CYCLECNTR_SIZE gives %d, expected 32 or 64\n", size);
     948        result = 0;
     949        return result;
     950      }
     951  
     952    pagesize = getpagesize();
     953    offset = (size_t) phys & (pagesize-1);
     954    physpage = phys - offset;
     955  
     956    /* shouldn't cross over a page boundary */
     957    ASSERT_ALWAYS (offset + size/8 <= pagesize);
     958  
     959    fd = open("/dev/mmem", O_RDONLY);
     960    if (fd == -1)
     961      {
     962        if (speed_option_verbose)
     963  	printf ("open /dev/mmem: %s\n", strerror (errno));
     964        result = 0;
     965        return result;
     966      }
     967  
     968    virtpage = mmap (0, pagesize, PROT_READ, MAP_PRIVATE, fd, (off_t) physpage);
     969    if (virtpage == (void *) -1)
     970      {
     971        if (speed_option_verbose)
     972  	printf ("mmap /dev/mmem: %s\n", strerror (errno));
     973        result = 0;
     974        return result;
     975      }
     976  
     977    /* address of least significant 4 bytes, knowing mips is big endian */
     978    sgi_addr = (unsigned *) ((char *) virtpage + offset
     979  			   + size/8 - sizeof(unsigned));
     980    result = 1;
     981    return result;
     982  
     983  #else /* ! (HAVE_SYSSGI && HAVE_MMAP) */
     984    return 0;
     985  #endif
     986  }
     987  
     988  
     989  #define DEFAULT(var,n)  \
     990    do {                  \
     991      if (! (var))        \
     992        (var) = (n);      \
     993    } while (0)
     994  
     995  void
     996  speed_time_init (void)
     997  {
     998    double supplement_unittime = 0.0;
     999  
    1000    static int  speed_time_initialized = 0;
    1001    if (speed_time_initialized)
    1002      return;
    1003    speed_time_initialized = 1;
    1004  
    1005    speed_cycletime_init ();
    1006  
    1007    if (!speed_option_cycles_broken && have_cycles && cycles_works_p ())
    1008      {
    1009        use_cycles = 1;
    1010        DEFAULT (speed_cycletime, 1.0);
    1011        speed_unittime = speed_cycletime;
    1012        DEFAULT (speed_precision, 10000);
    1013        strcpy (speed_time_string, "CPU cycle counter");
    1014  
    1015        /* only used if a supplementary method is chosen below */
    1016        cycles_limit = (have_cycles == 1 ? M_2POW32 : M_2POW64) / 2.0
    1017  	* speed_cycletime;
    1018  
    1019        if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p())
    1020  	{
    1021  	  /* this is a good combination */
    1022  	  use_grus = 1;
    1023  	  supplement_unittime = grus_unittime = 1.0e-6;
    1024  	  strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond getrusage()");
    1025  	}
    1026        else if (have_cycles == 1)
    1027  	{
    1028  	  /* When speed_cyclecounter has a limited range, look for something
    1029  	     to supplement it. */
    1030  	  if (have_gtod && gettimeofday_microseconds_p())
    1031  	    {
    1032  	      use_gtod = 1;
    1033  	      supplement_unittime = gtod_unittime = 1.0e-6;
    1034  	      strcpy (speed_time_string, "CPU cycle counter, supplemented by microsecond gettimeofday()");
    1035  	    }
    1036  	  else if (have_grus)
    1037  	    {
    1038  	      use_grus = 1;
    1039  	      supplement_unittime = grus_unittime = 1.0 / (double) clk_tck ();
    1040  	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick getrusage()", unittime_string (supplement_unittime));
    1041  	    }
    1042  	  else if (have_times)
    1043  	    {
    1044  	      use_times = 1;
    1045  	      supplement_unittime = times_unittime = 1.0 / (double) clk_tck ();
    1046  	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick times()", unittime_string (supplement_unittime));
    1047  	    }
    1048  	  else if (have_gtod)
    1049  	    {
    1050  	      use_gtod = 1;
    1051  	      supplement_unittime = gtod_unittime = 1.0 / (double) clk_tck ();
    1052  	      sprintf (speed_time_string, "CPU cycle counter, supplemented by %s clock tick gettimeofday()", unittime_string (supplement_unittime));
    1053  	    }
    1054  	  else
    1055  	    {
    1056  	      fprintf (stderr, "WARNING: cycle counter is 32 bits and there's no other functions.\n");
    1057  	      fprintf (stderr, "    Wraparounds may produce bad results on long measurements.\n");
    1058  	    }
    1059  	}
    1060  
    1061        if (use_grus || use_times || use_gtod)
    1062  	{
    1063  	  /* must know cycle period to compare cycles to other measuring
    1064  	     (via cycles_limit) */
    1065  	  speed_cycletime_need_seconds ();
    1066  
    1067  	  if (speed_precision * supplement_unittime > cycles_limit)
    1068  	    {
    1069  	      fprintf (stderr, "WARNING: requested precision can't always be achieved due to limited range\n");
    1070  	      fprintf (stderr, "    cycle counter and limited precision supplemental method\n");
    1071  	      fprintf (stderr, "    (%s)\n", speed_time_string);
    1072  	    }
    1073  	}
    1074      }
    1075    else if (have_stck)
    1076      {
    1077        strcpy (speed_time_string, "STCK timestamp");
    1078        /* stck is in units of 2^-12 microseconds, which is very likely higher
    1079  	 resolution than a cpu cycle */
    1080        if (speed_cycletime == 0.0)
    1081  	speed_cycletime_fail
    1082  	  ("Need to know CPU frequency for effective stck unit");
    1083        speed_unittime = MAX (speed_cycletime, STCK_PERIOD);
    1084        DEFAULT (speed_precision, 10000);
    1085      }
    1086    else if (have_mftb && mftb_works_p ())
    1087      {
    1088        use_mftb = 1;
    1089        DEFAULT (speed_precision, 10000);
    1090        speed_unittime = mftb_unittime;
    1091        sprintf (speed_time_string, "mftb counter (%s)",
    1092  	       unittime_string (speed_unittime));
    1093      }
    1094    else if (have_sgi && sgi_works_p ())
    1095      {
    1096        use_sgi = 1;
    1097        DEFAULT (speed_precision, 10000);
    1098        speed_unittime = sgi_unittime;
    1099        sprintf (speed_time_string, "syssgi() mmap counter (%s), supplemented by millisecond getrusage()",
    1100  	       unittime_string (speed_unittime));
    1101        /* supplemented with getrusage, which we assume to have 1ms resolution */
    1102        use_grus = 1;
    1103        supplement_unittime = 1e-3;
    1104      }
    1105    else if (have_rrt)
    1106      {
    1107        timebasestruct_t  t;
    1108        use_rrt = 1;
    1109        DEFAULT (speed_precision, 10000);
    1110        read_real_time (&t, sizeof(t));
    1111        switch (t.flag) {
    1112        case RTC_POWER:
    1113  	/* FIXME: What's the actual RTC resolution? */
    1114  	speed_unittime = 1e-7;
    1115  	strcpy (speed_time_string, "read_real_time() power nanoseconds");
    1116  	break;
    1117        case RTC_POWER_PC:
    1118  	t.tb_high = 1;
    1119  	t.tb_low = 0;
    1120  	time_base_to_time (&t, sizeof(t));
    1121  	speed_unittime = TIMEBASESTRUCT_SECS(&t) / M_2POW32;
    1122  	sprintf (speed_time_string, "%s read_real_time() powerpc ticks",
    1123  		 unittime_string (speed_unittime));
    1124  	break;
    1125        default:
    1126  	fprintf (stderr, "ERROR: Unrecognised timebasestruct_t flag=%d\n",
    1127  		 t.flag);
    1128  	abort ();
    1129        }
    1130      }
    1131    else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5e-6)
    1132      {
    1133        /* use clock_gettime if microsecond or better resolution */
    1134      choose_cgt:
    1135        use_cgt = 1;
    1136        speed_unittime = cgt_unittime;
    1137        DEFAULT (speed_precision, (cgt_unittime <= 0.1e-6 ? 10000 : 1000));
    1138        strcpy (speed_time_string, "microsecond accurate clock_gettime()");
    1139      }
    1140    else if (have_times && clk_tck() > 1000000)
    1141      {
    1142        /* Cray vector systems have times() which is clock cycle resolution
    1143  	 (eg. 450 MHz).  */
    1144        DEFAULT (speed_precision, 10000);
    1145        goto choose_times;
    1146      }
    1147    else if (have_grus && getrusage_microseconds_p() && ! getrusage_backwards_p())
    1148      {
    1149        use_grus = 1;
    1150        speed_unittime = grus_unittime = 1.0e-6;
    1151        DEFAULT (speed_precision, 1000);
    1152        strcpy (speed_time_string, "microsecond accurate getrusage()");
    1153      }
    1154    else if (have_gtod && gettimeofday_microseconds_p())
    1155      {
    1156        use_gtod = 1;
    1157        speed_unittime = gtod_unittime = 1.0e-6;
    1158        DEFAULT (speed_precision, 1000);
    1159        strcpy (speed_time_string, "microsecond accurate gettimeofday()");
    1160      }
    1161    else if (have_cgt && cgt_works_p() && cgt_unittime < 1.5/clk_tck())
    1162      {
    1163        /* use clock_gettime if 1 tick or better resolution */
    1164        goto choose_cgt;
    1165      }
    1166    else if (have_times)
    1167      {
    1168        use_tick_boundary = 1;
    1169        DEFAULT (speed_precision, 200);
    1170      choose_times:
    1171        use_times = 1;
    1172        speed_unittime = times_unittime = 1.0 / (double) clk_tck ();
    1173        sprintf (speed_time_string, "%s clock tick times()",
    1174  	       unittime_string (speed_unittime));
    1175      }
    1176    else if (have_grus)
    1177      {
    1178        use_grus = 1;
    1179        use_tick_boundary = 1;
    1180        speed_unittime = grus_unittime = 1.0 / (double) clk_tck ();
    1181        DEFAULT (speed_precision, 200);
    1182        sprintf (speed_time_string, "%s clock tick getrusage()\n",
    1183  	       unittime_string (speed_unittime));
    1184      }
    1185    else if (have_gtod)
    1186      {
    1187        use_gtod = 1;
    1188        use_tick_boundary = 1;
    1189        speed_unittime = gtod_unittime = 1.0 / (double) clk_tck ();
    1190        DEFAULT (speed_precision, 200);
    1191        sprintf (speed_time_string, "%s clock tick gettimeofday()",
    1192  	       unittime_string (speed_unittime));
    1193      }
    1194    else
    1195      {
    1196        fprintf (stderr, "No time measuring method available\n");
    1197        fprintf (stderr, "None of: speed_cyclecounter(), STCK(), getrusage(), gettimeofday(), times()\n");
    1198        abort ();
    1199      }
    1200  
    1201    if (speed_option_verbose)
    1202      {
    1203        printf ("speed_time_init: %s\n", speed_time_string);
    1204        printf ("    speed_precision     %d\n", speed_precision);
    1205        printf ("    speed_unittime      %.2g\n", speed_unittime);
    1206        if (supplement_unittime)
    1207  	printf ("    supplement_unittime %.2g\n", supplement_unittime);
    1208        printf ("    use_tick_boundary   %d\n", use_tick_boundary);
    1209        if (have_cycles)
    1210  	printf ("    cycles_limit        %.2g seconds\n", cycles_limit);
    1211      }
    1212  }
    1213  
    1214  
    1215  
    1216  /* Burn up CPU until a clock tick boundary, for greater accuracy.  Set the
    1217     corresponding "start_foo" appropriately too. */
    1218  
    1219  void
    1220  grus_tick_boundary (void)
    1221  {
    1222    struct_rusage  prev;
    1223    getrusage (0, &prev);
    1224    do {
    1225      getrusage (0, &start_grus);
    1226    } while (start_grus.ru_utime.tv_usec == prev.ru_utime.tv_usec);
    1227  }
    1228  
    1229  void
    1230  gtod_tick_boundary (void)
    1231  {
    1232    struct_timeval  prev;
    1233    gettimeofday (&prev, NULL);
    1234    do {
    1235      gettimeofday (&start_gtod, NULL);
    1236    } while (start_gtod.tv_usec == prev.tv_usec);
    1237  }
    1238  
    1239  void
    1240  times_tick_boundary (void)
    1241  {
    1242    struct_tms  prev;
    1243    times (&prev);
    1244    do
    1245      times (&start_times);
    1246    while (start_times.tms_utime == prev.tms_utime);
    1247  }
    1248  
    1249  
    1250  /* "have_" values are tested to let unused code go dead.  */
    1251  
    1252  void
    1253  speed_starttime (void)
    1254  {
    1255    speed_time_init ();
    1256  
    1257    if (have_grus && use_grus)
    1258      {
    1259        if (use_tick_boundary)
    1260  	grus_tick_boundary ();
    1261        else
    1262  	getrusage (0, &start_grus);
    1263      }
    1264  
    1265    if (have_gtod && use_gtod)
    1266      {
    1267        if (use_tick_boundary)
    1268  	gtod_tick_boundary ();
    1269        else
    1270  	gettimeofday (&start_gtod, NULL);
    1271      }
    1272  
    1273    if (have_times && use_times)
    1274      {
    1275        if (use_tick_boundary)
    1276  	times_tick_boundary ();
    1277        else
    1278  	times (&start_times);
    1279      }
    1280  
    1281    if (have_cgt && use_cgt)
    1282      clock_gettime (CGT_ID, &start_cgt);
    1283  
    1284    if (have_rrt && use_rrt)
    1285      read_real_time (&start_rrt, sizeof(start_rrt));
    1286  
    1287    if (have_sgi && use_sgi)
    1288      start_sgi = *sgi_addr;
    1289  
    1290    if (have_mftb && use_mftb)
    1291      MFTB (start_mftb);
    1292  
    1293    if (have_stck && use_stck)
    1294      STCK (start_stck);
    1295  
    1296    /* Cycles sampled last for maximum accuracy. */
    1297    if (have_cycles && use_cycles)
    1298      speed_cyclecounter (start_cycles);
    1299  }
    1300  
    1301  
    1302  /* Calculate the difference between two cycle counter samples, as a "double"
    1303     counter of cycles.
    1304  
    1305     The start and end values are allowed to cancel in integers in case the
    1306     counter values are bigger than the 53 bits that normally fit in a double.
    1307  
    1308     This works even if speed_cyclecounter() puts a value bigger than 32-bits
    1309     in the low word (the high word always gets a 2**32 multiplier though). */
    1310  
    1311  double
    1312  speed_cyclecounter_diff (const unsigned end[2], const unsigned start[2])
    1313  {
    1314    unsigned  d;
    1315    double    t;
    1316  
    1317    if (have_cycles == 1)
    1318      {
    1319        t = (end[0] - start[0]);
    1320      }
    1321    else
    1322      {
    1323        d = end[0] - start[0];
    1324        t = d - (d > end[0] ? M_2POWU : 0.0);
    1325        t += (end[1] - start[1]) * M_2POW32;
    1326      }
    1327    return t;
    1328  }
    1329  
    1330  
    1331  double
    1332  speed_mftb_diff (const unsigned end[2], const unsigned start[2])
    1333  {
    1334    unsigned  d;
    1335    double    t;
    1336  
    1337    d = end[0] - start[0];
    1338    t = (double) d - (d > end[0] ? M_2POW32 : 0.0);
    1339    t += (end[1] - start[1]) * M_2POW32;
    1340    return t;
    1341  }
    1342  
    1343  
    1344  /* Calculate the difference between "start" and "end" using fields "sec" and
    1345     "psec", where each "psec" is a "punit" of a second.
    1346  
    1347     The seconds parts are allowed to cancel before being combined with the
    1348     psec parts, in case a simple "sec+psec*punit" exceeds the precision of a
    1349     double.
    1350  
    1351     Total time is only calculated in a "double" since an integer count of
    1352     psecs might overflow.  2^32 microseconds is only a bit over an hour, or
    1353     2^32 nanoseconds only about 4 seconds.
    1354  
    1355     The casts to "long" are for the benefit of timebasestruct_t, where the
    1356     fields are only "unsigned int", but we want a signed difference.  */
    1357  
    1358  #define DIFF_SECS_ROUTINE(sec, psec, punit)                     \
    1359    {                                                             \
    1360      long  sec_diff, psec_diff;                                  \
    1361      sec_diff = (long) end->sec - (long) start->sec;             \
    1362      psec_diff = (long) end->psec - (long) start->psec;          \
    1363      return (double) sec_diff + punit * (double) psec_diff;      \
    1364    }
    1365  
    1366  double
    1367  timeval_diff_secs (const struct_timeval *end, const struct_timeval *start)
    1368  {
    1369    DIFF_SECS_ROUTINE (tv_sec, tv_usec, 1e-6);
    1370  }
    1371  
    1372  double
    1373  rusage_diff_secs (const struct_rusage *end, const struct_rusage *start)
    1374  {
    1375    DIFF_SECS_ROUTINE (ru_utime.tv_sec, ru_utime.tv_usec, 1e-6);
    1376  }
    1377  
    1378  double
    1379  timespec_diff_secs (const struct_timespec *end, const struct_timespec *start)
    1380  {
    1381    DIFF_SECS_ROUTINE (tv_sec, tv_nsec, 1e-9);
    1382  }
    1383  
    1384  /* This is for use after time_base_to_time, ie. for seconds and nanoseconds. */
    1385  double
    1386  timebasestruct_diff_secs (const timebasestruct_t *end,
    1387  			  const timebasestruct_t *start)
    1388  {
    1389    DIFF_SECS_ROUTINE (tb_high, tb_low, 1e-9);
    1390  }
    1391  
    1392  
    1393  double
    1394  speed_endtime (void)
    1395  {
    1396  #define END_USE(name,value)                             \
    1397    do {                                                  \
    1398      if (speed_option_verbose >= 3)                      \
    1399        printf ("speed_endtime(): used %s\n", name);      \
    1400      result = value;                                     \
    1401      goto done;                                          \
    1402    } while (0)
    1403  
    1404  #define END_ENOUGH(name,value)                                          \
    1405    do {                                                                  \
    1406      if (speed_option_verbose >= 3)                                      \
    1407        printf ("speed_endtime(): %s gives enough precision\n", name);    \
    1408      result = value;                                                     \
    1409      goto done;                                                          \
    1410    } while (0)
    1411  
    1412  #define END_EXCEED(name,value)                                            \
    1413    do {                                                                    \
    1414      if (speed_option_verbose >= 3)                                        \
    1415        printf ("speed_endtime(): cycle counter limit exceeded, used %s\n", \
    1416  	      name);                                                      \
    1417      result = value;                                                       \
    1418      goto done;                                                            \
    1419    } while (0)
    1420  
    1421    unsigned          end_cycles[2];
    1422    stck_t            end_stck;
    1423    unsigned          end_mftb[2];
    1424    unsigned          end_sgi;
    1425    timebasestruct_t  end_rrt;
    1426    struct_timespec   end_cgt;
    1427    struct_timeval    end_gtod;
    1428    struct_rusage     end_grus;
    1429    struct_tms        end_times;
    1430    double            t_gtod, t_grus, t_times, t_cgt;
    1431    double            t_rrt, t_sgi, t_mftb, t_stck, t_cycles;
    1432    double            result;
    1433  
    1434    /* Cycles sampled first for maximum accuracy.
    1435       "have_" values tested to let unused code go dead.  */
    1436  
    1437    if (have_cycles && use_cycles)  speed_cyclecounter (end_cycles);
    1438    if (have_stck   && use_stck)    STCK (end_stck);
    1439    if (have_mftb   && use_mftb)    MFTB (end_mftb);
    1440    if (have_sgi    && use_sgi)     end_sgi = *sgi_addr;
    1441    if (have_rrt    && use_rrt)     read_real_time (&end_rrt, sizeof(end_rrt));
    1442    if (have_cgt    && use_cgt)     clock_gettime (CGT_ID, &end_cgt);
    1443    if (have_gtod   && use_gtod)    gettimeofday (&end_gtod, NULL);
    1444    if (have_grus   && use_grus)    getrusage (0, &end_grus);
    1445    if (have_times  && use_times)   times (&end_times);
    1446  
    1447    result = -1.0;
    1448  
    1449    if (speed_option_verbose >= 4)
    1450      {
    1451        printf ("speed_endtime():\n");
    1452        if (use_cycles)
    1453  	printf ("   cycles  0x%X,0x%X -> 0x%X,0x%X\n",
    1454  		start_cycles[1], start_cycles[0],
    1455  		end_cycles[1], end_cycles[0]);
    1456  
    1457        if (use_stck)
    1458  	printf ("   stck  0x%lX -> 0x%lX\n", start_stck, end_stck);
    1459  
    1460        if (use_mftb)
    1461  	printf ("   mftb  0x%X,%08X -> 0x%X,%08X\n",
    1462  		start_mftb[1], start_mftb[0],
    1463  		end_mftb[1], end_mftb[0]);
    1464  
    1465        if (use_sgi)
    1466  	printf ("   sgi  0x%X -> 0x%X\n", start_sgi, end_sgi);
    1467  
    1468        if (use_rrt)
    1469  	printf ("   read_real_time  (%d)%u,%u -> (%d)%u,%u\n",
    1470  		start_rrt.flag, start_rrt.tb_high, start_rrt.tb_low,
    1471  		end_rrt.flag, end_rrt.tb_high, end_rrt.tb_low);
    1472  
    1473        if (use_cgt)
    1474  	printf ("   clock_gettime  %ld.%09ld -> %ld.%09ld\n",
    1475  		(long) start_cgt.tv_sec, (long) start_cgt.tv_nsec,
    1476  		(long) end_cgt.tv_sec, (long) end_cgt.tv_nsec);
    1477  
    1478        if (use_gtod)
    1479  	printf ("   gettimeofday  %ld.%06ld -> %ld.%06ld\n",
    1480  		(long) start_gtod.tv_sec, (long) start_gtod.tv_usec,
    1481  		(long) end_gtod.tv_sec, (long) end_gtod.tv_usec);
    1482  
    1483        if (use_grus)
    1484  	printf ("   getrusage  %ld.%06ld -> %ld.%06ld\n",
    1485  		(long) start_grus.ru_utime.tv_sec,
    1486  		(long) start_grus.ru_utime.tv_usec,
    1487  		(long) end_grus.ru_utime.tv_sec,
    1488  		(long) end_grus.ru_utime.tv_usec);
    1489  
    1490        if (use_times)
    1491  	printf ("   times  %ld -> %ld\n",
    1492  		start_times.tms_utime, end_times.tms_utime);
    1493      }
    1494  
    1495    if (use_rrt)
    1496      {
    1497        time_base_to_time (&start_rrt, sizeof(start_rrt));
    1498        time_base_to_time (&end_rrt, sizeof(end_rrt));
    1499        t_rrt = timebasestruct_diff_secs (&end_rrt, &start_rrt);
    1500        END_USE ("read_real_time()", t_rrt);
    1501      }
    1502  
    1503    if (use_cgt)
    1504      {
    1505        t_cgt = timespec_diff_secs (&end_cgt, &start_cgt);
    1506        END_USE ("clock_gettime()", t_cgt);
    1507      }
    1508  
    1509    if (use_grus)
    1510      {
    1511        t_grus = rusage_diff_secs (&end_grus, &start_grus);
    1512  
    1513        /* Use getrusage() if the cycle counter limit would be exceeded, or if
    1514  	 it provides enough accuracy already. */
    1515        if (use_cycles)
    1516  	{
    1517  	  if (t_grus >= speed_precision*grus_unittime)
    1518  	    END_ENOUGH ("getrusage()", t_grus);
    1519  	  if (t_grus >= cycles_limit)
    1520  	    END_EXCEED ("getrusage()", t_grus);
    1521  	}
    1522      }
    1523  
    1524    if (use_times)
    1525      {
    1526        t_times = (end_times.tms_utime - start_times.tms_utime) * times_unittime;
    1527  
    1528        /* Use times() if the cycle counter limit would be exceeded, or if
    1529  	 it provides enough accuracy already. */
    1530        if (use_cycles)
    1531  	{
    1532  	  if (t_times >= speed_precision*times_unittime)
    1533  	    END_ENOUGH ("times()", t_times);
    1534  	  if (t_times >= cycles_limit)
    1535  	    END_EXCEED ("times()", t_times);
    1536  	}
    1537      }
    1538  
    1539    if (use_gtod)
    1540      {
    1541        t_gtod = timeval_diff_secs (&end_gtod, &start_gtod);
    1542  
    1543        /* Use gettimeofday() if it measured a value bigger than the cycle
    1544  	 counter can handle.  */
    1545        if (use_cycles)
    1546  	{
    1547  	  if (t_gtod >= cycles_limit)
    1548  	    END_EXCEED ("gettimeofday()", t_gtod);
    1549  	}
    1550      }
    1551  
    1552    if (use_mftb)
    1553      {
    1554        t_mftb = speed_mftb_diff (end_mftb, start_mftb) * mftb_unittime;
    1555        END_USE ("mftb", t_mftb);
    1556      }
    1557  
    1558    if (use_stck)
    1559      {
    1560        t_stck = (end_stck - start_stck) * STCK_PERIOD;
    1561        END_USE ("stck", t_stck);
    1562      }
    1563  
    1564    if (use_sgi)
    1565      {
    1566        t_sgi = (end_sgi - start_sgi) * sgi_unittime;
    1567        END_USE ("SGI hardware counter", t_sgi);
    1568      }
    1569  
    1570    if (use_cycles)
    1571      {
    1572        t_cycles = speed_cyclecounter_diff (end_cycles, start_cycles)
    1573  	* speed_cycletime;
    1574        END_USE ("cycle counter", t_cycles);
    1575      }
    1576  
    1577    if (use_grus && getrusage_microseconds_p())
    1578      END_USE ("getrusage()", t_grus);
    1579  
    1580    if (use_gtod && gettimeofday_microseconds_p())
    1581      END_USE ("gettimeofday()", t_gtod);
    1582  
    1583    if (use_times)  END_USE ("times()",        t_times);
    1584    if (use_grus)   END_USE ("getrusage()",    t_grus);
    1585    if (use_gtod)   END_USE ("gettimeofday()", t_gtod);
    1586  
    1587    fprintf (stderr, "speed_endtime(): oops, no time method available\n");
    1588    abort ();
    1589  
    1590   done:
    1591    if (result < 0.0)
    1592      {
    1593        if (speed_option_verbose >= 2)
    1594  	fprintf (stderr, "speed_endtime(): warning, treating negative time as zero: %.9f\n", result);
    1595        result = 0.0;
    1596      }
    1597    return result;
    1598  }