(root)/
binutils-2.41/
gprofng/
common/
hwcdrv.c
       1  /* Copyright (C) 2021-2023 Free Software Foundation, Inc.
       2     Contributed by Oracle.
       3  
       4     This file is part of GNU Binutils.
       5  
       6     This program is free software; you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3, or (at your option)
       9     any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program; if not, write to the Free Software
      18     Foundation, 51 Franklin Street - Fifth Floor, Boston,
      19     MA 02110-1301, USA.  */
      20  
      21  #include <errno.h>
      22  #include <unistd.h>
      23  #include <fcntl.h>
      24  #include <sys/mman.h>
      25  #include <sys/ioctl.h>
      26  #include <sys/syscall.h>
      27  #include <linux/perf_event.h>
      28  
      29  #include "hwcdrv.h"
      30  
      31  /*---------------------------------------------------------------------------*/
      32  /* macros */
      33  #define IS_GLOBAL /* Mark global symbols */
      34  
      35  #include "cpuid.c" /* ftns for identifying a chip */
      36  
      37  static hdrv_pcbe_api_t hdrv_pcbe_core_api;
      38  static hdrv_pcbe_api_t hdrv_pcbe_opteron_api;
      39  static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = {
      40    &hdrv_pcbe_core_api,
      41    &hdrv_pcbe_opteron_api,
      42    NULL
      43  };
      44  #include "opteron_pcbe.c" /* CPU-specific code */
      45  #include "core_pcbe.c" /* CPU-specific code  */
      46  
      47  extern hwcdrv_api_t hwcdrv_pcl_api;
      48  IS_GLOBAL hwcdrv_api_t *hwcdrv_drivers[] = {
      49    &hwcdrv_pcl_api,
      50    NULL
      51  };
      52  
      53  /*---------------------------------------------------------------------------*/
      54  
      55  /* utils for drivers */
      56  IS_GLOBAL int
      57  hwcdrv_assign_all_regnos (Hwcentry* entries[], unsigned numctrs)
      58  {
      59    unsigned int pmc_assigned[MAX_PICS];
      60    unsigned idx;
      61    for (int ii = 0; ii < MAX_PICS; ii++)
      62      pmc_assigned[ii] = 0;
      63  
      64    /* assign the HWCs that we already know about */
      65    for (idx = 0; idx < numctrs; idx++)
      66      {
      67        regno_t regno = entries[idx]->reg_num;
      68        if (regno == REGNO_ANY)
      69  	{
      70  	  /* check to see if list of possible registers only contains one entry */
      71  	  regno = REG_LIST_SINGLE_VALID_ENTRY (entries[idx]->reg_list);
      72  	}
      73        if (regno != REGNO_ANY)
      74  	{
      75  	  if (regno < 0 || regno >= MAX_PICS || !regno_is_valid (entries[idx], regno))
      76  	    {
      77  	      logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
      78  	      return HWCFUNCS_ERROR_HWCARGS;
      79  	    }
      80  	  TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): preselected: idx=%d, regno=%d\n", idx, regno);
      81  	  entries[idx]->reg_num = regno; /* assigning back to entries */
      82  	  pmc_assigned[regno] = 1;
      83  	}
      84      }
      85  
      86    /* assign HWCs that are currently REGNO_ANY */
      87    for (idx = 0; idx < numctrs; idx++)
      88      {
      89        if (entries[idx]->reg_num == REGNO_ANY)
      90  	{
      91  	  int assigned = 0;
      92  	  regno_t *reg_list = entries[idx]->reg_list;
      93  	  for (; reg_list && *reg_list != REGNO_ANY; reg_list++)
      94  	    {
      95  	      regno_t regno = *reg_list;
      96  	      if (regno < 0 || regno >= MAX_PICS)
      97  		{
      98  		  logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
      99  		  return HWCFUNCS_ERROR_HWCARGS;
     100  		}
     101  	      if (pmc_assigned[regno] == 0)
     102  		{
     103  		  TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): assigned:   idx=%d, regno=%d\n", idx, regno);
     104  		  entries[idx]->reg_num = regno; /* assigning back to entries */
     105  		  pmc_assigned[regno] = 1;
     106  		  assigned = 1;
     107  		  break;
     108  		}
     109  	    }
     110  	  if (!assigned)
     111  	    {
     112  	      logerr (GTXT ("Counter '%s' could not be bound to a register\n"),
     113  		      entries[idx]->name ? entries[idx]->name : "<NULL>");
     114  	      return HWCFUNCS_ERROR_HWCARGS;
     115  	    }
     116  	}
     117      }
     118    return 0;
     119  }
     120  
     121  IS_GLOBAL int
     122  hwcdrv_lookup_cpuver (const char * cpcN_cciname)
     123  {
     124    libcpc2_cpu_lookup_t *plookup;
     125    static libcpc2_cpu_lookup_t cpu_table[] = {
     126      LIBCPC2_CPU_LOOKUP_LIST
     127    };
     128    if (cpcN_cciname == NULL)
     129      return CPUVER_UNDEFINED;
     130  
     131    /* search table for name */
     132    for (plookup = cpu_table; plookup->cpc2_cciname; plookup++)
     133      {
     134        int n = strlen (plookup->cpc2_cciname);
     135        if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n))
     136  	return plookup->cpc2_cpuver;
     137      }
     138    /* unknown, but does have a descriptive string */
     139    TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' "
     140  	    "could not be determined\n",
     141  	    cpcN_cciname);
     142    return CPUVER_GENERIC;
     143  }
     144  
     145  /*---------------------------------------------------------------------------*/
     146  /* utils to generate x86 register definitions on Linux */
     147  
     148  /*
     149   *  This code is structured as though we're going to initialize the
     150   *  HWC by writing the Intel MSR register directly.  That is, we
     151   *  assume the lowest 16 bits of the event number will have the event
     152   *  and that higher bits will set attributes.
     153   *
     154   *  While SPARC is different, we can nonetheless use basically the
     155   *  same "x86"-named functions:
     156   *
     157   *  - The event code will still be 16 bits.  It will still
     158   *    be in the lowest 16 bits of the event number.  Though
     159   *    perf_event_code() on SPARC will expect those bits to
     160   *    shifted, hwcdrv_pcl.c can easily perform that shift.
     161   *
     162   *  - On SPARC we support only two attributes, "user" and "system",
     163   *    which hwcdrv_pcl.c already converts to the "exclude_user"
     164   *    and "exclude_kernel" fields expected by perf_event_open().
     165   *    "user" and "system" are stored in event bits 16 and 17.
     166   *    For M8, a 4-bit mask of supported PICs is stored in bits [23:20].
     167   */
     168  
     169  IS_GLOBAL hwcdrv_get_eventnum_fn_t *hwcdrv_get_x86_eventnum = 0;
     170  
     171  static const attr_info_t perfctr_sparc_attrs[] = {
     172    {NTXT ("user"),   0, 0x01, 16}, //usr
     173    {NTXT ("system"), 0, 0x01, 17}, //os
     174    {NULL, 0, 0x00, 0},
     175  };
     176  static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */
     177    {NTXT ("umask"),  0, 0xff, 8},
     178    {NTXT ("user"),   0, 0x01, 16}, //usr
     179    //{NTXT("nouser"),  1, 0x01, 16}, //usr (inverted)
     180    {NTXT ("system"), 0, 0x01, 17}, //os
     181    {NTXT ("edge"),   0, 0x01, 18},
     182    {NTXT ("pc"),     0, 0x01, 19},
     183    {NTXT ("inv"),    0, 0x01, 23},
     184    {NTXT ("cmask"),  0, 0xff, 24},
     185    {NULL, 0, 0x00, 0},
     186  };
     187  const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs;
     188  
     189  static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */
     190      // (0xff <<  0) |   /* event*/
     191      // (0xff <<  8) |   /* umask */
     192      // (0x01 << 17) |   /* os */
     193      // (0x01 << 18) |   /* edge */
     194      // (0x01 << 19) |   /* pc */
     195      (0x01 << 20) |      /* int */
     196      // (0x01 << 21) |   /* reserved */
     197      (0x01 << 22) |      /* enable */
     198      // (0x01 << 23) |   /* inv */
     199      // (0xff << 24) |   /* cmask */
     200      0;
     201  
     202  static int
     203  myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc,
     204  			    eventsel_t *eventsel, eventsel_t *valid_umask,
     205  			    uint_t *pmc_sel)
     206  {
     207    if (hwcdrv_get_x86_eventnum &&
     208        !hwcdrv_get_x86_eventnum (eventname, pmc, eventsel, valid_umask, pmc_sel))
     209      return 0;
     210  
     211    /* check for numerically-specified counters */
     212    char * endptr;
     213    uint64_t num = strtoull (eventname, &endptr, 0);
     214    if (*eventname && !*endptr)
     215      {
     216        *eventsel = EXTENDED_EVNUM_2_EVSEL (num);
     217        *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */
     218        *pmc_sel = pmc;
     219        return 0;
     220      }
     221  
     222    /* name does not specify a numeric value */
     223    *eventsel = (eventsel_t) - 1;
     224    *valid_umask = 0x0;
     225    *pmc_sel = pmc;
     226    return -1;
     227  }
     228  
     229  static int
     230  mask_shift_set (eventsel_t *presult, eventsel_t invalue,
     231  		eventsel_t mask, eventsel_t shift)
     232  {
     233    if (invalue & ~mask)
     234      return -1; /* invalue attempts to set bits outside of mask */
     235    *presult &= ~(mask << shift); /* clear all the mask bits */
     236    *presult |= (invalue << shift); /* set bits according to invalue */
     237    return 0;
     238  }
     239  
     240  static int
     241  set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask,
     242  		   hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly)
     243  {
     244    eventsel_t evntsel = *result_mask;
     245    for (int ii = 0; ii < (int) nattrs; ii++)
     246      {
     247        const char *attrname = attrs[ii].ca_name;
     248        eventsel_t attrval = (eventsel_t) attrs[ii].ca_val;
     249        const char *tmpname;
     250        int attr_found = 0;
     251        for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++)
     252  	{
     253  	  if (strcmp (attrname, tmpname) == 0)
     254  	    {
     255  	      if (strcmp (attrname, "umask") == 0)
     256  		{
     257  		  if (attrval & ~evnt_valid_umask)
     258  		    {
     259  		      logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"),
     260  			      nameOnly, (long long) evnt_valid_umask);
     261  		      return -1;
     262  		    }
     263  		}
     264  	      if (mask_shift_set (&evntsel,
     265  				  perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval,
     266  				  perfctr_attrs_table[jj].mask,
     267  				  perfctr_attrs_table[jj].shift))
     268  		{
     269  		  logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"),
     270  			  nameOnly, attrname, (long long) attrval);
     271  		  return -1;
     272  		}
     273  	      TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n",
     274  			nameOnly, attrname, (long long) attrval);
     275  	      attr_found = 1;
     276  	      break;
     277  	    }
     278  	}
     279        if (!attr_found)
     280  	{
     281  	  logerr (GTXT ("attribute `%s' is invalid\n"), attrname);
     282  	  return -1;
     283  	}
     284      }
     285    *result_mask = evntsel;
     286    return 0;
     287  }
     288  
     289  IS_GLOBAL int
     290  hwcfuncs_get_x86_eventsel (unsigned int regno, const char *int_name,
     291  			   eventsel_t *return_event, uint_t *return_pmc_sel)
     292  {
     293    hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1];
     294    unsigned nattrs = 0;
     295    char *nameOnly = NULL;
     296    eventsel_t evntsel = 0; // event number
     297    eventsel_t evnt_valid_umask = 0;
     298    uint_t pmc_sel = 0;
     299    int rc = -1;
     300    *return_event = 0;
     301    *return_pmc_sel = 0;
     302    void *attr_mem = hwcfuncs_parse_attrs (int_name, attrs, HWCFUNCS_MAX_ATTRS,
     303  				   &nattrs, NULL);
     304    if (!attr_mem)
     305      {
     306        logerr (GTXT ("out of memory, could not parse attributes\n"));
     307        return -1;
     308      }
     309    hwcfuncs_parse_ctr (int_name, NULL, &nameOnly, NULL, NULL, NULL);
     310    if (regno == REGNO_ANY)
     311      {
     312        logerr (GTXT ("reg# could not be determined for `%s'\n"), nameOnly);
     313        goto attr_wrapup;
     314      }
     315  
     316    /* look up evntsel */
     317    if (myperfctr_get_x86_eventnum (nameOnly, regno,
     318  				  &evntsel, &evnt_valid_umask, &pmc_sel))
     319      {
     320        logerr (GTXT ("counter `%s' is not valid\n"), nameOnly);
     321        goto attr_wrapup;
     322      }
     323    TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n",
     324  	    (long long) evntsel, pmc_sel, nameOnly, nattrs);
     325  
     326    /* determine event attributes */
     327    eventsel_t evnt_attrs = perfctr_evntsel_enable_bits;
     328    if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly))
     329      goto attr_wrapup;
     330    if (evntsel & evnt_attrs)
     331      TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n",
     332  	      (long long) evntsel, (long long) evnt_attrs,
     333  	      (long long) (evntsel & evnt_attrs));
     334    *return_event = evntsel | evnt_attrs;
     335    *return_pmc_sel = pmc_sel;
     336    rc = 0;
     337  
     338  attr_wrapup:
     339    free (attr_mem);
     340    free (nameOnly);
     341    return rc;
     342  }
     343  
     344  #ifdef __x86_64__
     345  #define syscall_instr          "syscall"
     346  #define syscall_clobber        "rcx", "r11", "memory"
     347  #endif
     348  #ifdef __i386__
     349  #define syscall_instr          "int $0x80"
     350  #define syscall_clobber        "memory"
     351  #endif
     352  
     353  static inline int
     354  perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid,
     355  		 int cpu, int group_fd, unsigned long flags)
     356  {
     357    /* It seems that perf_event_open() sometimes fails spuriously,
     358     * even while an immediate retry succeeds.
     359     * So, let's try a few retries if the call fails just to be sure.
     360     */
     361    int rc;
     362    for (int retry = 0; retry < 5; retry++)
     363      {
     364        rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags);
     365        if (rc != -1)
     366  	return rc;
     367      }
     368    return rc;
     369  }
     370  
     371  /*---------------------------------------------------------------------------*/
     372  /* macros & fwd prototypes */
     373  
     374  #define HWCDRV_API      static /* Mark functions used by hwcdrv API */
     375  
     376  HWCDRV_API int hwcdrv_start (void);
     377  HWCDRV_API int hwcdrv_free_counters ();
     378  
     379  static pid_t
     380  hwcdrv_gettid (void)
     381  {
     382  #ifndef LIBCOLLECTOR_SRC
     383    return syscall (__NR_gettid);
     384  #elif defined(intel)
     385    pid_t r;
     386    __asm__ __volatile__(syscall_instr
     387  		       : "=a" (r) : "0" (__NR_gettid)
     388  		       : syscall_clobber);
     389    return r;
     390  #else
     391    return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm
     392  #endif
     393  }
     394  
     395  /*---------------------------------------------------------------------------*/
     396  /* types */
     397  
     398  #define NPAGES_PER_BUF  1 // number of pages to be used for perf_event samples
     399  // must be a power of 2
     400  
     401  /*---------------------------------------------------------------------------*/
     402  
     403  /* typedefs */
     404  
     405  typedef struct
     406  { // event (hwc) definition
     407    unsigned int reg_num; // PMC assignment, potentially for detecting conflicts
     408    eventsel_t eventsel;          // raw event bits (Intel/AMD)
     409    uint64_t counter_preload;     // number of HWC events before signal
     410    struct perf_event_attr hw;    // perf_event definition
     411    hrtime_t min_time;            // minimum time we're targeting between events
     412    char *name;
     413  } perf_event_def_t;
     414  
     415  typedef struct
     416  { // runtime state of perf_event buffer
     417    void *buf;                    // pointer to mmapped buffer
     418    size_t pagesz;                // size of pages
     419  } buffer_state_t;
     420  
     421  typedef struct
     422  { // runtime state of counter values
     423    uint64_t prev_ena_ts;         // previous perf_event "enabled" time
     424    uint64_t prev_run_ts;         // previous perf_event "running" time
     425    uint64_t prev_value;          // previous HWC value
     426  } counter_value_state_t;
     427  
     428  typedef struct
     429  { // per-counter information
     430    perf_event_def_t *ev_def;     // global HWC definition for one counter
     431    int fd;                       // perf_event fd
     432    buffer_state_t buf_state;     // perf_event buffer's state
     433    counter_value_state_t value_state; // counter state
     434    int needs_restart;            // workaround for dbx failure to preserve si_fd
     435    uint64_t last_overflow_period;
     436    hrtime_t last_overflow_time;
     437  } counter_state_t;
     438  
     439  typedef struct
     440  { // per-thread context
     441    counter_state_t *ctr_list;
     442    int signal_fd;                // fd that caused the most recent signal
     443    pid_t tid;			// for debugging signal delivery problems
     444  } hdrv_pcl_ctx_t;
     445  
     446  /*---------------------------------------------------------------------------*/
     447  
     448  /* static variables */
     449  static struct
     450  {
     451    int library_ok;
     452    int internal_open_called;
     453    hwcfuncs_tsd_get_fn_t find_vpc_ctx;
     454    unsigned hwcdef_cnt;      /* number of *active* hardware counters */
     455    hwcdrv_get_events_fn_t *get_events;
     456  } hdrv_pcl_state;
     457  
     458  static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED};
     459  static perf_event_def_t global_perf_event_def[MAX_PICS];
     460  
     461  #define COUNTERS_ENABLED()      (hdrv_pcl_state.hwcdef_cnt)
     462  
     463  
     464  /* perf_event buffer formatting and handling */
     465  static void
     466  reset_buf (buffer_state_t *bufstate)
     467  {
     468    TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n");
     469    struct perf_event_mmap_page *metadata = bufstate->buf;
     470    if (metadata)
     471      metadata->data_tail = metadata->data_head;
     472  }
     473  
     474  static int
     475  skip_buf (buffer_state_t *bufstate, size_t sz)
     476  {
     477    TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n");
     478    struct perf_event_mmap_page *metadata = bufstate->buf;
     479    if (metadata == NULL)
     480      return -1;
     481    size_t pgsz = bufstate->pagesz;
     482    size_t bufsz = NPAGES_PER_BUF*pgsz;
     483    uint64_t d_tail = metadata->data_tail;
     484    uint64_t d_head = metadata->data_head;
     485  
     486    // validate request size
     487    if (sz > d_head - d_tail || sz >= bufsz)
     488      {
     489        reset_buf (bufstate);
     490        return -1;
     491      }
     492    metadata->data_tail = d_tail + sz; // advance tail
     493    return 0;
     494  }
     495  
     496  static int
     497  read_buf (buffer_state_t *bufstate, void *buf, size_t sz)
     498  {
     499    struct perf_event_mmap_page *metadata = bufstate->buf;
     500    if (metadata == NULL)
     501      return -1;
     502    size_t pgsz = bufstate->pagesz;
     503    size_t bufsz = NPAGES_PER_BUF*pgsz;
     504    uint64_t d_tail = metadata->data_tail;
     505    uint64_t d_head = metadata->data_head;
     506  
     507    // validate request size
     508    if (sz > d_head - d_tail || sz >= bufsz)
     509      {
     510        reset_buf (bufstate);
     511        return -1;
     512      }
     513    char *buf_base = ((char *) metadata) + pgsz; // start of data buffer
     514    uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer
     515    size_t nbytes = sz;
     516    if (start_pos + sz > bufsz)
     517      {
     518        // will wrap past end of buffer
     519        nbytes = bufsz - start_pos;
     520        memcpy (buf, buf_base + start_pos, nbytes);
     521        start_pos = 0; // wrap to start
     522        buf = (void *) (((char *) buf) + nbytes);
     523        nbytes = sz - nbytes;
     524      }
     525    memcpy (buf, buf_base + start_pos, nbytes);
     526    metadata->data_tail += sz;
     527    return 0;
     528  }
     529  
     530  static int
     531  read_u64 (buffer_state_t *bufstate, uint64_t *value)
     532  {
     533    return read_buf (bufstate, value, sizeof (uint64_t));
     534  }
     535  
     536  static int
     537  read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue,
     538  	     uint64_t *rlost)
     539  {
     540    // returns count of bytes read
     541    buffer_state_t *bufstate = &ctr_state->buf_state;
     542    counter_value_state_t *cntstate = &ctr_state->value_state;
     543    int readsz = 0;
     544  
     545    // PERF_SAMPLE_IP
     546    uint64_t ipc = 0;
     547    int rc = read_u64 (bufstate, &ipc);
     548    if (rc)
     549      return -1;
     550    readsz += sizeof (uint64_t);
     551  
     552    // PERF_SAMPLE_READ: value
     553    uint64_t value = 0;
     554    rc = read_u64 (bufstate, &value);
     555    if (rc)
     556      return -2;
     557    readsz += sizeof (uint64_t);
     558  
     559    /* Bug 20806896
     560     * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and
     561     * running times in the sample data that correspond to the metadata times
     562     *     metadata->time_enabled
     563     *     metadata->time_running
     564     * from the PREVIOUS (not current) sample.  Probably just ignore this bug
     565     * since it's on old kernels and we only use the enabled and running times
     566     * to construct loss_estimate.
     567     */
     568    // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED
     569    uint64_t enabled_time = 0;
     570    rc = read_u64 (bufstate, &enabled_time);
     571    if (rc)
     572      return -3;
     573    readsz += sizeof (uint64_t);
     574  
     575    // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING
     576    uint64_t running_time = 0;
     577    rc = read_u64 (bufstate, &running_time);
     578    if (rc)
     579      return -4;
     580    readsz += sizeof (uint64_t);
     581  
     582    uint64_t value_delta = value - cntstate->prev_value;
     583    uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts;
     584    uint64_t running_delta = running_time - cntstate->prev_run_ts;
     585    cntstate->prev_value = value;
     586    cntstate->prev_ena_ts = enabled_time;
     587    cntstate->prev_run_ts = running_time;
     588  
     589    // 24830461 need workaround for Linux anomalous HWC skid overrun
     590    int set_error_flag = 0;
     591    if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */)
     592      set_error_flag = 1;
     593  
     594    uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing
     595    if (running_delta == enabled_delta)
     596      {
     597        // counter was running 100% of time, no multiplexing
     598      }
     599    else if (running_delta == 0)
     600      loss_estimate = 1; // token amount to aid in debugging perfctr oddities
     601    else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll))
     602      {
     603        // running should be smaller than enabled, can't estimate
     604        /*
     605         * 21418391 HWC can have a negative count
     606         *
     607         * We've also seen enabled not only be smaller than running
     608         * but in fact go negative.  Guard against this.
     609         */
     610        loss_estimate = 2; // token amount to aid in debugging perfctr oddities
     611      }
     612    else
     613      {
     614        // counter was running less than 100% of time
     615        // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479
     616        uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta;
     617        value_delta = scaled_delta;
     618  #if 0
     619        // We should perhaps warn the user that multiplexing is going on,
     620        // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values.
     621        // For now we simply don't report.
     622        // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(),
     623        // but at that level "lost" has a meaning that's considerably broader than just multiplexing.
     624        collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
     625  				     SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
     626  				     ctr_list[idx].last_overflow_period, new_period);
     627  #endif
     628      }
     629    TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3,
     630  	    "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu "
     631  	    "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n",
     632  	    ctr_state->ev_def->name, (long long) ipc,
     633  	    (long long) enabled_delta, (long long) running_delta,
     634  	    (long long) value_delta, (long long) value_delta,
     635  	    (unsigned long long) loss_estimate,
     636  	    loss_estimate ? ", WARNING - SCALED" : "",
     637  	    set_error_flag ? ", ERRORFLAG" : "");
     638    if (set_error_flag == 1)
     639      value_delta |= (1ULL << 63)     /* HWCVAL_ERR_FLAG */;
     640    *rvalue = value_delta;
     641    *rlost = loss_estimate;
     642    if (readsz != msgsz)
     643      {
     644        TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n");
     645        return -5;
     646      }
     647    return 0;
     648  }
     649  
     650  static void
     651  dump_perf_event_attr (struct perf_event_attr *at)
     652  {
     653    TprintfT (DBG_LT2, "dump_perf_event_attr:  size=%d  type=%d  sample_period=%lld\n"
     654  	    "  config=0x%llx  config1=0x%llx  config2=0x%llx  wakeup_events=%lld __reserved_1=%lld\n",
     655  	    (int) at->size, (int) at->type, (unsigned long long) at->sample_period,
     656  	    (unsigned long long) at->config, (unsigned long long) at->config1,
     657  	    (unsigned long long) at->config2, (unsigned long long) at->wakeup_events,
     658  	    (unsigned long long) at->__reserved_1);
     659  #define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, "  %-10s : %lld\n", #fld, (long long) at->fld)
     660    DUMP_F (disabled);
     661    DUMP_F (inherit);
     662    DUMP_F (pinned);
     663    DUMP_F (exclusive);
     664    DUMP_F (exclude_user);
     665    DUMP_F (exclude_kernel);
     666    DUMP_F (exclude_hv);
     667    DUMP_F (exclude_idle);
     668    //    DUMP_F(xmmap);
     669    DUMP_F (comm);
     670    DUMP_F (freq);
     671    DUMP_F (inherit_stat);
     672    DUMP_F (enable_on_exec);
     673    DUMP_F (task);
     674    DUMP_F (watermark);
     675  }
     676  
     677  static void
     678  init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period)
     679  {
     680    memset (hw, 0, sizeof (struct perf_event_attr));
     681    hw->size = sizeof (struct perf_event_attr); // fwd/bwd compat
     682  
     683  #if defined(__i386__) || defined(__x86_64)
     684    //note: Nehalem/Westmere OFFCORE_RESPONSE in upper 32 bits
     685    hw->config = event;
     686    hw->type = PERF_TYPE_RAW;     // hw/sw/trace/raw...
     687  #elif defined(__aarch64__)
     688    hw->type = (event >> 24) & 7;
     689    hw->config = event & 0xff;
     690  #elif defined(sparc)
     691    //SPARC needs to be shifted up 16 bits
     692    hw->config = (event & 0xFFFF) << 16;  // uint64_t event
     693    uint64_t regs = (event >> 20) & 0xf;  // see sparc_pcbe.c
     694    hw->config |= regs << 4;  // for M8, supported PICs need to be placed at bits [7:4]
     695    hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw...
     696  #endif
     697  
     698    hw->sample_period = period;
     699    hw->sample_type = PERF_SAMPLE_IP |
     700  	  // PERF_SAMPLE_TID		|
     701  	  // PERF_SAMPLE_TIME		| // possibly interesting
     702  	  // PERF_SAMPLE_ADDR		|
     703  	  PERF_SAMPLE_READ | // HWC value
     704  	  // PERF_SAMPLE_CALLCHAIN	| // interesting
     705  	  // PERF_SAMPLE_ID		|
     706  	  // PERF_SAMPLE_CPU		| // possibly interesting
     707  	  // PERF_SAMPLE_PERIOD		|
     708  	  // PERF_SAMPLE_STREAM_ID	|
     709  	  // PERF_SAMPLE_RAW		|
     710  	  0;
     711    hw->read_format =
     712  	  PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled
     713  	  PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled
     714  	  // PERF_FORMAT_ID		|
     715  	  // PERF_FORMAT_GROUP		|
     716  	  0;
     717    hw->disabled = 1; /* off by default */
     718  
     719    // Note: the following override config.priv bits!
     720    hw->exclude_user = (event & (1 << 16)) == 0;      /* don't count user */
     721    hw->exclude_kernel = (event & (1 << 17)) == 0;    /* ditto kernel */
     722    hw->exclude_hv = 1;       /* ditto hypervisor */
     723    hw->wakeup_events = 1;    /* wakeup every n events */
     724    dump_perf_event_attr (hw);
     725  }
     726  
     727  static int
     728  start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string)
     729  {
     730    // pe_attr should have been initialized in hwcdrv_create_counters()
     731    struct perf_event_attr pe_attr;
     732    memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr));
     733  
     734    // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set
     735    pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period;
     736  
     737    int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0);
     738    if (hwc_fd == -1)
     739      {
     740        TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n",
     741  		error_string, ii, errno);
     742        return 1;
     743      }
     744  
     745    size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata
     746    void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call?
     747  		     PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0);
     748    if (buf == MAP_FAILED)
     749      {
     750        TprintfT (0, "sz = %ld, pgsz = %ld\n  err=%s idx=%d mmap failed: %s\n",
     751  		(long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno));
     752        return 1;
     753      }
     754    pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def?  we never seem to use it
     755    pctx->ctr_list[ii].fd = hwc_fd;
     756    pctx->ctr_list[ii].buf_state.buf = buf;
     757    pctx->ctr_list[ii].buf_state.pagesz = pgsz;
     758    pctx->ctr_list[ii].value_state.prev_ena_ts = 0;
     759    pctx->ctr_list[ii].value_state.prev_run_ts = 0;
     760    pctx->ctr_list[ii].value_state.prev_value = 0;
     761    pctx->ctr_list[ii].last_overflow_time = gethrtime ();
     762  
     763    /* set async mode */
     764    long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC;
     765    int rc = fcntl (hwc_fd, F_SETFL, flags);
     766    if (rc == -1)
     767      {
     768        TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii);
     769        return 1;
     770      }
     771  
     772    /*
     773     * set lwp ownership of the fd
     774     * See BUGS section of "man perf_event_open":
     775     *     The F_SETOWN_EX option to fcntl(2) is needed to properly get
     776     *     overflow signals in threads.  This was introduced in Linux 2.6.32.
     777     * Legacy references:
     778     *     see http://lkml.org/lkml/2009/8/4/128
     779     *     google man fcntl F_SETOWN_EX -conflict
     780     *       "From Linux 2.6.32 onward, use F_SETOWN_EX to target
     781     *       SIGIO and SIGURG signals at a particular thread."
     782     *     http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html
     783     *     See 2010 CSCADS presentation by Eranian
     784     */
     785    struct f_owner_ex fowner_ex;
     786    fowner_ex.type = F_OWNER_TID;
     787    fowner_ex.pid = pctx->tid;
     788    rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex);
     789    if (rc == -1)
     790      {
     791        TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii);
     792        return 1;
     793      }
     794  
     795    /* Use sigio so handler can determine FD via siginfo->si_fd. */
     796    rc = fcntl (hwc_fd, F_SETSIG, SIGIO);
     797    if (rc == -1)
     798      {
     799        TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii);
     800        return 1;
     801      }
     802    return 0;
     803  }
     804  
     805  static int
     806  stop_one_ctr (int ii, counter_state_t *ctr_list)
     807  {
     808    int hwc_rc = 0;
     809    if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1))
     810      {
     811        TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno);
     812        hwc_rc = HWCFUNCS_ERROR_GENERIC;
     813      }
     814    void *buf = ctr_list[ii].buf_state.buf;
     815    if (buf)
     816      {
     817        size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz;
     818        ctr_list[ii].buf_state.buf = NULL;
     819        int tmprc = munmap (buf, bufsz);
     820        if (tmprc)
     821  	{
     822  	  TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno);
     823  	  hwc_rc = HWCFUNCS_ERROR_GENERIC;
     824  	}
     825      }
     826    if (-1 == close (ctr_list[ii].fd))
     827      {
     828        TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno);
     829        hwc_rc = HWCFUNCS_ERROR_GENERIC;
     830      }
     831    return hwc_rc;
     832  }
     833  
     834  /* HWCDRV_API for thread-specific actions */
     835  HWCDRV_API int
     836  hwcdrv_lwp_init (void)
     837  {
     838    return hwcdrv_start ();
     839  }
     840  
     841  HWCDRV_API void
     842  hwcdrv_lwp_fini (void)
     843  {
     844    hwcdrv_free_counters ();  /* also sets pctx->ctr_list=NULL; */
     845  }
     846  
     847  /* open */
     848  static int
     849  hdrv_pcl_internal_open ()
     850  {
     851    if (hdrv_pcl_state.internal_open_called)
     852      {
     853        TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n");
     854        return HWCFUNCS_ERROR_ALREADY_CALLED;
     855      }
     856  
     857    // determine if PCL is available
     858    perf_event_def_t tmp_event_def;
     859    memset (&tmp_event_def, 0, sizeof (tmp_event_def));
     860    struct perf_event_attr *pe_attr = &tmp_event_def.hw;
     861    init_perf_event (pe_attr, 0, 0);
     862    pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event
     863    pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts
     864    int hwc_fd = perf_event_open (pe_attr,
     865  				0, // pid/tid, 0 is self
     866  				-1, // cpu, -1 is per-thread mode
     867  				-1, // group_fd, -1 is root
     868  				0); // flags
     869    if (hwc_fd == -1)
     870      {
     871        TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
     872  		" perf_event_open() failed, errno=%d\n", errno);
     873        goto internal_open_error;
     874      }
     875  
     876    /* see if the PCL is new enough to know about F_SETOWN_EX */
     877    struct f_owner_ex fowner_ex;
     878    fowner_ex.type = F_OWNER_TID;
     879    fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID
     880    if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1)
     881      {
     882        TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: "
     883  		"F_SETOWN failed, errno=%d\n", errno);
     884        close (hwc_fd);
     885        goto internal_open_error;
     886      }
     887    close (hwc_fd);
     888  
     889    hdrv_pcl_state.internal_open_called = 1;
     890    hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted
     891    hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
     892    TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n");
     893    for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++)
     894      {
     895        hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii];
     896        if (!ppcbe->hdrv_pcbe_init ())
     897  	{
     898  	  hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name ();
     899  	  hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname);
     900  	  if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
     901  	    goto internal_open_error;
     902  	  hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters ();
     903  	  hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref ();
     904  	  hdrv_pcl_state.get_events = ppcbe->hdrv_pcbe_get_events;
     905  	  hwcdrv_get_x86_eventnum = ppcbe->hdrv_pcbe_get_eventnum;
     906  	  break;
     907  	}
     908      }
     909    if (hdrv_pcl_about.cpcN_npics > MAX_PICS)
     910      {
     911        TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
     912  		" reducing number of HWCs from %u to %u on processor '%s'\n",
     913  		hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname);
     914        hdrv_pcl_about.cpcN_npics = MAX_PICS;
     915      }
     916    TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:"
     917  	    " perf_event cpuver=%d, name='%s'\n",
     918  	    hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname);
     919    return 0;
     920  
     921  internal_open_error:
     922    hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
     923    hdrv_pcl_about.cpcN_npics = 0;
     924    hdrv_pcl_about.cpcN_docref = NULL;
     925    hdrv_pcl_about.cpcN_cciname = NULL;
     926    return HWCFUNCS_ERROR_NOT_SUPPORTED;
     927  }
     928  
     929  static void *
     930  single_thread_tsd_ftn ()
     931  {
     932    static hdrv_pcl_ctx_t tsd_context;
     933    return &tsd_context;
     934  }
     935  
     936  /* HWCDRV_API */
     937  HWCDRV_API int
     938  hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz)
     939  {
     940    hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn;
     941    if (tsd_sz)
     942      *tsd_sz = sizeof (hdrv_pcl_ctx_t);
     943  
     944    if (hdrv_pcl_state.internal_open_called)
     945      return HWCFUNCS_ERROR_ALREADY_CALLED;
     946    return hdrv_pcl_internal_open ();
     947  }
     948  
     949  HWCDRV_API void
     950  hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics,
     951  		 const char **docref, uint64_t *support)
     952  {
     953    if (cpuver)
     954      *cpuver = hdrv_pcl_about.cpcN_cpuver;
     955    if (cciname)
     956      *cciname = hdrv_pcl_about.cpcN_cciname;
     957    if (npics)
     958      *npics = hdrv_pcl_about.cpcN_npics;
     959    if (docref)
     960      *docref = hdrv_pcl_about.cpcN_docref;
     961    if (support)
     962      *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID;
     963  }
     964  
     965  HWCDRV_API int
     966  hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn)
     967  {
     968    if (tsd_ftn)
     969      hdrv_pcl_state.find_vpc_ctx = tsd_ftn;
     970    else
     971      {
     972        TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n");
     973        return HWCFUNCS_ERROR_UNAVAIL;
     974      }
     975    return 0;
     976  }
     977  
     978  HWCDRV_API int
     979  hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb)
     980  {
     981    int count = 0;
     982    if (hwc_cb && hdrv_pcl_state.get_events)
     983      count = hdrv_pcl_state.get_events (hwc_cb);
     984    if (attr_cb)
     985      for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++)
     986        attr_cb (perfctr_attrs_table[ii].attrname);
     987    if (!count)
     988      return -1;
     989    return 0;
     990  }
     991  
     992  HWCDRV_API int
     993  hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs)
     994  {
     995    return hwcdrv_assign_all_regnos (entries, numctrs);
     996  }
     997  
     998  static int
     999  internal_hwc_start (int fd)
    1000  {
    1001    int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1);
    1002    if (rc == -1)
    1003      {
    1004        TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:"
    1005  		" PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno);
    1006        return HWCFUNCS_ERROR_UNAVAIL;
    1007      }
    1008    TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd);
    1009    return 0;
    1010  }
    1011  
    1012  HWCDRV_API int
    1013  hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events)
    1014  {
    1015    /* set expired counters to overflow value and all others to 0 */
    1016    /* return 0: OK, counters should be restarted */
    1017    /* return non-zero: eventp not set, counters should not be restarted */
    1018    /* clear return values */
    1019    int ii;
    1020    for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1021      {
    1022        eventp->ce_pic[ii] = 0;
    1023        lost_events->ce_pic[ii] = 0;
    1024      }
    1025    hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event?
    1026    eventp->ce_hrt = sig_ts;
    1027    lost_events->ce_hrt = sig_ts;
    1028  
    1029    /* determine source signal */
    1030    int signal_fd = -1;
    1031    switch (si->si_code)
    1032      {
    1033      case POLL_HUP: /* expected value from pcl */
    1034        /* According to Stephane Eranian:
    1035         * "expect POLL_HUP instead of POLL_IN because we are
    1036         * in one-shot mode (IOC_REFRESH)"
    1037         */
    1038        signal_fd = si->si_fd;
    1039        break;
    1040      case SI_TKILL: /* event forwarded by tkill */
    1041        /* DBX can only forward SI_TKILL when it detects POLL_HUP
    1042         * unfortunately, this means that si->si_fd has been lost...
    1043         * We need to process the buffers, but we don't know the fd!
    1044         */
    1045        TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    1046  		" SI_TKILL detected\n", sig_ts);
    1047        break;
    1048      default:
    1049        // "sometimes we see a POLL_IN (1) with very high event rates,"
    1050        // according to eranian(?)
    1051        TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
    1052  		" unexpected si_code 0x%x\n", sig_ts, si->si_code);
    1053        return HWCFUNCS_ERROR_GENERIC;
    1054      }
    1055  
    1056    hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
    1057    if (!pctx)
    1058      {
    1059        TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
    1060  		" tsd context is NULL\n", sig_ts);
    1061        return HWCFUNCS_ERROR_UNEXPECTED;
    1062      }
    1063    counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
    1064    if (!ctr_list)
    1065      {
    1066        TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    1067  		" ctr_list is NULL\n", sig_ts);
    1068        return HWCFUNCS_ERROR_UNEXPECTED;
    1069      }
    1070  
    1071    /* clear needs_restart flag */
    1072    for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1073      ctr_list[ii].needs_restart = 0;
    1074  
    1075    /* attempt to identify the counter to read */
    1076    int signal_idx = -1;
    1077    pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t
    1078    if (signal_fd != -1)
    1079      {
    1080        for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1081  	{
    1082  	  if (ctr_list[ii].fd == signal_fd)
    1083  	    {
    1084  	      signal_idx = ii;
    1085  	      break;
    1086  	    }
    1087  	}
    1088      }
    1089  
    1090    if (signal_idx < 0)
    1091      {
    1092        TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
    1093  		" pmc not determined!\n", sig_ts);
    1094        lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */
    1095        // note: bogus value may get overwritten in loop below
    1096      }
    1097  
    1098    /* capture sample(s).  In addition to signal_idx, check other counters. */
    1099    struct perf_event_header sheader;
    1100    int idx;
    1101    for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++)
    1102      {
    1103        int num_recs = 0;
    1104        while (1)
    1105  	{
    1106  	  /* check for samples */
    1107  	  struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf;
    1108  	  if (metadata == NULL)
    1109  	    break; // empty
    1110  	  if (metadata->data_tail == metadata->data_head)
    1111  	    break; // empty
    1112  
    1113  	  /* read header */
    1114  	  if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader)))
    1115  	    break;
    1116  	  num_recs++;
    1117  
    1118  	  /* check for PERF_RECORD_SAMPLE */
    1119  	  size_t datasz = sheader.size - sizeof (struct perf_event_header);
    1120  	  if (sheader.type != PERF_RECORD_SAMPLE)
    1121  	    {
    1122  	      TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    1123  			" unexpected recd type=%d\n",
    1124  			sig_ts, sheader.type);
    1125  	      if (skip_buf (&ctr_list[idx].buf_state, datasz))
    1126  		{
    1127  		  TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
    1128  			    " skip recd type=%d failed\n", sig_ts, sheader.type);
    1129  		  lost_events->ce_pic[idx] = 4; /* record a bogus value */
    1130  		  break; // failed to skip buffer??
    1131  		}
    1132  	      lost_events->ce_pic[idx] = 2; /* record a bogus value */
    1133  	      continue; // advance to next record
    1134  	    }
    1135  
    1136  	  /* type is PERF_RECORD_SAMPLE */
    1137  	  uint64_t value, lostv;
    1138  	  if (read_sample (&ctr_list[idx], datasz, &value, &lostv))
    1139  	    {
    1140  	      TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
    1141  			" read_sample() failed\n", sig_ts);
    1142  	      lost_events->ce_pic[idx] = 3; // record a bogus value
    1143  	      break;                        // failed to read sample data??
    1144  	    }
    1145  	  TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:"
    1146  		    " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts,
    1147  		    idx, (unsigned long long) value, (unsigned long long) lostv);
    1148  	  if (eventp->ce_pic[idx])
    1149  	    {
    1150  	      TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    1151  			" idx=%d previous sample recorded as lost_event\n", sig_ts, idx);
    1152  	      lost_events->ce_pic[idx] += eventp->ce_pic[idx];
    1153  	    }
    1154  	  eventp->ce_pic[idx] = value;
    1155  	  lost_events->ce_pic[idx] += lostv;
    1156  	}
    1157  
    1158        /* debug output for unexpected (but common) cases */
    1159        if (idx == signal_idx)
    1160  	{
    1161  	  if (num_recs != 1)
    1162  	    TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    1163  		      " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx);
    1164  	}
    1165        else if (num_recs)
    1166  	TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
    1167  		  " %d unexpected record(s) for idx=%d (signal_idx=%d)\n",
    1168  		  sig_ts, num_recs, idx, signal_idx);
    1169  
    1170        /* trigger counter restart whenever records were found */
    1171        if (num_recs)
    1172  	{
    1173  	  /* check whether to adapt the overflow interval */
    1174  	  /* This is the Linux version.
    1175  	   * The Solaris version is in hwprofile.c collector_update_overflow_counters().
    1176  	   */
    1177  	  hrtime_t min_time = global_perf_event_def[idx].min_time;
    1178  	  if (min_time > 0 // overflow interval is adaptive
    1179  	      && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min
    1180  	    {
    1181  	      /* pick a new overflow interval */
    1182  	      /* roughly doubled, but add funny numbers */
    1183  	      /* hopefully the result is prime or not a multiple of some # of ops/loop */
    1184  	      uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37;
    1185  #if 0
    1186  	      // On Solaris, we report the adjustment to the log file.
    1187  	      // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ.
    1188  	      // For now we simply don't report.
    1189  	      collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
    1190  					     SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
    1191  					     ctr_list[idx].last_overflow_period, new_period);
    1192  #endif
    1193  	      /* There are a variety of ways of resetting the period on Linux.
    1194  	       * The most elegant is
    1195  	       *     ioctl(fd,PERF_EVENT_IOC_PERIOD,&period)
    1196  	       * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD:
    1197  	       *     > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel.
    1198  	       *     > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect
    1199  	       *         until after the next overflow.
    1200  	       * So we're kind of stuck shutting the fd down and restarting it with the new period.
    1201  	       */
    1202  	      if (stop_one_ctr (idx, ctr_list))
    1203  		{
    1204  		  // EUGENE figure out what to do on error
    1205  		}
    1206  	      ctr_list[idx].last_overflow_period = new_period;
    1207  	      if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):"))
    1208  		{
    1209  		  // EUGENE figure out what to do on error
    1210  		}
    1211  	    }
    1212  	  ctr_list[idx].last_overflow_time = sig_ts;
    1213  #if 0
    1214  	  ctr_list[idx].needs_restart = 1;
    1215  #else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart()
    1216  	  internal_hwc_start (ctr_list[idx].fd);
    1217  #endif
    1218  	}
    1219      }
    1220    return 0; // OK to restart counters
    1221  }
    1222  
    1223  HWCDRV_API int
    1224  hwcdrv_sighlr_restart (const hwc_event_t *pp)
    1225  {
    1226  #if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow()
    1227    hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
    1228    if (!pctx)
    1229      {
    1230        TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n");
    1231        return -1;
    1232      }
    1233    counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
    1234    if (!ctr_list)
    1235      {
    1236        TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n");
    1237        return -1;
    1238      }
    1239    int errors = 0;
    1240    for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1241      {
    1242        if (ctr_list[ii].needs_restart)
    1243  	errors |= internal_hwc_start (ctr_list[ii].fd);
    1244        ctr_list[ii].needs_restart = 0;
    1245      }
    1246    return errors;
    1247  #else
    1248    return 0;
    1249  #endif
    1250  }
    1251  
    1252  /* create counters based on hwcdef[] */
    1253  HWCDRV_API int
    1254  hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef)
    1255  {
    1256    if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics)
    1257      {
    1258        logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/
    1259        return HWCFUNCS_ERROR_HWCARGS;
    1260      }
    1261    if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
    1262      {
    1263        logerr (GTXT ("Processor not supported\n"));
    1264        return HWCFUNCS_ERROR_HWCARGS;
    1265      }
    1266  
    1267    /* add counters */
    1268    for (unsigned idx = 0; idx < hwcdef_cnt; idx++)
    1269      {
    1270        perf_event_def_t *glb_event_def = &global_perf_event_def[idx];
    1271        memset (glb_event_def, 0, sizeof (perf_event_def_t));
    1272        unsigned int pmc_sel;
    1273        eventsel_t evntsel;
    1274        if (hwcfuncs_get_x86_eventsel (hwcdef[idx].reg_num,
    1275  				     hwcdef[idx].int_name, &evntsel, &pmc_sel))
    1276  	{
    1277  	  TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n");
    1278  	  return HWCFUNCS_ERROR_HWCARGS;
    1279  	}
    1280        glb_event_def->reg_num = pmc_sel;
    1281        glb_event_def->eventsel = evntsel;
    1282        glb_event_def->counter_preload = hwcdef[idx].val;
    1283        glb_event_def->min_time = hwcdef[idx].min_time;
    1284        glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor
    1285        init_perf_event (&glb_event_def->hw, glb_event_def->eventsel,
    1286  		       glb_event_def->counter_preload);
    1287        TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld"
    1288  		"(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n",
    1289  		idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload,
    1290  		(long long) glb_event_def->min_time, (int) glb_event_def->reg_num,
    1291  		(long long) glb_event_def->eventsel,
    1292  		(long long) HW_INTERVAL_PRESET (hwcdef[idx].val),
    1293  		(long long) glb_event_def->hw.exclude_user,
    1294  		(long long) glb_event_def->hw.exclude_kernel);
    1295      }
    1296  
    1297    hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt;
    1298    return 0;
    1299  }
    1300  
    1301  HWCDRV_API int
    1302  hwcdrv_free_counters () // note: only performs shutdown for this thread
    1303  {
    1304    hdrv_pcl_ctx_t * pctx;
    1305    if (!COUNTERS_ENABLED ())
    1306      return 0;
    1307    pctx = hdrv_pcl_state.find_vpc_ctx ();
    1308    if (!pctx)
    1309      {
    1310        TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n");
    1311        return HWCFUNCS_ERROR_GENERIC;
    1312      }
    1313    counter_state_t *ctr_list = pctx->ctr_list;
    1314    if (!ctr_list)
    1315      {
    1316        // fork child: prolog suspends hwcs, then epilog frees them
    1317        TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n");
    1318        return 0;
    1319      }
    1320    int hwc_rc = 0;
    1321    for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1322      if (stop_one_ctr (ii, ctr_list))
    1323        hwc_rc = HWCFUNCS_ERROR_GENERIC;
    1324    TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", (long) pctx->tid);
    1325    pctx->ctr_list = NULL;
    1326    return hwc_rc;
    1327  }
    1328  
    1329  HWCDRV_API int
    1330  hwcdrv_start (void) /* must be called from each thread ? */
    1331  {
    1332    hdrv_pcl_ctx_t *pctx = NULL;
    1333    if (!COUNTERS_ENABLED ())
    1334      {
    1335        TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n");
    1336        return 0;
    1337      }
    1338    if (!hdrv_pcl_state.library_ok)
    1339      {
    1340        TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n");
    1341        return HWCFUNCS_ERROR_NOT_SUPPORTED;
    1342      }
    1343  
    1344    /*
    1345     * set up per-thread context
    1346     */
    1347    pctx = hdrv_pcl_state.find_vpc_ctx ();
    1348    if (!pctx)
    1349      {
    1350        TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n");
    1351        return HWCFUNCS_ERROR_UNEXPECTED;
    1352      }
    1353    pctx->tid = hwcdrv_gettid ();
    1354    TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", (long) pctx->tid);
    1355  
    1356    /*
    1357     * create per-thread counter list
    1358     */
    1359    counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt,
    1360  							  sizeof (counter_state_t));
    1361    if (!ctr_list)
    1362      {
    1363        TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n");
    1364        return HWCFUNCS_ERROR_MEMORY;
    1365      }
    1366    int ii;
    1367    for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1368      ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely
    1369    pctx->ctr_list = ctr_list;
    1370  
    1371    /*
    1372     * bind the counters
    1373     */
    1374    size_t pgsz = sysconf (_SC_PAGESIZE);
    1375    for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1376      {
    1377        ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period;
    1378        if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup;
    1379      }
    1380  
    1381    /*
    1382     * start the counters
    1383     */
    1384    for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
    1385      {
    1386        int rc = internal_hwc_start (ctr_list[ii].fd);
    1387        if (rc < 0)
    1388  	goto hwcdrv_start_cleanup;
    1389      }
    1390    return 0;
    1391  
    1392  hwcdrv_start_cleanup:
    1393    hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds
    1394    return HWCFUNCS_ERROR_UNAVAIL;
    1395  }
    1396  
    1397  HWCDRV_API int
    1398  hwcdrv_lwp_suspend (void) /* must be called from each thread */
    1399  {
    1400    if (!COUNTERS_ENABLED ())
    1401      {
    1402        TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n");
    1403        return 0;
    1404      }
    1405    TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n");
    1406    return hwcdrv_free_counters ();
    1407  }
    1408  
    1409  HWCDRV_API int
    1410  hwcdrv_lwp_resume (void) /* must be called from each thread */
    1411  {
    1412    if (!COUNTERS_ENABLED ())
    1413      {
    1414        TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n");
    1415        return 0;
    1416      }
    1417    TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n");
    1418    return hwcdrv_start ();
    1419  }
    1420  
    1421  HWCDRV_API int
    1422  hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data)
    1423  {
    1424    overflow_data->ce_hrt = 0;
    1425    for (int i = 0; i < MAX_PICS; i++)
    1426      {
    1427        overflow_data->ce_pic[i] = 0;
    1428        if (sampled_data)
    1429  	HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]);
    1430      }
    1431    return 0;
    1432  }
    1433  
    1434  /*---------------------------------------------------------------------------*/
    1435  /* HWCDRV_API */
    1436  
    1437  hwcdrv_api_t hwcdrv_pcl_api = {
    1438    hwcdrv_init,
    1439    hwcdrv_get_info,
    1440    hwcdrv_enable_mt,
    1441    hwcdrv_get_descriptions,
    1442    hwcdrv_assign_regnos,
    1443    hwcdrv_create_counters,
    1444    hwcdrv_start,
    1445    hwcdrv_overflow,
    1446    hwcdrv_read_events,
    1447    hwcdrv_sighlr_restart,
    1448    hwcdrv_lwp_suspend,
    1449    hwcdrv_lwp_resume,
    1450    hwcdrv_free_counters,
    1451    hwcdrv_lwp_init,
    1452    hwcdrv_lwp_fini,
    1453      -1                      // hwcdrv_init_status
    1454  };