1  /* Measure various lock acquisition times for empty critical sections.
       2     Copyright (C) 2020-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  #define TEST_MAIN
      20  #define TEST_NAME "pthread-locks"
      21  
      22  #include <stdio.h>
      23  #include <string.h>
      24  #include <limits.h>
      25  #include <stdlib.h>
      26  #include <pthread.h>
      27  #include <semaphore.h>
      28  #include <stdatomic.h>
      29  #include <sys/time.h>
      30  #include <math.h>
      31  #include "bench-timing.h"
      32  #include "json-lib.h"
      33  
      34  /* The point of this benchmark is to measure the overhead of an empty
      35     critical section or a small critical section.  This is never going
      36     to be indicative of real application performance.  Instead we are
      37     trying to benchmark the effects of the compiler and the runtime
      38     coupled with a particular set of hardware atomic operations.
      39     The numbers from this benchmark should be taken with a massive gain
      40     of salt and viewed through the eyes of expert reviewers.  */
      41  
      42  static pthread_mutex_t m;
      43  static pthread_rwlock_t rw;
      44  static pthread_cond_t cv;
      45  static pthread_cond_t consumer_c, producer_c;
      46  static int cv_done;
      47  static pthread_spinlock_t sp;
      48  static sem_t sem;
      49  
      50  typedef timing_t (*test_t)(long, int);
      51  
      52  #define START_ITERS 1000
      53  
      54  #define FILLER_GOES_HERE \
      55    if (filler) \
      56      do_filler ();
      57  
      58  /* Everyone loves a good fibonacci series.  This isn't quite one of
      59     them because we need larger values in fewer steps, in a way that
      60     won't be optimized away.  We're looking to approximately double the
      61     total time each test iteration takes, so as to not swamp the useful
      62     timings.  */
      63  
      64  #pragma GCC push_options
      65  #pragma GCC optimize(1)
      66  
      67  static int __attribute__((noinline))
      68  fibonacci (int i)
      69  {
      70    asm("");
      71    if (i > 2)
      72      return fibonacci (i-1) + fibonacci (i-2);
      73    return 10+i;
      74  }
      75  
      76  static void
      77  do_filler (void)
      78  {
      79    static char buf1[512], buf2[512];
      80    int f = fibonacci (5);
      81    memcpy (buf1, buf2, f);
      82  }
      83  
      84  #pragma GCC pop_options
      85  
      86  static timing_t
      87  test_mutex (long iters, int filler)
      88  {
      89    timing_t start, stop, cur;
      90  
      91    pthread_mutex_init (&m, NULL);
      92  
      93    TIMING_NOW (start);
      94    for (long j = iters; j >= 0; --j)
      95      {
      96        pthread_mutex_lock (&m);
      97        FILLER_GOES_HERE;
      98        pthread_mutex_unlock (&m);
      99      }
     100    TIMING_NOW (stop);
     101    TIMING_DIFF (cur, start, stop);
     102  
     103    return cur;
     104  }
     105  
     106  static timing_t
     107  test_mutex_trylock (long iters, int filler)
     108  {
     109    timing_t start, stop, cur;
     110  
     111    pthread_mutex_init (&m, NULL);
     112    pthread_mutex_lock (&m);
     113  
     114    TIMING_NOW (start);
     115    for (long j = iters; j >= 0; --j)
     116      {
     117        pthread_mutex_trylock (&m);
     118        FILLER_GOES_HERE;
     119      }
     120    TIMING_NOW (stop);
     121    TIMING_DIFF (cur, start, stop);
     122  
     123    pthread_mutex_unlock (&m);
     124    return cur;
     125  }
     126  
     127  static timing_t
     128  test_rwlock_read (long iters, int filler)
     129  {
     130    timing_t start, stop, cur;
     131  
     132    pthread_rwlock_init (&rw, NULL);
     133  
     134    TIMING_NOW (start);
     135    for (long j = iters; j >= 0; --j)
     136      {
     137        pthread_rwlock_rdlock (&rw);
     138        FILLER_GOES_HERE;
     139        pthread_rwlock_unlock (&rw);
     140      }
     141    TIMING_NOW (stop);
     142    TIMING_DIFF (cur, start, stop);
     143  
     144    return cur;
     145  }
     146  
     147  static timing_t
     148  test_rwlock_tryread (long iters, int filler)
     149  {
     150    timing_t start, stop, cur;
     151  
     152    pthread_rwlock_init (&rw, NULL);
     153    pthread_rwlock_wrlock (&rw);
     154  
     155    TIMING_NOW (start);
     156    for (long j = iters; j >= 0; --j)
     157      {
     158        pthread_rwlock_tryrdlock (&rw);
     159        FILLER_GOES_HERE;
     160      }
     161    TIMING_NOW (stop);
     162    TIMING_DIFF (cur, start, stop);
     163  
     164    pthread_rwlock_unlock (&rw);
     165    return cur;
     166  }
     167  
     168  static timing_t
     169  test_rwlock_write (long iters, int filler)
     170  {
     171    timing_t start, stop, cur;
     172  
     173    pthread_rwlock_init (&rw, NULL);
     174  
     175    TIMING_NOW (start);
     176    for (long j = iters; j >= 0; --j)
     177      {
     178        pthread_rwlock_wrlock (&rw);
     179        FILLER_GOES_HERE;
     180        pthread_rwlock_unlock (&rw);
     181      }
     182    TIMING_NOW (stop);
     183    TIMING_DIFF (cur, start, stop);
     184  
     185    return cur;
     186  }
     187  
     188  static timing_t
     189  test_rwlock_trywrite (long iters, int filler)
     190  {
     191    timing_t start, stop, cur;
     192  
     193    pthread_rwlock_init (&rw, NULL);
     194    pthread_rwlock_rdlock (&rw);
     195  
     196    TIMING_NOW (start);
     197    for (long j = iters; j >= 0; --j)
     198      {
     199        pthread_rwlock_trywrlock (&rw);
     200        FILLER_GOES_HERE;
     201      }
     202    TIMING_NOW (stop);
     203    TIMING_DIFF (cur, start, stop);
     204  
     205    pthread_rwlock_unlock (&rw);
     206    return cur;
     207  }
     208  
     209  static timing_t
     210  test_spin_lock (long iters, int filler)
     211  {
     212    timing_t start, stop, cur;
     213  
     214    pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
     215  
     216    TIMING_NOW (start);
     217    for (long j = iters; j >= 0; --j)
     218      {
     219        pthread_spin_lock (&sp);
     220        FILLER_GOES_HERE;
     221        pthread_spin_unlock (&sp);
     222      }
     223    TIMING_NOW (stop);
     224    TIMING_DIFF (cur, start, stop);
     225  
     226    return cur;
     227  }
     228  
     229  static timing_t
     230  test_spin_trylock (long iters, int filler)
     231  {
     232    timing_t start, stop, cur;
     233  
     234    pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
     235    pthread_spin_lock (&sp);
     236  
     237    TIMING_NOW (start);
     238    for (long j = iters; j >= 0; --j)
     239      {
     240        pthread_spin_trylock (&sp);
     241        FILLER_GOES_HERE;
     242      }
     243    TIMING_NOW (stop);
     244    TIMING_DIFF (cur, start, stop);
     245  
     246    pthread_spin_unlock (&sp);
     247    return cur;
     248  }
     249  
     250  static timing_t
     251  test_sem_wait (long iters, int filler)
     252  {
     253    timing_t start, stop, cur;
     254  
     255    sem_init (&sem, 0, 1);
     256  
     257    TIMING_NOW (start);
     258    for (long j = iters; j >= 0; --j)
     259      {
     260        sem_post (&sem);
     261        FILLER_GOES_HERE;
     262        sem_wait (&sem);
     263      }
     264    TIMING_NOW (stop);
     265    TIMING_DIFF (cur, start, stop);
     266  
     267    return cur;
     268  }
     269  
     270  static timing_t
     271  test_sem_trywait (long iters, int filler)
     272  {
     273    timing_t start, stop, cur;
     274  
     275    sem_init (&sem, 0, 0);
     276  
     277    TIMING_NOW (start);
     278    for (long j = iters; j >= 0; --j)
     279      {
     280        sem_trywait (&sem);
     281        FILLER_GOES_HERE;
     282      }
     283    TIMING_NOW (stop);
     284    TIMING_DIFF (cur, start, stop);
     285  
     286    return cur;
     287  }
     288  
     289  static void *
     290  test_condvar_helper (void *v)
     291  {
     292    /* This is wasteful, but the alternative is to add the overhead of a
     293       mutex lock/unlock to the overall iteration (both threads) and we
     294       don't want that.  Ideally, this thread would run on an
     295       independent processing core anyway.  The ONLY goal here is to
     296       minimize the time the other thread spends waiting for us.  */
     297    while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0)
     298      pthread_cond_signal (&cv);
     299  
     300    return NULL;
     301  }
     302  
     303  static timing_t
     304  test_condvar (long iters, int filler)
     305  {
     306    timing_t start, stop, cur;
     307    pthread_t helper_id;
     308  
     309    pthread_mutex_init (&m, NULL);
     310    pthread_cond_init (&cv, NULL);
     311    pthread_mutex_lock (&m);
     312  
     313    __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED);
     314    pthread_create (&helper_id, NULL, test_condvar_helper, &iters);
     315  
     316    TIMING_NOW (start);
     317    for (long j = iters; j >= 0; --j)
     318      {
     319        pthread_cond_wait (&cv, &m);
     320        FILLER_GOES_HERE;
     321      }
     322    TIMING_NOW (stop);
     323    TIMING_DIFF (cur, start, stop);
     324  
     325    pthread_mutex_unlock (&m);
     326    __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED);
     327  
     328    pthread_join (helper_id, NULL);
     329    return cur;
     330  }
     331  
     332  /* How many items are "queued" in our pretend queue.  */
     333  static int queued = 0;
     334  
     335  typedef struct Producer_Params {
     336    long iters;
     337    int filler;
     338  } Producer_Params;
     339  
     340  /* We only benchmark the consumer thread, but both threads are doing
     341     essentially the same thing, and never run in parallel due to the
     342     locks.  Thus, even if they run on separate processing cores, we
     343     count the time for both threads.  */
     344  static void *
     345  test_producer_thread (void *v)
     346  {
     347    Producer_Params *p = (Producer_Params *) v;
     348    long iters = p->iters;
     349    int filler = p->filler;
     350    long j;
     351  
     352    for (j = iters; j >= 0; --j)
     353      {
     354        /* Acquire lock on the queue.  */
     355        pthread_mutex_lock (&m);
     356        /* if something's already there, wait.  */
     357        while (queued > 0)
     358  	pthread_cond_wait (&consumer_c, &m);
     359  
     360        /* Put something on the queue */
     361        FILLER_GOES_HERE;
     362        ++ queued;
     363        pthread_cond_signal (&producer_c);
     364  
     365        /* Give the other thread a chance to run.  */
     366        pthread_mutex_unlock (&m);
     367      }
     368  
     369    return NULL;
     370  }
     371  
     372  static timing_t
     373  test_consumer_producer (long iters, int filler)
     374  {
     375    timing_t start, stop, cur;
     376    pthread_t helper_id;
     377    Producer_Params p;
     378  
     379    p.iters = iters;
     380    p.filler = filler;
     381  
     382    pthread_mutex_init (&m, NULL);
     383    pthread_cond_init (&cv, NULL);
     384  
     385    pthread_create (&helper_id, NULL, test_producer_thread, &p);
     386  
     387    TIMING_NOW (start);
     388  
     389    for (long j = iters; j >= 0; --j)
     390      {
     391        /* Acquire lock on the queue.  */
     392        pthread_mutex_lock (&m);
     393        /* Wait for something to be on the queue.  */
     394        while (queued == 0)
     395  	pthread_cond_wait (&producer_c, &m);
     396  
     397        /* Take if off. */
     398        FILLER_GOES_HERE;
     399        -- queued;
     400        pthread_cond_signal (&consumer_c);
     401  
     402        /* Give the other thread a chance to run.  */
     403        pthread_mutex_unlock (&m);
     404      }
     405  
     406    TIMING_NOW (stop);
     407    TIMING_DIFF (cur, start, stop);
     408  
     409  
     410    pthread_join (helper_id, NULL);
     411    return cur;
     412  }
     413  
     414  /* Number of runs we use for computing mean and standard deviation.
     415     We actually do two additional runs and discard the outliers.  */
     416  #define RUN_COUNT 10
     417  
     418  static int
     419  do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js)
     420  {
     421    timing_t cur;
     422    struct timeval ts, te;
     423    double tsd, ted, td;
     424    long iters, iters_limit;
     425    timing_t curs[RUN_COUNT + 2];
     426    int i, j;
     427    double mean, stdev;
     428  
     429    iters = START_ITERS;
     430    iters_limit = LONG_MAX / 100;
     431  
     432    while (1) {
     433      gettimeofday (&ts, NULL);
     434      cur = func(iters, filler);
     435      gettimeofday (&te, NULL);
     436  
     437      /* We want a test to take at least 0.01 seconds, and try
     438         increasingly larger iteration counts until it does.  This
     439         allows for approximately constant-time tests regardless of
     440         hardware speed, without the overhead of checking the time
     441         inside the test loop itself.  We stop at a million iterations
     442         as that should be precise enough.  Once we determine a suitable
     443         iteration count, we run the test multiple times to calculate
     444         mean and standard deviation.  */
     445  
     446      /* Note that this also primes the CPU cache and triggers faster
     447         MHz, we hope.  */
     448      tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
     449      ted = te.tv_sec + te.tv_usec / 1000000.0;
     450      td = ted - tsd;
     451      if (td >= 0.01
     452  	|| iters >= iters_limit
     453  	|| iters >= 1000000)
     454        break;
     455  
     456      iters *= 10;
     457    }
     458  
     459    curs[0] = cur;
     460    for (i = 1; i < RUN_COUNT + 2; i ++)
     461      curs[i] = func(iters, filler);
     462  
     463    /* We sort the results so we can discard the fastest and slowest
     464       times as outliers.  In theory we should keep the fastest time,
     465       but IMHO this is more fair.  A simple bubble sort suffices.  */
     466  
     467    for (i = 0; i < RUN_COUNT + 1; i ++)
     468      for (j = i + 1; j < RUN_COUNT + 2; j ++)
     469        if (curs[i] > curs[j])
     470  	{
     471  	  timing_t temp = curs[i];
     472  	  curs[i] = curs[j];
     473  	  curs[j] = temp;
     474  	}
     475  
     476    /* Now calculate mean and standard deviation, skipping the outliers.  */
     477    mean = 0.0;
     478    for (i = 1; i<RUN_COUNT + 1; i ++)
     479      mean += (double) curs[i] / (double) iters;
     480    mean /= RUN_COUNT;
     481  
     482    stdev = 0.0;
     483    for (i = 1; i < RUN_COUNT + 1; i ++)
     484      {
     485        double s = (double) curs[i] / (double) iters - mean;
     486        stdev += s * s;
     487      }
     488    stdev = sqrt (stdev / (RUN_COUNT - 1));
     489  
     490    char buf[128];
     491    snprintf (buf, sizeof buf, "%s-%s", name, filler ? "filler" : "empty");
     492  
     493    json_attr_object_begin (js, buf);
     494  
     495    json_attr_double (js, "duration", (double) cur);
     496    json_attr_double (js, "iterations", (double) iters);
     497    json_attr_double (js, "wall-sec", (double) td);
     498    json_attr_double (js, "mean", mean);
     499    json_attr_double (js, "stdev", stdev);
     500    json_attr_double (js, "min-outlier", (double) curs[0] / (double) iters);
     501    json_attr_double (js, "min", (double) curs[1] / (double) iters);
     502    json_attr_double (js, "max", (double) curs[RUN_COUNT] / (double) iters);
     503    json_attr_double (js, "max-outlier", (double) curs[RUN_COUNT + 1] / (double) iters);
     504  
     505    json_attr_object_end (js);
     506  
     507    return 0;
     508  }
     509  
     510  static int
     511  do_bench_1 (const char *name, test_t func, json_ctx_t *js)
     512  {
     513    int rv = 0;
     514  
     515    rv += do_bench_2 (name, func, 0, js);
     516    rv += do_bench_2 (name, func, 1, js);
     517  
     518    return rv;
     519  }
     520  
     521  int
     522  do_bench (void)
     523  {
     524    int rv = 0;
     525    json_ctx_t json_ctx;
     526  
     527    json_init (&json_ctx, 2, stdout);
     528    json_attr_object_begin (&json_ctx, "pthread_locks");
     529  
     530  #define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
     531  
     532    BENCH (mutex);
     533    BENCH (mutex_trylock);
     534    BENCH (rwlock_read);
     535    BENCH (rwlock_tryread);
     536    BENCH (rwlock_write);
     537    BENCH (rwlock_trywrite);
     538    BENCH (spin_lock);
     539    BENCH (spin_trylock);
     540    BENCH (sem_wait);
     541    BENCH (sem_trywait);
     542    BENCH (condvar);
     543    BENCH (consumer_producer);
     544  
     545    json_attr_object_end (&json_ctx);
     546  
     547    return rv;
     548  }
     549  
     550  
     551  #define TEST_FUNCTION do_bench ()
     552  
     553  #include "../test-skeleton.c"