benchtests/bench-pthread-locks.c

   1 /* Measure various lock acquisition times for empty critical sections.
   2    Copyright (C) 2020-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #define TEST_MAIN
  20 #define TEST_NAME "pthread-locks"
  21
  22 #include <stdio.h>
  23 #include <string.h>
  24 #include <limits.h>
  25 #include <stdlib.h>
  26 #include <pthread.h>
  27 #include <semaphore.h>
  28 #include <stdatomic.h>
  29 #include <sys/time.h>
  30 #include <math.h>
  31 #include "bench-timing.h"
  32 #include "json-lib.h"
  33
  34 /* The point of this benchmark is to measure the overhead of an empty
  35    critical section or a small critical section.  This is never going
  36    to be indicative of real application performance.  Instead we are
  37    trying to benchmark the effects of the compiler and the runtime
  38    coupled with a particular set of hardware atomic operations.
  39    The numbers from this benchmark should be taken with a massive gain
  40    of salt and viewed through the eyes of expert reviewers.  */
  41
  42 static pthread_mutex_t m;
  43 static pthread_rwlock_t rw;
  44 static pthread_cond_t cv;
  45 static pthread_cond_t consumer_c, producer_c;
  46 static int cv_done;
  47 static pthread_spinlock_t sp;
  48 static sem_t sem;
  49
  50 typedef timing_t (*test_t)(long, int);
  51
  52 #define START_ITERS 1000
  53
  54 #define FILLER_GOES_HERE \
  55   if (filler) \
  56     do_filler ();
  57
  58 /* Everyone loves a good fibonacci series.  This isn't quite one of
  59    them because we need larger values in fewer steps, in a way that
  60    won't be optimized away.  We're looking to approximately double the
  61    total time each test iteration takes, so as to not swamp the useful
  62    timings.  */
  63
  64 #pragma GCC push_options
  65 #pragma GCC optimize(1)
  66
  67 static int __attribute__((noinline))
  68 fibonacci (int i)
  69 {
  70   asm("");
  71   if (i > 2)
  72     return fibonacci (i-1) + fibonacci (i-2);
  73   return 10+i;
  74 }
  75
  76 static void
  77 do_filler (void)
  78 {
  79   static char buf1[512], buf2[512];
  80   int f = fibonacci (5);
  81   memcpy (buf1, buf2, f);
  82 }
  83
  84 #pragma GCC pop_options
  85
  86 static timing_t
  87 test_mutex (long iters, int filler)
  88 {
  89   timing_t start, stop, cur;
  90
  91   pthread_mutex_init (&m, NULL);
  92
  93   TIMING_NOW (start);
  94   for (long j = iters; j >= 0; --j)
  95     {
  96       pthread_mutex_lock (&m);
  97       FILLER_GOES_HERE;
  98       pthread_mutex_unlock (&m);
  99     }
 100   TIMING_NOW (stop);
 101   TIMING_DIFF (cur, start, stop);
 102
 103   return cur;
 104 }
 105
 106 static timing_t
 107 test_mutex_trylock (long iters, int filler)
 108 {
 109   timing_t start, stop, cur;
 110
 111   pthread_mutex_init (&m, NULL);
 112   pthread_mutex_lock (&m);
 113
 114   TIMING_NOW (start);
 115   for (long j = iters; j >= 0; --j)
 116     {
 117       pthread_mutex_trylock (&m);
 118       FILLER_GOES_HERE;
 119     }
 120   TIMING_NOW (stop);
 121   TIMING_DIFF (cur, start, stop);
 122
 123   pthread_mutex_unlock (&m);
 124   return cur;
 125 }
 126
 127 static timing_t
 128 test_rwlock_read (long iters, int filler)
 129 {
 130   timing_t start, stop, cur;
 131
 132   pthread_rwlock_init (&rw, NULL);
 133
 134   TIMING_NOW (start);
 135   for (long j = iters; j >= 0; --j)
 136     {
 137       pthread_rwlock_rdlock (&rw);
 138       FILLER_GOES_HERE;
 139       pthread_rwlock_unlock (&rw);
 140     }
 141   TIMING_NOW (stop);
 142   TIMING_DIFF (cur, start, stop);
 143
 144   return cur;
 145 }
 146
 147 static timing_t
 148 test_rwlock_tryread (long iters, int filler)
 149 {
 150   timing_t start, stop, cur;
 151
 152   pthread_rwlock_init (&rw, NULL);
 153   pthread_rwlock_wrlock (&rw);
 154
 155   TIMING_NOW (start);
 156   for (long j = iters; j >= 0; --j)
 157     {
 158       pthread_rwlock_tryrdlock (&rw);
 159       FILLER_GOES_HERE;
 160     }
 161   TIMING_NOW (stop);
 162   TIMING_DIFF (cur, start, stop);
 163
 164   pthread_rwlock_unlock (&rw);
 165   return cur;
 166 }
 167
 168 static timing_t
 169 test_rwlock_write (long iters, int filler)
 170 {
 171   timing_t start, stop, cur;
 172
 173   pthread_rwlock_init (&rw, NULL);
 174
 175   TIMING_NOW (start);
 176   for (long j = iters; j >= 0; --j)
 177     {
 178       pthread_rwlock_wrlock (&rw);
 179       FILLER_GOES_HERE;
 180       pthread_rwlock_unlock (&rw);
 181     }
 182   TIMING_NOW (stop);
 183   TIMING_DIFF (cur, start, stop);
 184
 185   return cur;
 186 }
 187
 188 static timing_t
 189 test_rwlock_trywrite (long iters, int filler)
 190 {
 191   timing_t start, stop, cur;
 192
 193   pthread_rwlock_init (&rw, NULL);
 194   pthread_rwlock_rdlock (&rw);
 195
 196   TIMING_NOW (start);
 197   for (long j = iters; j >= 0; --j)
 198     {
 199       pthread_rwlock_trywrlock (&rw);
 200       FILLER_GOES_HERE;
 201     }
 202   TIMING_NOW (stop);
 203   TIMING_DIFF (cur, start, stop);
 204
 205   pthread_rwlock_unlock (&rw);
 206   return cur;
 207 }
 208
 209 static timing_t
 210 test_spin_lock (long iters, int filler)
 211 {
 212   timing_t start, stop, cur;
 213
 214   pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
 215
 216   TIMING_NOW (start);
 217   for (long j = iters; j >= 0; --j)
 218     {
 219       pthread_spin_lock (&sp);
 220       FILLER_GOES_HERE;
 221       pthread_spin_unlock (&sp);
 222     }
 223   TIMING_NOW (stop);
 224   TIMING_DIFF (cur, start, stop);
 225
 226   return cur;
 227 }
 228
 229 static timing_t
 230 test_spin_trylock (long iters, int filler)
 231 {
 232   timing_t start, stop, cur;
 233
 234   pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
 235   pthread_spin_lock (&sp);
 236
 237   TIMING_NOW (start);
 238   for (long j = iters; j >= 0; --j)
 239     {
 240       pthread_spin_trylock (&sp);
 241       FILLER_GOES_HERE;
 242     }
 243   TIMING_NOW (stop);
 244   TIMING_DIFF (cur, start, stop);
 245
 246   pthread_spin_unlock (&sp);
 247   return cur;
 248 }
 249
 250 static timing_t
 251 test_sem_wait (long iters, int filler)
 252 {
 253   timing_t start, stop, cur;
 254
 255   sem_init (&sem, 0, 1);
 256
 257   TIMING_NOW (start);
 258   for (long j = iters; j >= 0; --j)
 259     {
 260       sem_post (&sem);
 261       FILLER_GOES_HERE;
 262       sem_wait (&sem);
 263     }
 264   TIMING_NOW (stop);
 265   TIMING_DIFF (cur, start, stop);
 266
 267   return cur;
 268 }
 269
 270 static timing_t
 271 test_sem_trywait (long iters, int filler)
 272 {
 273   timing_t start, stop, cur;
 274
 275   sem_init (&sem, 0, 0);
 276
 277   TIMING_NOW (start);
 278   for (long j = iters; j >= 0; --j)
 279     {
 280       sem_trywait (&sem);
 281       FILLER_GOES_HERE;
 282     }
 283   TIMING_NOW (stop);
 284   TIMING_DIFF (cur, start, stop);
 285
 286   return cur;
 287 }
 288
 289 static void *
 290 test_condvar_helper (void *v)
 291 {
 292   /* This is wasteful, but the alternative is to add the overhead of a
 293      mutex lock/unlock to the overall iteration (both threads) and we
 294      don't want that.  Ideally, this thread would run on an
 295      independent processing core anyway.  The ONLY goal here is to
 296      minimize the time the other thread spends waiting for us.  */
 297   while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0)
 298     pthread_cond_signal (&cv);
 299
 300   return NULL;
 301 }
 302
 303 static timing_t
 304 test_condvar (long iters, int filler)
 305 {
 306   timing_t start, stop, cur;
 307   pthread_t helper_id;
 308
 309   pthread_mutex_init (&m, NULL);
 310   pthread_cond_init (&cv, NULL);
 311   pthread_mutex_lock (&m);
 312
 313   __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED);
 314   pthread_create (&helper_id, NULL, test_condvar_helper, &iters);
 315
 316   TIMING_NOW (start);
 317   for (long j = iters; j >= 0; --j)
 318     {
 319       pthread_cond_wait (&cv, &m);
 320       FILLER_GOES_HERE;
 321     }
 322   TIMING_NOW (stop);
 323   TIMING_DIFF (cur, start, stop);
 324
 325   pthread_mutex_unlock (&m);
 326   __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED);
 327
 328   pthread_join (helper_id, NULL);
 329   return cur;
 330 }
 331
 332 /* How many items are "queued" in our pretend queue.  */
 333 static int queued = 0;
 334
 335 typedef struct Producer_Params {
 336   long iters;
 337   int filler;
 338 } Producer_Params;
 339
 340 /* We only benchmark the consumer thread, but both threads are doing
 341    essentially the same thing, and never run in parallel due to the
 342    locks.  Thus, even if they run on separate processing cores, we
 343    count the time for both threads.  */
 344 static void *
 345 test_producer_thread (void *v)
 346 {
 347   Producer_Params *p = (Producer_Params *) v;
 348   long iters = p->iters;
 349   int filler = p->filler;
 350   long j;
 351
 352   for (j = iters; j >= 0; --j)
 353     {
 354       /* Aquire lock on the queue.  */
 355       pthread_mutex_lock (&m);
 356       /* if something's already there, wait.  */
 357       while (queued > 0)
 358         pthread_cond_wait (&consumer_c, &m);
 359
 360       /* Put something on the queue */
 361       FILLER_GOES_HERE;
 362       ++ queued;
 363       pthread_cond_signal (&producer_c);
 364
 365       /* Give the other thread a chance to run.  */
 366       pthread_mutex_unlock (&m);
 367     }
 368
 369   return NULL;
 370 }
 371
 372 static timing_t
 373 test_consumer_producer (long iters, int filler)
 374 {
 375   timing_t start, stop, cur;
 376   pthread_t helper_id;
 377   Producer_Params p;
 378
 379   p.iters = iters;
 380   p.filler = filler;
 381
 382   pthread_mutex_init (&m, NULL);
 383   pthread_cond_init (&cv, NULL);
 384
 385   pthread_create (&helper_id, NULL, test_producer_thread, &p);
 386
 387   TIMING_NOW (start);
 388
 389   for (long j = iters; j >= 0; --j)
 390     {
 391       /* Aquire lock on the queue.  */
 392       pthread_mutex_lock (&m);
 393       /* Wait for something to be on the queue.  */
 394       while (queued == 0)
 395         pthread_cond_wait (&producer_c, &m);
 396
 397       /* Take if off. */
 398       FILLER_GOES_HERE;
 399       -- queued;
 400       pthread_cond_signal (&consumer_c);
 401
 402       /* Give the other thread a chance to run.  */
 403       pthread_mutex_unlock (&m);
 404     }
 405
 406   TIMING_NOW (stop);
 407   TIMING_DIFF (cur, start, stop);
 408
 409
 410   pthread_join (helper_id, NULL);
 411   return cur;
 412 }
 413
 414 /* Number of runs we use for computing mean and standard deviation.
 415    We actually do two additional runs and discard the outliers.  */
 416 #define RUN_COUNT 10
 417
 418 static int
 419 do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js)
 420 {
 421   timing_t cur;
 422   struct timeval ts, te;
 423   double tsd, ted, td;
 424   long iters, iters_limit;
 425   timing_t curs[RUN_COUNT + 2];
 426   int i, j;
 427   double mean, stdev;
 428
 429   iters = START_ITERS;
 430   iters_limit = LONG_MAX / 100;
 431
 432   while (1) {
 433     gettimeofday (&ts, NULL);
 434     cur = func(iters, filler);
 435     gettimeofday (&te, NULL);
 436
 437     /* We want a test to take at least 0.01 seconds, and try
 438        increasingly larger iteration counts until it does.  This
 439        allows for approximately constant-time tests regardless of
 440        hardware speed, without the overhead of checking the time
 441        inside the test loop itself.  We stop at a million iterations
 442        as that should be precise enough.  Once we determine a suitable
 443        iteration count, we run the test multiple times to calculate
 444        mean and standard deviation.  */
 445
 446     /* Note that this also primes the CPU cache and triggers faster
 447        MHz, we hope.  */
 448     tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
 449     ted = te.tv_sec + te.tv_usec / 1000000.0;
 450     td = ted - tsd;
 451     if (td >= 0.01
 452         || iters >= iters_limit
 453         || iters >= 1000000)
 454       break;
 455
 456     iters *= 10;
 457   }
 458
 459   curs[0] = cur;
 460   for (i = 1; i < RUN_COUNT + 2; i ++)
 461     curs[i] = func(iters, filler);
 462
 463   /* We sort the results so we can discard the fastest and slowest
 464      times as outliers.  In theory we should keep the fastest time,
 465      but IMHO this is more fair.  A simple bubble sort suffices.  */
 466
 467   for (i = 0; i < RUN_COUNT + 1; i ++)
 468     for (j = i + 1; j < RUN_COUNT + 2; j ++)
 469       if (curs[i] > curs[j])
 470         {
 471           timing_t temp = curs[i];
 472           curs[i] = curs[j];
 473           curs[j] = temp;
 474         }
 475
 476   /* Now calculate mean and standard deviation, skipping the outliers.  */
 477   mean = 0.0;
 478   for (i = 1; i<RUN_COUNT + 1; i ++)
 479     mean += (double) curs[i] / (double) iters;
 480   mean /= RUN_COUNT;
 481
 482   stdev = 0.0;
 483   for (i = 1; i < RUN_COUNT + 1; i ++)
 484     {
 485       double s = (double) curs[i] / (double) iters - mean;
 486       stdev += s * s;
 487     }
 488   stdev = sqrt (stdev / (RUN_COUNT - 1));
 489
 490   char buf[128];
 491   snprintf (buf, sizeof buf, "%s-%s", name, filler ? "filler" : "empty");
 492
 493   json_attr_object_begin (js, buf);
 494
 495   json_attr_double (js, "duration", (double) cur);
 496   json_attr_double (js, "iterations", (double) iters);
 497   json_attr_double (js, "wall-sec", (double) td);
 498   json_attr_double (js, "mean", mean);
 499   json_attr_double (js, "stdev", stdev);
 500   json_attr_double (js, "min-outlier", (double) curs[0] / (double) iters);
 501   json_attr_double (js, "min", (double) curs[1] / (double) iters);
 502   json_attr_double (js, "max", (double) curs[RUN_COUNT] / (double) iters);
 503   json_attr_double (js, "max-outlier", (double) curs[RUN_COUNT + 1] / (double) iters);
 504
 505   json_attr_object_end (js);
 506
 507   return 0;
 508 }
 509
 510 static int
 511 do_bench_1 (const char *name, test_t func, json_ctx_t *js)
 512 {
 513   int rv = 0;
 514
 515   rv += do_bench_2 (name, func, 0, js);
 516   rv += do_bench_2 (name, func, 1, js);
 517
 518   return rv;
 519 }
 520
 521 int
 522 do_bench (void)
 523 {
 524   int rv = 0;
 525   json_ctx_t json_ctx;
 526
 527   json_init (&json_ctx, 2, stdout);
 528   json_attr_object_begin (&json_ctx, "pthread_locks");
 529
 530 #define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
 531
 532   BENCH (mutex);
 533   BENCH (mutex_trylock);
 534   BENCH (rwlock_read);
 535   BENCH (rwlock_tryread);
 536   BENCH (rwlock_write);
 537   BENCH (rwlock_trywrite);
 538   BENCH (spin_lock);
 539   BENCH (spin_trylock);
 540   BENCH (sem_wait);
 541   BENCH (sem_trywait);
 542   BENCH (condvar);
 543   BENCH (consumer_producer);
 544
 545   json_attr_object_end (&json_ctx);
 546
 547   return rv;
 548 }
 549
 550
 551 #define TEST_FUNCTION do_bench ()
 552
 553 #include "../test-skeleton.c"