pthread_workqueue-0.8.2/testing/latency/latency.c

   1 /*
   2  * Copyright (c) 2011 Joakim Johansson <jocke@tbricks.com>.
   3  *
   4  * @APPLE_APACHE_LICENSE_HEADER_START@
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  *
  18  * @APPLE_APACHE_LICENSE_HEADER_END@
  19  */
  20
  21 #include <stdio.h>
  22 #include <stdlib.h>
  23 #include <ctype.h>
  24 #include <string.h>
  25 #include <errno.h>
  26 #include <time.h>
  27
  28 #ifndef _WIN32
  29 # include <unistd.h>
  30 # include <pthread.h>
  31 # include <sys/time.h>
  32 #endif
  33
  34 #include "latency.h"
  35
  36 pthread_workqueue_t workqueues[WORKQUEUE_COUNT];
  37 struct wq_statistics workqueue_statistics[WORKQUEUE_COUNT];
  38 struct wq_event_generator workqueue_generator[GENERATOR_WORKQUEUE_COUNT];
  39
  40 struct wq_statistics global_statistics;
  41 unsigned int global_stats_used = 0;
  42
  43 pthread_mutex_t generator_mutex;
  44 pthread_cond_t generator_condition;
  45 static unsigned int events_processed;
  46
  47 #define PERCENTILE_COUNT 8
  48 double percentiles[PERCENTILE_COUNT] = {50.0, 80.0, 98.0, 99.0, 99.5, 99.8, 99.9, 99.99};
  49 mytime_t real_start, real_end;
  50
  51 #ifdef __APPLE__
  52
  53 #include <assert.h>
  54 #include <CoreServices/CoreServices.h>
  55 #include <mach/mach.h>
  56 #include <mach/mach_time.h>
  57
  58 static mach_timebase_info_data_t    sTimebaseInfo;
  59
  60 // From http://developer.apple.com/library/mac/#qa/qa2004/qa1398.html
  61 unsigned long gettime(void)
  62 {
  63     return (mach_absolute_time() * sTimebaseInfo.numer / sTimebaseInfo.denom);
  64 }
  65
  66 #else
  67
  68 static mytime_t gettime(void)
  69 {
  70 #ifdef __linux__
  71         struct timespec ts;
  72     if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0)
  73                         fprintf(stderr, "Failed to get high resolution clock! errno = %d\n", errno);
  74     return ((ts.tv_sec * NANOSECONDS_PER_SECOND) + ts.tv_nsec);
  75 #elif defined(_WIN32)
  76         LARGE_INTEGER now;
  77         LARGE_INTEGER freq;
  78         if (!QueryPerformanceCounter(&now) )
  79                 fprintf(stderr, "Failed to get performance counter!\n");
  80         if (!QueryPerformanceFrequency(&freq) )
  81                 fprintf(stderr, "Failed to get performance frequency!\n");
  82
  83         return (mytime_t)(now.QuadPart * NANOSECONDS_PER_SECOND / freq.QuadPart);
  84 #else
  85     struct timespec ts;
  86     if (clock_gettime(CLOCK_HIGHRES, &ts) != 0)
  87                         fprintf(stderr, "Failed to get high resolution clock! errno = %d\n", errno);
  88     return ((ts.tv_sec * NANOSECONDS_PER_SECOND) + ts.tv_nsec);
  89 #endif
  90 }
  91
  92 #endif
  93
  94 #ifdef _WIN32
  95
  96 static void my_sleep(unsigned long nanoseconds) {
  97         LARGE_INTEGER start, end;
  98         LARGE_INTEGER freq;
  99
 100         QueryPerformanceCounter(&start);
 101         QueryPerformanceFrequency(&freq);
 102
 103         // sleep with ms resolution ...
 104         Sleep(nanoseconds / 1000000);
 105
 106         // ... and busy-wait afterwards, until the requested delay was reached
 107         QueryPerformanceCounter(&end);
 108         while( (end.QuadPart - start.QuadPart) * NANOSECONDS_PER_SECOND / freq.QuadPart < nanoseconds ){
 109                 YieldProcessor();
 110                 QueryPerformanceCounter(&end);
 111         }
 112
 113 }
 114
 115 #else
 116
 117 // real resolution on solaris is at best system clock tick, i.e. 100Hz unless having the
 118 // high res system clock (1000Hz in that case)
 119
 120 static void my_sleep(unsigned long nanoseconds)
 121 {
 122         struct timespec timeout0;
 123         struct timespec timeout1;
 124         struct timespec* tmp;
 125         struct timespec* t0 = &timeout0;
 126         struct timespec* t1 = &timeout1;
 127
 128         t0->tv_sec = nanoseconds / NANOSECONDS_PER_SECOND;
 129         t0->tv_nsec = nanoseconds % NANOSECONDS_PER_SECOND;
 130
 131         while ((nanosleep(t0, t1) == (-1)) && (errno == EINTR))
 132         {
 133                 tmp = t0;
 134                 t0 = t1;
 135                 t1 = tmp;
 136         }
 137
 138     return;
 139 }
 140
 141 #endif
 142
 143 static void _process_data(void* context)
 144 {
 145         struct wq_event *event = (struct wq_event *) context;
 146         mytime_t elapsed_time;
 147
 148         elapsed_time = gettime() - event->start_time;
 149
 150         workqueue_statistics[event->queue_index].avg = ((workqueue_statistics[event->queue_index].count * workqueue_statistics[event->queue_index].avg) + elapsed_time) / (workqueue_statistics[event->queue_index].count + 1);
 151         workqueue_statistics[event->queue_index].total += elapsed_time;
 152         workqueue_statistics[event->queue_index].count += 1;
 153
 154     if (elapsed_time < workqueue_statistics[event->queue_index].min ||
 155         workqueue_statistics[event->queue_index].min == 0)
 156         workqueue_statistics[event->queue_index].min = elapsed_time;
 157
 158     if (elapsed_time > workqueue_statistics[event->queue_index].max)
 159         workqueue_statistics[event->queue_index].max = elapsed_time;
 160
 161     if ((elapsed_time / 1000) < DISTRIBUTION_BUCKETS)
 162         workqueue_statistics[event->queue_index].distribution[(int)(elapsed_time / 1000)] += 1;
 163     else
 164         workqueue_statistics[event->queue_index].distribution[DISTRIBUTION_BUCKETS-1] += 1;
 165
 166     // allow generator thread to continue when all events have been processed
 167     if (atomic_dec_nv(&events_processed) == 0)
 168     {
 169         pthread_mutex_lock(&generator_mutex);
 170         pthread_cond_signal(&generator_condition);
 171         pthread_mutex_unlock(&generator_mutex);
 172     }
 173     return;
 174 }
 175
 176 // Perform a small microburst for this tick
 177 static void _event_tick(void* context)
 178 {
 179         struct wq_event *current_event;
 180         long i, generator_workqueue = (long) context;
 181
 182     for (i = 0; i < EVENTS_GENERATED_PER_TICK; i++)
 183     {
 184         current_event = &workqueue_generator[generator_workqueue].wq_events[i];
 185         current_event->start_time = gettime();
 186         current_event->queue_index = (current_event->start_time % WORKQUEUE_COUNT);
 187
 188         (void) pthread_workqueue_additem_np(workqueues[current_event->queue_index], _process_data, current_event, NULL, NULL);
 189     }
 190
 191     return;
 192 }
 193
 194 static void _generate_simulated_events()
 195 {
 196         unsigned long i = 0, tick;
 197         mytime_t overhead;
 198     mytime_t start, current, overhead_start = 0, overhead_end = 0;
 199
 200     start = current = gettime();
 201
 202         for (tick = 0; tick < TOTAL_TICKS_TO_RUN; tick++)
 203         {
 204         start = current = overhead_end;
 205         overhead = overhead_end - overhead_start;
 206
 207         // wait until we have waited proper amount of time for current rate
 208         // we should remove overhead of previous lap to not lag behind in data rate
 209         // one call to gethrtime() alone is around 211ns on Nehalem 2.93
 210         // use busy waiting in case the frequency is higher than the supported resolution of nanosleep()
 211
 212         if (overhead > EVENT_TIME_SLICE)
 213         {
 214             printf("Warning: Event processing overhead > event time slice, readjust test parameters.\n");
 215         }
 216         else
 217         if ((EVENT_GENERATION_FREQUENCY > SYSTEM_CLOCK_RESOLUTION) || FORCE_BUSY_LOOP)
 218         {
 219             while ((current - start) < (EVENT_TIME_SLICE - overhead))
 220                 current = gettime();
 221         }
 222         else
 223         {
 224             my_sleep(EVENT_TIME_SLICE - overhead);
 225         }
 226
 227         overhead_start = gettime();
 228
 229         events_processed = GENERATOR_WORKQUEUE_COUNT * EVENTS_GENERATED_PER_TICK; // number of items that will be processed
 230
 231 #if (LATENCY_RUN_GENERATOR_IN_MAIN_THREAD == 0)
 232         for (i = 0; i < GENERATOR_WORKQUEUE_COUNT; i++)
 233             (void) pthread_workqueue_additem_np(workqueue_generator[i].wq, _event_tick, (void *) i, NULL, NULL);
 234 #else
 235         _event_tick((void *)i);
 236 #endif
 237
 238         // wait for all events to be processed
 239         pthread_mutex_lock(&generator_mutex);
 240         while (events_processed > 0)
 241             pthread_cond_wait(&generator_condition, &generator_mutex);
 242         pthread_mutex_unlock(&generator_mutex);
 243
 244         overhead_end = gettime();
 245         }
 246
 247         return;
 248 }
 249
 250 static void _gather_statistics(unsigned long queue_index)
 251 {
 252     unsigned long i;
 253
 254     if (workqueue_statistics[queue_index].count > 0)
 255     {
 256         global_stats_used ++;
 257
 258         global_statistics.avg = ((global_statistics.count * global_statistics.avg) + (workqueue_statistics[queue_index].avg * workqueue_statistics[queue_index].count)) / (global_statistics.count + workqueue_statistics[queue_index].count);
 259         global_statistics.total += workqueue_statistics[queue_index].total;
 260         global_statistics.count += workqueue_statistics[queue_index].count;
 261
 262         if (workqueue_statistics[queue_index].min < global_statistics.min || global_statistics.min == 0)
 263             global_statistics.min = workqueue_statistics[queue_index].min;
 264
 265         if (workqueue_statistics[queue_index].max > global_statistics.max)
 266             global_statistics.max = workqueue_statistics[queue_index].max;
 267
 268         for (i = 0; i < DISTRIBUTION_BUCKETS; i++)
 269             global_statistics.distribution[i] += workqueue_statistics[queue_index].distribution[i];
 270     }
 271
 272         return;
 273 }
 274
 275 void _print_statistics()
 276 {
 277         unsigned long i, j, total_events = 0, last_percentile = 0, accumulated_percentile = 0;
 278
 279         printf("Collecting statistics...\n");
 280
 281         for (i = 0; i < WORKQUEUE_COUNT; i++)
 282         _gather_statistics(i);
 283
 284         printf("Test is done, run time was %.3f seconds, %.1fM events generated and processed.\n", (double)((double)(real_end - real_start) / (double) NANOSECONDS_PER_SECOND), total_events/1000000.0);
 285
 286     //FIXME - casting from mytime_t (u_long) to int will truncate the result
 287         printf("Global dispatch queue aggregate statistics for %d queues: %dM events, min = %d ns, avg = %.1f ns, max = %d ns\n",
 288            global_stats_used, global_statistics.count/1000000, (int) global_statistics.min, global_statistics.avg, (int) global_statistics.max);
 289
 290     printf("\nDistribution:\n");
 291     for (i = 0; i < DISTRIBUTION_BUCKETS; i++)
 292     {
 293         printf("%3ld us: %d ", i, global_statistics.distribution[i]);
 294         for (j=0; j<(((double) global_statistics.distribution[i] / (double) global_statistics.count) * 400.0); j++)
 295             printf("*");
 296         printf("\n");
 297     }
 298
 299     printf("\nPercentiles:\n");
 300
 301     for (i = 0; i < DISTRIBUTION_BUCKETS; i++)
 302     {
 303         while ((last_percentile < PERCENTILE_COUNT) && ((100.0 * ((double) accumulated_percentile / (double) global_statistics.count)) > percentiles[last_percentile]))
 304         {
 305             printf("%.2f < %ld us\n", percentiles[last_percentile], i-1);
 306             last_percentile++;
 307         }
 308         accumulated_percentile += global_statistics.distribution[i];
 309     }
 310
 311     while ((last_percentile < PERCENTILE_COUNT) && ((100.0 * ((double) accumulated_percentile / (double) global_statistics.count)) > percentiles[last_percentile]))
 312     {
 313         printf("%.2f > %d us\n", percentiles[last_percentile], DISTRIBUTION_BUCKETS-1);
 314         last_percentile++;
 315     }
 316
 317         return;
 318 }
 319
 320 int main(void)
 321 {
 322         int i;
 323     pthread_workqueue_attr_t attr;
 324
 325 #ifdef __APPLE__
 326     (void) mach_timebase_info(&sTimebaseInfo);
 327 #endif
 328
 329 #ifdef MAKE_STATIC
 330         pthread_workqueue_init_np();
 331 #endif
 332
 333         memset(&workqueues, 0, sizeof(workqueues));
 334         memset(&workqueue_statistics, 0, sizeof(workqueue_statistics));
 335         memset(&global_statistics, 0, sizeof(global_statistics));
 336         memset(&workqueue_generator, 0, sizeof(workqueue_generator));
 337
 338     pthread_mutex_init(&generator_mutex, NULL);
 339     pthread_cond_init(&generator_condition, NULL);
 340
 341     if (pthread_workqueue_attr_init_np(&attr) != 0)
 342         fprintf(stderr, "Failed to set workqueue attributes\n");
 343
 344     for (i = 0; i < GENERATOR_WORKQUEUE_COUNT; i++)
 345     {
 346         if (pthread_workqueue_attr_setqueuepriority_np(&attr, i) != 0)
 347             fprintf(stderr, "Failed to set workqueue priority\n");
 348
 349         if (pthread_workqueue_attr_setovercommit_np(&attr, 1) != 0)
 350             fprintf(stderr, "Failed to set workqueue overcommit\n");
 351
 352         workqueue_generator[i].wq_events = malloc(sizeof(struct wq_event) * EVENTS_GENERATED_PER_TICK);
 353         memset(workqueue_generator[i].wq_events, 0, (sizeof(struct wq_event) * EVENTS_GENERATED_PER_TICK));
 354
 355         if (pthread_workqueue_create_np(&workqueue_generator[i].wq, &attr) != 0)
 356             fprintf(stderr, "Failed to create workqueue\n");
 357     }
 358
 359         for (i = 0; i < WORKQUEUE_COUNT; i++)
 360         {
 361         if (pthread_workqueue_attr_init_np(&attr) != 0)
 362             fprintf(stderr, "Failed to set workqueue attributes\n");
 363
 364         if (pthread_workqueue_attr_setqueuepriority_np(&attr, i) != 0)
 365             fprintf(stderr, "Failed to set workqueue priority\n");
 366
 367         if (pthread_workqueue_create_np(&workqueues[i], &attr) != 0)
 368             fprintf(stderr, "Failed to create workqueue\n");
 369         }
 370
 371         if (SLEEP_BEFORE_START > 0)
 372         {
 373                 printf("Sleeping for %d seconds to allow for processor set configuration...\n",SLEEP_BEFORE_START);
 374                 sleep(SLEEP_BEFORE_START);
 375         }
 376
 377     printf("%d workqueues, running for %d seconds at %d Hz, %d events per tick.\n",WORKQUEUE_COUNT, SECONDS_TO_RUN, EVENT_GENERATION_FREQUENCY, EVENTS_GENERATED_PER_TICK);
 378
 379         printf("Running %d generator threads at %dK events/s, the aggregated data rate is %dK events/s. %.2f MB is used for %.2fK events.\n",
 380            GENERATOR_WORKQUEUE_COUNT,AGGREGATE_DATA_RATE_PER_SECOND/1000, TOTAL_DATA_PER_SECOND/1000,
 381            (double) GENERATOR_WORKQUEUE_COUNT * ((sizeof(struct wq_event) * EVENTS_GENERATED_PER_TICK + sizeof(workqueues))/(1024.0*1024.0)),
 382            GENERATOR_WORKQUEUE_COUNT * EVENTS_GENERATED_PER_TICK/1000.0);
 383
 384     real_start = gettime();
 385
 386     _generate_simulated_events();
 387
 388     real_end = gettime();
 389
 390     _print_statistics();
 391
 392         return 0;
 393 }