2 * Copyright (c) 2008-2009 Apple Inc. All rights reserved.
4 * @APPLE_APACHE_LICENSE_HEADER_START@
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
18 * @APPLE_APACHE_LICENSE_HEADER_END@
21 #include <Foundation/Foundation.h>
22 #include <libkern/OSAtomic.h>
23 #include <sys/sysctl.h>
24 #include <mach/mach.h>
25 #include <mach/mach_time.h>
38 #include <dispatch/dispatch.h>
39 #include <dispatch/private.h>
42 __private_extern__ void func(void);
44 __private_extern__ void (^block)(void);
46 static void backflip(void *ctxt);
47 static void backflip_done(void);
50 @interface BasicObject : NSObject
56 @implementation BasicObject
64 virtual void virtfunc(void) {
69 force_a_thread(void *arg)
76 static volatile int32_t global;
78 static const size_t cnt = 10000000;
79 static const size_t cnt2 = 100000;
82 static long double loop_cost;
83 static long double cycles_per_nanosecond;
84 static mach_timebase_info_data_t tbi;
86 //static void func2(void *, dispatch_item_t di);
88 static void __attribute__((noinline))
89 print_result(uint64_t s, const char *str)
91 uint64_t d, e = mach_absolute_time();
96 if (tbi.numer != tbi.denom) {
101 dd = (typeof(dd))d / (typeof(dd))cnt;
105 if (loop_cost == 0.0) {
109 dd *= cycles_per_nanosecond;
111 printf("%-45s%15.3Lf cycles\n", str, dd);
114 static void __attribute__((noinline))
115 print_result2(uint64_t s, const char *str)
117 uint64_t d, e = mach_absolute_time();
122 if (tbi.numer != tbi.denom) {
127 dd = (typeof(dd))d / (typeof(dd))cnt2;
130 dd *= cycles_per_nanosecond;
132 printf("%-45s%15.3Lf cycles\n", str, dd);
135 #if defined(__i386__) || defined(__x86_64__)
136 static inline uint64_t
141 asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
143 return (uint64_t)hi << 32 | lo;
148 struct fml *fml_next;
149 } *fixed_malloc_lifo_head;
151 struct fml *fixed_malloc_lifo(void);// __attribute__((noinline));
152 void fixed_free_lifo(struct fml *fml);// __attribute__((noinline));
155 fixed_malloc_lifo(void)
157 struct fml *fml_r = fixed_malloc_lifo_head;
160 fixed_malloc_lifo_head = fml_r->fml_next;
163 return (struct fml *)malloc(32);
168 fixed_free_lifo(struct fml *fml)
170 fml->fml_next = fixed_malloc_lifo_head;
171 fixed_malloc_lifo_head = fml;
177 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
178 pthread_mutex_t plock = PTHREAD_MUTEX_INITIALIZER;
179 OSSpinLock slock = OS_SPINLOCK_INIT;
182 pthread_t pthr_pause;
183 dispatch_queue_t q, mq;
188 size_t freq_len = sizeof(freq);
193 r = sysctlbyname("hw.cpufrequency", &freq, &freq_len, NULL, 0);
195 assert(freq_len == sizeof(freq));
197 cycles_per_nanosecond = (long double)freq / (long double)NSEC_PER_SEC;
201 /* Malloc has different logic for threaded apps. */
202 r = pthread_create(&pthr_pause, NULL, force_a_thread, NULL);
205 kr = mach_timebase_info(&tbi);
207 #if defined(__i386__) || defined(__x86_64__)
208 assert(tbi.numer == tbi.denom); /* This will fail on PowerPC. */
211 bo = [[BasicObject alloc] init];
214 bc = new BasicClass();
217 q = dispatch_queue_create("com.apple.bench-dispatch", NULL);
220 mq = dispatch_get_main_queue();
223 printf("%-45s%15Lf\n\n", "Cycles per nanosecond:", cycles_per_nanosecond);
225 s = mach_absolute_time();
226 for (i = cnt; i; i--) {
229 print_result(s, "Empty loop:");
231 printf("\nLoop cost subtracted from the following:\n\n");
233 s = mach_absolute_time();
234 for (i = cnt; i; i--) {
235 mach_absolute_time();
237 print_result(s, "mach_absolute_time():");
239 #if defined(__i386__) || defined(__x86_64__)
240 s = mach_absolute_time();
241 for (i = cnt; i; i--) {
244 print_result(s, "rdtsc():");
247 s = mach_absolute_time();
248 for (i = cnt2; i; i--) {
252 r = pthread_create(&pthr, NULL, (void *(*)(void *))func, NULL);
254 r = pthread_join(pthr, &pr);
257 print_result2(s, "pthread create+join:");
259 s = mach_absolute_time();
260 for (i = cnt2; i; i--) {
261 kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
263 kr = semaphore_destroy(mach_task_self(), sem);
266 print_result2(s, "Mach semaphore create/destroy:");
268 kr = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
270 s = mach_absolute_time();
271 for (i = cnt2; i; i--) {
272 kr = semaphore_signal(sem);
275 print_result2(s, "Mach semaphore signal:");
276 kr = semaphore_destroy(mach_task_self(), sem);
279 s = mach_absolute_time();
280 for (i = cnt; i; i--) {
283 print_result(s, "pthread_yield_np():");
285 s = mach_absolute_time();
286 for (i = cnt; i; i--) {
289 print_result(s, "free(malloc(32)):");
291 s = mach_absolute_time();
292 for (i = cnt / 2; i; i--) {
293 void *m1 = malloc(32);
294 void *m2 = malloc(32);
298 print_result(s, "Avoiding the MRU cache of free(malloc(32)):");
300 s = mach_absolute_time();
301 for (i = cnt; i; i--) {
302 fixed_free_lifo(fixed_malloc_lifo());
304 print_result(s, "per-thread/fixed free(malloc(32)):");
306 s = mach_absolute_time();
307 for (i = cnt; i; i--) {
308 assert(strtoull("18446744073709551615", NULL, 0) == ~0ull);
310 print_result(s, "strtoull(\"18446744073709551615\") == ~0ull:");
312 s = mach_absolute_time();
313 for (i = cnt; i; i--) {
316 print_result(s, "Empty function call:");
319 s = mach_absolute_time();
320 for (i = cnt; i; i--) {
323 print_result(s, "Empty block call:");
326 s = mach_absolute_time();
327 for (i = cnt; i; i--) {
330 print_result(s, "Empty ObjC call:");
332 s = mach_absolute_time();
333 for (i = cnt; i; i--) {
336 print_result(s, "Empty C++ virtual call:");
338 s = mach_absolute_time();
339 for (i = cnt2; i; i--) {
342 print_result2(s, "\"description\" ObjC call:");
348 #if defined(__i386__) || defined(__x86_64__)
349 s = mach_absolute_time();
350 for (i = cnt; i; i--) {
353 print_result(s, "raw 'nop':");
355 s = mach_absolute_time();
356 for (i = cnt; i; i--) {
359 print_result(s, "raw 'pause':");
361 s = mach_absolute_time();
362 for (i = cnt; i; i--) {
365 print_result(s, "Atomic mfence:");
367 s = mach_absolute_time();
368 for (i = cnt; i; i--) {
371 print_result(s, "Atomic lfence:");
373 s = mach_absolute_time();
374 for (i = cnt; i; i--) {
377 print_result(s, "Atomic sfence:");
379 s = mach_absolute_time();
380 for (i = cnt; i; i--) {
382 asm("sidt %0" : "=m" (sidt_rval));
384 print_result(s, "'sidt' instruction:");
386 s = mach_absolute_time();
387 for (i = cnt; i; i--) {
389 asm volatile("cmpxchg %1,%2" : "=a" (prev) : "r" (0l), "m" (global), "0" (1l));
391 print_result(s, "'cmpxchg' without the 'lock' prefix:");
394 s = mach_absolute_time();
395 for (i = cnt; i; i--) {
396 __sync_lock_test_and_set(&global, 0);
398 print_result(s, "Atomic xchg:");
400 s = mach_absolute_time();
401 for (i = cnt; i; i--) {
402 __sync_val_compare_and_swap(&global, 1, 0);
404 print_result(s, "Atomic cmpxchg:");
406 s = mach_absolute_time();
407 for (i = cnt; i; i--) {
408 __sync_fetch_and_add(&global, 1);
410 print_result(s, "Atomic increment:");
414 s = mach_absolute_time();
415 for (i = cnt; i; i--) {
416 OSAtomicIncrement32Barrier(&global);
418 print_result(s, "OSAtomic increment:");
422 s = mach_absolute_time();
423 for (i = cnt; i; i--) {
424 while (!__sync_bool_compare_and_swap(&global, 0, 1)) {
426 #if defined(__i386__) || defined(__x86_64__)
433 print_result(s, "Inlined spin lock/unlock:");
435 s = mach_absolute_time();
436 for (i = cnt; i; i--) {
437 OSSpinLockLock(&slock);
438 OSSpinLockUnlock(&slock);
440 print_result(s, "OS spin lock/unlock:");
442 s = mach_absolute_time();
443 for (i = cnt; i; i--) {
444 r = pthread_mutex_lock(&plock);
446 r = pthread_mutex_unlock(&plock);
449 print_result(s, "pthread lock/unlock:");
452 s = mach_absolute_time();
453 for (i = cnt; i; i--) {
454 dispatch_sync(q, ^{ });
456 print_result(s, "dispatch_sync:");
459 s = mach_absolute_time();
460 for (i = cnt; i; i--) {
461 dispatch_sync_f(q, NULL, (void (*)(void *))func);
463 print_result(s, "dispatch_sync_f:");
466 s = mach_absolute_time();
467 for (i = cnt; i; i--) {
468 dispatch_barrier_sync(q, ^{ });
470 print_result(s, "dispatch_barrier_sync:");
473 s = mach_absolute_time();
474 for (i = cnt; i; i--) {
475 dispatch_barrier_sync_f(q, NULL, (void (*)(void *))func);
477 print_result(s, "dispatch_barrier_sync_f:");
479 s = mach_absolute_time();
480 dispatch_apply_f(cnt, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), NULL, (void (*)(void *, size_t))func);
481 s += loop_cost; /* cancel out the implicit subtraction done by the next line */
482 print_result(s, "dispatch_apply_f():");
484 // we do a "double backflip" to hit the fast-path of the enqueue/dequeue logic
485 bfs = mach_absolute_time();
486 dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);
487 dispatch_async_f(dispatch_get_main_queue(), &bf_cnt, backflip);
492 __attribute__((noinline))
496 print_result(bfs, "dispatch_async_f():");
503 size_t *bf_cnt = (size_t *)ctxt;
505 return dispatch_async_f(dispatch_get_main_queue(), ctxt, backflip);