From d1d1b206b07977fe0e75b0254e5b50c215c97803 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 2 Dec 2014 12:05:44 +0100 Subject: [PATCH] coroutine-ucontext: use __thread ELF thread local storage is about 10% faster on tests/test-coroutine's perf/cost test. The timing on my machine is 190ns per iteration with pthread TLS, 170 with ELF TLS. Based on a patch by Kevin Wolf and Peter Lieven, but redone to follow the model of coroutine-win32.c (including the important "noinline" attribute!). Platforms without thread-local storage (OpenBSD probably?) will need a new-enough GCC for this to compile, in order to use the same emutls support that Windows already relies on. Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 1417518350-6167-2-git-send-email-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- coroutine-ucontext.c | 69 +++++++++++++++------------------------------------- 1 file changed, 19 insertions(+), 50 deletions(-) diff --git a/coroutine-ucontext.c b/coroutine-ucontext.c index 4bf2cde..259fcb4 100644 --- a/coroutine-ucontext.c +++ b/coroutine-ucontext.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "qemu-common.h" #include "block/coroutine_int.h" @@ -48,15 +47,8 @@ typedef struct { /** * Per-thread coroutine bookkeeping */ -typedef struct { - /** Currently executing coroutine */ - Coroutine *current; - - /** The default coroutine */ - CoroutineUContext leader; -} CoroutineThreadState; - -static pthread_key_t thread_state_key; +static __thread CoroutineUContext leader; +static __thread Coroutine *current; /* * va_args to makecontext() must be type 'int', so passing @@ -68,36 +60,6 @@ union cc_arg { int i[2]; }; -static CoroutineThreadState *coroutine_get_thread_state(void) -{ - CoroutineThreadState *s = pthread_getspecific(thread_state_key); - - if (!s) { - s = g_malloc0(sizeof(*s)); - s->current = &s->leader.base; - pthread_setspecific(thread_state_key, s); - } - return s; -} - -static void qemu_coroutine_thread_cleanup(void *opaque) -{ - CoroutineThreadState *s = opaque; - - g_free(s); -} - -static void __attribute__((constructor)) coroutine_init(void) -{ - int ret; - - ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup); - if (ret != 0) { - fprintf(stderr, "unable to create leader key: %s\n", strerror(errno)); - abort(); - } -} - static void coroutine_trampoline(int i0, int i1) { union cc_arg arg; @@ -193,15 +155,23 @@ void qemu_coroutine_delete(Coroutine *co_) g_free(co); } -CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, - CoroutineAction action) +/* This function is marked noinline to prevent GCC from inlining it + * into coroutine_trampoline(). If we allow it to do that then it + * hoists the code to get the address of the TLS variable "current" + * out of the while() loop. This is an invalid transformation because + * the sigsetjmp() call may be called when running thread A but + * return in thread B, and so we might be in a different thread + * context each time round the loop. + */ +CoroutineAction __attribute__((noinline)) +qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) { CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_); CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_); - CoroutineThreadState *s = coroutine_get_thread_state(); int ret; - s->current = to_; + current = to_; ret = sigsetjmp(from->env, 0); if (ret == 0) { @@ -212,14 +182,13 @@ CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, Coroutine *qemu_coroutine_self(void) { - CoroutineThreadState *s = coroutine_get_thread_state(); - - return s->current; + if (!current) { + current = &leader.base; + } + return current; } bool qemu_in_coroutine(void) { - CoroutineThreadState *s = pthread_getspecific(thread_state_key); - - return s && s->current->caller; + return current && current->caller; } -- 2.7.4