llvmpipe/cs: rework thread pool for avoid mtx locking
authorDave Airlie <airlied@redhat.com>
Mon, 16 Aug 2021 01:18:15 +0000 (11:18 +1000)
committerDave Airlie <airlied@redhat.com>
Thu, 16 Sep 2021 03:21:06 +0000 (13:21 +1000)
This helps reduced the mtx lock/unlock overheads for the threadpool
if the work evenly distributes across the number of threads.

The CL CTS conversions tests really hit this, and this takes maybe 10-20s
off a 5min test run.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12432>

src/gallium/drivers/llvmpipe/lp_cs_tpool.c
src/gallium/drivers/llvmpipe/lp_cs_tpool.h

index dd28dba..50d4212 100644 (file)
@@ -43,6 +43,7 @@ lp_cs_tpool_worker(void *data)
 
    while (!pool->shutdown) {
       struct lp_cs_tpool_task *task;
+      unsigned iter_per_thread;
 
       while (list_is_empty(&pool->workqueue) && !pool->shutdown)
          cnd_wait(&pool->new_work, &pool->m);
@@ -52,15 +53,26 @@ lp_cs_tpool_worker(void *data)
 
       task = list_first_entry(&pool->workqueue, struct lp_cs_tpool_task,
                               list);
-      unsigned this_iter = task->iter_start++;
+
+      unsigned this_iter = task->iter_start;
+
+      iter_per_thread = task->iter_per_thread;
+
+      if (task->iter_remainder &&
+          task->iter_start + task->iter_remainder == task->iter_total)
+         iter_per_thread = task->iter_remainder;
+
+      task->iter_start += iter_per_thread;
 
       if (task->iter_start == task->iter_total)
          list_del(&task->list);
 
       mtx_unlock(&pool->m);
-      task->work(task->data, this_iter, &lmem);
+      for (unsigned i = 0; i < iter_per_thread; i++)
+         task->work(task->data, this_iter + i, &lmem);
+
       mtx_lock(&pool->m);
-      task->iter_finished++;
+      task->iter_finished += iter_per_thread;
       if (task->iter_finished == task->iter_total)
          cnd_broadcast(&task->finish);
    }
@@ -132,6 +144,10 @@ lp_cs_tpool_queue_task(struct lp_cs_tpool *pool,
    task->work = work;
    task->data = data;
    task->iter_total = num_iters;
+
+   task->iter_per_thread = num_iters / pool->num_threads;
+   task->iter_remainder = num_iters % pool->num_threads;
+
    cnd_init(&task->finish);
 
    mtx_lock(&pool->m);
index d32a5e0..22c0d10 100644 (file)
@@ -66,6 +66,8 @@ struct lp_cs_tpool_task {
    unsigned iter_total;
    unsigned iter_start;
    unsigned iter_finished;
+   unsigned iter_per_thread;
+   unsigned iter_remainder;
 };
 
 struct lp_cs_tpool *lp_cs_tpool_create(unsigned num_threads);