drm/i915/breadcrumbs: Reduce signaler rbtree to a sorted list
authorChris Wilson <chris@chris-wilson.co.uk>
Thu, 22 Feb 2018 09:25:44 +0000 (09:25 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Tue, 6 Mar 2018 12:12:45 +0000 (12:12 +0000)
The goal here is to try and reduce the latency of signaling additional
requests following the wakeup from interrupt by reducing the list of
to-be-signaled requests from an rbtree to a sorted linked list. The
original choice of using an rbtree was to facilitate random insertions
of request into the signaler while maintaining a sorted list. However,
if we assume that most new requests are added when they are submitted,
we see those new requests in execution order making a insertion sort
fast, and the reduction in overhead of each signaler iteration
significant.

Since commit 56299fb7d904 ("drm/i915: Signal first fence from irq handler
if complete"), we signal most fences directly from notify_ring() in the
interrupt handler greatly reducing the amount of work that actually
needs to be done by the signaler kthread. All the thread is then
required to do is operate as the bottom-half, cleaning up after the
interrupt handler and preparing the next waiter. This includes signaling
all later completed fences in a saturated system, but on a mostly idle
system we only have to rebuild the wait rbtree in time for the next
interrupt. With this de-emphasis of the signaler's role, we want to
rejig it's datastructures to reduce the amount of work we require to
both setup the signal tree and maintain it on every interrupt.

References: 56299fb7d904 ("drm/i915: Signal first fence from irq handler if complete")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180222092545.17216-1-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_request.h
drivers/gpu/drm/i915/intel_breadcrumbs.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index 74311fc53e2f9d373770a5aba344d622ad96ca32..7d6eb82eeb913636eb590e25121e752044f93fe2 100644 (file)
@@ -44,8 +44,8 @@ struct intel_wait {
 };
 
 struct intel_signal_node {
-       struct rb_node node;
        struct intel_wait wait;
+       struct list_head link;
 };
 
 struct i915_dependency {
index 094f010908b8b7e39eed52243a7b10b9ecff3e95..03bbc1dfbc51bd4a12e0c5dea4f8750c4f784c50 100644 (file)
@@ -340,7 +340,8 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
        lockdep_assert_held(&b->rb_lock);
        GEM_BUG_ON(b->irq_wait == wait);
 
-       /* This request is completed, so remove it from the tree, mark it as
+       /*
+        * This request is completed, so remove it from the tree, mark it as
         * complete, and *then* wake up the associated task. N.B. when the
         * task wakes up, it will find the empty rb_node, discern that it
         * has already been removed from the tree and skip the serialisation
@@ -351,7 +352,8 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
        rb_erase(&wait->node, &b->waiters);
        RB_CLEAR_NODE(&wait->node);
 
-       wake_up_process(wait->tsk); /* implicit smp_wmb() */
+       if (wait->tsk->state != TASK_RUNNING)
+               wake_up_process(wait->tsk); /* implicit smp_wmb() */
 }
 
 static inline void __intel_breadcrumbs_next(struct intel_engine_cs *engine,
@@ -592,23 +594,6 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
        spin_unlock_irq(&b->rb_lock);
 }
 
-static bool signal_complete(const struct i915_request *request)
-{
-       if (!request)
-               return false;
-
-       /*
-        * Carefully check if the request is complete, giving time for the
-        * seqno to be visible or if the GPU hung.
-        */
-       return __i915_request_irq_complete(request);
-}
-
-static struct i915_request *to_signaler(struct rb_node *rb)
-{
-       return rb_entry(rb, struct i915_request, signaling.node);
-}
-
 static void signaler_set_rtpriority(void)
 {
         struct sched_param param = { .sched_priority = 1 };
@@ -616,78 +601,26 @@ static void signaler_set_rtpriority(void)
         sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
 }
 
-static void __intel_engine_remove_signal(struct intel_engine_cs *engine,
-                                        struct i915_request *request)
-{
-       struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-       lockdep_assert_held(&b->rb_lock);
-
-       /*
-        * Wake up all other completed waiters and select the
-        * next bottom-half for the next user interrupt.
-        */
-       __intel_engine_remove_wait(engine, &request->signaling.wait);
-
-       /*
-        * Find the next oldest signal. Note that as we have
-        * not been holding the lock, another client may
-        * have installed an even older signal than the one
-        * we just completed - so double check we are still
-        * the oldest before picking the next one.
-        */
-       if (request->signaling.wait.seqno) {
-               if (request == rcu_access_pointer(b->first_signal)) {
-                       struct rb_node *rb = rb_next(&request->signaling.node);
-                       rcu_assign_pointer(b->first_signal,
-                                          rb ? to_signaler(rb) : NULL);
-               }
-
-               rb_erase(&request->signaling.node, &b->signals);
-               request->signaling.wait.seqno = 0;
-       }
-}
-
-static struct i915_request *
-get_first_signal_rcu(struct intel_breadcrumbs *b)
-{
-       /*
-        * See the big warnings for i915_gem_active_get_rcu() and similarly
-        * for dma_fence_get_rcu_safe() that explain the intricacies involved
-        * here with defeating CPU/compiler speculation and enforcing
-        * the required memory barriers.
-        */
-       do {
-               struct i915_request *request;
-
-               request = rcu_dereference(b->first_signal);
-               if (request)
-                       request = i915_request_get_rcu(request);
-
-               barrier();
-
-               if (!request || request == rcu_access_pointer(b->first_signal))
-                       return rcu_pointer_handoff(request);
-
-               i915_request_put(request);
-       } while (1);
-}
-
 static int intel_breadcrumbs_signaler(void *arg)
 {
        struct intel_engine_cs *engine = arg;
        struct intel_breadcrumbs *b = &engine->breadcrumbs;
-       struct i915_request *request;
+       struct i915_request *rq, *n;
 
        /* Install ourselves with high priority to reduce signalling latency */
        signaler_set_rtpriority();
 
        do {
                bool do_schedule = true;
+               LIST_HEAD(list);
+               u32 seqno;
 
                set_current_state(TASK_INTERRUPTIBLE);
+               if (list_empty(&b->signals))
+                       goto sleep;
 
-               /* We are either woken up by the interrupt bottom-half,
+               /*
+                * We are either woken up by the interrupt bottom-half,
                 * or by a client adding a new signaller. In both cases,
                 * the GPU seqno may have advanced beyond our oldest signal.
                 * If it has, propagate the signal, remove the waiter and
@@ -695,25 +628,45 @@ static int intel_breadcrumbs_signaler(void *arg)
                 * need to wait for a new interrupt from the GPU or for
                 * a new client.
                 */
-               rcu_read_lock();
-               request = get_first_signal_rcu(b);
-               rcu_read_unlock();
-               if (signal_complete(request)) {
-                       if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-                                     &request->fence.flags)) {
-                               local_bh_disable();
-                               dma_fence_signal(&request->fence);
-                               GEM_BUG_ON(!i915_request_completed(request));
-                               local_bh_enable(); /* kick start the tasklets */
-                       }
+               seqno = intel_engine_get_seqno(engine);
+
+               spin_lock_irq(&b->rb_lock);
+               list_for_each_entry_safe(rq, n, &b->signals, signaling.link) {
+                       u32 this = rq->signaling.wait.seqno;
+
+                       GEM_BUG_ON(!rq->signaling.wait.seqno);
 
-                       if (READ_ONCE(request->signaling.wait.seqno)) {
-                               spin_lock_irq(&b->rb_lock);
-                               __intel_engine_remove_signal(engine, request);
-                               spin_unlock_irq(&b->rb_lock);
+                       if (!i915_seqno_passed(seqno, this))
+                               break;
+
+                       if (likely(this == i915_request_global_seqno(rq))) {
+                               __intel_engine_remove_wait(engine,
+                                                          &rq->signaling.wait);
+
+                               rq->signaling.wait.seqno = 0;
+                               __list_del_entry(&rq->signaling.link);
+
+                               if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+                                             &rq->fence.flags)) {
+                                       list_add_tail(&rq->signaling.link,
+                                                     &list);
+                                       i915_request_get(rq);
+                               }
+                       }
+               }
+               spin_unlock_irq(&b->rb_lock);
+
+               if (!list_empty(&list)) {
+                       local_bh_disable();
+                       list_for_each_entry_safe(rq, n, &list, signaling.link) {
+                               dma_fence_signal(&rq->fence);
+                               GEM_BUG_ON(!i915_request_completed(rq));
+                               i915_request_put(rq);
                        }
+                       local_bh_enable(); /* kick start the tasklets */
 
-                       /* If the engine is saturated we may be continually
+                       /*
+                        * If the engine is saturated we may be continually
                         * processing completed requests. This angers the
                         * NMI watchdog if we never let anything else
                         * have access to the CPU. Let's pretend to be nice
@@ -722,9 +675,19 @@ static int intel_breadcrumbs_signaler(void *arg)
                         */
                        do_schedule = need_resched();
                }
-               i915_request_put(request);
 
                if (unlikely(do_schedule)) {
+                       /* Before we sleep, check for a missed seqno */
+                       if (current->state & TASK_NORMAL &&
+                           !list_empty(&b->signals) &&
+                           engine->irq_seqno_barrier &&
+                           test_and_clear_bit(ENGINE_IRQ_BREADCRUMB,
+                                              &engine->irq_posted)) {
+                               engine->irq_seqno_barrier(engine);
+                               intel_engine_wakeup(engine);
+                       }
+
+sleep:
                        if (kthread_should_park())
                                kthread_parkme();
 
@@ -739,13 +702,40 @@ static int intel_breadcrumbs_signaler(void *arg)
        return 0;
 }
 
+static void insert_signal(struct intel_breadcrumbs *b,
+                         struct i915_request *request,
+                         const u32 seqno)
+{
+       struct i915_request *iter;
+
+       lockdep_assert_held(&b->rb_lock);
+
+       /*
+        * A reasonable assumption is that we are called to add signals
+        * in sequence, as the requests are submitted for execution and
+        * assigned a global_seqno. This will be the case for the majority
+        * of internally generated signals (inter-engine signaling).
+        *
+        * Out of order waiters triggering random signaling enabling will
+        * be more problematic, but hopefully rare enough and the list
+        * small enough that the O(N) insertion sort is not an issue.
+        */
+
+       list_for_each_entry_reverse(iter, &b->signals, signaling.link)
+               if (i915_seqno_passed(seqno, iter->signaling.wait.seqno))
+                       break;
+
+       list_add(&request->signaling.link, &iter->signaling.link);
+}
+
 void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
 {
        struct intel_engine_cs *engine = request->engine;
        struct intel_breadcrumbs *b = &engine->breadcrumbs;
        u32 seqno;
 
-       /* Note that we may be called from an interrupt handler on another
+       /*
+        * Note that we may be called from an interrupt handler on another
         * device (e.g. nouveau signaling a fence completion causing us
         * to submit a request, and so enable signaling). As such,
         * we need to make sure that all other users of b->rb_lock protect
@@ -757,17 +747,16 @@ void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
        lockdep_assert_held(&request->lock);
 
        seqno = i915_request_global_seqno(request);
-       if (!seqno)
+       if (!seqno) /* will be enabled later upon execution */
                return;
 
-       spin_lock(&b->rb_lock);
-
        GEM_BUG_ON(request->signaling.wait.seqno);
        request->signaling.wait.tsk = b->signaler;
        request->signaling.wait.request = request;
        request->signaling.wait.seqno = seqno;
 
-       /* First add ourselves into the list of waiters, but register our
+       /*
+        * Add ourselves into the list of waiters, but registering our
         * bottom-half as the signaller thread. As per usual, only the oldest
         * waiter (not just signaller) is tasked as the bottom-half waking
         * up all completed waiters after the user interrupt.
@@ -775,39 +764,9 @@ void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
         * If we are the oldest waiter, enable the irq (after which we
         * must double check that the seqno did not complete).
         */
+       spin_lock(&b->rb_lock);
+       insert_signal(b, request, seqno);
        wakeup &= __intel_engine_add_wait(engine, &request->signaling.wait);
-
-       if (!__i915_request_completed(request, seqno)) {
-               struct rb_node *parent, **p;
-               bool first;
-
-               /* Now insert ourselves into the retirement ordered list of
-                * signals on this engine. We track the oldest seqno as that
-                * will be the first signal to complete.
-                */
-               parent = NULL;
-               first = true;
-               p = &b->signals.rb_node;
-               while (*p) {
-                       parent = *p;
-                       if (i915_seqno_passed(seqno,
-                                             to_signaler(parent)->signaling.wait.seqno)) {
-                               p = &parent->rb_right;
-                               first = false;
-                       } else {
-                               p = &parent->rb_left;
-                       }
-               }
-               rb_link_node(&request->signaling.node, parent, p);
-               rb_insert_color(&request->signaling.node, &b->signals);
-               if (first)
-                       rcu_assign_pointer(b->first_signal, request);
-       } else {
-               __intel_engine_remove_wait(engine, &request->signaling.wait);
-               request->signaling.wait.seqno = 0;
-               wakeup = false;
-       }
-
        spin_unlock(&b->rb_lock);
 
        if (wakeup)
@@ -816,17 +775,20 @@ void intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
 
 void intel_engine_cancel_signaling(struct i915_request *request)
 {
+       struct intel_engine_cs *engine = request->engine;
+       struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
        GEM_BUG_ON(!irqs_disabled());
        lockdep_assert_held(&request->lock);
 
-       if (READ_ONCE(request->signaling.wait.seqno)) {
-               struct intel_engine_cs *engine = request->engine;
-               struct intel_breadcrumbs *b = &engine->breadcrumbs;
+       if (!READ_ONCE(request->signaling.wait.seqno))
+               return;
 
-               spin_lock(&b->rb_lock);
-               __intel_engine_remove_signal(engine, request);
-               spin_unlock(&b->rb_lock);
-       }
+       spin_lock(&b->rb_lock);
+       __intel_engine_remove_wait(engine, &request->signaling.wait);
+       if (fetch_and_zero(&request->signaling.wait.seqno))
+               __list_del_entry(&request->signaling.link);
+       spin_unlock(&b->rb_lock);
 }
 
 int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -840,6 +802,8 @@ int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
        timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
        timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
 
+       INIT_LIST_HEAD(&b->signals);
+
        /* Spawn a thread to provide a common bottom-half for all signals.
         * As this is an asynchronous interface we cannot steal the current
         * task for handling the bottom-half to the user interrupt, therefore
@@ -899,8 +863,7 @@ void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
        /* The engines should be idle and all requests accounted for! */
        WARN_ON(READ_ONCE(b->irq_wait));
        WARN_ON(!RB_EMPTY_ROOT(&b->waiters));
-       WARN_ON(rcu_access_pointer(b->first_signal));
-       WARN_ON(!RB_EMPTY_ROOT(&b->signals));
+       WARN_ON(!list_empty(&b->signals));
 
        if (!IS_ERR_OR_NULL(b->signaler))
                kthread_stop(b->signaler);
@@ -913,20 +876,22 @@ bool intel_breadcrumbs_busy(struct intel_engine_cs *engine)
        struct intel_breadcrumbs *b = &engine->breadcrumbs;
        bool busy = false;
 
-       spin_lock_irq(&b->rb_lock);
-
        if (b->irq_wait) {
-               wake_up_process(b->irq_wait->tsk);
-               busy = true;
+               spin_lock_irq(&b->irq_lock);
+
+               if (b->irq_wait) {
+                       wake_up_process(b->irq_wait->tsk);
+                       busy = true;
+               }
+
+               spin_unlock_irq(&b->irq_lock);
        }
 
-       if (rcu_access_pointer(b->first_signal)) {
+       if (!busy && !list_empty(&b->signals)) {
                wake_up_process(b->signaler);
                busy = true;
        }
 
-       spin_unlock_irq(&b->rb_lock);
-
        return busy;
 }
 
index 90e4380cbdd5e30fa18df20455cc5b0d97694320..e7526a4f05e5b953134f5b9f95263b8c4d8d571c 100644 (file)
@@ -356,9 +356,9 @@ struct intel_engine_cs {
 
                spinlock_t rb_lock; /* protects the rb and wraps irq_lock */
                struct rb_root waiters; /* sorted by retirement, priority */
-               struct rb_root signals; /* sorted by retirement */
+               struct list_head signals; /* sorted by retirement */
                struct task_struct *signaler; /* used for fence signalling */
-               struct i915_request __rcu *first_signal;
+
                struct timer_list fake_irq; /* used after a missed interrupt */
                struct timer_list hangcheck; /* detect missed interrupts */