drm/i915: Remove i915_request.lock requirement for execution callbacks
authorChris Wilson <chris@chris-wilson.co.uk>
Thu, 16 Jul 2020 14:22:07 +0000 (15:22 +0100)
committerJoonas Lahtinen <joonas.lahtinen@linux.intel.com>
Mon, 7 Sep 2020 12:08:06 +0000 (15:08 +0300)
To implement preempt-to-busy (and so efficient timeslicing and best utilization
of the hardware submission ports) we let the GPU run asynchronously in respect
to the ELSP submission queue. This created challenges in keeping and accessing
the driver state mirroring the asynchronous GPU execution.

Previous fix 1d9221e9d395 ("drm/i915: Skip signaling a signaled request")
however did not correctly serialize request retirement with the execution
callbacks.

We were using the i915_request.lock to serialise adding an execution callback
with __i915_request_submit. However, if we use an atomic llist_add to serialise
multiple waiters and then check to see if the request is already executing, we
can remove the irq-spinlock and fix serialization between retirement and
execution callbacks in one go.

v2: Avoid using the irq_work when outside of the irq-spinlocks, where we
can execute the callbacks immediately.
v3: Pay close attention to the order of setting ACTIVE on retirement, we
need to ensure the request is signaled and breadcrumbs detached before
we finish removing the request from the engine.
v4: Expanded commit message.

Fixes: 1d9221e9d395 ("drm/i915: Skip signaling a signaled request")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200716142207.13003-2-chris@chris-wilson.co.uk
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
[Joonas: Rebased and reordered into drm-intel-gt-next branch]
[Joonas: Added expanded commit message from Tvrtko and Chris]
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
drivers/gpu/drm/i915/i915_request.c

index 4f7f67a..11e2724 100644 (file)
@@ -187,30 +187,34 @@ static void irq_execute_cb_hook(struct irq_work *wrk)
        irq_execute_cb(wrk);
 }
 
-static void __notify_execute_cb(struct i915_request *rq)
+static __always_inline void
+__notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk))
 {
        struct execute_cb *cb, *cn;
 
-       lockdep_assert_held(&rq->lock);
-
-       GEM_BUG_ON(!i915_request_is_active(rq));
        if (llist_empty(&rq->execute_cb))
                return;
 
-       llist_for_each_entry_safe(cb, cn, rq->execute_cb.first, work.llnode)
-               irq_work_queue(&cb->work);
+       llist_for_each_entry_safe(cb, cn,
+                                 llist_del_all(&rq->execute_cb),
+                                 work.llnode)
+               fn(&cb->work);
+}
 
-       /*
-        * XXX Rollback on __i915_request_unsubmit()
-        *
-        * In the future, perhaps when we have an active time-slicing scheduler,
-        * it will be interesting to unsubmit parallel execution and remove
-        * busywaits from the GPU until their master is restarted. This is
-        * quite hairy, we have to carefully rollback the fence and do a
-        * preempt-to-idle cycle on the target engine, all the while the
-        * master execute_cb may refire.
-        */
-       init_llist_head(&rq->execute_cb);
+static void __notify_execute_cb_irq(struct i915_request *rq)
+{
+       __notify_execute_cb(rq, irq_work_queue);
+}
+
+static bool irq_work_imm(struct irq_work *wrk)
+{
+       wrk->func(wrk);
+       return false;
+}
+
+static void __notify_execute_cb_imm(struct i915_request *rq)
+{
+       __notify_execute_cb(rq, irq_work_imm);
 }
 
 static void free_capture_list(struct i915_request *request)
@@ -257,9 +261,16 @@ static void remove_from_engine(struct i915_request *rq)
                locked = engine;
        }
        list_del_init(&rq->sched.link);
+
        clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
        clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags);
+
+       /* Prevent further __await_execution() registering a cb, then flush */
+       set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags);
+
        spin_unlock_irq(&locked->active.lock);
+
+       __notify_execute_cb_imm(rq);
 }
 
 bool i915_request_retire(struct i915_request *rq)
@@ -271,6 +282,7 @@ bool i915_request_retire(struct i915_request *rq)
 
        GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
        trace_i915_request_retire(rq);
+       i915_request_mark_complete(rq);
 
        /*
         * We know the GPU must have read the request to have
@@ -288,15 +300,6 @@ bool i915_request_retire(struct i915_request *rq)
                __i915_request_fill(rq, POISON_FREE);
        rq->ring->head = rq->postfix;
 
-       /*
-        * We only loosely track inflight requests across preemption,
-        * and so we may find ourselves attempting to retire a _completed_
-        * request that we have removed from the HW and put back on a run
-        * queue.
-        */
-       remove_from_engine(rq);
-
-       i915_request_mark_complete(rq);
        if (!i915_request_signaled(rq)) {
                spin_lock_irq(&rq->lock);
                dma_fence_signal_locked(&rq->fence);
@@ -320,7 +323,6 @@ bool i915_request_retire(struct i915_request *rq)
         */
        remove_from_engine(rq);
        GEM_BUG_ON(!llist_empty(&rq->execute_cb));
-       spin_unlock_irq(&rq->lock);
 
        __list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */
 
@@ -348,12 +350,6 @@ void i915_request_retire_upto(struct i915_request *rq)
        } while (i915_request_retire(tmp) && tmp != rq);
 }
 
-static void __llist_add(struct llist_node *node, struct llist_head *head)
-{
-       node->next = head->first;
-       head->first = node;
-}
-
 static struct i915_request * const *
 __engine_active(struct intel_engine_cs *engine)
 {
@@ -451,18 +447,24 @@ __await_execution(struct i915_request *rq,
                cb->work.func = irq_execute_cb_hook;
        }
 
-       spin_lock_irq(&signal->lock);
-       if (i915_request_is_active(signal) || __request_in_flight(signal)) {
-               if (hook) {
-                       hook(rq, &signal->fence);
-                       i915_request_put(signal);
-               }
-               i915_sw_fence_complete(cb->fence);
-               kmem_cache_free(global.slab_execute_cbs, cb);
-       } else {
-               __llist_add(&cb->work.llnode, &signal->execute_cb);
+       /*
+        * Register the callback first, then see if the signaler is already
+        * active. This ensures that if we race with the
+        * __notify_execute_cb from i915_request_submit() and we are not
+        * included in that list, we get a second bite of the cherry and
+        * execute it ourselves. After this point, a future
+        * i915_request_submit() will notify us.
+        *
+        * In i915_request_retire() we set the ACTIVE bit on a completed
+        * request (then flush the execute_cb). So by registering the
+        * callback first, then checking the ACTIVE bit, we serialise with
+        * the completed/retired request.
+        */
+       if (llist_add(&cb->work.llnode, &signal->execute_cb)) {
+               if (i915_request_is_active(signal) ||
+                   __request_in_flight(signal))
+                       __notify_execute_cb_imm(signal);
        }
-       spin_unlock_irq(&signal->lock);
 
        return 0;
 }
@@ -588,10 +590,19 @@ xfer:
         * preempt-to-idle cycle on the target engine, all the while the
         * master execute_cb may refire.
         */
-       __notify_execute_cb(request);
+       __notify_execute_cb_irq(request);
 
-       if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
-               i915_request_enable_breadcrumb(request);
+       /* We may be recursing from the signal callback of another i915 fence */
+       if (!i915_request_signaled(request)) {
+               spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+
+               if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+                            &request->fence.flags) &&
+                   !i915_request_enable_breadcrumb(request))
+                       intel_engine_signal_breadcrumbs(engine);
+
+               spin_unlock(&request->lock);
+       }
 
        return result;
 }