drm/i915/gt: Don't declare hangs if engine is stalled
authorChris Wilson <chris@chris-wilson.co.uk>
Thu, 28 May 2020 07:41:00 +0000 (08:41 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Thu, 28 May 2020 16:53:52 +0000 (17:53 +0100)
If the ring submission is stalled on an external request, nothing can be
submitted, not even the heartbeat in the kernel context. Since nothing
is running, resetting the engine/device does not unblock the system and
is pointless. We can see if the heartbeat is supposed to be running
before declaring foul.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200528074109.28235-2-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c

index 5136c8b..f67ad93 100644 (file)
@@ -48,8 +48,10 @@ static void show_heartbeat(const struct i915_request *rq,
        struct drm_printer p = drm_debug_printer("heartbeat");
 
        intel_engine_dump(engine, &p,
-                         "%s heartbeat {prio:%d} not ticking\n",
+                         "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
                          engine->name,
+                         rq->fence.context,
+                         rq->fence.seqno,
                          rq->sched.attr.priority);
 }
 
@@ -76,8 +78,19 @@ static void heartbeat(struct work_struct *wrk)
                goto out;
 
        if (engine->heartbeat.systole) {
-               if (engine->schedule &&
-                   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
+               if (!i915_sw_fence_signaled(&rq->submit)) {
+                       /*
+                        * Not yet submitted, system is stalled.
+                        *
+                        * This more often happens for ring submission,
+                        * where all contexts are funnelled into a common
+                        * ringbuffer. If one context is blocked on an
+                        * external fence, not only is it not submitted,
+                        * but all other contexts, including the kernel
+                        * context are stuck waiting for the signal.
+                        */
+               } else if (engine->schedule &&
+                          rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
                        /*
                         * Gradually raise the priority of the heartbeat to
                         * give high priority work [which presumably desires