drm/i915/gt: Drop the timeline->mutex as we wait for retirement
authorChris Wilson <chris@chris-wilson.co.uk>
Tue, 3 Mar 2020 14:00:09 +0000 (14:00 +0000)
committerJani Nikula <jani.nikula@intel.com>
Wed, 4 Mar 2020 11:49:26 +0000 (13:49 +0200)
As we have pinned the timeline (using tl->active_count), we can safely
drop the tl->mutex as we wait for what we believe to be the final
request on that timeline. This is useful for ensuring that we do not
block the engine heartbeat by hogging the kernel_context's timeline on a
dead GPU.

References: https://gitlab.freedesktop.org/drm/intel/issues/1364
Fixes: 058179e72e09 ("drm/i915/gt: Replace hangcheck by heartbeats")
Fixes: f33a8a51602c ("drm/i915: Merge wait_for_timelines with retire_request")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200303140009.1494819-1-chris@chris-wilson.co.uk
(cherry picked from commit 82126e596d8519baac416aee83cad938f1d23cf8)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
drivers/gpu/drm/i915/gt/intel_gt_requests.c

index 8a5054f21bf880644a0bc3002971bf63491406ff..24c99d0838af6e23e7eb6295139b852d02ec538c 100644 (file)
@@ -147,24 +147,32 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
                        fence = i915_active_fence_get(&tl->last_request);
                        if (fence) {
+                               mutex_unlock(&tl->mutex);
+
                                timeout = dma_fence_wait_timeout(fence,
                                                                 interruptible,
                                                                 timeout);
                                dma_fence_put(fence);
+
+                               /* Retirement is best effort */
+                               if (!mutex_trylock(&tl->mutex)) {
+                                       active_count++;
+                                       goto out_active;
+                               }
                        }
                }
 
                if (!retire_requests(tl) || flush_submission(gt))
                        active_count++;
+               mutex_unlock(&tl->mutex);
 
-               spin_lock(&timelines->lock);
+out_active:    spin_lock(&timelines->lock);
 
-               /* Resume iteration after dropping lock */
+               /* Resume list iteration after reacquiring spinlock */
                list_safe_reset_next(tl, tn, link);
                if (atomic_dec_and_test(&tl->active_count))
                        list_del(&tl->link);
 
-               mutex_unlock(&tl->mutex);
 
                /* Defer the final release to after the spinlock */
                if (refcount_dec_and_test(&tl->kref.refcount)) {