From 178b8a3668bd63b40303d9dcb17ad58cf4b44007 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 2 Nov 2022 12:21:09 -0700 Subject: [PATCH] drm/i915/guc: Don't deadlock busyness stats vs reset The engine busyness stats has a worker function to do things like 64bit extend the 32bit hardware counters. The GuC's reset prepare function flushes out this worker function to ensure no corruption happens during the reset. Unforunately, the worker function has an infinite wait for active resets to finish before doing its work. Thus a deadlock would occur if the worker function had actually started just as the reset starts. The function being used to lock the reset-in-progress mutex is called intel_gt_reset_trylock(). However, as noted it does not follow standard 'trylock' conventions and exit if already locked. So rename the current _trylock function to intel_gt_reset_lock_interruptible(), which is the behaviour it actually provides. In addition, add a new implementation of _trylock and call that from the busyness stats worker instead. v2: Rename existing trylock to interruptible rather than trying to preserve the existing (confusing) naming scheme (review comments from Tvrtko). Signed-off-by: John Harrison Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20221102192109.2492625-3-John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 18 ++++++++++++++++-- drivers/gpu/drm/i915/gt/intel_reset.h | 1 + drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 4 +++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index e63329b..c29efde 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -330,7 +330,7 @@ retry: if (ret) goto err_rpm; - ret = intel_gt_reset_trylock(ggtt->vm.gt, &srcu); + ret = intel_gt_reset_lock_interruptible(ggtt->vm.gt, &srcu); if (ret) goto err_pages; diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 3159df6..24736eb 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1407,15 +1407,19 @@ out: intel_runtime_pm_put(gt->uncore->rpm, wakeref); } -int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) +static int _intel_gt_reset_lock(struct intel_gt *gt, int *srcu, bool retry) { might_lock(>->reset.backoff_srcu); - might_sleep(); + if (retry) + might_sleep(); rcu_read_lock(); while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { rcu_read_unlock(); + if (!retry) + return -EBUSY; + if (wait_event_interruptible(gt->reset.queue, !test_bit(I915_RESET_BACKOFF, >->reset.flags))) @@ -1429,6 +1433,16 @@ int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) return 0; } +int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) +{ + return _intel_gt_reset_lock(gt, srcu, false); +} + +int intel_gt_reset_lock_interruptible(struct intel_gt *gt, int *srcu) +{ + return _intel_gt_reset_lock(gt, srcu, true); +} + void intel_gt_reset_unlock(struct intel_gt *gt, int tag) __releases(>->reset.backoff_srcu) { diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h index adc734e..25c975b 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.h +++ b/drivers/gpu/drm/i915/gt/intel_reset.h @@ -39,6 +39,7 @@ int __intel_engine_reset_bh(struct intel_engine_cs *engine, void __i915_request_reset(struct i915_request *rq, bool guilty); int __must_check intel_gt_reset_trylock(struct intel_gt *gt, int *srcu); +int __must_check intel_gt_reset_lock_interruptible(struct intel_gt *gt, int *srcu); void intel_gt_reset_unlock(struct intel_gt *gt, int tag); void intel_gt_set_wedged(struct intel_gt *gt); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 8fbc363..412c262 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1401,7 +1401,9 @@ static void guc_timestamp_ping(struct work_struct *wrk) /* * Synchronize with gt reset to make sure the worker does not - * corrupt the engine/guc stats. + * corrupt the engine/guc stats. NB: can't actually block waiting + * for a reset to complete as the reset requires flushing out + * this worker thread if started. So waiting would deadlock. */ ret = intel_gt_reset_trylock(gt, &srcu); if (ret) -- 2.7.4