From 3700e353781e27f1bc7222f51f2cc36cbeb9b4ec Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 26 Jan 2023 16:28:36 -0800 Subject: [PATCH] drm/i915: Fix request ref counting during error capture & debugfs dump When GuC support was added to error capture, the reference counting around the request object was broken. Fix it up. The context based search manages the spinlocking around the search internally. So it needs to grab the reference count internally as well. The execlist only request based search relies on external locking, so it needs an external reference count but within the spinlock not outside it. The only other caller of the context based search is the code for dumping engine state to debugfs. That code wasn't previously getting an explicit reference at all as it does everything while holding the execlist specific spinlock. So, that needs updaing as well as that spinlock doesn't help when using GuC submission. Rather than trying to conditionally get/put depending on submission model, just change it to always do the get/put. v2: Explicitly document adding an extra blank line in some dense code (Andy Shevchenko). Fix multiple potential null pointer derefs in case of no request found (some spotted by Tvrtko, but there was more!). Also fix a leaked request in case of !started and another in __guc_reset_context now that intel_context_find_active_request is actually reference counting the returned request. v3: Add a _get suffix to intel_context_find_active_request now that it grabs a reference (Daniele). v4: Split the intel_guc_find_hung_context change to a separate patch and rename intel_context_find_active_request_get to intel_context_get_active_request (Tvrtko). v5: s/locking/reference counting/ in commit message (Tvrtko) Fixes: dc0dad365c5e ("drm/i915/guc: Fix for error capture after full GPU reset with GuC") Fixes: 573ba126aef3 ("drm/i915/guc: Capture error state on context reset") Signed-off-by: John Harrison Reviewed-by: Daniele Ceraolo Spurio Acked-by: Tvrtko Ursulin Cc: Matthew Brost Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Andrzej Hajda Cc: Matthew Auld Cc: Matt Roper Cc: Umesh Nerlige Ramappa Cc: Michael Cheng Cc: Lucas De Marchi Cc: Tejas Upadhyay Cc: Andy Shevchenko Cc: Aravind Iddamsetty Cc: Alan Previn Cc: Bruce Chang Link: https://patchwork.freedesktop.org/patch/msgid/20230127002842.3169194-3-John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gt/intel_context.c | 4 +++- drivers/gpu/drm/i915/gt/intel_context.h | 3 +-- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 6 +++++- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 3 ++- drivers/gpu/drm/i915/i915_gpu_error.c | 13 ++++++------- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index e94365b08f1e..2aa63ec521b8 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -528,7 +528,7 @@ retry: return rq; } -struct i915_request *intel_context_find_active_request(struct intel_context *ce) +struct i915_request *intel_context_get_active_request(struct intel_context *ce) { struct intel_context *parent = intel_context_to_parent(ce); struct i915_request *rq, *active = NULL; @@ -552,6 +552,8 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce) active = rq; } + if (active) + active = i915_request_get_rcu(active); spin_unlock_irqrestore(&parent->guc_state.lock, flags); return active; diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index fb62b7b8cbcd..0a8d553da3f4 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -268,8 +268,7 @@ int intel_context_prepare_remote_request(struct intel_context *ce, struct i915_request *intel_context_create_request(struct intel_context *ce); -struct i915_request * -intel_context_find_active_request(struct intel_context *ce); +struct i915_request *intel_context_get_active_request(struct intel_context *ce); static inline bool intel_context_is_barrier(const struct intel_context *ce) { diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 922f1bb22dc6..a86bdbee7a6b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -2237,9 +2237,11 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d if (guc) { ce = intel_engine_get_hung_context(engine); if (ce) - hung_rq = intel_context_find_active_request(ce); + hung_rq = intel_context_get_active_request(ce); } else { hung_rq = intel_engine_execlist_find_hung_request(engine); + if (hung_rq) + hung_rq = i915_request_get_rcu(hung_rq); } if (hung_rq) @@ -2250,6 +2252,8 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d else intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m); + if (hung_rq) + i915_request_put(hung_rq); } void intel_engine_dump(struct intel_engine_cs *engine, diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 3b34a82d692b..a2b263e5fd66 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1702,7 +1702,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st goto next_context; guilty = false; - rq = intel_context_find_active_request(ce); + rq = intel_context_get_active_request(ce); if (!rq) { head = ce->ring->tail; goto out_replay; @@ -1715,6 +1715,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st head = intel_ring_wrap(ce->ring, rq->head); __i915_request_reset(rq, guilty); + i915_request_put(rq); out_replay: guc_reset_state(ce, head, guilty); next_context: diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9d5d5a397b64..9e2d17785a9a 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1607,7 +1607,7 @@ capture_engine(struct intel_engine_cs *engine, ce = intel_engine_get_hung_context(engine); if (ce) { intel_engine_clear_hung_context(engine); - rq = intel_context_find_active_request(ce); + rq = intel_context_get_active_request(ce); if (!rq || !i915_request_started(rq)) goto no_request_capture; } else { @@ -1618,21 +1618,18 @@ capture_engine(struct intel_engine_cs *engine, if (!intel_uc_uses_guc_submission(&engine->gt->uc)) { spin_lock_irqsave(&engine->sched_engine->lock, flags); rq = intel_engine_execlist_find_hung_request(engine); + if (rq) + rq = i915_request_get_rcu(rq); spin_unlock_irqrestore(&engine->sched_engine->lock, flags); } } - if (rq) - rq = i915_request_get_rcu(rq); - if (!rq) goto no_request_capture; capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); - if (!capture) { - i915_request_put(rq); + if (!capture) goto no_request_capture; - } if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) intel_guc_capture_get_matching_node(engine->gt, ee, ce); @@ -1642,6 +1639,8 @@ capture_engine(struct intel_engine_cs *engine, return ee; no_request_capture: + if (rq) + i915_request_put(rq); kfree(ee); return NULL; } -- 2.34.1