drm/i915: Improve record of hung engines in error state
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>
Wed, 4 Nov 2020 13:47:42 +0000 (13:47 +0000)
committerTvrtko Ursulin <tvrtko.ursulin@intel.com>
Mon, 9 Nov 2020 11:59:43 +0000 (11:59 +0000)
Between events which trigger engine and GPU resets and capturing the error
state we lose information on which engine triggered the reset. Improve
this by passing in the hung engine mask down to error capture.

Result is that the list of engines in user visible "GPU HANG: ecode
<gen>:<engines>:<ecode>, <process>" is now a list of hanging and not just
active engines. Most importantly the displayed process is now the one
which was actually hung.

v2:
 * Stub prototype. (Chris)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20201104134743.916027-1-tvrtko.ursulin@linux.intel.com
drivers/gpu/drm/i915/gt/intel_lrc.c
drivers/gpu/drm/i915/gt/intel_reset.c
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_gpu_error.h

index f3eb68a76a258eeef06be56e37fcbaa3c01d00a2..8a51c1c3a09117d389b4076dcaa7c7cfce9a2c6a 100644 (file)
@@ -3037,6 +3037,8 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
        if (!cap->error->gt->engine)
                goto err_gt;
 
+       cap->error->gt->engine->hung = true;
+
        return cap;
 
 err_gt:
index 4e5e13dc95da2baff6885256cf100aa3f85969e7..9fb4306b2900e41c667a072ff6fea13e678cbfb5 100644 (file)
@@ -1251,7 +1251,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
        engine_mask &= gt->info.engine_mask;
 
        if (flags & I915_ERROR_CAPTURE) {
-               i915_capture_error_state(gt->i915);
+               i915_capture_error_state(gt, engine_mask);
                intel_gt_clear_error_registers(gt, engine_mask);
        }
 
index ea469168cd44326b02198c2b1b087018b39d9371..a727552d2bc6aad13d29eb3fbe48d90d2f2081a6 100644 (file)
@@ -725,7 +725,7 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)
 
        gpu = NULL;
        with_intel_runtime_pm(&i915->runtime_pm, wakeref)
-               gpu = i915_gpu_coredump(i915);
+               gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES);
        if (IS_ERR(gpu))
                return PTR_ERR(gpu);
 
index ae370909e8ddd037949154bd49a03d67a0093927..994738d974cce2b4d82ea3594f178ac3bb16eae3 100644 (file)
@@ -570,6 +570,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
                                   ee->vm_info.pp_dir_base);
                }
        }
+       err_printf(m, "  hung: %u\n", ee->hung);
        err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 
        for (n = 0; n < ee->num_ports; n++) {
@@ -1456,6 +1457,7 @@ capture_engine(struct intel_engine_cs *engine,
 
 static void
 gt_record_engines(struct intel_gt_coredump *gt,
+                 intel_engine_mask_t engine_mask,
                  struct i915_vma_compress *compress)
 {
        struct intel_engine_cs *engine;
@@ -1471,6 +1473,8 @@ gt_record_engines(struct intel_gt_coredump *gt,
                if (!ee)
                        continue;
 
+               ee->hung = engine->mask & engine_mask;
+
                gt->simulated |= ee->simulated;
                if (ee->simulated) {
                        kfree(ee);
@@ -1663,11 +1667,13 @@ static const char *error_msg(struct i915_gpu_coredump *error)
        for (gt = error->gt; gt; gt = gt->next) {
                struct intel_engine_coredump *cs;
 
-               if (gt->engine && !first)
-                       first = gt->engine;
-
-               for (cs = gt->engine; cs; cs = cs->next)
-                       engines |= cs->engine->mask;
+               for (cs = gt->engine; cs; cs = cs->next) {
+                       if (cs->hung) {
+                               engines |= cs->engine->mask;
+                               if (!first)
+                                       first = cs;
+                       }
+               }
        }
 
        len = scnprintf(error->error_msg, sizeof(error->error_msg),
@@ -1781,8 +1787,10 @@ void i915_vma_capture_finish(struct intel_gt_coredump *gt,
        kfree(compress);
 }
 
-struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
+struct i915_gpu_coredump *
+i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask)
 {
+       struct drm_i915_private *i915 = gt->i915;
        struct i915_gpu_coredump *error;
 
        /* Check if GPU capture has been disabled */
@@ -1794,7 +1802,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
        if (!error)
                return ERR_PTR(-ENOMEM);
 
-       error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL);
+       error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL);
        if (error->gt) {
                struct i915_vma_compress *compress;
 
@@ -1806,7 +1814,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
                }
 
                gt_record_info(error->gt);
-               gt_record_engines(error->gt, compress);
+               gt_record_engines(error->gt, engine_mask, compress);
 
                if (INTEL_INFO(i915)->has_gt_uc)
                        error->gt->uc = gt_record_uc(error->gt, compress);
@@ -1853,20 +1861,23 @@ void i915_error_state_store(struct i915_gpu_coredump *error)
 
 /**
  * i915_capture_error_state - capture an error record for later analysis
- * @i915: i915 device
+ * @gt: intel_gt which originated the hang
+ * @engine_mask: hung engines
+ *
  *
  * Should be called when an error is detected (either a hang or an error
  * interrupt) to capture error state from the time of the error.  Fills
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
-void i915_capture_error_state(struct drm_i915_private *i915)
+void i915_capture_error_state(struct intel_gt *gt,
+                             intel_engine_mask_t engine_mask)
 {
        struct i915_gpu_coredump *error;
 
-       error = i915_gpu_coredump(i915);
+       error = i915_gpu_coredump(gt, engine_mask);
        if (IS_ERR(error)) {
-               cmpxchg(&i915->gpu_error.first_error, NULL, error);
+               cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
                return;
        }
 
index 0220b09928080cc0b0345d5adc882c46a5b40484..16bc42de4b84091e2244528a75a743eb309bb3ec 100644 (file)
@@ -59,6 +59,7 @@ struct i915_request_coredump {
 struct intel_engine_coredump {
        const struct intel_engine_cs *engine;
 
+       bool hung;
        bool simulated;
        u32 reset_count;
 
@@ -218,8 +219,10 @@ struct drm_i915_error_state_buf {
 __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 
-struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
-void i915_capture_error_state(struct drm_i915_private *i915);
+struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
+                                           intel_engine_mask_t engine_mask);
+void i915_capture_error_state(struct intel_gt *gt,
+                             intel_engine_mask_t engine_mask);
 
 struct i915_gpu_coredump *
 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
@@ -271,7 +274,8 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err);
 
 #else
 
-static inline void i915_capture_error_state(struct drm_i915_private *i915)
+static inline void
+i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask)
 {
 }