drm/i915: Record more information about the hanging contexts
authorChris Wilson <chris@chris-wilson.co.uk>
Sun, 29 Jan 2017 09:24:33 +0000 (09:24 +0000)
committerChris Wilson <chris@chris-wilson.co.uk>
Tue, 31 Jan 2017 09:50:03 +0000 (09:50 +0000)
Include extra information such as the user_handle and hw_id so that
userspace can identify which of their contexts hung, useful if they are
performing self-diagnositics.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170129092433.10483-1-chris@chris-wilson.co.uk
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gpu_error.c

index 457129b..448e5d8 100644 (file)
@@ -969,6 +969,16 @@ struct drm_i915_error_state {
                u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
                struct intel_instdone instdone;
 
+               struct drm_i915_error_context {
+                       char comm[TASK_COMM_LEN];
+                       pid_t pid;
+                       u32 handle;
+                       u32 hw_id;
+                       int ban_score;
+                       int active;
+                       int guilty;
+               } context;
+
                struct drm_i915_error_object {
                        u64 gtt_offset;
                        u64 gtt_size;
@@ -1002,10 +1012,6 @@ struct drm_i915_error_state {
                                u32 pp_dir_base;
                        };
                } vm_info;
-
-               pid_t pid;
-               char comm[TASK_COMM_LEN];
-               int context_bans;
        } engine[I915_NUM_ENGINES];
 
        struct drm_i915_error_buffer {
index e537532..5283fe8 100644 (file)
@@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
                   erq->head, erq->tail);
 }
 
+static void error_print_context(struct drm_i915_error_state_buf *m,
+                               const char *header,
+                               struct drm_i915_error_context *ctx)
+{
+       err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
+                  header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
+                  ctx->ban_score, ctx->guilty, ctx->active);
+}
+
 static void error_print_engine(struct drm_i915_error_state_buf *m,
                               struct drm_i915_error_engine *ee)
 {
@@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 
        error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
        error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
+       error_print_context(m, "  Active context: ", &ee->context);
 }
 
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
@@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 
        for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
                if (error->engine[i].hangcheck_stalled &&
-                   error->engine[i].pid != -1) {
-                       err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n",
+                   error->engine[i].context.pid) {
+                       err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
                                   engine_str(i),
-                                  error->engine[i].comm,
-                                  error->engine[i].pid,
-                                  error->engine[i].context_bans);
+                                  error->engine[i].context.comm,
+                                  error->engine[i].context.pid,
+                                  error->engine[i].context.ban_score);
                }
        }
        err_printf(m, "Reset count: %u\n", error->reset_count);
@@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
                obj = ee->batchbuffer;
                if (obj) {
                        err_puts(m, dev_priv->engine[i]->name);
-                       if (ee->pid != -1)
-                               err_printf(m, " (submitted by %s [%d], bans %d)",
-                                          ee->comm,
-                                          ee->pid,
-                                          ee->context_bans);
+                       if (ee->context.pid)
+                               err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
+                                          ee->context.comm,
+                                          ee->context.pid,
+                                          ee->context.handle,
+                                          ee->context.hw_id,
+                                          ee->context.ban_score);
                        err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
                                   upper_32_bits(obj->gtt_offset),
                                   lower_32_bits(obj->gtt_offset));
@@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
                                       &ee->execlist[n]);
 }
 
+static void record_context(struct drm_i915_error_context *e,
+                          struct i915_gem_context *ctx)
+{
+       if (ctx->pid) {
+               struct task_struct *task;
+
+               rcu_read_lock();
+               task = pid_task(ctx->pid, PIDTYPE_PID);
+               if (task) {
+                       strcpy(e->comm, task->comm);
+                       e->pid = task->pid;
+               }
+               rcu_read_unlock();
+       }
+
+       e->handle = ctx->user_handle;
+       e->hw_id = ctx->hw_id;
+       e->ban_score = ctx->ban_score;
+       e->guilty = ctx->guilty_count;
+       e->active = ctx->active_count;
+}
+
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
                                  struct drm_i915_error_state *error)
 {
@@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
                struct drm_i915_error_engine *ee = &error->engine[i];
                struct drm_i915_gem_request *request;
 
-               ee->pid = -1;
                ee->engine_id = -1;
 
                if (!engine)
@@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
                request = i915_gem_find_active_request(engine);
                if (request) {
                        struct intel_ring *ring;
-                       struct pid *pid;
 
                        ee->vm = request->ctx->ppgtt ?
                                &request->ctx->ppgtt->base : &ggtt->base;
 
+                       record_context(&ee->context, request->ctx);
+
                        /* We need to copy these to an anonymous buffer
                         * as the simplest method to avoid being overwritten
                         * by userspace.
@@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
                                i915_error_object_create(dev_priv,
                                                         request->ctx->engine[i].state);
 
-                       pid = request->ctx->pid;
-                       if (pid) {
-                               struct task_struct *task;
-
-                               rcu_read_lock();
-                               task = pid_task(pid, PIDTYPE_PID);
-                               if (task) {
-                                       strcpy(ee->comm, task->comm);
-                                       ee->pid = task->pid;
-                               }
-                               rcu_read_unlock();
-                       }
-
                        error->simulated |=
                                i915_gem_context_no_error_capture(request->ctx);
 
@@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
                        "GPU HANG: ecode %d:%d:0x%08x",
                        INTEL_GEN(dev_priv), engine_id, ecode);
 
-       if (engine_id != -1 && error->engine[engine_id].pid != -1)
+       if (engine_id != -1 && error->engine[engine_id].context.pid)
                len += scnprintf(error->error_msg + len,
                                 sizeof(error->error_msg) - len,
                                 ", in %s [%d]",
-                                error->engine[engine_id].comm,
-                                error->engine[engine_id].pid);
+                                error->engine[engine_id].context.comm,
+                                error->engine[engine_id].context.pid);
 
        scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
                  ", reason: %s, action: %s",