drm/i915: Record pid/comm of hanging task
authorChris Wilson <chris@chris-wilson.co.uk>
Tue, 25 Feb 2014 15:11:24 +0000 (17:11 +0200)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Wed, 5 Mar 2014 20:30:24 +0000 (21:30 +0100)
After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.

This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.

v2: Also capture the workaround buffer for gen2, so that we can compare
    its contents against the intended batch for the active request.

v3: Rebase (Mika)
v4: Check for null context (Chris)
    checkpatch warnings fixed

Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v2)
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> (v4)
Acked-by: Ben Widawsky <ben@bwidawsk.net>
Cc: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gpu_error.c

index fe4427b..826fcae 100644 (file)
@@ -360,7 +360,7 @@ struct drm_i915_error_state {
                        int page_count;
                        u32 gtt_offset;
                        u32 *pages[0];
-               } *ringbuffer, *batchbuffer, *ctx, *hws_page;
+               } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
                struct drm_i915_error_request {
                        long jiffies;
@@ -375,6 +375,9 @@ struct drm_i915_error_state {
                                u32 pp_dir_base;
                        };
                } vm_info;
+
+               pid_t pid;
+               char comm[TASK_COMM_LEN];
        } ring[I915_NUM_RINGS];
        struct drm_i915_error_buffer {
                u32 size;
@@ -1797,6 +1800,7 @@ struct drm_i915_gem_request {
 
 struct drm_i915_file_private {
        struct drm_i915_private *dev_priv;
+       struct drm_file *file;
 
        struct {
                spinlock_t lock;
index c5a182b..6e17b45 100644 (file)
@@ -4857,6 +4857,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
 
        file->driver_priv = file_priv;
        file_priv->dev_priv = dev->dev_private;
+       file_priv->file = file;
 
        spin_lock_init(&file_priv->mm.lock);
        INIT_LIST_HEAD(&file_priv->mm.request_list);
index eed1b34..8b02498 100644 (file)
@@ -301,13 +301,28 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
        va_end(args);
 }
 
+static void print_error_obj(struct drm_i915_error_state_buf *m,
+                           struct drm_i915_error_object *obj)
+{
+       int page, offset, elt;
+
+       for (page = offset = 0; page < obj->page_count; page++) {
+               for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+                       err_printf(m, "%08x :  %08x\n", offset,
+                                  obj->pages[page][elt]);
+                       offset += 4;
+               }
+       }
+}
+
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
                            const struct i915_error_state_file_priv *error_priv)
 {
        struct drm_device *dev = error_priv->dev;
        drm_i915_private_t *dev_priv = dev->dev_private;
        struct drm_i915_error_state *error = error_priv->error;
-       int i, j, page, offset, elt;
+       int i, j, offset, elt;
+       int max_hangcheck_score;
 
        if (!error) {
                err_printf(m, "no error state collected\n");
@@ -317,6 +332,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
        err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
                   error->time.tv_usec);
        err_printf(m, "Kernel: " UTS_RELEASE "\n");
+       max_hangcheck_score = 0;
+       for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+               if (error->ring[i].hangcheck_score > max_hangcheck_score)
+                       max_hangcheck_score = error->ring[i].hangcheck_score;
+       }
+       for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+               if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+                   error->ring[i].pid != -1) {
+                       err_printf(m, "Active process (on ring %s): %s [%d]\n",
+                                  ring_str(i),
+                                  error->ring[i].comm,
+                                  error->ring[i].pid);
+               }
+       }
        err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
        err_printf(m, "EIR: 0x%08x\n", error->eir);
        err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -359,18 +388,23 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
        for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
                struct drm_i915_error_object *obj;
 
-               if ((obj = error->ring[i].batchbuffer)) {
-                       err_printf(m, "%s --- gtt_offset = 0x%08x\n",
-                                  dev_priv->ring[i].name,
+               obj = error->ring[i].batchbuffer;
+               if (obj) {
+                       err_puts(m, dev_priv->ring[i].name);
+                       if (error->ring[i].pid != -1)
+                               err_printf(m, " (submitted by %s [%d])",
+                                          error->ring[i].comm,
+                                          error->ring[i].pid);
+                       err_printf(m, " --- gtt_offset = 0x%08x\n",
                                   obj->gtt_offset);
-                       offset = 0;
-                       for (page = 0; page < obj->page_count; page++) {
-                               for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-                                       err_printf(m, "%08x :  %08x\n", offset,
-                                                  obj->pages[page][elt]);
-                                       offset += 4;
-                               }
-                       }
+                       print_error_obj(m, obj);
+               }
+
+               obj = error->ring[i].wa_batchbuffer;
+               if (obj) {
+                       err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n",
+                                  dev_priv->ring[i].name, obj->gtt_offset);
+                       print_error_obj(m, obj);
                }
 
                if (error->ring[i].num_requests) {
@@ -389,15 +423,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
                        err_printf(m, "%s --- ringbuffer = 0x%08x\n",
                                   dev_priv->ring[i].name,
                                   obj->gtt_offset);
-                       offset = 0;
-                       for (page = 0; page < obj->page_count; page++) {
-                               for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-                                       err_printf(m, "%08x :  %08x\n",
-                                                  offset,
-                                                  obj->pages[page][elt]);
-                                       offset += 4;
-                               }
-                       }
+                       print_error_obj(m, obj);
                }
 
                if ((obj = error->ring[i].hws_page)) {
@@ -713,39 +739,6 @@ static void i915_gem_record_fences(struct drm_device *dev,
        }
 }
 
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
-                            struct intel_ring_buffer *ring)
-{
-       struct drm_i915_gem_request *request;
-
-       if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
-               struct drm_i915_gem_object *obj;
-               u32 acthd = I915_READ(ACTHD);
-
-               if (WARN_ON(ring->id != RCS))
-                       return NULL;
-
-               obj = ring->scratch.obj;
-               if (obj != NULL &&
-                   acthd >= i915_gem_obj_ggtt_offset(obj) &&
-                   acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
-                       return i915_error_ggtt_object_create(dev_priv, obj);
-       }
-
-       request = i915_gem_find_active_request(ring);
-       if (request == NULL)
-               return NULL;
-
-       /* We need to copy these to an anonymous buffer as the simplest
-        * method to avoid being overwritten by userspace.
-        */
-       return i915_error_object_create(dev_priv, request->batch_obj,
-                                       request->ctx ?
-                                       request->ctx->vm :
-                                       &dev_priv->gtt.base);
-}
-
 static void i915_record_ring_state(struct drm_device *dev,
                                   struct intel_ring_buffer *ring,
                                   struct drm_i915_error_ring *ering)
@@ -894,8 +887,39 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
                i915_record_ring_state(dev, ring, &error->ring[i]);
 
-               error->ring[i].batchbuffer =
-                       i915_error_first_batchbuffer(dev_priv, ring);
+               error->ring[i].pid = -1;
+               request = i915_gem_find_active_request(ring);
+               if (request) {
+                       /* We need to copy these to an anonymous buffer
+                        * as the simplest method to avoid being overwritten
+                        * by userspace.
+                        */
+                       error->ring[i].batchbuffer =
+                               i915_error_object_create(dev_priv,
+                                                        request->batch_obj,
+                                                        request->ctx ?
+                                                        request->ctx->vm :
+                                                        &dev_priv->gtt.base);
+
+                       if (HAS_BROKEN_CS_TLB(dev_priv->dev) &&
+                           ring->scratch.obj)
+                               error->ring[i].wa_batchbuffer =
+                                       i915_error_ggtt_object_create(dev_priv,
+                                                            ring->scratch.obj);
+
+                       if (request->file_priv) {
+                               struct task_struct *task;
+
+                               rcu_read_lock();
+                               task = pid_task(request->file_priv->file->pid,
+                                               PIDTYPE_PID);
+                               if (task) {
+                                       strcpy(error->ring[i].comm, task->comm);
+                                       error->ring[i].pid = task->pid;
+                               }
+                               rcu_read_unlock();
+                       }
+               }
 
                error->ring[i].ringbuffer =
                        i915_error_ggtt_object_create(dev_priv, ring->obj);