drm/i915: Add reason for capture in error state
authorMika Kuoppala <mika.kuoppala@linux.intel.com>
Tue, 25 Feb 2014 15:11:26 +0000 (17:11 +0200)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Wed, 5 Mar 2014 20:30:26 +0000 (21:30 +0100)
We capture error state not only when the GPU hangs but also on
other situations as in interrupt errors and in situations where
we can kick things forward without GPU reset. There will be log
entry on most of these cases. But as error state capture might be
only thing we have, if dmesg was not captured. Or as in GEN4 case,
interrupt error can trigger error state capture without log entry,
the exact reason why capture was made is hard to decipher.

v2: Split out the the error code stuff to separate patch (Ben)

References: https://bugs.freedesktop.org/show_bug.cgi?id=74193
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_irq.c

index aef779b..43f3074 100644 (file)
@@ -3190,9 +3190,8 @@ i915_wedged_set(void *data, u64 val)
 {
        struct drm_device *dev = data;
 
-       DRM_INFO("Manually setting wedged to %llu\n", val);
-       i915_handle_error(dev, val);
-
+       i915_handle_error(dev, val,
+                         "Manually setting wedged to %llu", val);
        return 0;
 }
 
index 7db735c..2b0da95 100644 (file)
@@ -2008,7 +2008,9 @@ extern void intel_console_resume(struct work_struct *work);
 
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
-void i915_handle_error(struct drm_device *dev, bool wedged);
+__printf(3, 4)
+void i915_handle_error(struct drm_device *dev, bool wedged,
+                      const char *fmt, ...);
 
 void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
                                                        int new_delay);
@@ -2449,7 +2451,8 @@ static inline void i915_error_state_buf_release(
 {
        kfree(eb->buf);
 }
-void i915_capture_error_state(struct drm_device *dev);
+void i915_capture_error_state(struct drm_device *dev, bool wedge,
+                             const char *error_msg);
 void i915_error_state_get(struct drm_device *dev,
                          struct i915_error_state_file_priv *error_priv);
 void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
index 75dd056..7b2afa0 100644 (file)
@@ -1094,16 +1094,30 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
 }
 
 static void i915_error_capture_msg(struct drm_device *dev,
-                                  struct drm_i915_error_state *error)
+                                  struct drm_i915_error_state *error,
+                                  bool wedged,
+                                  const char *error_msg)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
        u32 ecode;
-       int ring_id = -1;
+       int ring_id = -1, len;
 
        ecode = i915_error_generate_code(dev_priv, error, &ring_id);
 
-       scnprintf(error->error_msg, sizeof(error->error_msg),
-                 "GPU HANG: ecode %d:0x%08x", ring_id, ecode);
+       len = scnprintf(error->error_msg, sizeof(error->error_msg),
+                       "GPU HANG: ecode %d:0x%08x", ring_id, ecode);
+
+       if (ring_id != -1 && error->ring[ring_id].pid != -1)
+               len += scnprintf(error->error_msg + len,
+                                sizeof(error->error_msg) - len,
+                                ", in %s [%d]",
+                                error->ring[ring_id].comm,
+                                error->ring[ring_id].pid);
+
+       scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
+                 ", reason: %s, action: %s",
+                 error_msg,
+                 wedged ? "reset" : "continue");
 }
 
 /**
@@ -1115,7 +1129,8 @@ static void i915_error_capture_msg(struct drm_device *dev,
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
-void i915_capture_error_state(struct drm_device *dev)
+void i915_capture_error_state(struct drm_device *dev, bool wedged,
+                             const char *error_msg)
 {
        static bool warned;
        struct drm_i915_private *dev_priv = dev->dev_private;
@@ -1141,7 +1156,7 @@ void i915_capture_error_state(struct drm_device *dev)
        error->overlay = intel_overlay_capture_error_state(dev);
        error->display = intel_display_capture_error_state(dev);
 
-       i915_error_capture_msg(dev, error);
+       i915_error_capture_msg(dev, error, wedged, error_msg);
        DRM_INFO("%s\n", error->error_msg);
 
        spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
index 484415b..3e8359e 100644 (file)
@@ -1297,8 +1297,8 @@ static void snb_gt_irq_handler(struct drm_device *dev,
        if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
                      GT_BSD_CS_ERROR_INTERRUPT |
                      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
-               DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
-               i915_handle_error(dev, false);
+               i915_handle_error(dev, false, "GT error interrupt 0x%08x",
+                                 gt_iir);
        }
 
        if (gt_iir & GT_PARITY_ERROR(dev))
@@ -1545,8 +1545,9 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
                        notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
 
                if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
-                       DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
-                       i915_handle_error(dev_priv->dev, false);
+                       i915_handle_error(dev_priv->dev, false,
+                                         "VEBOX CS error interrupt 0x%08x",
+                                         pm_iir);
                }
        }
 }
@@ -2278,11 +2279,18 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
  * so userspace knows something bad happened (should trigger collection
  * of a ring dump etc.).
  */
-void i915_handle_error(struct drm_device *dev, bool wedged)
+void i915_handle_error(struct drm_device *dev, bool wedged,
+                      const char *fmt, ...)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
+       va_list args;
+       char error_msg[80];
 
-       i915_capture_error_state(dev);
+       va_start(args, fmt);
+       vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+       va_end(args);
+
+       i915_capture_error_state(dev, wedged, error_msg);
        i915_report_and_clear_eir(dev);
 
        if (wedged) {
@@ -2585,9 +2593,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
         */
        tmp = I915_READ_CTL(ring);
        if (tmp & RING_WAIT) {
-               DRM_ERROR("Kicking stuck wait on %s\n",
-                         ring->name);
-               i915_handle_error(dev, false);
+               i915_handle_error(dev, false,
+                                 "Kicking stuck wait on %s",
+                                 ring->name);
                I915_WRITE_CTL(ring, tmp);
                return HANGCHECK_KICK;
        }
@@ -2597,9 +2605,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
                default:
                        return HANGCHECK_HUNG;
                case 1:
-                       DRM_ERROR("Kicking stuck semaphore on %s\n",
-                                 ring->name);
-                       i915_handle_error(dev, false);
+                       i915_handle_error(dev, false,
+                                         "Kicking stuck semaphore on %s",
+                                         ring->name);
                        I915_WRITE_CTL(ring, tmp);
                        return HANGCHECK_KICK;
                case 0:
@@ -2721,7 +2729,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
        }
 
        if (rings_hung)
-               return i915_handle_error(dev, true);
+               return i915_handle_error(dev, true, "Ring hung");
 
        if (busy_count)
                /* Reset timer case chip hangs without another request
@@ -3338,7 +3346,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
                 */
                spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
                if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-                       i915_handle_error(dev, false);
+                       i915_handle_error(dev, false,
+                                         "Command parser error, iir 0x%08x",
+                                         iir);
 
                for_each_pipe(pipe) {
                        int reg = PIPESTAT(pipe);
@@ -3520,7 +3530,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
                 */
                spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
                if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-                       i915_handle_error(dev, false);
+                       i915_handle_error(dev, false,
+                                         "Command parser error, iir 0x%08x",
+                                         iir);
 
                for_each_pipe(pipe) {
                        int reg = PIPESTAT(pipe);
@@ -3757,7 +3769,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
                 */
                spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
                if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-                       i915_handle_error(dev, false);
+                       i915_handle_error(dev, false,
+                                         "Command parser error, iir 0x%08x",
+                                         iir);
 
                for_each_pipe(pipe) {
                        int reg = PIPESTAT(pipe);