drm/i915: fix wait_for_pending_flips vs gpu hang deadlock

author Daniel Vetter <daniel.vetter@ffwll.ch>

Sun, 8 Sep 2013 19:57:13 +0000 (21:57 +0200)

committer Daniel Vetter <daniel.vetter@ffwll.ch>

Mon, 9 Sep 2013 09:26:03 +0000 (11:26 +0200)
author Daniel Vetter <daniel.vetter@ffwll.ch>
Sun, 8 Sep 2013 19:57:13 +0000 (21:57 +0200)
committer Daniel Vetter <daniel.vetter@ffwll.ch>
Mon, 9 Sep 2013 09:26:03 +0000 (11:26 +0200)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c

index 83cce0c..4b91228 100644 (file)
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1469,6 +1469,34 @@ static irqreturn_t ironlake_irq_handler(int irq, void *arg)
         return ret;
  }
  
+static void i915_error_wake_up(struct drm_i915_private *dev_priv,
+                              bool reset_completed)
+{
+       struct intel_ring_buffer *ring;
+       int i;
+
+       /*
+        * Notify all waiters for GPU completion events that reset state has
+        * been changed, and that they need to restart their wait after
+        * checking for potential errors (and bail out to drop locks if there is
+        * a gpu reset pending so that i915_error_work_func can acquire them).
+        */
+
+       /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
+       for_each_ring(ring, dev_priv, i)
+               wake_up_all(&ring->irq_queue);
+
+       /* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
+       wake_up_all(&dev_priv->pending_flip_queue);
+
+       /*
+        * Signal tasks blocked in i915_gem_wait_for_error that the pending
+        * reset state is cleared.
+        */
+       if (reset_completed)
+               wake_up_all(&dev_priv->gpu_error.reset_queue);
+}
+
  /**
   * i915_error_work_func - do process context error handling work
   * @work: work struct
@@ -1483,11 +1511,10 @@ static void i915_error_work_func(struct work_struct *work)
         drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
                                                     gpu_error);
         struct drm_device *dev = dev_priv->dev;
-       struct intel_ring_buffer *ring;
         char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
         char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
         char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
-       int i, ret;
+       int ret;
  
         kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
  
@@ -1506,8 +1533,16 @@ static void i915_error_work_func(struct work_struct *work)
                 kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE,
                                    reset_event);
  
+               /*
+                * All state reset _must_ be completed before we update the
+                * reset counter, for otherwise waiters might miss the reset
+                * pending state and not properly drop locks, resulting in
+                * deadlocks with the reset work.
+                */
                 ret = i915_reset(dev);
  
+               intel_display_handle_reset(dev);
+
                 if (ret == 0) {
                         /*
                          * After all the gem state is reset, increment the reset
@@ -1528,12 +1563,11 @@ static void i915_error_work_func(struct work_struct *work)
                         atomic_set(&error->reset_counter, I915_WEDGED);
                 }
  
-               for_each_ring(ring, dev_priv, i)
-                       wake_up_all(&ring->irq_queue);
-
-               intel_display_handle_reset(dev);
-
-               wake_up_all(&dev_priv->gpu_error.reset_queue);
+               /*
+                * Note: The wake_up also serves as a memory barrier so that
+                * waiters see the update value of the reset counter atomic_t.
+                */
+               i915_error_wake_up(dev_priv, true);
         }
  }
  
@@ -1642,8 +1676,6 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
  void i915_handle_error(struct drm_device *dev, bool wedged)
  {
         struct drm_i915_private *dev_priv = dev->dev_private;
-       struct intel_ring_buffer *ring;
-       int i;
  
         i915_capture_error_state(dev);
         i915_report_and_clear_eir(dev);
@@ -1653,11 +1685,19 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
                                 &dev_priv->gpu_error.reset_counter);
  
                 /*
-                * Wakeup waiting processes so that the reset work item
-                * doesn't deadlock trying to grab various locks.
+                * Wakeup waiting processes so that the reset work function
+                * i915_error_work_func doesn't deadlock trying to grab various
+                * locks. By bumping the reset counter first, the woken
+                * processes will see a reset in progress and back off,
+                * releasing their locks and then wait for the reset completion.
+                * We must do this for _all_ gpu waiters that might hold locks
+                * that the reset work needs to acquire.
+                *
+                * Note: The wake_up serves as the required memory barrier to
+                * ensure that the waiters see the updated value of the reset
+                * counter atomic_t.
                  */
-               for_each_ring(ring, dev_priv, i)
-                       wake_up_all(&ring->irq_queue);
+               i915_error_wake_up(dev_priv, false);
         }
  
         /*
author	Daniel Vetter <daniel.vetter@ffwll.ch>
	Sun, 8 Sep 2013 19:57:13 +0000 (21:57 +0200)
committer	Daniel Vetter <daniel.vetter@ffwll.ch>
	Mon, 9 Sep 2013 09:26:03 +0000 (11:26 +0200)