drm/i915: Split up hangcheck phases
authorMika Kuoppala <mika.kuoppala@linux.intel.com>
Wed, 16 Nov 2016 15:20:29 +0000 (17:20 +0200)
committerMika Kuoppala <mika.kuoppala@intel.com>
Mon, 21 Nov 2016 12:36:40 +0000 (14:36 +0200)
In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.

Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/intel_hangcheck.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index ae84aa4..4bcf1a0 100644 (file)
@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
                return "idle";
        case HANGCHECK_WAIT:
                return "wait";
-       case HANGCHECK_ACTIVE:
-               return "active";
+       case HANGCHECK_ACTIVE_SEQNO:
+               return "active seqno";
+       case HANGCHECK_ACTIVE_HEAD:
+               return "active head";
+       case HANGCHECK_ACTIVE_SUBUNITS:
+               return "active subunits";
        case HANGCHECK_KICK:
                return "kick";
        case HANGCHECK_HUNG:
index 53df5b1..3d2e81c 100644 (file)
@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
                memset(&engine->hangcheck.instdone, 0,
                       sizeof(engine->hangcheck.instdone));
 
-               return HANGCHECK_ACTIVE;
+               return HANGCHECK_ACTIVE_HEAD;
        }
 
        if (!subunits_stuck(engine))
-               return HANGCHECK_ACTIVE;
+               return HANGCHECK_ACTIVE_SUBUNITS;
 
        return HANGCHECK_HUNG;
 }
@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
        return HANGCHECK_HUNG;
 }
 
+static void hangcheck_load_sample(struct intel_engine_cs *engine,
+                                 struct intel_engine_hangcheck *hc)
+{
+       /* We don't strictly need an irq-barrier here, as we are not
+        * serving an interrupt request, be paranoid in case the
+        * barrier has side-effects (such as preventing a broken
+        * cacheline snoop) and so be sure that we can see the seqno
+        * advance. If the seqno should stick, due to a stale
+        * cacheline, we would erroneously declare the GPU hung.
+        */
+       if (engine->irq_seqno_barrier)
+               engine->irq_seqno_barrier(engine);
+
+       hc->acthd = intel_engine_get_active_head(engine);
+       hc->seqno = intel_engine_get_seqno(engine);
+       hc->score = engine->hangcheck.score;
+}
+
+static void hangcheck_store_sample(struct intel_engine_cs *engine,
+                                  const struct intel_engine_hangcheck *hc)
+{
+       engine->hangcheck.acthd = hc->acthd;
+       engine->hangcheck.seqno = hc->seqno;
+       engine->hangcheck.score = hc->score;
+       engine->hangcheck.action = hc->action;
+}
+
+static enum intel_engine_hangcheck_action
+hangcheck_get_action(struct intel_engine_cs *engine,
+                    const struct intel_engine_hangcheck *hc)
+{
+       if (engine->hangcheck.seqno != hc->seqno)
+               return HANGCHECK_ACTIVE_SEQNO;
+
+       if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
+               return HANGCHECK_IDLE;
+
+       return engine_stuck(engine, hc->acthd);
+}
+
+static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
+                                       struct intel_engine_hangcheck *hc)
+{
+       hc->action = hangcheck_get_action(engine, hc);
+
+       switch (hc->action) {
+       case HANGCHECK_IDLE:
+       case HANGCHECK_WAIT:
+               break;
+
+       case HANGCHECK_ACTIVE_HEAD:
+       case HANGCHECK_ACTIVE_SUBUNITS:
+               /* We always increment the hangcheck score
+                * if the engine is busy and still processing
+                * the same request, so that no single request
+                * can run indefinitely (such as a chain of
+                * batches). The only time we do not increment
+                * the hangcheck score on this ring, if this
+                * engine is in a legitimate wait for another
+                * engine. In that case the waiting engine is a
+                * victim and we want to be sure we catch the
+                * right culprit. Then every time we do kick
+                * the ring, add a small increment to the
+                * score so that we can catch a batch that is
+                * being repeatedly kicked and so responsible
+                * for stalling the machine.
+                */
+               hc->score += 1;
+               break;
+
+       case HANGCHECK_KICK:
+               hc->score += 5;
+               break;
+
+       case HANGCHECK_HUNG:
+               hc->score += 20;
+               break;
+
+       case HANGCHECK_ACTIVE_SEQNO:
+               /* Gradually reduce the count so that we catch DoS
+                * attempts across multiple batches.
+                */
+               if (hc->score > 0)
+                       hc->score -= 15;
+               if (hc->score < 0)
+                       hc->score = 0;
+
+               /* Clear head and subunit states on seqno movement */
+               hc->acthd = 0;
+
+               memset(&engine->hangcheck.instdone, 0,
+                      sizeof(engine->hangcheck.instdone));
+               break;
+
+       default:
+               MISSING_CASE(hc->action);
+       }
+}
+
+static void hangcheck_declare_hang(struct drm_i915_private *i915,
+                                  unsigned int hung,
+                                  unsigned int stuck)
+{
+       struct intel_engine_cs *engine;
+       char msg[80];
+       unsigned int tmp;
+       int len;
+
+       /* If some rings hung but others were still busy, only
+        * blame the hanging rings in the synopsis.
+        */
+       if (stuck != hung)
+               hung &= ~stuck;
+       len = scnprintf(msg, sizeof(msg),
+                       "%s on ", stuck == hung ? "No progress" : "Hang");
+       for_each_engine_masked(engine, i915, hung, tmp)
+               len += scnprintf(msg + len, sizeof(msg) - len,
+                                "%s, ", engine->name);
+       msg[len-2] = '\0';
+
+       return i915_handle_error(i915, hung, msg);
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
        enum intel_engine_id id;
        unsigned int hung = 0, stuck = 0;
        int busy_count = 0;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define ACTIVE_DECAY 15
 
        if (!i915.enable_hangcheck)
                return;
@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
        intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 
        for_each_engine(engine, dev_priv, id) {
-               bool busy = intel_engine_has_waiter(engine);
-               u64 acthd;
-               u32 seqno;
-               u32 submit;
+               struct intel_engine_hangcheck cur_state, *hc = &cur_state;
+               const bool busy = intel_engine_has_waiter(engine);
 
                semaphore_clear_deadlocks(dev_priv);
 
-               /* We don't strictly need an irq-barrier here, as we are not
-                * serving an interrupt request, be paranoid in case the
-                * barrier has side-effects (such as preventing a broken
-                * cacheline snoop) and so be sure that we can see the seqno
-                * advance. If the seqno should stick, due to a stale
-                * cacheline, we would erroneously declare the GPU hung.
-                */
-               if (engine->irq_seqno_barrier)
-                       engine->irq_seqno_barrier(engine);
-
-               acthd = intel_engine_get_active_head(engine);
-               seqno = intel_engine_get_seqno(engine);
-               submit = intel_engine_last_submit(engine);
-
-               if (engine->hangcheck.seqno == seqno) {
-                       if (i915_seqno_passed(seqno, submit)) {
-                               engine->hangcheck.action = HANGCHECK_IDLE;
-                       } else {
-                               /* We always increment the hangcheck score
-                                * if the engine is busy and still processing
-                                * the same request, so that no single request
-                                * can run indefinitely (such as a chain of
-                                * batches). The only time we do not increment
-                                * the hangcheck score on this ring, if this
-                                * engine is in a legitimate wait for another
-                                * engine. In that case the waiting engine is a
-                                * victim and we want to be sure we catch the
-                                * right culprit. Then every time we do kick
-                                * the ring, add a small increment to the
-                                * score so that we can catch a batch that is
-                                * being repeatedly kicked and so responsible
-                                * for stalling the machine.
-                                */
-                               engine->hangcheck.action =
-                                       engine_stuck(engine, acthd);
-
-                               switch (engine->hangcheck.action) {
-                               case HANGCHECK_IDLE:
-                               case HANGCHECK_WAIT:
-                                       break;
-                               case HANGCHECK_ACTIVE:
-                                       engine->hangcheck.score += BUSY;
-                                       break;
-                               case HANGCHECK_KICK:
-                                       engine->hangcheck.score += KICK;
-                                       break;
-                               case HANGCHECK_HUNG:
-                                       engine->hangcheck.score += HUNG;
-                                       break;
-                               }
-                       }
-
-                       if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-                               hung |= intel_engine_flag(engine);
-                               if (engine->hangcheck.action != HANGCHECK_HUNG)
-                                       stuck |= intel_engine_flag(engine);
-                       }
-               } else {
-                       engine->hangcheck.action = HANGCHECK_ACTIVE;
-
-                       /* Gradually reduce the count so that we catch DoS
-                        * attempts across multiple batches.
-                        */
-                       if (engine->hangcheck.score > 0)
-                               engine->hangcheck.score -= ACTIVE_DECAY;
-                       if (engine->hangcheck.score < 0)
-                               engine->hangcheck.score = 0;
-
-                       /* Clear head and subunit states on seqno movement */
-                       acthd = 0;
-
-                       memset(&engine->hangcheck.instdone, 0,
-                              sizeof(engine->hangcheck.instdone));
+               hangcheck_load_sample(engine, hc);
+               hangcheck_accumulate_sample(engine, hc);
+               hangcheck_store_sample(engine, hc);
+
+               if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+                       hung |= intel_engine_flag(engine);
+                       if (hc->action != HANGCHECK_HUNG)
+                               stuck |= intel_engine_flag(engine);
                }
 
-               engine->hangcheck.seqno = seqno;
-               engine->hangcheck.acthd = acthd;
                busy_count += busy;
        }
 
-       if (hung) {
-               char msg[80];
-               unsigned int tmp;
-               int len;
-
-               /* If some rings hung but others were still busy, only
-                * blame the hanging rings in the synopsis.
-                */
-               if (stuck != hung)
-                       hung &= ~stuck;
-               len = scnprintf(msg, sizeof(msg),
-                               "%s on ", stuck == hung ? "No progress" : "Hang");
-               for_each_engine_masked(engine, dev_priv, hung, tmp)
-                       len += scnprintf(msg + len, sizeof(msg) - len,
-                                        "%s, ", engine->name);
-               msg[len-2] = '\0';
-
-               return i915_handle_error(dev_priv, hung, msg);
-       }
+       if (hung)
+               hangcheck_declare_hang(dev_priv, hung, stuck);
 
        /* Reset timer in case GPU hangs without another request being added */
        if (busy_count)
index 3466b4e..3152b2b 100644 (file)
@@ -67,7 +67,9 @@ struct intel_hw_status_page {
 enum intel_engine_hangcheck_action {
        HANGCHECK_IDLE = 0,
        HANGCHECK_WAIT,
-       HANGCHECK_ACTIVE,
+       HANGCHECK_ACTIVE_SEQNO,
+       HANGCHECK_ACTIVE_HEAD,
+       HANGCHECK_ACTIVE_SUBUNITS,
        HANGCHECK_KICK,
        HANGCHECK_HUNG,
 };