drm/msm: Hangcheck progress detection

author Rob Clark <robdclark@chromium.org>

Mon, 14 Nov 2022 19:30:41 +0000 (11:30 -0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 17 May 2023 09:53:53 +0000 (11:53 +0200)
author Rob Clark <robdclark@chromium.org>
Mon, 14 Nov 2022 19:30:41 +0000 (11:30 -0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 17 May 2023 09:53:53 +0000 (11:53 +0200)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c

index dc53466..95e73ed 100644 (file)
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1850,6 +1850,39 @@ static uint32_t a6xx_get_rptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
         return ring->memptrs->rptr = gpu_read(gpu, REG_A6XX_CP_RB_RPTR);
  }
  
+static bool a6xx_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
+{
+       struct msm_cp_state cp_state = {
+               .ib1_base = gpu_read64(gpu, REG_A6XX_CP_IB1_BASE),
+               .ib2_base = gpu_read64(gpu, REG_A6XX_CP_IB2_BASE),
+               .ib1_rem  = gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE),
+               .ib2_rem  = gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE),
+       };
+       bool progress;
+
+       /*
+        * Adjust the remaining data to account for what has already been
+        * fetched from memory, but not yet consumed by the SQE.
+        *
+        * This is not *technically* correct, the amount buffered could
+        * exceed the IB size due to hw prefetching ahead, but:
+        *
+        * (1) We aren't trying to find the exact position, just whether
+        *     progress has been made
+        * (2) The CP_REG_TO_MEM at the end of a submit should be enough
+        *     to prevent prefetching into an unrelated submit.  (And
+        *     either way, at some point the ROQ will be full.)
+        */
+       cp_state.ib1_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB1_STAT) >> 16;
+       cp_state.ib2_rem += gpu_read(gpu, REG_A6XX_CP_CSQ_IB2_STAT) >> 16;
+
+       progress = !!memcmp(&cp_state, &ring->last_cp_state, sizeof(cp_state));
+
+       ring->last_cp_state = cp_state;
+
+       return progress;
+}
+
  static u32 a618_get_speed_bin(u32 fuse)
  {
         if (fuse == 0)
@@ -1966,6 +1999,7 @@ static const struct adreno_gpu_funcs funcs = {
                 .create_address_space = a6xx_create_address_space,
                 .create_private_address_space = a6xx_create_private_address_space,
                 .get_rptr = a6xx_get_rptr,
+               .progress = a6xx_progress,
         },
         .get_timestamp = a6xx_get_timestamp,
  };
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c

index 69bc2e5..5eeb465 100644 (file)
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -433,7 +433,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
         priv->dev = ddev;
  
         priv->wq = alloc_ordered_workqueue("msm", 0);
-       priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;
  
         INIT_LIST_HEAD(&priv->objects);
         mutex_init(&priv->obj_lock);
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h

index b2ea262..d4e0ef6 100644 (file)
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -224,7 +224,13 @@ struct msm_drm_private {
  
         struct drm_atomic_state *pm_state;
  
-       /* For hang detection, in ms */
+       /**
+        * hangcheck_period: For hang detection, in ms
+        *
+        * Note that in practice, a submit/job will get at least two hangcheck
+        * periods, due to checking for progress being implemented as simply
+        * "have the CP position registers changed since last time?"
+        */
         unsigned int hangcheck_period;
  
         /**
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c

index 4f495ee..3802495 100644 (file)
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -494,6 +494,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
                         round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period)));
  }
  
+static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
+{
+       if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES)
+               return false;
+
+       if (!gpu->funcs->progress)
+               return false;
+
+       if (!gpu->funcs->progress(gpu, ring))
+               return false;
+
+       ring->hangcheck_progress_retries++;
+       return true;
+}
+
  static void hangcheck_handler(struct timer_list *t)
  {
         struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
@@ -504,9 +519,12 @@ static void hangcheck_handler(struct timer_list *t)
         if (fence != ring->hangcheck_fence) {
                 /* some progress has been made.. ya! */
                 ring->hangcheck_fence = fence;
-       } else if (fence_before(fence, ring->fctx->last_fence)) {
+               ring->hangcheck_progress_retries = 0;
+       } else if (fence_before(fence, ring->fctx->last_fence) &&
+                       !made_progress(gpu, ring)) {
                 /* no progress and not done.. hung! */
                 ring->hangcheck_fence = fence;
+               ring->hangcheck_progress_retries = 0;
                 DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
                                 gpu->name, ring->id);
                 DRM_DEV_ERROR(dev->dev, "%s:     completed fence: %u\n",
@@ -832,6 +850,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
                 struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
                 const char *name, struct msm_gpu_config *config)
  {
+       struct msm_drm_private *priv = drm->dev_private;
         int i, ret, nr_rings = config->nr_rings;
         void *memptrs;
         uint64_t memptrs_iova;
@@ -859,6 +878,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
         kthread_init_work(&gpu->recover_work, recover_worker);
         kthread_init_work(&gpu->fault_work, fault_worker);
  
+       priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;
+
+       /*
+        * If progress detection is supported, halve the hangcheck timer
+        * duration, as it takes two iterations of the hangcheck handler
+        * to detect a hang.
+        */
+       if (funcs->progress)
+               priv->hangcheck_period /= 2;
+
         timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
  
         spin_lock_init(&gpu->perf_lock);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h

index 7a36e07..732295e 100644 (file)
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -78,6 +78,15 @@ struct msm_gpu_funcs {
         struct msm_gem_address_space *(*create_private_address_space)
                 (struct msm_gpu *gpu);
         uint32_t (*get_rptr)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
+
+       /**
+        * progress: Has the GPU made progress?
+        *
+        * Return true if GPU position in cmdstream has advanced (or changed)
+        * since the last call.  To avoid false negatives, this should account
+        * for cmdstream that is buffered in this FIFO upstream of the CP fw.
+        */
+       bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
  };
  
  /* Additional state for iommu faults: */
@@ -237,6 +246,7 @@ struct msm_gpu {
  #define DRM_MSM_INACTIVE_PERIOD   66 /* in ms (roughly four frames) */
  
  #define DRM_MSM_HANGCHECK_DEFAULT_PERIOD 500 /* in ms */
+#define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3
         struct timer_list hangcheck_timer;
  
         /* Fault info for most recent iova fault: */
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h

index 2a5045a..698b333 100644 (file)
--- a/drivers/gpu/drm/msm/msm_ringbuffer.h
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
@@ -35,6 +35,11 @@ struct msm_rbmemptrs {
         volatile u64 ttbr0;
  };
  
+struct msm_cp_state {
+       uint64_t ib1_base, ib2_base;
+       uint32_t ib1_rem, ib2_rem;
+};
+
  struct msm_ringbuffer {
         struct msm_gpu *gpu;
         int id;
@@ -64,6 +69,29 @@ struct msm_ringbuffer {
         uint64_t memptrs_iova;
         struct msm_fence_context *fctx;
  
+       /**
+        * hangcheck_progress_retries:
+        *
+        * The number of extra hangcheck duration cycles that we have given
+        * due to it appearing that the GPU is making forward progress.
+        *
+        * For GPU generations which support progress detection (see.
+        * msm_gpu_funcs::progress()), if the GPU appears to be making progress
+        * (ie. the CP has advanced in the command stream, we'll allow up to
+        * DRM_MSM_HANGCHECK_PROGRESS_RETRIES expirations of the hangcheck timer
+        * before killing the job.  But to detect progress we need two sample
+        * points, so the duration of the hangcheck timer is halved.  In other
+        * words we'll let the submit run for up to:
+        *
+        * (DRM_MSM_HANGCHECK_DEFAULT_PERIOD / 2) * (DRM_MSM_HANGCHECK_PROGRESS_RETRIES + 1)
+        */
+       int hangcheck_progress_retries;
+
+       /**
+        * last_cp_state: The state of the CP at the last call to gpu->progress()
+        */
+       struct msm_cp_state last_cp_state;
+
         /*
          * preempt_lock protects preemption and serializes wptr updates against
          * preemption.  Can be aquired from irq context.
author	Rob Clark <robdclark@chromium.org>
	Mon, 14 Nov 2022 19:30:41 +0000 (11:30 -0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 17 May 2023 09:53:53 +0000 (11:53 +0200)
drivers/gpu/drm/msm/adreno/a6xx_gpu.c		patch \| blob \| history
drivers/gpu/drm/msm/msm_drv.c		patch \| blob \| history
drivers/gpu/drm/msm/msm_drv.h		patch \| blob \| history
drivers/gpu/drm/msm/msm_gpu.c		patch \| blob \| history
drivers/gpu/drm/msm/msm_gpu.h		patch \| blob \| history
drivers/gpu/drm/msm/msm_ringbuffer.h		patch \| blob \| history