From 36abacaed34bc1f5bcb11ca611dd3a06c5c0ef39 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Christian=20K=C3=B6nig?= Date: Wed, 2 May 2012 15:11:13 +0200 Subject: [PATCH] drm/radeon: rework gpu lockup detection and processing MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Previusly multiple rings could trigger multiple GPU resets at the same time. Signed-off-by: Christian König Reviewed-by: Jerome Glisse Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon.h | 3 +- drivers/gpu/drm/radeon/radeon_fence.c | 150 +++++++++++++++++----------------- 2 files changed, 77 insertions(+), 76 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 65855af..35db5bd 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -255,8 +255,7 @@ struct radeon_fence_driver { volatile uint32_t *cpu_addr; atomic_t seq; uint32_t last_seq; - unsigned long last_jiffies; - unsigned long last_timeout; + unsigned long last_activity; wait_queue_head_t queue; struct list_head created; struct list_head emitted; diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 36c411f..1a9765a 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -74,6 +74,10 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence) radeon_fence_ring_emit(rdev, fence->ring, fence); trace_radeon_fence_emit(rdev->ddev, fence->seq); fence->emitted = true; + /* are we the first fence on a previusly idle ring? */ + if (list_empty(&rdev->fence_drv[fence->ring].emitted)) { + rdev->fence_drv[fence->ring].last_activity = jiffies; + } list_move_tail(&fence->list, &rdev->fence_drv[fence->ring].emitted); write_unlock_irqrestore(&rdev->fence_lock, irq_flags); return 0; @@ -85,34 +89,14 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring) struct list_head *i, *n; uint32_t seq; bool wake = false; - unsigned long cjiffies; seq = radeon_fence_read(rdev, ring); - if (seq != rdev->fence_drv[ring].last_seq) { - rdev->fence_drv[ring].last_seq = seq; - rdev->fence_drv[ring].last_jiffies = jiffies; - rdev->fence_drv[ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT; - } else { - cjiffies = jiffies; - if (time_after(cjiffies, rdev->fence_drv[ring].last_jiffies)) { - cjiffies -= rdev->fence_drv[ring].last_jiffies; - if (time_after(rdev->fence_drv[ring].last_timeout, cjiffies)) { - /* update the timeout */ - rdev->fence_drv[ring].last_timeout -= cjiffies; - } else { - /* the 500ms timeout is elapsed we should test - * for GPU lockup - */ - rdev->fence_drv[ring].last_timeout = 1; - } - } else { - /* wrap around update last jiffies, we will just wait - * a little longer - */ - rdev->fence_drv[ring].last_jiffies = cjiffies; - } + if (seq == rdev->fence_drv[ring].last_seq) return false; - } + + rdev->fence_drv[ring].last_seq = seq; + rdev->fence_drv[ring].last_activity = jiffies; + n = NULL; list_for_each(i, &rdev->fence_drv[ring].emitted) { fence = list_entry(i, struct radeon_fence, list); @@ -207,66 +191,84 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) struct radeon_device *rdev; unsigned long irq_flags, timeout; u32 seq; - int r; + int i, r; + bool signaled; if (fence == NULL) { WARN(1, "Querying an invalid fence : %p !\n", fence); - return 0; + return -EINVAL; } + rdev = fence->rdev; - if (radeon_fence_signaled(fence)) { - return 0; - } - timeout = rdev->fence_drv[fence->ring].last_timeout; -retry: - /* save current sequence used to check for GPU lockup */ - seq = rdev->fence_drv[fence->ring].last_seq; - trace_radeon_fence_wait_begin(rdev->ddev, seq); - if (intr) { + signaled = radeon_fence_signaled(fence); + while (!signaled) { + read_lock_irqsave(&rdev->fence_lock, irq_flags); + timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT; + if (time_after(rdev->fence_drv[fence->ring].last_activity, timeout)) { + /* the normal case, timeout is somewhere before last_activity */ + timeout = rdev->fence_drv[fence->ring].last_activity - timeout; + } else { + /* either jiffies wrapped around, or no fence was signaled in the last 500ms + * anyway we will just wait for the minimum amount and then check for a lockup */ + timeout = 1; + } + /* save current sequence value used to check for GPU lockups */ + seq = rdev->fence_drv[fence->ring].last_seq; + read_unlock_irqrestore(&rdev->fence_lock, irq_flags); + + trace_radeon_fence_wait_begin(rdev->ddev, seq); radeon_irq_kms_sw_irq_get(rdev, fence->ring); - r = wait_event_interruptible_timeout(rdev->fence_drv[fence->ring].queue, - radeon_fence_signaled(fence), timeout); + if (intr) { + r = wait_event_interruptible_timeout( + rdev->fence_drv[fence->ring].queue, + (signaled = radeon_fence_signaled(fence)), timeout); + } else { + r = wait_event_timeout( + rdev->fence_drv[fence->ring].queue, + (signaled = radeon_fence_signaled(fence)), timeout); + } radeon_irq_kms_sw_irq_put(rdev, fence->ring); if (unlikely(r < 0)) { return r; } - } else { - radeon_irq_kms_sw_irq_get(rdev, fence->ring); - r = wait_event_timeout(rdev->fence_drv[fence->ring].queue, - radeon_fence_signaled(fence), timeout); - radeon_irq_kms_sw_irq_put(rdev, fence->ring); - } - trace_radeon_fence_wait_end(rdev->ddev, seq); - if (unlikely(!radeon_fence_signaled(fence))) { - /* we were interrupted for some reason and fence isn't - * isn't signaled yet, resume wait - */ - if (r) { - timeout = r; - goto retry; - } - /* don't protect read access to rdev->fence_drv[t].last_seq - * if we experiencing a lockup the value doesn't change - */ - if (seq == rdev->fence_drv[fence->ring].last_seq && - radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) { - - /* good news we believe it's a lockup */ - printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n", - fence->seq, seq); - - /* mark the ring as not ready any more */ - rdev->ring[fence->ring].ready = false; - r = radeon_gpu_reset(rdev); - if (r) - return r; + trace_radeon_fence_wait_end(rdev->ddev, seq); + + if (unlikely(!signaled)) { + /* we were interrupted for some reason and fence + * isn't signaled yet, resume waiting */ + if (r) { + continue; + } + + write_lock_irqsave(&rdev->fence_lock, irq_flags); + /* check if sequence value has changed since last_activity */ + if (seq != rdev->fence_drv[fence->ring].last_seq) { + write_unlock_irqrestore(&rdev->fence_lock, irq_flags); + continue; + } + + /* change sequence value on all rings, so nobody else things there is a lockup */ + for (i = 0; i < RADEON_NUM_RINGS; ++i) + rdev->fence_drv[i].last_seq -= 0x10000; + write_unlock_irqrestore(&rdev->fence_lock, irq_flags); + + if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) { + + /* good news we believe it's a lockup */ + printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n", + fence->seq, seq); + + /* mark the ring as not ready any more */ + rdev->ring[fence->ring].ready = false; + r = radeon_gpu_reset(rdev); + if (r) + return r; + + write_lock_irqsave(&rdev->fence_lock, irq_flags); + rdev->fence_drv[fence->ring].last_activity = jiffies; + write_unlock_irqrestore(&rdev->fence_lock, irq_flags); + } } - timeout = RADEON_FENCE_JIFFIES_TIMEOUT; - write_lock_irqsave(&rdev->fence_lock, irq_flags); - rdev->fence_drv[fence->ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT; - rdev->fence_drv[fence->ring].last_jiffies = jiffies; - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - goto retry; } return 0; } -- 2.7.4