drm/amdgpu: add and implement the GPU reset status query
authorMarek Olšák <marek.olsak@amd.com>
Tue, 5 May 2015 19:13:49 +0000 (21:13 +0200)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 4 Jun 2015 01:03:39 +0000 (21:03 -0400)
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Jammy Zhou <Jammy.Zhou@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
include/uapi/drm/amdgpu_drm.h

index 66b5bd0..ebff89e 100644 (file)
@@ -1040,7 +1040,7 @@ struct amdgpu_vm_manager {
 
 struct amdgpu_ctx_state {
        uint64_t flags;
-       uint64_t hangs;
+       uint32_t hangs;
 };
 
 struct amdgpu_ctx {
@@ -1049,6 +1049,7 @@ struct amdgpu_ctx {
        struct amdgpu_fpriv *fpriv;
        struct amdgpu_ctx_state state;
        uint32_t id;
+       unsigned reset_counter;
 };
 
 struct amdgpu_ctx_mgr {
@@ -1897,8 +1898,6 @@ int amdgpu_ctx_alloc(struct amdgpu_device *adev,struct amdgpu_fpriv *fpriv,
                                                        uint32_t *id,uint32_t flags);
 int amdgpu_ctx_free(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv,
                                                  uint32_t id);
-int amdgpu_ctx_query(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv,
-                                                       uint32_t id,struct amdgpu_ctx_state *state);
 
 void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv);
 struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id);
@@ -2006,6 +2005,7 @@ struct amdgpu_device {
        atomic64_t                      vram_vis_usage;
        atomic64_t                      gtt_usage;
        atomic64_t                      num_bytes_moved;
+       atomic_t                        gpu_reset_counter;
 
        /* display */
        struct amdgpu_mode_info         mode_info;
index bcd332e..6c66ac8 100644 (file)
@@ -81,21 +81,36 @@ int amdgpu_ctx_free(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, uint
        return -EINVAL;
 }
 
-int amdgpu_ctx_query(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, uint32_t id, struct amdgpu_ctx_state *state)
+static int amdgpu_ctx_query(struct amdgpu_device *adev,
+                           struct amdgpu_fpriv *fpriv, uint32_t id,
+                           union drm_amdgpu_ctx_out *out)
 {
        struct amdgpu_ctx *ctx;
        struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr;
+       unsigned reset_counter;
 
        mutex_lock(&mgr->lock);
        ctx = idr_find(&mgr->ctx_handles, id);
-       if (ctx) {
-               /* state should alter with CS activity */
-               *state = ctx->state;
+       if (!ctx) {
                mutex_unlock(&mgr->lock);
-               return 0;
+               return -EINVAL;
        }
+
+       /* TODO: these two are always zero */
+       out->state.flags = ctx->state.flags;
+       out->state.hangs = ctx->state.hangs;
+
+       /* determine if a GPU reset has occured since the last call */
+       reset_counter = atomic_read(&adev->gpu_reset_counter);
+       /* TODO: this should ideally return NO, GUILTY, or INNOCENT. */
+       if (ctx->reset_counter == reset_counter)
+               out->state.reset_status = AMDGPU_CTX_NO_RESET;
+       else
+               out->state.reset_status = AMDGPU_CTX_UNKNOWN_RESET;
+       ctx->reset_counter = reset_counter;
+
        mutex_unlock(&mgr->lock);
-       return -EINVAL;
+       return 0;
 }
 
 void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv)
@@ -115,12 +130,11 @@ void amdgpu_ctx_fini(struct amdgpu_fpriv *fpriv)
 }
 
 int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
-                                                       struct drm_file *filp)
+                    struct drm_file *filp)
 {
        int r;
        uint32_t id;
        uint32_t flags;
-       struct amdgpu_ctx_state state;
 
        union drm_amdgpu_ctx *args = data;
        struct amdgpu_device *adev = dev->dev_private;
@@ -139,11 +153,7 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
                        r = amdgpu_ctx_free(adev, fpriv, id);
                        break;
                case AMDGPU_CTX_OP_QUERY_STATE:
-                       r = amdgpu_ctx_query(adev, fpriv, id, &state);
-                       if (r == 0) {
-                               args->out.state.flags = state.flags;
-                               args->out.state.hangs = state.hangs;
-                       }
+                       r = amdgpu_ctx_query(adev, fpriv, id, &args->out);
                        break;
                default:
                        return -EINVAL;
index 61cf5ad..3448d9f 100644 (file)
@@ -1781,6 +1781,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
        }
 
        adev->needs_reset = false;
+       atomic_inc(&adev->gpu_reset_counter);
 
        /* block TTM */
        resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
index 65da7cd..46580e9 100644 (file)
@@ -149,6 +149,12 @@ union drm_amdgpu_bo_list {
 
 #define AMDGPU_CTX_OP_STATE_RUNNING    1
 
+/* GPU reset status */
+#define AMDGPU_CTX_NO_RESET            0
+#define AMDGPU_CTX_GUILTY_RESET                1 /* this the context caused it */
+#define AMDGPU_CTX_INNOCENT_RESET      2 /* some other context caused it */
+#define AMDGPU_CTX_UNKNOWN_RESET       3 /* unknown cause */
+
 struct drm_amdgpu_ctx_in {
        uint32_t        op;
        uint32_t        flags;
@@ -164,7 +170,10 @@ union drm_amdgpu_ctx_out {
 
                struct {
                        uint64_t        flags;
-                       uint64_t        hangs;
+                       /** Number of resets caused by this context so far. */
+                       uint32_t        hangs;
+                       /** Reset status since the last call of the ioctl. */
+                       uint32_t        reset_status;
                } state;
 };