From ad4a72c11e571927b68a73a03a66ffd786b1e23d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Andr=C3=A9=20Almeida?= Date: Sat, 1 Apr 2023 19:57:52 -0300 Subject: [PATCH] radv: Implement vk.check_status MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Implement check_status function so the driver can check if the GPU has been reset by the application, and thus if it's still available. AMDGPU_CTX_QUERY ioctls work by asking amdgpu if this context was the cause of a previous GPU reset. Reviewed-by: Samuel Pitoiset Signed-off-by: André Almeida Part-of: --- src/amd/vulkan/radv_device.c | 23 +++++++++++++ src/amd/vulkan/radv_radeon_winsys.h | 9 +++++ src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 47 +++++++++++++++++++++++++++ 3 files changed, 79 insertions(+) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index feaa5ac..51540c8 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -602,6 +602,28 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *ph add_entrypoints(&b, &vk_common_device_entrypoints, RADV_DISPATCH_TABLE_COUNT); } +static VkResult +radv_check_status(struct vk_device *vk_device) +{ + struct radv_device *device = container_of(vk_device, struct radv_device, vk); + enum radv_reset_status status; + + for (int i = 0; i < RADV_NUM_HW_CTX; i++) { + if (device->hw_ctx[i]) { + status = device->ws->ctx_query_reset_status(device->hw_ctx[i]); + + if (status == RADV_GUILTY_CONTEXT_RESET) + return vk_device_set_lost(&device->vk, "GPU hung detected in this process"); + if (status == RADV_INNOCENT_CONTEXT_RESET) + return vk_device_set_lost(&device->vk, "GPU hung triggered by other process"); + if (status == RADV_UNKNOWN_CONTEXT_RESET) + return vk_device_set_lost(&device->vk, "GPU hung triggered by unknown source"); + } + } + + return VK_SUCCESS; +} + VKAPI_ATTR VkResult VKAPI_CALL radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDevice *pDevice) @@ -750,6 +772,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr init_dispatch_tables(device, physical_device); device->vk.command_buffer_ops = &radv_cmd_buffer_ops; + device->vk.check_status = radv_check_status; device->instance = physical_device->instance; device->physical_device = physical_device; diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index 32d8795..28eb6e7 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -102,6 +102,13 @@ enum radeon_value_id { RADEON_CURRENT_MCLK, }; +enum radv_reset_status { + RADV_NO_RESET, + RADV_GUILTY_CONTEXT_RESET, + RADV_INNOCENT_CONTEXT_RESET, + RADV_UNKNOWN_CONTEXT_RESET, +}; + struct radeon_cmdbuf { /* These are uint64_t to tell the compiler that buf can't alias them. * If they're uint32_t the generated code needs to redundantly @@ -275,6 +282,8 @@ struct radeon_winsys { int (*ctx_set_pstate)(struct radeon_winsys_ctx *ctx, uint32_t pstate); + enum radv_reset_status (*ctx_query_reset_status)(struct radeon_winsys_ctx *rwctx); + enum radeon_bo_domain (*cs_domain)(const struct radeon_winsys *ws); struct radeon_cmdbuf *(*cs_create)(struct radeon_winsys *ws, enum amd_ip_type amd_ip_type); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index b668113..9e85190 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -1560,6 +1560,52 @@ radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum amd_ip_type ip_t return true; } +static enum radv_reset_status +radv_amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) +{ + int ret; + struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx; + + if (ctx->ws->info.drm_minor >= 24) { + uint64_t flags; + ret = amdgpu_cs_query_reset_state2(ctx->ctx, &flags); + + if (ret) { + fprintf(stderr, "radv/amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", ret); + return RADV_NO_RESET; + } + + if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) { + if (flags & AMDGPU_CTX_QUERY2_FLAGS_GUILTY) { + /* Some job from this context once caused a GPU hang */ + return RADV_GUILTY_CONTEXT_RESET; + } else { + /* Some job from other context caused a GPU hang */ + return RADV_INNOCENT_CONTEXT_RESET; + } + } + } else { + uint32_t result, hangs; + + ret = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); + if (ret) { + fprintf(stderr, "radv/amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", ret); + return RADV_NO_RESET; + } + + switch (result) { + case AMDGPU_CTX_GUILTY_RESET: + return RADV_GUILTY_CONTEXT_RESET; + case AMDGPU_CTX_INNOCENT_RESET: + return RADV_INNOCENT_CONTEXT_RESET; + case AMDGPU_CTX_UNKNOWN_RESET: + return RADV_UNKNOWN_CONTEXT_RESET; + } + } + + return RADV_NO_RESET; +} + static uint32_t radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate) { @@ -1871,6 +1917,7 @@ radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws) ws->base.ctx_destroy = radv_amdgpu_ctx_destroy; ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle; ws->base.ctx_set_pstate = radv_amdgpu_ctx_set_pstate; + ws->base.ctx_query_reset_status = radv_amdgpu_ctx_query_reset_status; ws->base.cs_domain = radv_amdgpu_cs_domain; ws->base.cs_create = radv_amdgpu_cs_create; ws->base.cs_destroy = radv_amdgpu_cs_destroy; -- 2.7.4