drm/amdgpu: clear ras controller status registers when interrupt occurs
authorLe Ma <le.ma@amd.com>
Fri, 22 Nov 2019 09:56:47 +0000 (17:56 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 5 Dec 2019 21:25:27 +0000 (16:25 -0500)
To fix issue that ras controller interrupt cannot be triggered anymore after
one time nbif uncorrectable error. And error count is stored in nbif ras object
for query.

Signed-off-by: Le Ma <le.ma@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

index 25231d6..9a3a65a 100644 (file)
@@ -52,6 +52,9 @@
 #define BIF_MMSCH1_DOORBELL_RANGE__OFFSET_MASK          0x00000FFCL
 #define BIF_MMSCH1_DOORBELL_RANGE__SIZE_MASK            0x001F0000L
 
+static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
+                                       void *ras_error_status);
+
 static void nbio_v7_4_remap_hdp_registers(struct amdgpu_device *adev)
 {
        WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
@@ -314,6 +317,7 @@ static void nbio_v7_4_init_registers(struct amdgpu_device *adev)
 static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device *adev)
 {
        uint32_t bif_doorbell_intr_cntl;
+       struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
 
        bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
        if (REG_GET_FIELD(bif_doorbell_intr_cntl,
@@ -324,6 +328,12 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
                                                RAS_CNTLR_INTERRUPT_CLEAR, 1);
                WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
+               /*
+                * clear error status after ras_controller_intr according to
+                * hw team and count ue number for query
+                */
+               nbio_v7_4_query_ras_error_count(adev, &obj->err_data);
+
                DRM_WARN("RAS controller interrupt triggered by NBIF error\n");
 
                /* ras_controller_int is dedicated for nbif ras error,