drm/amdgpu: Resolve nBIF RAS error harvesting bug
authorJohn Clements <john.clements@amd.com>
Wed, 15 Sep 2021 09:14:12 +0000 (17:14 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 23 Sep 2021 19:17:28 +0000 (15:17 -0400)
Set correct RAS nBIF error query register offsets on aldebaran

Signed-off-by: John Clements <john.clements@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

index f50045c..0a9fc19 100644 (file)
@@ -566,7 +566,9 @@ static int nbio_v7_4_init_ras_err_event_athub_interrupt (struct amdgpu_device *a
        return r;
 }
 
-#define smnPARITY_ERROR_STATUS_UNCORR_GRP2     0x13a20030
+#define smnPARITY_ERROR_STATUS_UNCORR_GRP2         0x13a20030
+#define smnPARITY_ERROR_STATUS_UNCORR_GRP2_ALDE        0x13b20030
+#define smnRAS_GLOBAL_STATUS_LO_ALDE            0x13b20020
 
 static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
                                        void *ras_error_status)
@@ -575,12 +577,20 @@ static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
        uint32_t corr, fatal, non_fatal;
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 
-       global_sts = RREG32_PCIE(smnRAS_GLOBAL_STATUS_LO);
+       if (adev->asic_type == CHIP_ALDEBARAN)
+               global_sts = RREG32_PCIE(smnRAS_GLOBAL_STATUS_LO_ALDE);
+       else
+               global_sts = RREG32_PCIE(smnRAS_GLOBAL_STATUS_LO);
+
        corr = REG_GET_FIELD(global_sts, RAS_GLOBAL_STATUS_LO, ParityErrCorr);
        fatal = REG_GET_FIELD(global_sts, RAS_GLOBAL_STATUS_LO, ParityErrFatal);
        non_fatal = REG_GET_FIELD(global_sts, RAS_GLOBAL_STATUS_LO,
                                ParityErrNonFatal);
-       parity_sts = RREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2);
+
+       if (adev->asic_type == CHIP_ALDEBARAN)
+               parity_sts = RREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2_ALDE);
+       else
+               parity_sts = RREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2);
 
        if (corr)
                err_data->ce_count++;
@@ -589,13 +599,21 @@ static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
 
        if (corr || fatal || non_fatal) {
                central_sts = RREG32_PCIE(smnBIFL_RAS_CENTRAL_STATUS);
+
                /* clear error status register */
-               WREG32_PCIE(smnRAS_GLOBAL_STATUS_LO, global_sts);
+               if (adev->asic_type == CHIP_ALDEBARAN)
+                       WREG32_PCIE(smnRAS_GLOBAL_STATUS_LO_ALDE, global_sts);
+               else
+                       WREG32_PCIE(smnRAS_GLOBAL_STATUS_LO, global_sts);
 
                if (fatal)
+               {
                        /* clear parity fatal error indication field */
-                       WREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2,
-                                   parity_sts);
+                       if (adev->asic_type == CHIP_ALDEBARAN)
+                               WREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2_ALDE, parity_sts);
+                       else
+                               WREG32_PCIE(smnPARITY_ERROR_STATUS_UNCORR_GRP2, parity_sts);
+               }
 
                if (REG_GET_FIELD(central_sts, BIFL_RAS_CENTRAL_STATUS,
                                BIFL_RasContller_Intr_Recv)) {