drm/amdgpu: split nbio callbacks into ras and non-ras ones
authorHawking Zhang <Hawking.Zhang@amd.com>
Fri, 2 Apr 2021 06:39:36 +0000 (14:39 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Apr 2021 20:51:04 +0000 (16:51 -0400)
nbio ras is not managed by gpu driver when gpu is
connected to cpu through xgmi. split nbio callbacks
into ras and non-ras ones so gpu driver only
initializes nbio ras callbacks when it manages
nbio ras.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: John Clements <John.Clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.h
drivers/gpu/drm/amd/amdgpu/soc15.c

index 0341254..90f5056 100644 (file)
@@ -199,13 +199,13 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg)
         * ack the interrupt if it is there
         */
        if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) {
-               if (adev->nbio.funcs &&
-                   adev->nbio.funcs->handle_ras_controller_intr_no_bifring)
-                       adev->nbio.funcs->handle_ras_controller_intr_no_bifring(adev);
+               if (adev->nbio.ras_funcs &&
+                   adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring)
+                       adev->nbio.ras_funcs->handle_ras_controller_intr_no_bifring(adev);
 
-               if (adev->nbio.funcs &&
-                   adev->nbio.funcs->handle_ras_err_event_athub_intr_no_bifring)
-                       adev->nbio.funcs->handle_ras_err_event_athub_intr_no_bifring(adev);
+               if (adev->nbio.ras_funcs &&
+                   adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring)
+                       adev->nbio.ras_funcs->handle_ras_err_event_athub_intr_no_bifring(adev);
        }
 
        return ret;
index 7c11bce..25ee535 100644 (file)
@@ -47,6 +47,17 @@ struct nbio_hdp_flush_reg {
        u32 ref_and_mask_sdma7;
 };
 
+struct amdgpu_nbio_ras_funcs {
+       void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device *adev);
+       void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev);
+       int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
+       int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev);
+       void (*query_ras_error_count)(struct amdgpu_device *adev,
+                                     void *ras_error_status);
+       int (*ras_late_init)(struct amdgpu_device *adev);
+       void (*ras_fini)(struct amdgpu_device *adev);
+};
+
 struct amdgpu_nbio_funcs {
        const struct nbio_hdp_flush_reg *hdp_flush_reg;
        u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev);
@@ -79,13 +90,6 @@ struct amdgpu_nbio_funcs {
        void (*ih_control)(struct amdgpu_device *adev);
        void (*init_registers)(struct amdgpu_device *adev);
        void (*remap_hdp_registers)(struct amdgpu_device *adev);
-       void (*handle_ras_controller_intr_no_bifring)(struct amdgpu_device *adev);
-       void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device *adev);
-       int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
-       int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev);
-       void (*query_ras_error_count)(struct amdgpu_device *adev,
-                                       void *ras_error_status);
-       int (*ras_late_init)(struct amdgpu_device *adev);
        void (*enable_aspm)(struct amdgpu_device *adev,
                            bool enable);
        void (*program_aspm)(struct amdgpu_device *adev);
@@ -97,6 +101,7 @@ struct amdgpu_nbio {
        struct amdgpu_irq_src ras_err_event_athub_irq;
        struct ras_common_if *ras_if;
        const struct amdgpu_nbio_funcs *funcs;
+       const struct amdgpu_nbio_ras_funcs *ras_funcs;
 };
 
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev);
index 1708045..ac3f4c3 100644 (file)
@@ -804,8 +804,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                        adev->mmhub.funcs->query_ras_error_status(adev);
                break;
        case AMDGPU_RAS_BLOCK__PCIE_BIF:
-               if (adev->nbio.funcs->query_ras_error_count)
-                       adev->nbio.funcs->query_ras_error_count(adev, &err_data);
+               if (adev->nbio.ras_funcs &&
+                   adev->nbio.ras_funcs->query_ras_error_count)
+                       adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
                break;
        case AMDGPU_RAS_BLOCK__XGMI_WAFL:
                amdgpu_xgmi_query_ras_error_count(adev, &err_data);
@@ -2030,14 +2031,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
        /* Might need get this flag from vbios. */
        con->flags = RAS_DEFAULT_FLAGS;
 
-       if (adev->nbio.funcs->init_ras_controller_interrupt) {
-               r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
+       /* initialize nbio ras function ahead of any other
+        * ras functions so hardware fatal error interrupt
+        * can be enabled as early as possible */
+       switch (adev->asic_type) {
+       case CHIP_VEGA20:
+       case CHIP_ARCTURUS:
+       case CHIP_ALDEBARAN:
+               if (!adev->gmc.xgmi.connected_to_cpu)
+                       adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
+               break;
+       default:
+               /* nbio ras is not available */
+               break;
+       }
+
+       if (adev->nbio.ras_funcs &&
+           adev->nbio.ras_funcs->init_ras_controller_interrupt) {
+               r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
                if (r)
                        goto release_con;
        }
 
-       if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
-               r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
+       if (adev->nbio.ras_funcs &&
+           adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
+               r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
                if (r)
                        goto release_con;
        }
index c477f89..af44aad 100644 (file)
@@ -557,6 +557,16 @@ static void nbio_v7_4_enable_doorbell_interrupt(struct amdgpu_device *adev,
                       DOORBELL_INTERRUPT_DISABLE, enable ? 0 : 1);
 }
 
+const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs = {
+       .handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring,
+       .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
+       .init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt,
+       .init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt,
+       .query_ras_error_count = nbio_v7_4_query_ras_error_count,
+       .ras_late_init = amdgpu_nbio_ras_late_init,
+       .ras_fini = amdgpu_nbio_ras_fini,
+};
+
 const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
        .get_hdp_flush_req_offset = nbio_v7_4_get_hdp_flush_req_offset,
        .get_hdp_flush_done_offset = nbio_v7_4_get_hdp_flush_done_offset,
@@ -577,10 +587,4 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
        .ih_control = nbio_v7_4_ih_control,
        .init_registers = nbio_v7_4_init_registers,
        .remap_hdp_registers = nbio_v7_4_remap_hdp_registers,
-       .handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring,
-       .handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
-       .init_ras_controller_interrupt = nbio_v7_4_init_ras_controller_interrupt,
-       .init_ras_err_event_athub_interrupt = nbio_v7_4_init_ras_err_event_athub_interrupt,
-       .query_ras_error_count = nbio_v7_4_query_ras_error_count,
-       .ras_late_init = amdgpu_nbio_ras_late_init,
 };
index b1ac828..b821658 100644 (file)
@@ -28,5 +28,6 @@
 
 extern const struct nbio_hdp_flush_reg nbio_v7_4_hdp_flush_reg;
 extern const struct amdgpu_nbio_funcs nbio_v7_4_funcs;
+extern const struct amdgpu_nbio_ras_funcs nbio_v7_4_ras_funcs;
 
 #endif
index 64c98b9..5c5eb3a 100644 (file)
@@ -1523,8 +1523,9 @@ static int soc15_common_late_init(void *handle)
        if (adev->hdp.funcs->reset_ras_error_count)
                adev->hdp.funcs->reset_ras_error_count(adev);
 
-       if (adev->nbio.funcs->ras_late_init)
-               r = adev->nbio.funcs->ras_late_init(adev);
+       if (adev->nbio.ras_funcs &&
+           adev->nbio.ras_funcs->ras_late_init)
+               r = adev->nbio.ras_funcs->ras_late_init(adev);
 
        return r;
 }
@@ -1545,7 +1546,9 @@ static int soc15_common_sw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-       amdgpu_nbio_ras_fini(adev);
+       if (adev->nbio.ras_funcs &&
+           adev->nbio.ras_funcs->ras_fini)
+               adev->nbio.ras_funcs->ras_fini(adev);
        adev->df.funcs->sw_fini(adev);
        return 0;
 }
@@ -1609,9 +1612,11 @@ static int soc15_common_hw_fini(void *handle)
 
        if (adev->nbio.ras_if &&
            amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
-               if (adev->nbio.funcs->init_ras_controller_interrupt)
+               if (adev->nbio.ras_funcs &&
+                   adev->nbio.ras_funcs->init_ras_controller_interrupt)
                        amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
-               if (adev->nbio.funcs->init_ras_err_event_athub_interrupt)
+               if (adev->nbio.ras_funcs &&
+                   adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt)
                        amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
        }