drm/amdgpu: Implement DPC recovery
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Wed, 29 Jul 2020 16:59:45 +0000 (12:59 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 15 Sep 2020 21:24:32 +0000 (17:24 -0400)
Add PCI Downstream Port Containment (DPC) with
basic recovery functionality

v2: remove pci_save_state to avoid breaking suspend/resume
v3: Fix style comments
v4: Improve description.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index 4009d2e..29c0b4c 100644 (file)
@@ -49,6 +49,8 @@
 #include <linux/rbtree.h>
 #include <linux/hashtable.h>
 #include <linux/dma-fence.h>
+#include <linux/pci.h>
+#include <linux/aer.h>
 
 #include <drm/ttm/ttm_bo_api.h>
 #include <drm/ttm/ttm_bo_driver.h>
@@ -1260,6 +1262,12 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return
 void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
 void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);
 
+pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
+                                          pci_channel_state_t state);
+pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
+pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev);
+void amdgpu_pci_resume(struct pci_dev *pdev);
+
 #include "amdgpu_object.h"
 
 /* used by df_v3_6.c and amdgpu_pmu.c */
index f7307af..99c0e6e 100644 (file)
@@ -2999,6 +2999,7 @@ static const struct attribute *amdgpu_dev_attributes[] = {
        NULL
 };
 
+
 /**
  * amdgpu_device_init - initialize the driver
  *
@@ -3217,6 +3218,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
                }
        }
 
+       pci_enable_pcie_error_reporting(adev->ddev.pdev);
+
        /* Post card if necessary */
        if (amdgpu_device_need_post(adev)) {
                if (!adev->bios) {
@@ -4705,3 +4708,161 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
 
        return 0;
 }
+
+/**
+ * amdgpu_pci_error_detected - Called when a PCI error is detected.
+ * @pdev: PCI device struct
+ * @state: PCI channel state
+ *
+ * Description: Called when a PCI error is detected.
+ *
+ * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
+ */
+pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+       struct drm_device *dev = pci_get_drvdata(pdev);
+       struct amdgpu_device *adev = drm_to_adev(dev);
+
+       DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
+
+       switch (state) {
+       case pci_channel_io_normal:
+               return PCI_ERS_RESULT_CAN_RECOVER;
+       case pci_channel_io_frozen:
+               /* Fatal error, prepare for slot reset */
+               amdgpu_device_lock_adev(adev);
+               return PCI_ERS_RESULT_NEED_RESET;
+       case pci_channel_io_perm_failure:
+               /* Permanent error, prepare for device removal */
+               return PCI_ERS_RESULT_DISCONNECT;
+       }
+
+       return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
+ * @pdev: pointer to PCI device
+ */
+pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
+{
+
+       DRM_INFO("PCI error: mmio enabled callback!!\n");
+
+       /* TODO - dump whatever for debugging purposes */
+
+       /* This called only if amdgpu_pci_error_detected returns
+        * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
+        * works, no need to reset slot.
+        */
+
+       return PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
+ * @pdev: PCI device struct
+ *
+ * Description: This routine is called by the pci error recovery
+ * code after the PCI slot has been reset, just before we
+ * should resume normal operations.
+ */
+pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
+{
+       struct drm_device *dev = pci_get_drvdata(pdev);
+       struct amdgpu_device *adev = drm_to_adev(dev);
+       int r;
+       bool vram_lost;
+
+       DRM_INFO("PCI error: slot reset callback!!\n");
+
+       pci_restore_state(pdev);
+
+       r = amdgpu_device_ip_suspend(adev);
+       if (r)
+               goto out;
+
+
+       /* post card */
+       r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
+       if (r)
+               goto out;
+
+       r = amdgpu_device_ip_resume_phase1(adev);
+       if (r)
+               goto out;
+
+       vram_lost = amdgpu_device_check_vram_lost(adev);
+       if (vram_lost) {
+               DRM_INFO("VRAM is lost due to GPU reset!\n");
+               amdgpu_inc_vram_lost(adev);
+       }
+
+       r = amdgpu_gtt_mgr_recover(
+               &adev->mman.bdev.man[TTM_PL_TT]);
+       if (r)
+               goto out;
+
+       r = amdgpu_device_fw_loading(adev);
+       if (r)
+               return r;
+
+       r = amdgpu_device_ip_resume_phase2(adev);
+       if (r)
+               goto out;
+
+       if (vram_lost)
+               amdgpu_device_fill_reset_magic(adev);
+
+       /*
+        * Add this ASIC as tracked as reset was already
+        * complete successfully.
+        */
+       amdgpu_register_gpu_instance(adev);
+
+       r = amdgpu_device_ip_late_init(adev);
+       if (r)
+               goto out;
+
+       amdgpu_fbdev_set_suspend(adev, 0);
+
+       /* must succeed. */
+       amdgpu_ras_resume(adev);
+
+
+       amdgpu_irq_gpu_reset_resume_helper(adev);
+       r = amdgpu_ib_ring_tests(adev);
+       if (r)
+               goto out;
+
+       r = amdgpu_device_recover_vram(adev);
+
+out:
+
+       if (!r) {
+               DRM_INFO("PCIe error recovery succeeded\n");
+       } else {
+               DRM_ERROR("PCIe error recovery failed, err:%d", r);
+               amdgpu_device_unlock_adev(adev);
+       }
+
+       return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+/**
+ * amdgpu_pci_resume() - resume normal ops after PCI reset
+ * @pdev: pointer to PCI device
+ *
+ * Called when the error recovery driver tells us that its
+ * OK to resume normal operation. Use completion to allow
+ * halted scsi ops to resume.
+ */
+void amdgpu_pci_resume(struct pci_dev *pdev)
+{
+       struct drm_device *dev = pci_get_drvdata(pdev);
+       struct amdgpu_device *adev = drm_to_adev(dev);
+
+       amdgpu_device_unlock_adev(adev);
+
+       DRM_INFO("PCI error: resume callback!!\n");
+}
index 6edde2b..e9daab1 100644 (file)
@@ -32,7 +32,6 @@
 #include <drm/drm_pciids.h>
 #include <linux/console.h>
 #include <linux/module.h>
-#include <linux/pci.h>
 #include <linux/pm_runtime.h>
 #include <linux/vga_switcheroo.h>
 #include <drm/drm_probe_helper.h>
@@ -1528,6 +1527,13 @@ static struct drm_driver kms_driver = {
        .patchlevel = KMS_DRIVER_PATCHLEVEL,
 };
 
+static struct pci_error_handlers amdgpu_pci_err_handler = {
+       .error_detected = amdgpu_pci_error_detected,
+       .mmio_enabled   = amdgpu_pci_mmio_enabled,
+       .slot_reset     = amdgpu_pci_slot_reset,
+       .resume         = amdgpu_pci_resume,
+};
+
 static struct pci_driver amdgpu_kms_pci_driver = {
        .name = DRIVER_NAME,
        .id_table = pciidlist,
@@ -1535,6 +1541,7 @@ static struct pci_driver amdgpu_kms_pci_driver = {
        .remove = amdgpu_pci_remove,
        .shutdown = amdgpu_pci_shutdown,
        .driver.pm = &amdgpu_pm_ops,
+       .err_handler = &amdgpu_pci_err_handler,
 };
 
 static int __init amdgpu_init(void)