drm/amdgpu:add smu mode1/2 support for aldebaran
authorFeifei Xu <Feifei.Xu@amd.com>
Thu, 19 Nov 2020 10:12:26 +0000 (18:12 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 24 Mar 2021 02:54:49 +0000 (22:54 -0400)
Use MSG_GfxDriverReset for mode reset and retire MSG_Mode1Reset.
Centralize soc15_asic_mode1_reset() and nv_asic_mode1_reset()functions.
Add mode2_reset_is_support() for smu->ppt_funcs.

Signed-off-by: Feifei Xu <Feifei.Xu@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/nv.c
drivers/gpu/drm/amd/amdgpu/soc15.c
drivers/gpu/drm/amd/pm/inc/aldebaran_ppsmc.h
drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
drivers/gpu/drm/amd/pm/inc/smu_types.h
drivers/gpu/drm/amd/pm/inc/smu_v13_0.h
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c

index b5310b35721c6f54cda384cbde381ede8a183d9d..e26c5799d6a5d1d7eb9374e5ee61e868ca490ba4 100644 (file)
@@ -1261,6 +1261,7 @@ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
                                             const u32 array_size);
 
 bool amdgpu_device_supports_atpx(struct drm_device *dev);
+int amdgpu_device_mode1_reset(struct amdgpu_device *adev);
 bool amdgpu_device_supports_boco(struct drm_device *dev);
 bool amdgpu_device_supports_baco(struct drm_device *dev);
 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
index 1f39af03153e446c6aa5564164f3620554532cb8..c4e10e97125e17e8d489c519e0e2117448ace6f9 100644 (file)
@@ -4248,6 +4248,45 @@ disabled:
                return false;
 }
 
+int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
+{
+        u32 i;
+        int ret = 0;
+
+        amdgpu_atombios_scratch_regs_engine_hung(adev, true);
+
+        dev_info(adev->dev, "GPU mode1 reset\n");
+
+        /* disable BM */
+        pci_clear_master(adev->pdev);
+
+        amdgpu_device_cache_pci_state(adev->pdev);
+
+        if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
+                dev_info(adev->dev, "GPU smu mode1 reset\n");
+                ret = amdgpu_dpm_mode1_reset(adev);
+        } else {
+                dev_info(adev->dev, "GPU psp mode1 reset\n");
+                ret = psp_gpu_reset(adev);
+        }
+
+        if (ret)
+                dev_err(adev->dev, "GPU mode1 reset failed\n");
+
+        amdgpu_device_load_pci_state(adev->pdev);
+
+        /* wait for asic to come out of reset */
+        for (i = 0; i < adev->usec_timeout; i++) {
+                u32 memsize = adev->nbio.funcs->get_memsize(adev);
+
+                if (memsize != 0xffffffff)
+                        break;
+                udelay(1);
+        }
+
+        amdgpu_atombios_scratch_regs_engine_hung(adev, false);
+        return ret;
+}
 
 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
                                        struct amdgpu_job *job,
index a7a2975f15f42abe3475d2c86bb6332bc999d019..5846eac292c3f515ac748ec8be4c74f40a6397e8 100644 (file)
@@ -484,44 +484,6 @@ static int nv_read_register(struct amdgpu_device *adev, u32 se_num,
        return -EINVAL;
 }
 
-static int nv_asic_mode1_reset(struct amdgpu_device *adev)
-{
-       u32 i;
-       int ret = 0;
-
-       amdgpu_atombios_scratch_regs_engine_hung(adev, true);
-
-       /* disable BM */
-       pci_clear_master(adev->pdev);
-
-       amdgpu_device_cache_pci_state(adev->pdev);
-
-       if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
-               dev_info(adev->dev, "GPU smu mode1 reset\n");
-               ret = amdgpu_dpm_mode1_reset(adev);
-       } else {
-               dev_info(adev->dev, "GPU psp mode1 reset\n");
-               ret = psp_gpu_reset(adev);
-       }
-
-       if (ret)
-               dev_err(adev->dev, "GPU mode1 reset failed\n");
-       amdgpu_device_load_pci_state(adev->pdev);
-
-       /* wait for asic to come out of reset */
-       for (i = 0; i < adev->usec_timeout; i++) {
-               u32 memsize = adev->nbio.funcs->get_memsize(adev);
-
-               if (memsize != 0xffffffff)
-                       break;
-               udelay(1);
-       }
-
-       amdgpu_atombios_scratch_regs_engine_hung(adev, false);
-
-       return ret;
-}
-
 static int nv_asic_mode2_reset(struct amdgpu_device *adev)
 {
        u32 i;
@@ -624,7 +586,7 @@ static int nv_asic_reset(struct amdgpu_device *adev)
                break;
        default:
                dev_info(adev->dev, "MODE1 reset\n");
-               ret = nv_asic_mode1_reset(adev);
+               ret = amdgpu_device_mode1_reset(adev);
                break;
        }
 
index f8fbf6aa1a448fa23e87caf715c43473bd027f33..8298c608f181eea0bb2a0249acb582155e2c1690 100644 (file)
@@ -650,40 +650,6 @@ void soc15_program_register_sequence(struct amdgpu_device *adev,
 
 }
 
-static int soc15_asic_mode1_reset(struct amdgpu_device *adev)
-{
-       u32 i;
-       int ret = 0;
-
-       amdgpu_atombios_scratch_regs_engine_hung(adev, true);
-
-       dev_info(adev->dev, "GPU mode1 reset\n");
-
-       /* disable BM */
-       pci_clear_master(adev->pdev);
-
-       amdgpu_device_cache_pci_state(adev->pdev);
-
-       ret = psp_gpu_reset(adev);
-       if (ret)
-               dev_err(adev->dev, "GPU mode1 reset failed\n");
-
-       amdgpu_device_load_pci_state(adev->pdev);
-
-       /* wait for asic to come out of reset */
-       for (i = 0; i < adev->usec_timeout; i++) {
-               u32 memsize = adev->nbio.funcs->get_memsize(adev);
-
-               if (memsize != 0xffffffff)
-                       break;
-               udelay(1);
-       }
-
-       amdgpu_atombios_scratch_regs_engine_hung(adev, false);
-
-       return ret;
-}
-
 static int soc15_asic_baco_reset(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
@@ -708,13 +674,21 @@ static enum amd_reset_method
 soc15_asic_reset_method(struct amdgpu_device *adev)
 {
        bool baco_reset = false;
+       bool connected_to_cpu = false;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+        if (adev->gmc.xgmi.supported && adev->gmc.xgmi.connected_to_cpu)
+                connected_to_cpu = true;
+
        if (amdgpu_reset_method == AMD_RESET_METHOD_MODE1 ||
            amdgpu_reset_method == AMD_RESET_METHOD_MODE2 ||
            amdgpu_reset_method == AMD_RESET_METHOD_BACO ||
-           amdgpu_reset_method == AMD_RESET_METHOD_PCI)
-               return amdgpu_reset_method;
+           amdgpu_reset_method == AMD_RESET_METHOD_PCI) {
+               /* If connected to cpu, driver only support mode2 */
+                if (connected_to_cpu)
+                        return AMD_RESET_METHOD_MODE2;
+                return amdgpu_reset_method;
+        }
 
        if (amdgpu_reset_method != -1)
                dev_warn(adev->dev, "Specified reset method:%d isn't supported, using AUTO instead.\n",
@@ -740,6 +714,14 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
                if ((ras && ras->supported) && adev->pm.fw_version <= 0x283400)
                        baco_reset = false;
                break;
+       case CHIP_ALDEBARAN:
+                /*
+                * 1.connected to cpu: driver issue mode2 reset
+                * 2.discret gpu: driver issue mode1 reset
+                */
+               if (connected_to_cpu)
+                       return AMD_RESET_METHOD_MODE2;
+               break;
        default:
                break;
        }
@@ -769,7 +751,7 @@ static int soc15_asic_reset(struct amdgpu_device *adev)
                return amdgpu_dpm_mode2_reset(adev);
        default:
                dev_info(adev->dev, "MODE1 reset\n");
-               return soc15_asic_mode1_reset(adev);
+               return amdgpu_device_mode1_reset(adev);
        }
 }
 
index 302888376c7c3a78a1812f2b202233dfcf994978..433dd1e9ec4f819a7da73fac7f4d7c609ab7ff0d 100644 (file)
@@ -36,7 +36,7 @@
 // Message Definitions:
 #define PPSMC_MSG_TestMessage                    0x1
 #define PPSMC_MSG_GetSmuVersion                  0x2
-#define PPSMC_MSG_Mode1Reset                     0x3
+#define PPSMC_MSG_GfxDriverReset                 0x3
 #define PPSMC_MSG_GetDriverIfVersion             0x4
 #define PPSMC_MSG_spare1                         0x5
 #define PPSMC_MSG_spare2                         0x6
@@ -70,8 +70,8 @@
 #define PPSMC_MSG_SetPptLimit                    0x22
 #define PPSMC_MSG_GetPptLimit                    0x23
 #define PPSMC_MSG_PrepareMp1ForUnload            0x24
-#define PPSMC_MSG_PrepareMp1ForReset             0x25
-#define PPSMC_MSG_SoftReset                      0x26
+#define PPSMC_MSG_PrepareMp1ForReset             0x25 //retired in 68.07
+#define PPSMC_MSG_SoftReset                      0x26 //retired in 68.07
 #define PPSMC_MSG_RunDcBtc                       0x27
 #define PPSMC_MSG_DramLogSetDramAddrHigh         0x28
 #define PPSMC_MSG_DramLogSetDramAddrLow          0x29
 #define PPSMC_MSG_DisableDeterminism             0x3A
 #define PPSMC_MSG_SetUclkDpmMode                 0x3B
 
-#define PPSMC_Message_Count                      0x3C
+//STB to dram log
+#define PPSMC_MSG_DumpSTBtoDram                     0x3C
+#define PPSMC_MSG_STBtoDramLogSetDramAddrHigh       0x3D
+#define PPSMC_MSG_STBtoDramLogSetDramAddrLow        0x3E
+#define PPSMC_MSG_STBtoDramLogSetDramSize           0x3F
+#define PPSMC_MSG_SetSystemVirtualSTBtoDramAddrHigh 0x40
+#define PPSMC_MSG_SetSystemVirtualSTBtoDramAddrLow  0x41
+
+#define PPSMC_Message_Count                      0x42
+
+//PPSMC Reset Types
+#define PPSMC_RESET_TYPE_WARM_RESET              0x00
+#define PPSMC_RESET_TYPE_DRIVER_MODE_1_RESET     0x01 //driver msg argument should be 1 for mode-1
+#define PPSMC_RESET_TYPE_DRIVER_MODE_2_RESET     0x02 //and 2 for mode-2
+#define PPSMC_RESET_TYPE_PCIE_LINK_RESET         0x03
+#define PPSMC_RESET_TYPE_BIF_LINK_RESET          0x04
+#define PPSMC_RESET_TYPE_PF0_FLR_RESET           0x05
+
 
 typedef enum {
   GFXOFF_ERROR_NO_ERROR,
index 2b4308c025dde9a96d5cd30ad961f33281844cc2..c02ffbd1df763780e11e8397aa33f6b8076738ae 100644 (file)
@@ -1044,6 +1044,10 @@ struct pptable_funcs {
         * @mode1_reset_is_support: Check if GPU supports mode1 reset.
         */
        bool (*mode1_reset_is_support)(struct smu_context *smu);
+       /**
+        * @mode2_reset_is_support: Check if GPU supports mode2 reset.
+        */
+       bool (*mode2_reset_is_support)(struct smu_context *smu);
 
        /**
         * @mode1_reset: Perform mode1 reset.
@@ -1279,6 +1283,7 @@ int smu_baco_set_state(void *handle, int state);
 
 
 bool smu_mode1_reset_is_support(struct smu_context *smu);
+bool smu_mode2_reset_is_support(struct smu_context *smu);
 int smu_mode1_reset(struct smu_context *smu);
 int smu_mode2_reset(void *handle);
 
index e9a0bda98fd75d43c0addf753916bffab736b6ba..207d5d923c9efb81fd792d96156d584ca4c25824 100644 (file)
        __SMU_DUMMY_MAP(GET_UMC_FW_WA), \
        __SMU_DUMMY_MAP(Mode1Reset), \
        __SMU_DUMMY_MAP(RlcPowerNotify),                 \
+       __SMU_DUMMY_MAP(GfxDriverReset), \
        __SMU_DUMMY_MAP(SetHardMinIspiclkByFreq),        \
        __SMU_DUMMY_MAP(SetHardMinIspxclkByFreq),        \
        __SMU_DUMMY_MAP(SetSoftMinSocclkByFreq),         \
index 699b656bbd71a8bcf628df8d82320c29629468f7..ef9dad9a51ffae0651aec259a43a12a79845afd8 100644 (file)
@@ -220,6 +220,7 @@ int smu_v13_0_baco_enter(struct smu_context *smu);
 int smu_v13_0_baco_exit(struct smu_context *smu);
 
 int smu_v13_0_mode1_reset(struct smu_context *smu);
+int smu_v13_0_mode2_reset(struct smu_context *smu);
 
 int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type,
                                    uint32_t *min, uint32_t *max);
index ffddee897d055df314b5847d54dd4b2d7a60ad79..4ffe74cdd28789f24d5de32a3d341861ff18305e 100644 (file)
@@ -1917,6 +1917,9 @@ int smu_set_mp1_state(void *handle,
                msg = SMU_MSG_PrepareMp1ForUnload;
                break;
        case PP_MP1_STATE_RESET:
+       /*TODO: since the SMU_MSG_PrepareMp1ForReset is retired in Aldebaran
+       * Add handling here forAldebaran.
+       */
                msg = SMU_MSG_PrepareMp1ForReset;
                break;
        case PP_MP1_STATE_NONE:
@@ -2788,6 +2791,23 @@ bool smu_mode1_reset_is_support(struct smu_context *smu)
        return ret;
 }
 
+bool smu_mode2_reset_is_support(struct smu_context *smu)
+{
+       bool ret = false;
+
+       if (!smu->pm_enabled)
+               return false;
+
+       mutex_lock(&smu->mutex);
+
+       if (smu->ppt_funcs && smu->ppt_funcs->mode2_reset_is_support)
+               ret = smu->ppt_funcs->mode2_reset_is_support(smu);
+
+       mutex_unlock(&smu->mutex);
+
+       return ret;
+}
+
 int smu_mode1_reset(struct smu_context *smu)
 {
        int ret = 0;
index bdefb9078847d7f2edeafb908ce5c6b3882bf93a..475bd5aff6c9b3bfb7b26fcb7882a732c7b548ea 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/pci.h>
 #include "amdgpu_ras.h"
 #include "smu_cmn.h"
+#include "mp/mp_13_0_2_offset.h"
 
 /*
  * DO NOT use these for err/warn/info/debug messages.
@@ -108,7 +109,7 @@ static const struct cmn2asic_msg_mapping aldebaran_message_map[SMU_MSG_MAX_COUNT
        MSG_MAP(GetPptLimit,                         PPSMC_MSG_GetPptLimit,                     1),
        MSG_MAP(PrepareMp1ForUnload,                 PPSMC_MSG_PrepareMp1ForUnload,             0),
        MSG_MAP(PrepareMp1ForReset,                  PPSMC_MSG_PrepareMp1ForReset,              0),
-       MSG_MAP(Mode1Reset,                          PPSMC_MSG_Mode1Reset,                      0),
+       MSG_MAP(GfxDriverReset,                      PPSMC_MSG_GfxDriverReset,                  0),
        MSG_MAP(SoftReset,                           PPSMC_MSG_SoftReset,                       0),
        MSG_MAP(RunDcBtc,                            PPSMC_MSG_RunDcBtc,                        0),
        MSG_MAP(DramLogSetDramAddrHigh,              PPSMC_MSG_DramLogSetDramAddrHigh,          0),
@@ -1250,6 +1251,31 @@ static ssize_t aldebaran_get_gpu_metrics(struct smu_context *smu,
        return sizeof(struct gpu_metrics_v1_0);
 }
 
+static bool aldebaran_is_mode1_reset_supported(struct smu_context *smu)
+{
+       struct amdgpu_device *adev = smu->adev;
+       u32 smu_version;
+       uint32_t val;
+       /**
+        * PM FW version support mode1 reset from 68.07
+        */
+       smu_cmn_get_smc_version(smu, NULL, &smu_version);
+       if ((smu_version < 0x00440700))
+               return false;
+       /**
+        * mode1 reset relies on PSP, so we should check if
+        * PSP is alive.
+        */
+       val = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81);
+
+       return val != 0x0;
+}
+
+static bool aldebaran_is_mode2_reset_supported(struct smu_context *smu)
+{
+       return true;
+}
+
 static const struct pptable_funcs aldebaran_ppt_funcs = {
        /* init dpm */
        .get_allowed_feature_mask = aldebaran_get_allowed_feature_mask,
@@ -1305,6 +1331,10 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
        .get_pp_feature_mask = smu_cmn_get_pp_feature_mask,
        .set_pp_feature_mask = smu_cmn_set_pp_feature_mask,
        .get_gpu_metrics = aldebaran_get_gpu_metrics,
+       .mode1_reset_is_support = aldebaran_is_mode1_reset_supported,
+       .mode2_reset_is_support = aldebaran_is_mode2_reset_supported,
+       .mode1_reset = smu_v13_0_mode1_reset,
+       .mode2_reset = smu_v13_0_mode2_reset,
 };
 
 void aldebaran_set_ppt_funcs(struct smu_context *smu)
index 20fff2fda13f91bbba0d215a95d5a882f045dc67..15033caeacb73922b6015d8e94506b1ec72ee51f 100644 (file)
@@ -1349,15 +1349,39 @@ int smu_v13_0_set_azalia_d3_pme(struct smu_context *smu)
 
 int smu_v13_0_mode1_reset(struct smu_context *smu)
 {
+       u32 smu_version;
        int ret = 0;
+       /*
+       * PM FW support SMU_MSG_GfxDeviceDriverReset from 68.07
+       */
+       smu_cmn_get_smc_version(smu, NULL, &smu_version);
+       if (smu_version < 0x00440700)
+               ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
+       else
+               ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_1, NULL);
 
-       ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
        if (!ret)
                msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
 
        return ret;
 }
 
+int smu_v13_0_mode2_reset(struct smu_context *smu)
+{
+       u32 smu_version;
+       int ret = 0;
+       struct amdgpu_device *adev = smu->adev;
+       smu_cmn_get_smc_version(smu, NULL, &smu_version);
+       if (smu_version >= 0x00440700)
+               ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GfxDeviceDriverReset, SMU_RESET_MODE_2, NULL);
+       else
+               dev_err(adev->dev, "smu fw 0x%x does not support MSG_GfxDeviceDriverReset MSG\n", smu_version);
+       /*TODO: mode2 reset wait time should be shorter, will modify it later*/
+       if (!ret)
+               msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
+       return ret;
+}
+
 int smu_v13_0_get_dpm_ultimate_freq(struct smu_context *smu, enum smu_clk_type clk_type,
                                    uint32_t *min, uint32_t *max)
 {