Merge tag 'drm-misc-next-2020-02-10' of git://anongit.freedesktop.org/drm/drm-misc...
[platform/kernel/linux-rpi.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
index e37fa52..53fdafd 100644 (file)
@@ -66,6 +66,7 @@
 #include "amdgpu_pmu.h"
 
 #include <linux/suspend.h>
+#include <drm/task_barrier.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -215,8 +216,8 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 {
        uint32_t ret;
 
-       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-               return amdgpu_virt_kiq_rreg(adev, reg);
+       if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+               return amdgpu_kiq_rreg(adev, reg);
 
        if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
                ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
@@ -293,8 +294,8 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
                adev->last_mm_index = v;
        }
 
-       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-               return amdgpu_virt_kiq_wreg(adev, reg, v);
+       if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+               return amdgpu_kiq_wreg(adev, reg, v);
 
        if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
                writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
@@ -984,7 +985,7 @@ static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
 {
        struct sysinfo si;
-       bool is_os_64 = (sizeof(void *) == 8) ? true : false;
+       bool is_os_64 = (sizeof(void *) == 8);
        uint64_t total_memory;
        uint64_t dram_size_seven_GB = 0x1B8000000;
        uint64_t dram_size_three_GB = 0xB8000000;
@@ -1031,8 +1032,6 @@ def_value:
  */
 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 {
-       int ret = 0;
-
        if (amdgpu_sched_jobs < 4) {
                dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
                         amdgpu_sched_jobs);
@@ -1072,7 +1071,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 
        adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
 
-       return ret;
+       return 0;
 }
 
 /**
@@ -1810,7 +1809,8 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
                }
        }
 
-       r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
+       if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
+               r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
 
        return r;
 }
@@ -2345,14 +2345,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
                adev->ip_blocks[i].status.hw = false;
                /* handle putting the SMC in the appropriate state */
                if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
-                       if (is_support_sw_smu(adev)) {
-                               r = smu_set_mp1_state(&adev->smu, adev->mp1_state);
-                       } else if (adev->powerplay.pp_funcs &&
-                                          adev->powerplay.pp_funcs->set_mp1_state) {
-                               r = adev->powerplay.pp_funcs->set_mp1_state(
-                                       adev->powerplay.pp_handle,
-                                       adev->mp1_state);
-                       }
+                       r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
                        if (r) {
                                DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
                                          adev->mp1_state, r);
@@ -2439,7 +2432,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
                AMD_IP_BLOCK_TYPE_GFX,
                AMD_IP_BLOCK_TYPE_SDMA,
                AMD_IP_BLOCK_TYPE_UVD,
-               AMD_IP_BLOCK_TYPE_VCE
+               AMD_IP_BLOCK_TYPE_VCE,
+               AMD_IP_BLOCK_TYPE_VCN
        };
 
        for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
@@ -2454,7 +2448,11 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
                                block->status.hw)
                                continue;
 
-                       r = block->version->funcs->hw_init(adev);
+                       if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
+                               r = block->version->funcs->resume(adev);
+                       else
+                               r = block->version->funcs->hw_init(adev);
+
                        DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
                        if (r)
                                return r;
@@ -2663,14 +2661,38 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 {
        struct amdgpu_device *adev =
                container_of(__work, struct amdgpu_device, xgmi_reset_work);
+       struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 
-       if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
-               adev->asic_reset_res = (adev->in_baco == false) ?
-                               amdgpu_device_baco_enter(adev->ddev) :
-                               amdgpu_device_baco_exit(adev->ddev);
-       else
-               adev->asic_reset_res = amdgpu_asic_reset(adev);
+       /* It's a bug to not have a hive within this function */
+       if (WARN_ON(!hive))
+               return;
+
+       /*
+        * Use task barrier to synchronize all xgmi reset works across the
+        * hive. task_barrier_enter and task_barrier_exit will block
+        * until all the threads running the xgmi reset works reach
+        * those points. task_barrier_full will do both blocks.
+        */
+       if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+               task_barrier_enter(&hive->tb);
+               adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+               if (adev->asic_reset_res)
+                       goto fail;
+
+               task_barrier_exit(&hive->tb);
+               adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+               if (adev->asic_reset_res)
+                       goto fail;
+       } else {
 
+               task_barrier_full(&hive->tb);
+               adev->asic_reset_res =  amdgpu_asic_reset(adev);
+       }
+
+fail:
        if (adev->asic_reset_res)
                DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
                         adev->asic_reset_res, adev->ddev->unique);
@@ -2785,7 +2807,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        adev->mman.buffer_funcs = NULL;
        adev->mman.buffer_funcs_ring = NULL;
        adev->vm_manager.vm_pte_funcs = NULL;
-       adev->vm_manager.vm_pte_num_rqs = 0;
+       adev->vm_manager.vm_pte_num_scheds = 0;
        adev->gmc.gmc_funcs = NULL;
        adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
        bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
@@ -2826,6 +2848,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        hash_init(adev->mn_hash);
        mutex_init(&adev->lock_reset);
        mutex_init(&adev->psp.mutex);
+       mutex_init(&adev->notifier_lock);
 
        r = amdgpu_device_check_arguments(adev);
        if (r)
@@ -3029,6 +3052,14 @@ fence_driver_init:
                goto failed;
        }
 
+       DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
+                       adev->gfx.config.max_shader_engines,
+                       adev->gfx.config.max_sh_per_se,
+                       adev->gfx.config.max_cu_per_sh,
+                       adev->gfx.cu_info.number);
+
+       amdgpu_ctx_init_sched(adev);
+
        adev->accel_working = true;
 
        amdgpu_vm_check_compute_bug(adev);
@@ -3660,8 +3691,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
        if (r)
                return r;
 
-       amdgpu_amdkfd_pre_reset(adev);
-
        /* Resume IP prior to SMC */
        r = amdgpu_device_ip_reinit_early_sriov(adev);
        if (r)
@@ -3730,6 +3759,11 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
                case CHIP_VEGA10:
                case CHIP_VEGA12:
                case CHIP_RAVEN:
+               case CHIP_ARCTURUS:
+               case CHIP_RENOIR:
+               case CHIP_NAVI10:
+               case CHIP_NAVI14:
+               case CHIP_NAVI12:
                        break;
                default:
                        goto disabled;
@@ -3790,18 +3824,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
        return r;
 }
 
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
-                              struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                               struct list_head *device_list_handle,
                               bool *need_full_reset_arg)
 {
        struct amdgpu_device *tmp_adev = NULL;
        bool need_full_reset = *need_full_reset_arg, vram_lost = false;
        int r = 0;
-       int cpu = smp_processor_id();
-       bool use_baco =
-               (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
-               true : false;
 
        /*
         * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3809,62 +3838,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
         */
        if (need_full_reset) {
                list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-                       /*
-                        * For XGMI run all resets in parallel to speed up the
-                        * process by scheduling the highpri wq on different
-                        * cpus. For XGMI with baco reset, all nodes must enter
-                        * baco within close proximity before anyone exit.
-                        */
+                       /* For XGMI run all resets in parallel to speed up the process */
                        if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-                               if (!queue_work_on(cpu, system_highpri_wq,
-                                                  &tmp_adev->xgmi_reset_work))
+                               if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
                                        r = -EALREADY;
-                               cpu = cpumask_next(cpu, cpu_online_mask);
                        } else
                                r = amdgpu_asic_reset(tmp_adev);
-                       if (r)
-                               break;
-               }
-
-               /* For XGMI wait for all work to complete before proceed */
-               if (!r) {
-                       list_for_each_entry(tmp_adev, device_list_handle,
-                                           gmc.xgmi.head) {
-                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-                                       flush_work(&tmp_adev->xgmi_reset_work);
-                                       r = tmp_adev->asic_reset_res;
-                                       if (r)
-                                               break;
-                                       if (use_baco)
-                                               tmp_adev->in_baco = true;
-                               }
-                       }
-               }
-
-               /*
-                * For XGMI with baco reset, need exit baco phase by scheduling
-                * xgmi_reset_work one more time. PSP reset and sGPU skips this
-                * phase. Not assume the situation that PSP reset and baco reset
-                * coexist within an XGMI hive.
-                */
 
-               if (!r && use_baco) {
-                       cpu = smp_processor_id();
-                       list_for_each_entry(tmp_adev, device_list_handle,
-                                           gmc.xgmi.head) {
-                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-                                       if (!queue_work_on(cpu,
-                                               system_highpri_wq,
-                                               &tmp_adev->xgmi_reset_work))
-                                               r = -EALREADY;
-                                       if (r)
-                                               break;
-                                       cpu = cpumask_next(cpu, cpu_online_mask);
-                               }
+                       if (r) {
+                               DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+                                        r, tmp_adev->ddev->unique);
+                               break;
                        }
                }
 
-               if (!r && use_baco) {
+               /* For XGMI wait for all resets to complete before proceed */
+               if (!r) {
                        list_for_each_entry(tmp_adev, device_list_handle,
                                            gmc.xgmi.head) {
                                if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3872,16 +3861,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
                                        r = tmp_adev->asic_reset_res;
                                        if (r)
                                                break;
-                                       tmp_adev->in_baco = false;
                                }
                        }
                }
-
-               if (r) {
-                       DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-                                r, tmp_adev->ddev->unique);
-                       goto end;
-               }
        }
 
        if (!r && amdgpu_ras_intr_triggered())
@@ -3974,7 +3956,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
                mutex_lock(&adev->lock_reset);
 
        atomic_inc(&adev->gpu_reset_counter);
-       adev->in_gpu_reset = 1;
+       adev->in_gpu_reset = true;
        switch (amdgpu_asic_reset_method(adev)) {
        case AMD_RESET_METHOD_MODE1:
                adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -3994,7 +3976,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 {
        amdgpu_vf_error_trans_all(adev);
        adev->mp1_state = PP_MP1_STATE_NONE;
-       adev->in_gpu_reset = 0;
+       adev->in_gpu_reset = false;
        mutex_unlock(&adev->lock_reset);
 }
 
@@ -4175,8 +4157,7 @@ retry:    /* Rest of adevs pre asic reset from XGMI hive. */
                if (r)
                        adev->asic_reset_res = r;
        } else {
-               r  = amdgpu_do_asic_reset(adev, hive, device_list_handle,
-                                         &need_full_reset);
+               r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
                if (r && r == -EAGAIN)
                        goto retry;
        }
@@ -4377,55 +4358,21 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
        if (ras && ras->supported)
                adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
 
-       if (is_support_sw_smu(adev)) {
-               struct smu_context *smu = &adev->smu;
-               int ret;
-
-               ret = smu_baco_enter(smu);
-               if (ret)
-                       return ret;
-       } else {
-               void *pp_handle = adev->powerplay.pp_handle;
-               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
-
-               if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state)
-                       return -ENOENT;
-
-               /* enter BACO state */
-               if (pp_funcs->set_asic_baco_state(pp_handle, 1))
-                       return -EIO;
-       }
-
-       return 0;
+       return amdgpu_dpm_baco_enter(adev);
 }
 
 int amdgpu_device_baco_exit(struct drm_device *dev)
 {
        struct amdgpu_device *adev = dev->dev_private;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       int ret = 0;
 
        if (!amdgpu_device_supports_baco(adev->ddev))
                return -ENOTSUPP;
 
-       if (is_support_sw_smu(adev)) {
-               struct smu_context *smu = &adev->smu;
-               int ret;
-
-               ret = smu_baco_exit(smu);
-               if (ret)
-                       return ret;
-
-       } else {
-               void *pp_handle = adev->powerplay.pp_handle;
-               const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
-
-               if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state)
-                       return -ENOENT;
-
-               /* exit BACO state */
-               if (pp_funcs->set_asic_baco_state(pp_handle, 0))
-                       return -EIO;
-       }
+       ret = amdgpu_dpm_baco_exit(adev);
+       if (ret)
+               return ret;
 
        if (ras && ras->supported)
                adev->nbio.funcs->enable_doorbell_interrupt(adev, true);