drm/amdgpu: fix system hang issue during GPU reset
authorDennis Li <Dennis.Li@amd.com>
Wed, 8 Jul 2020 07:07:13 +0000 (15:07 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 27 Jul 2020 20:21:37 +0000 (16:21 -0400)
when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover,
the atomic adev->in_gpu_reset and hive->in_reset are used to avoid
re-entering GPU recovery.

During GPU reset and resume, it is unsafe that other threads access GPU,
which maybe cause GPU reset failed. Therefore the new rw_semaphore
adev->reset_sem is introduced, which protect GPU from being accessed by
external threads during recovery.

v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.

v3:
1. change back to use adev->reset_sem to protect kfd callback
functions, because dqm_lock couldn't protect all codes, for example:
free_mqd must be called outside of dqm_lock;

[ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019
[ 1230.177221] Call Trace:
[ 1230.178249]  dump_stack+0x98/0xd5
[ 1230.179443]  amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu]
[ 1230.180673]  gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu]
[ 1230.181882]  amdgpu_gart_unbind+0xa9/0xe0 [amdgpu]
[ 1230.183098]  amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu]
[ 1230.184239]  ? ttm_bo_put+0x171/0x5f0 [ttm]
[ 1230.185394]  ttm_tt_unbind+0x21/0x40 [ttm]
[ 1230.186558]  ttm_tt_destroy.part.12+0x12/0x60 [ttm]
[ 1230.187707]  ttm_tt_destroy+0x13/0x20 [ttm]
[ 1230.188832]  ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm]
[ 1230.189979]  ttm_bo_put+0x1be/0x5f0 [ttm]
[ 1230.191230]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 1230.192522]  amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu]
[ 1230.193833]  free_mqd+0x25/0x40 [amdgpu]
[ 1230.195143]  destroy_queue_cpsch+0x1a7/0x270 [amdgpu]
[ 1230.196475]  pqm_destroy_queue+0x105/0x260 [amdgpu]
[ 1230.197819]  kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu]
[ 1230.199154]  kfd_ioctl+0x277/0x500 [amdgpu]
[ 1230.200458]  ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu]
[ 1230.201656]  ? tomoyo_file_ioctl+0x19/0x20
[ 1230.202831]  ksys_ioctl+0x98/0xb0
[ 1230.204004]  __x64_sys_ioctl+0x1a/0x20
[ 1230.205174]  do_syscall_64+0x5f/0x250
[ 1230.206339]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

2. remove try_lock and introduce atomic hive->in_reset, to avoid
re-enter GPU recovery.

v4:
1. remove an unnecessary whitespace change in kfd_chardev.c
2. remove comment codes in amdgpu_device.c
3. add more detailed comment in commit message
4. define a wrap function amdgpu_in_reset

v5:
1. Fix some style issues.

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Suggested-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Suggested-by: Lijo Lazar <Lijo.Lazar@amd.com>
Suggested-by: Luben Tukov <luben.tuikov@amd.com>
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
39 files changed:
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
drivers/gpu/drm/amd/amdgpu/atom.c
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_process.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c

index 327a0da..58e3942 100644 (file)
@@ -961,9 +961,9 @@ struct amdgpu_device {
        bool                            in_suspend;
        bool                            in_hibernate;
 
-       bool                            in_gpu_reset;
+       atomic_t                        in_gpu_reset;
        enum pp_mp1_state               mp1_state;
-       struct mutex  lock_reset;
+       struct rw_semaphore     reset_sem;
        struct amdgpu_doorbell_index doorbell_index;
 
        struct mutex                    notifier_lock;
@@ -1278,4 +1278,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
        return adev->gmc.tmz_enabled;
 }
 
+static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
+{
+       return atomic_read(&adev->in_gpu_reset) ? true : false;
+}
+
 #endif
index 1b865fe..a0ea663 100644 (file)
@@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
        if (cp_mqd_gfx9)
                bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
 
+       if (!down_read_trylock(&adev->reset_sem))
+               return -EIO;
+
        r = amdgpu_bo_create(adev, &bp, &bo);
        if (r) {
                dev_err(adev->dev,
                        "failed to allocate BO for amdkfd (%d)\n", r);
-               return r;
+               goto err;
        }
 
        /* map the buffer */
@@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
 
        amdgpu_bo_unreserve(bo);
 
+       up_read(&adev->reset_sem);
        return 0;
 
 allocate_mem_kmap_bo_failed:
@@ -291,19 +295,25 @@ allocate_mem_pin_bo_failed:
        amdgpu_bo_unreserve(bo);
 allocate_mem_reserve_bo_failed:
        amdgpu_bo_unref(&bo);
-
+err:
+       up_read(&adev->reset_sem);
        return r;
 }
 
 void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
 {
+       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
        struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
 
+       down_read(&adev->reset_sem);
+
        amdgpu_bo_reserve(bo, true);
        amdgpu_bo_kunmap(bo);
        amdgpu_bo_unpin(bo);
        amdgpu_bo_unreserve(bo);
        amdgpu_bo_unref(&(bo));
+
+       up_read(&adev->reset_sem);
 }
 
 int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
@@ -335,9 +345,14 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
 
 void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
 {
+       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
        struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;
 
+       down_read(&adev->reset_sem);
+
        amdgpu_bo_unref(&bo);
+
+       up_read(&adev->reset_sem);
 }
 
 uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
@@ -611,12 +626,19 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
        /* This works for NO_HWS. TODO: need to handle without knowing VMID */
        job->vmid = vmid;
 
+       if (!down_read_trylock(&adev->reset_sem)) {
+               ret = -EIO;
+               goto err_ib_sched;
+       }
+
        ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
        if (ret) {
                DRM_ERROR("amdgpu: failed to schedule IB.\n");
                goto err_ib_sched;
        }
 
+       up_read(&adev->reset_sem);
+
        ret = dma_fence_wait(f, false);
 
 err_ib_sched:
@@ -647,6 +669,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
 
+       if (!down_read_trylock(&adev->reset_sem))
+               return -EIO;
+
        if (adev->family == AMDGPU_FAMILY_AI) {
                int i;
 
@@ -656,6 +681,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
                amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
        }
 
+       up_read(&adev->reset_sem);
+
        return 0;
 }
 
@@ -664,11 +691,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
        const uint32_t flush_type = 0;
        bool all_hub = false;
+       int ret = -EIO;
 
        if (adev->family == AMDGPU_FAMILY_AI)
                all_hub = true;
 
-       return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
+       if (down_read_trylock(&adev->reset_sem)) {
+               ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
+                                       pasid, flush_type, all_hub);
+               up_read(&adev->reset_sem);
+       }
+
+       return ret;
 }
 
 bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
index bf927f4..b0dcc80 100644 (file)
@@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
        uint32_t temp;
        struct v10_compute_mqd *m = get_mqd(mqd);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EIO;
 
 #if 0
index 744366c..275f203 100644 (file)
@@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
        unsigned long flags, end_jiffies;
        int retry;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EIO;
 
        acquire_queue(kgd, pipe_id, queue_id);
index feab4cc..4997189 100644 (file)
@@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
        int retry;
        struct vi_mqd *m = get_mqd(mqd);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EIO;
 
        acquire_queue(kgd, pipe_id, queue_id);
index c7fd0c4..7372818 100644 (file)
@@ -539,7 +539,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
        uint32_t temp;
        struct v9_mqd *m = get_mqd(mqd);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EIO;
 
        acquire_queue(kgd, pipe_id, queue_id);
index e5a5ba8..a21cf84 100644 (file)
@@ -1190,6 +1190,9 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
                return -EINVAL;
        }
 
+       if (!down_read_trylock(&adev->reset_sem))
+               return -EIO;
+
        *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
        if (!*mem) {
                ret = -ENOMEM;
@@ -1256,6 +1259,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
        if (offset)
                *offset = amdgpu_bo_mmap_offset(bo);
 
+       up_read(&adev->reset_sem);
        return 0;
 
 allocate_init_user_pages_failed:
@@ -1273,6 +1277,9 @@ err:
                sg_free_table(sg);
                kfree(sg);
        }
+
+       up_read(&adev->reset_sem);
+
        return ret;
 }
 
index a512ccb..a3b1503 100644 (file)
@@ -1292,6 +1292,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
        parser.adev = adev;
        parser.filp = filp;
 
+       down_read(&adev->reset_sem);
+
        r = amdgpu_cs_parser_init(&parser, data);
        if (r) {
                DRM_ERROR("Failed to initialize parser %d!\n", r);
@@ -1331,6 +1333,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 out:
        amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
 
+       up_read(&adev->reset_sem);
+
        return r;
 }
 
index 8842c55..d85d13f 100644 (file)
@@ -358,6 +358,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
        if (atomic_read(&ctx->guilty))
                out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
+       down_read(&adev->reset_sem);
+
        /*query ue count*/
        ras_counter = amdgpu_ras_query_error_count(adev, false);
        /*ras counter is monotonic increasing*/
@@ -373,6 +375,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
                ctx->ras_counter_ce = ras_counter;
        }
 
+       up_read(&adev->reset_sem);
+
        mutex_unlock(&mgr->lock);
        return 0;
 }
index 193ffdb..3a4b31b 100644 (file)
@@ -100,14 +100,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
 
        file->private_data = adev;
 
-       mutex_lock(&adev->lock_reset);
+       down_read(&adev->reset_sem);
        if (adev->autodump.dumping.done) {
                reinit_completion(&adev->autodump.dumping);
                ret = 0;
        } else {
                ret = -EBUSY;
        }
-       mutex_unlock(&adev->lock_reset);
+       up_read(&adev->reset_sem);
 
        return ret;
 }
@@ -126,7 +126,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_
 
        poll_wait(file, &adev->autodump.gpu_hang, poll_table);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return POLLIN | POLLRDNORM | POLLWRNORM;
 
        return 0;
@@ -1241,7 +1241,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
        }
 
        /* Avoid accidently unparking the sched thread during GPU reset */
-       mutex_lock(&adev->lock_reset);
+       down_read(&adev->reset_sem);
 
        /* hold on the scheduler */
        for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1268,7 +1268,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
                kthread_unpark(ring->sched.thread);
        }
 
-       mutex_unlock(&adev->lock_reset);
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(dev->dev);
        pm_runtime_put_autosuspend(dev->dev);
@@ -1458,7 +1458,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
                return -ENOMEM;
 
        /* Avoid accidently unparking the sched thread during GPU reset */
-       mutex_lock(&adev->lock_reset);
+       down_read(&adev->reset_sem);
 
        /* stop the scheduler */
        kthread_park(ring->sched.thread);
@@ -1499,7 +1499,7 @@ failure:
        /* restart the scheduler */
        kthread_unpark(ring->sched.thread);
 
-       mutex_unlock(&adev->lock_reset);
+       up_read(&adev->reset_sem);
 
        ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
 
index aa5b54e..62ecac9 100644 (file)
@@ -1935,7 +1935,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
                        if (adev->ip_blocks[i].status.hw == true)
                                break;
 
-                       if (adev->in_gpu_reset || adev->in_suspend) {
+                       if (amdgpu_in_reset(adev) || adev->in_suspend) {
                                r = adev->ip_blocks[i].version->funcs->resume(adev);
                                if (r) {
                                        DRM_ERROR("resume of IP block <%s> failed %d\n",
@@ -2106,7 +2106,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
                        AMDGPU_RESET_MAGIC_NUM))
                return true;
 
-       if (!adev->in_gpu_reset)
+       if (!amdgpu_in_reset(adev))
                return false;
 
        /*
@@ -3036,7 +3036,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        mutex_init(&adev->mn_lock);
        mutex_init(&adev->virt.vf_errors.lock);
        hash_init(adev->mn_hash);
-       mutex_init(&adev->lock_reset);
+       init_rwsem(&adev->reset_sem);
+       atomic_set(&adev->in_gpu_reset, 0);
        mutex_init(&adev->psp.mutex);
        mutex_init(&adev->notifier_lock);
 
@@ -4064,8 +4065,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
                if (need_full_reset) {
                        /* post card */
-                       if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
-                               DRM_WARN("asic atom init failed!");
+                       if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) {
+                               dev_warn(tmp_adev->dev, "asic atom init failed!");
+                               r = -EAGAIN;
+                               goto out;
+                       }
 
                        if (!r) {
                                dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
@@ -4141,16 +4145,14 @@ end:
        return r;
 }
 
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
+static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
 {
-       if (trylock) {
-               if (!mutex_trylock(&adev->lock_reset))
-                       return false;
-       } else
-               mutex_lock(&adev->lock_reset);
+       if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
+               return false;
+
+       down_write(&adev->reset_sem);
 
        atomic_inc(&adev->gpu_reset_counter);
-       adev->in_gpu_reset = true;
        switch (amdgpu_asic_reset_method(adev)) {
        case AMD_RESET_METHOD_MODE1:
                adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4170,8 +4172,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 {
        amdgpu_vf_error_trans_all(adev);
        adev->mp1_state = PP_MP1_STATE_NONE;
-       adev->in_gpu_reset = false;
-       mutex_unlock(&adev->lock_reset);
+       atomic_set(&adev->in_gpu_reset, 0);
+       up_write(&adev->reset_sem);
 }
 
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -4281,12 +4283,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         * We always reset all schedulers for device and all devices for XGMI
         * hive so that should take care of them too.
         */
-       hive = amdgpu_get_xgmi_hive(adev, true);
-       if (hive && !mutex_trylock(&hive->reset_lock)) {
-               DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
-                         job ? job->base.id : -1, hive->hive_id);
-               mutex_unlock(&hive->hive_lock);
-               return 0;
+       hive = amdgpu_get_xgmi_hive(adev, false);
+       if (hive) {
+               if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
+                       DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
+                               job ? job->base.id : -1, hive->hive_id);
+                       return 0;
+               }
+               mutex_lock(&hive->hive_lock);
        }
 
        /*
@@ -4308,11 +4312,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
+               if (!amdgpu_device_lock_adev(tmp_adev)) {
                        DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
                                  job ? job->base.id : -1);
-                       mutex_unlock(&hive->hive_lock);
-                       return 0;
+                       r = 0;
+                       goto skip_recovery;
                }
 
                /*
@@ -4445,8 +4449,9 @@ skip_sched_resume:
                amdgpu_device_unlock_adev(tmp_adev);
        }
 
+skip_recovery:
        if (hive) {
-               mutex_unlock(&hive->reset_lock);
+               atomic_set(&hive->in_reset, 0);
                mutex_unlock(&hive->hive_lock);
        }
 
index 7f9e502..73cc68a 100644 (file)
@@ -671,6 +671,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                bo_va = NULL;
        }
 
+       down_read(&adev->reset_sem);
+
        switch (args->operation) {
        case AMDGPU_VA_OP_MAP:
                va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
@@ -700,6 +702,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
                                        args->operation);
 
+       up_read(&adev->reset_sem);
+
 error_backoff:
        ttm_eu_backoff_reservation(&ticket, &list);
 
index 78d37f9..8eff017 100644 (file)
@@ -724,7 +724,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
         *
         * also don't wait anymore for IRQ context
         * */
-       if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
+       if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
                goto failed_kiq_read;
 
        might_sleep();
@@ -782,7 +782,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
         *
         * also don't wait anymore for IRQ context
         * */
-       if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
+       if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
                goto failed_kiq_write;
 
        might_sleep();
@@ -801,5 +801,5 @@ failed_undo:
        amdgpu_ring_undo(ring);
        spin_unlock_irqrestore(&kiq->ring_lock, flags);
 failed_kiq_write:
-       pr_err("failed to write reg:%x\n", reg);
+       dev_warn(adev->dev, "failed to write reg:%x\n", reg);
 }
index 937029a..75d37df 100644 (file)
@@ -220,17 +220,17 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 
        trace_amdgpu_sched_run_job(job);
 
-       if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter))
-               dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
-
-       if (finished->error < 0) {
-               DRM_INFO("Skip scheduling IBs!\n");
-       } else {
+       if (down_read_trylock(&ring->adev->reset_sem)) {
                r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
-                                      &fence);
+                                       &fence);
+               up_read(&ring->adev->reset_sem);
                if (r)
                        DRM_ERROR("Error scheduling IBs (%d)\n", r);
+       } else {
+               dma_fence_set_error(finished, -ECANCELED);
+               DRM_INFO("Skip scheduling IBs!\n");
        }
+
        /* if gpu reset, hw fence will be replaced here */
        dma_fence_put(job->fence);
        job->fence = dma_fence_get(fence);
index a8c47ae..b9c1fce 100644 (file)
@@ -1084,6 +1084,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
        if (!fpriv)
                return;
 
+       down_read(&adev->reset_sem);
+
        pm_runtime_get_sync(dev->dev);
 
        if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL)
@@ -1122,6 +1124,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
 
        pm_runtime_mark_last_busy(dev->dev);
        pm_runtime_put_autosuspend(dev->dev);
+
+       up_read(&adev->reset_sem);
 }
 
 /*
index 0a05db9..2f7b055 100644 (file)
@@ -163,7 +163,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
        enum amd_pm_state_type pm;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -172,6 +172,8 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                if (adev->smu.ppt_funcs->get_current_power_state)
                        pm = smu_get_current_power_state(&adev->smu);
@@ -183,6 +185,8 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
                pm = adev->pm.dpm.user_state;
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -201,7 +205,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
        enum amd_pm_state_type  state;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (strncmp("battery", buf, strlen("battery")) == 0)
@@ -219,6 +223,8 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                mutex_lock(&adev->pm.mutex);
                adev->pm.dpm.user_state = state;
@@ -232,6 +238,9 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
 
                amdgpu_pm_compute_clocks(adev);
        }
+
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -307,7 +316,7 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
        enum amd_dpm_forced_level level = 0xff;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -316,6 +325,8 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                level = smu_get_performance_level(&adev->smu);
        else if (adev->powerplay.pp_funcs->get_performance_level)
@@ -323,6 +334,8 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
        else
                level = adev->pm.dpm.forced_level;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -349,7 +362,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
        enum amd_dpm_forced_level current_level = 0xff;
        int ret = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (strncmp("low", buf, strlen("low")) == 0) {
@@ -380,6 +393,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                current_level = smu_get_performance_level(&adev->smu);
        else if (adev->powerplay.pp_funcs->get_performance_level)
@@ -388,7 +403,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
        if (current_level == level) {
                pm_runtime_mark_last_busy(ddev->dev);
                pm_runtime_put_autosuspend(ddev->dev);
-               return count;
+               ret = count;
+               goto pro_end;
        }
 
        if (adev->asic_type == CHIP_RAVEN) {
@@ -409,7 +425,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
                pr_err("Currently not in any profile mode!\n");
                pm_runtime_mark_last_busy(ddev->dev);
                pm_runtime_put_autosuspend(ddev->dev);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto pro_end;
        }
 
        if (is_support_sw_smu(adev)) {
@@ -417,7 +434,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
                if (ret) {
                        pm_runtime_mark_last_busy(ddev->dev);
                        pm_runtime_put_autosuspend(ddev->dev);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto pro_end;
                }
        } else if (adev->powerplay.pp_funcs->force_performance_level) {
                mutex_lock(&adev->pm.mutex);
@@ -425,14 +443,16 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
                        mutex_unlock(&adev->pm.mutex);
                        pm_runtime_mark_last_busy(ddev->dev);
                        pm_runtime_put_autosuspend(ddev->dev);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto pro_end;
                }
                ret = amdgpu_dpm_force_performance_level(adev, level);
                if (ret) {
                        mutex_unlock(&adev->pm.mutex);
                        pm_runtime_mark_last_busy(ddev->dev);
                        pm_runtime_put_autosuspend(ddev->dev);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto pro_end;
                } else {
                        adev->pm.dpm.forced_level = level;
                }
@@ -441,7 +461,9 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
-       return count;
+pro_end:
+       up_read(&adev->reset_sem);
+       return ret;
 }
 
 static ssize_t amdgpu_get_pp_num_states(struct device *dev,
@@ -453,7 +475,7 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev,
        struct pp_states_info data;
        int i, buf_len, ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -497,7 +519,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev,
        enum amd_pm_state_type pm = 0;
        int i = 0, ret = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -538,7 +560,7 @@ static ssize_t amdgpu_get_pp_force_state(struct device *dev,
        struct drm_device *ddev = dev_get_drvdata(dev);
        struct amdgpu_device *adev = ddev->dev_private;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (adev->pp_force_state_enabled)
@@ -558,7 +580,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
        unsigned long idx;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (strlen(buf) == 1)
@@ -584,6 +606,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
                        return ret;
                }
 
+               down_read(&adev->reset_sem);
                /* only set user selected power states */
                if (state != POWER_STATE_TYPE_INTERNAL_BOOT &&
                    state != POWER_STATE_TYPE_DEFAULT) {
@@ -591,6 +614,8 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
                                        AMD_PP_TASK_ENABLE_USER_STATE, &state);
                        adev->pp_force_state_enabled = true;
                }
+               up_read(&adev->reset_sem);
+
                pm_runtime_mark_last_busy(ddev->dev);
                pm_runtime_put_autosuspend(ddev->dev);
        }
@@ -618,7 +643,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
        char *table = NULL;
        int size, ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -662,7 +687,7 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
        struct amdgpu_device *adev = ddev->dev_private;
        int ret = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -671,16 +696,21 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                ret = smu_sys_set_pp_table(&adev->smu, (void *)buf, count);
                if (ret) {
                        pm_runtime_mark_last_busy(ddev->dev);
                        pm_runtime_put_autosuspend(ddev->dev);
+                       up_read(&adev->reset_sem);
                        return ret;
                }
        } else if (adev->powerplay.pp_funcs->set_pp_table)
                amdgpu_dpm_set_pp_table(adev, buf, count);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -815,7 +845,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
        const char delimiter[3] = {' ', '\n', '\0'};
        uint32_t type;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (count > 127)
@@ -858,6 +888,10 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
                return ret;
        }
 
+       ret = count;
+
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                ret = smu_od_edit_dpm_table(&adev->smu, type,
                                            parameter, parameter_size);
@@ -865,7 +899,8 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
                if (ret) {
                        pm_runtime_mark_last_busy(ddev->dev);
                        pm_runtime_put_autosuspend(ddev->dev);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto pro_end;
                }
        } else {
                if (adev->powerplay.pp_funcs->odn_edit_dpm_table) {
@@ -874,7 +909,8 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
                        if (ret) {
                                pm_runtime_mark_last_busy(ddev->dev);
                                pm_runtime_put_autosuspend(ddev->dev);
-                               return -EINVAL;
+                               ret = -EINVAL;
+                               goto pro_end;
                        }
                }
 
@@ -885,18 +921,22 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
                                                NULL);
                                pm_runtime_mark_last_busy(ddev->dev);
                                pm_runtime_put_autosuspend(ddev->dev);
-                               return count;
+                               ret = count;
+                               goto pro_end;
                        } else {
                                pm_runtime_mark_last_busy(ddev->dev);
                                pm_runtime_put_autosuspend(ddev->dev);
-                               return -EINVAL;
+                               ret = -EINVAL;
+                               goto pro_end;
                        }
                }
        }
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
-       return count;
+pro_end:
+       up_read(&adev->reset_sem);
+       return ret;
 }
 
 static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
@@ -908,7 +948,7 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -962,7 +1002,7 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
        uint64_t featuremask;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = kstrtou64(buf, 0, &featuremask);
@@ -977,11 +1017,13 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
        if (is_support_sw_smu(adev)) {
                ret = smu_sys_set_pp_feature_mask(&adev->smu, featuremask);
                if (ret) {
                        pm_runtime_mark_last_busy(ddev->dev);
                        pm_runtime_put_autosuspend(ddev->dev);
+                       up_read(&adev->reset_sem);
                        return -EINVAL;
                }
        } else if (adev->powerplay.pp_funcs->set_ppfeature_status) {
@@ -989,9 +1031,12 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
                if (ret) {
                        pm_runtime_mark_last_busy(ddev->dev);
                        pm_runtime_put_autosuspend(ddev->dev);
+                       up_read(&adev->reset_sem);
                        return -EINVAL;
                }
        }
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1007,7 +1052,7 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1016,6 +1061,8 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_sys_get_pp_feature_mask(&adev->smu, buf);
        else if (adev->powerplay.pp_funcs->get_ppfeature_status)
@@ -1023,6 +1070,8 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1068,7 +1117,7 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1077,6 +1126,8 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_print_clk_levels(&adev->smu, SMU_SCLK, buf);
        else if (adev->powerplay.pp_funcs->print_clock_levels)
@@ -1084,6 +1135,8 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1135,7 +1188,7 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev,
        int ret;
        uint32_t mask = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = amdgpu_read_mask(buf, count, &mask);
@@ -1148,11 +1201,15 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                ret = smu_force_clk_levels(&adev->smu, SMU_SCLK, mask);
        else if (adev->powerplay.pp_funcs->force_clock_level)
                ret = amdgpu_dpm_force_clock_level(adev, PP_SCLK, mask);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1171,7 +1228,7 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1180,6 +1237,8 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_print_clk_levels(&adev->smu, SMU_MCLK, buf);
        else if (adev->powerplay.pp_funcs->print_clock_levels)
@@ -1187,6 +1246,8 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1203,7 +1264,7 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev,
        uint32_t mask = 0;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = amdgpu_read_mask(buf, count, &mask);
@@ -1216,11 +1277,15 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                ret = smu_force_clk_levels(&adev->smu, SMU_MCLK, mask);
        else if (adev->powerplay.pp_funcs->force_clock_level)
                ret = amdgpu_dpm_force_clock_level(adev, PP_MCLK, mask);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1239,7 +1304,7 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1248,6 +1313,8 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_print_clk_levels(&adev->smu, SMU_SOCCLK, buf);
        else if (adev->powerplay.pp_funcs->print_clock_levels)
@@ -1255,6 +1322,8 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1271,7 +1340,7 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev,
        int ret;
        uint32_t mask = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = amdgpu_read_mask(buf, count, &mask);
@@ -1284,6 +1353,8 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                ret = smu_force_clk_levels(&adev->smu, SMU_SOCCLK, mask);
        else if (adev->powerplay.pp_funcs->force_clock_level)
@@ -1291,6 +1362,8 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev,
        else
                ret = 0;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1309,7 +1382,7 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1318,6 +1391,8 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_print_clk_levels(&adev->smu, SMU_FCLK, buf);
        else if (adev->powerplay.pp_funcs->print_clock_levels)
@@ -1325,6 +1400,8 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1341,7 +1418,7 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev,
        int ret;
        uint32_t mask = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = amdgpu_read_mask(buf, count, &mask);
@@ -1354,6 +1431,8 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                ret = smu_force_clk_levels(&adev->smu, SMU_FCLK, mask);
        else if (adev->powerplay.pp_funcs->force_clock_level)
@@ -1361,6 +1440,8 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev,
        else
                ret = 0;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1379,7 +1460,7 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1388,6 +1469,8 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_print_clk_levels(&adev->smu, SMU_DCEFCLK, buf);
        else if (adev->powerplay.pp_funcs->print_clock_levels)
@@ -1395,6 +1478,8 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1411,7 +1496,7 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev,
        int ret;
        uint32_t mask = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = amdgpu_read_mask(buf, count, &mask);
@@ -1424,6 +1509,8 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                ret = smu_force_clk_levels(&adev->smu, SMU_DCEFCLK, mask);
        else if (adev->powerplay.pp_funcs->force_clock_level)
@@ -1431,6 +1518,8 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev,
        else
                ret = 0;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1449,7 +1538,7 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1458,6 +1547,8 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_print_clk_levels(&adev->smu, SMU_PCIE, buf);
        else if (adev->powerplay.pp_funcs->print_clock_levels)
@@ -1465,6 +1556,8 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1481,7 +1574,7 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev,
        int ret;
        uint32_t mask = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = amdgpu_read_mask(buf, count, &mask);
@@ -1494,6 +1587,8 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                ret = smu_force_clk_levels(&adev->smu, SMU_PCIE, mask);
        else if (adev->powerplay.pp_funcs->force_clock_level)
@@ -1501,6 +1596,8 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev,
        else
                ret = 0;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1519,7 +1616,7 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,
        uint32_t value = 0;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1528,11 +1625,15 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                value = smu_get_od_percentage(&(adev->smu), SMU_OD_SCLK);
        else if (adev->powerplay.pp_funcs->get_sclk_od)
                value = amdgpu_dpm_get_sclk_od(adev);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1549,7 +1650,7 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
        int ret;
        long int value;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = kstrtol(buf, 0, &value);
@@ -1563,6 +1664,8 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                value = smu_set_od_percentage(&(adev->smu), SMU_OD_SCLK, (uint32_t)value);
        } else {
@@ -1577,6 +1680,8 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
                }
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1592,7 +1697,7 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,
        uint32_t value = 0;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1601,11 +1706,15 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                value = smu_get_od_percentage(&(adev->smu), SMU_OD_MCLK);
        else if (adev->powerplay.pp_funcs->get_mclk_od)
                value = amdgpu_dpm_get_mclk_od(adev);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1622,7 +1731,7 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
        int ret;
        long int value;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = kstrtol(buf, 0, &value);
@@ -1636,6 +1745,8 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                value = smu_set_od_percentage(&(adev->smu), SMU_OD_MCLK, (uint32_t)value);
        } else {
@@ -1650,6 +1761,8 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
                }
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1685,7 +1798,7 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
        ssize_t size;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(ddev->dev);
@@ -1694,6 +1807,8 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                size = smu_get_power_profile_mode(&adev->smu, buf);
        else if (adev->powerplay.pp_funcs->get_power_profile_mode)
@@ -1701,6 +1816,8 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
        else
                size = snprintf(buf, PAGE_SIZE, "\n");
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1725,7 +1842,7 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,
        long int profile_mode = 0;
        const char delimiter[3] = {' ', '\n', '\0'};
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        tmp[0] = *(buf);
@@ -1758,11 +1875,15 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                ret = smu_set_power_profile_mode(&adev->smu, parameter, parameter_size, true);
        else if (adev->powerplay.pp_funcs->set_power_profile_mode)
                ret = amdgpu_dpm_set_power_profile_mode(adev, parameter, parameter_size);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1788,7 +1909,7 @@ static ssize_t amdgpu_get_gpu_busy_percent(struct device *dev,
        struct amdgpu_device *adev = ddev->dev_private;
        int r, value, size = sizeof(value);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(ddev->dev);
@@ -1797,9 +1918,11 @@ static ssize_t amdgpu_get_gpu_busy_percent(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
        /* read the IP busy sensor */
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_LOAD,
                                   (void *)&value, &size);
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
@@ -1826,7 +1949,7 @@ static ssize_t amdgpu_get_mem_busy_percent(struct device *dev,
        struct amdgpu_device *adev = ddev->dev_private;
        int r, value, size = sizeof(value);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(ddev->dev);
@@ -1835,10 +1958,14 @@ static ssize_t amdgpu_get_mem_busy_percent(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
+
        /* read the IP busy sensor */
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MEM_LOAD,
                                   (void *)&value, &size);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1869,7 +1996,7 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,
        uint64_t count0 = 0, count1 = 0;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (adev->flags & AMD_IS_APU)
@@ -1884,8 +2011,12 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        amdgpu_asic_get_pcie_usage(adev, &count0, &count1);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(ddev->dev);
        pm_runtime_put_autosuspend(ddev->dev);
 
@@ -1910,7 +2041,7 @@ static ssize_t amdgpu_get_unique_id(struct device *dev,
        struct drm_device *ddev = dev_get_drvdata(dev);
        struct amdgpu_device *adev = ddev->dev_private;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (adev->unique_id)
@@ -2177,7 +2308,7 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
        int channel = to_sensor_dev_attr(attr)->index;
        int r, temp = 0, size = sizeof(temp);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (channel >= PP_TEMP_MAX)
@@ -2189,6 +2320,8 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
+
        switch (channel) {
        case PP_TEMP_JUNCTION:
                /* get current junction temperature */
@@ -2210,6 +2343,8 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
                break;
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2313,7 +2448,7 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,
        u32 pwm_mode = 0;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(adev->ddev->dev);
@@ -2322,18 +2457,23 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                pwm_mode = smu_get_fan_control_mode(&adev->smu);
        } else {
                if (!adev->powerplay.pp_funcs->get_fan_control_mode) {
                        pm_runtime_mark_last_busy(adev->ddev->dev);
                        pm_runtime_put_autosuspend(adev->ddev->dev);
+                       up_read(&adev->reset_sem);
                        return -EINVAL;
                }
 
                pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2349,7 +2489,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,
        int err, ret;
        int value;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        err = kstrtoint(buf, 10, &value);
@@ -2362,18 +2502,23 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                smu_set_fan_control_mode(&adev->smu, value);
        } else {
                if (!adev->powerplay.pp_funcs->set_fan_control_mode) {
                        pm_runtime_mark_last_busy(adev->ddev->dev);
                        pm_runtime_put_autosuspend(adev->ddev->dev);
+                       up_read(&adev->reset_sem);
                        return -EINVAL;
                }
 
                amdgpu_dpm_set_fan_control_mode(adev, value);
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2403,7 +2548,7 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
        u32 value;
        u32 pwm_mode;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        err = pm_runtime_get_sync(adev->ddev->dev);
@@ -2412,11 +2557,15 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                pwm_mode = smu_get_fan_control_mode(&adev->smu);
        else
                pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
 
+       up_read(&adev->reset_sem);
+
        if (pwm_mode != AMD_FAN_CTRL_MANUAL) {
                pr_info("manual fan speed control should be enabled first\n");
                pm_runtime_mark_last_busy(adev->ddev->dev);
@@ -2457,7 +2606,7 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
        int err;
        u32 speed = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        err = pm_runtime_get_sync(adev->ddev->dev);
@@ -2466,6 +2615,8 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                err = smu_get_fan_speed_percent(&adev->smu, &speed);
        else if (adev->powerplay.pp_funcs->get_fan_speed_percent)
@@ -2473,6 +2624,8 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
        else
                err = -EINVAL;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2492,7 +2645,7 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
        int err;
        u32 speed = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        err = pm_runtime_get_sync(adev->ddev->dev);
@@ -2501,6 +2654,8 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                err = smu_get_fan_speed_rpm(&adev->smu, &speed);
        else if (adev->powerplay.pp_funcs->get_fan_speed_rpm)
@@ -2508,6 +2663,8 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
        else
                err = -EINVAL;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2526,7 +2683,7 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev,
        u32 size = sizeof(min_rpm);
        int r;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -2535,9 +2692,13 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
+
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MIN_FAN_RPM,
                                   (void *)&min_rpm, &size);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2556,7 +2717,7 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev,
        u32 size = sizeof(max_rpm);
        int r;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -2565,9 +2726,13 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
+
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MAX_FAN_RPM,
                                   (void *)&max_rpm, &size);
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2585,7 +2750,7 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
        int err;
        u32 rpm = 0;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        err = pm_runtime_get_sync(adev->ddev->dev);
@@ -2594,6 +2759,8 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                err = smu_get_fan_speed_rpm(&adev->smu, &rpm);
        else if (adev->powerplay.pp_funcs->get_fan_speed_rpm)
@@ -2601,6 +2768,8 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
        else
                err = -EINVAL;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2619,7 +2788,7 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
        u32 value;
        u32 pwm_mode;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        err = pm_runtime_get_sync(adev->ddev->dev);
@@ -2628,11 +2797,15 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                pwm_mode = smu_get_fan_control_mode(&adev->smu);
        else
                pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
 
+       up_read(&adev->reset_sem);
+
        if (pwm_mode != AMD_FAN_CTRL_MANUAL) {
                pm_runtime_mark_last_busy(adev->ddev->dev);
                pm_runtime_put_autosuspend(adev->ddev->dev);
@@ -2646,6 +2819,8 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                err = smu_set_fan_speed_rpm(&adev->smu, value);
        else if (adev->powerplay.pp_funcs->set_fan_speed_rpm)
@@ -2653,6 +2828,8 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
        else
                err = -EINVAL;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2670,7 +2847,7 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,
        u32 pwm_mode = 0;
        int ret;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        ret = pm_runtime_get_sync(adev->ddev->dev);
@@ -2679,18 +2856,23 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,
                return ret;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                pwm_mode = smu_get_fan_control_mode(&adev->smu);
        } else {
                if (!adev->powerplay.pp_funcs->get_fan_control_mode) {
                        pm_runtime_mark_last_busy(adev->ddev->dev);
                        pm_runtime_put_autosuspend(adev->ddev->dev);
+                       up_read(&adev->reset_sem);
                        return -EINVAL;
                }
 
                pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2707,7 +2889,7 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
        int value;
        u32 pwm_mode;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        err = kstrtoint(buf, 10, &value);
@@ -2727,17 +2909,22 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                smu_set_fan_control_mode(&adev->smu, pwm_mode);
        } else {
                if (!adev->powerplay.pp_funcs->set_fan_control_mode) {
                        pm_runtime_mark_last_busy(adev->ddev->dev);
                        pm_runtime_put_autosuspend(adev->ddev->dev);
+                       up_read(&adev->reset_sem);
                        return -EINVAL;
                }
                amdgpu_dpm_set_fan_control_mode(adev, pwm_mode);
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2752,7 +2939,7 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev,
        u32 vddgfx;
        int r, size = sizeof(vddgfx);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -2761,9 +2948,11 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
        /* get the voltage */
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDGFX,
                                   (void *)&vddgfx, &size);
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
@@ -2789,7 +2978,7 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev,
        u32 vddnb;
        int r, size = sizeof(vddnb);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        /* only APUs have vddnb */
@@ -2802,9 +2991,11 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
        /* get the voltage */
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDNB,
                                   (void *)&vddnb, &size);
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
@@ -2831,7 +3022,7 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev,
        int r, size = sizeof(u32);
        unsigned uw;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -2840,9 +3031,11 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
        /* get the voltage */
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_POWER,
                                   (void *)&query, &size);
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
@@ -2872,7 +3065,7 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev,
        ssize_t size;
        int r;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -2881,6 +3074,8 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                smu_get_power_limit(&adev->smu, &limit, true);
                size = snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
@@ -2891,6 +3086,8 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev,
                size = snprintf(buf, PAGE_SIZE, "\n");
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2906,7 +3103,7 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev,
        ssize_t size;
        int r;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -2915,6 +3112,8 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev)) {
                smu_get_power_limit(&adev->smu, &limit, false);
                size = snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
@@ -2925,6 +3124,8 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev,
                size = snprintf(buf, PAGE_SIZE, "\n");
        }
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2941,7 +3142,7 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
        int err;
        u32 value;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        if (amdgpu_sriov_vf(adev))
@@ -2960,6 +3161,8 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
                return err;
        }
 
+       down_read(&adev->reset_sem);
+
        if (is_support_sw_smu(adev))
                err = smu_set_power_limit(&adev->smu, value);
        else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->set_power_limit)
@@ -2967,6 +3170,8 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
        else
                err = -EINVAL;
 
+       up_read(&adev->reset_sem);
+
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
 
@@ -2984,7 +3189,7 @@ static ssize_t amdgpu_hwmon_show_sclk(struct device *dev,
        uint32_t sclk;
        int r, size = sizeof(sclk);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -2993,9 +3198,11 @@ static ssize_t amdgpu_hwmon_show_sclk(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
        /* get the sclk */
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GFX_SCLK,
                                   (void *)&sclk, &size);
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
@@ -3021,7 +3228,7 @@ static ssize_t amdgpu_hwmon_show_mclk(struct device *dev,
        uint32_t mclk;
        int r, size = sizeof(mclk);
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(adev->ddev->dev);
@@ -3030,9 +3237,11 @@ static ssize_t amdgpu_hwmon_show_mclk(struct device *dev,
                return r;
        }
 
+       down_read(&adev->reset_sem);
        /* get the sclk */
        r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GFX_MCLK,
                                   (void *)&mclk, &size);
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(adev->ddev->dev);
        pm_runtime_put_autosuspend(adev->ddev->dev);
@@ -3913,7 +4122,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
        u32 flags = 0;
        int r;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EPERM;
 
        r = pm_runtime_get_sync(dev->dev);
@@ -3922,7 +4131,10 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
                return r;
        }
 
+       down_read(&adev->reset_sem);
        amdgpu_device_ip_get_clockgating_state(adev, &flags);
+       up_read(&adev->reset_sem);
+
        seq_printf(m, "Clock Gating Flags Mask: 0x%x\n", flags);
        amdgpu_parse_cg_state(m, flags);
        seq_printf(m, "\n");
@@ -3934,6 +4146,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
                return 0;
        }
 
+       down_read(&adev->reset_sem);
        if (!is_support_sw_smu(adev) &&
            adev->powerplay.pp_funcs->debugfs_print_current_performance_level) {
                mutex_lock(&adev->pm.mutex);
@@ -3946,6 +4159,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
        } else {
                r = amdgpu_debugfs_pm_info_pp(m, adev);
        }
+       up_read(&adev->reset_sem);
 
        pm_runtime_mark_last_busy(dev->dev);
        pm_runtime_put_autosuspend(dev->dev);
index 8034111..a053b7a 100644 (file)
@@ -1684,7 +1684,7 @@ static int psp_load_smu_fw(struct psp_context *psp)
                return 0;
 
 
-       if (adev->in_gpu_reset && ras && ras->supported) {
+       if (amdgpu_in_reset(adev) && ras && ras->supported) {
                ret = amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_UNLOAD);
                if (ret) {
                        DRM_WARN("Failed to set MP1 state prepare for reload\n");
@@ -1799,7 +1799,7 @@ static int psp_load_fw(struct amdgpu_device *adev)
        int ret;
        struct psp_context *psp = &adev->psp;
 
-       if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) {
+       if (amdgpu_sriov_vf(adev) && amdgpu_in_reset(adev)) {
                psp_ring_stop(psp, PSP_RING_TYPE__KM); /* should not destroy ring, only stop */
                goto skip_memalloc;
        }
index bcce4c0..5680f7e 100644 (file)
@@ -1978,7 +1978,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
                        amdgpu_ras_request_reset_on_boot(adev,
                                        ras_block->block);
                        return 0;
-               } else if (adev->in_suspend || adev->in_gpu_reset) {
+               } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
                        /* in resume phase, if fail to enable ras,
                         * clean up all ras fs nodes, and disable ras */
                        goto cleanup;
@@ -1987,7 +1987,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
        }
 
        /* in resume phase, no need to create ras fs node */
-       if (adev->in_suspend || adev->in_gpu_reset)
+       if (adev->in_suspend || amdgpu_in_reset(adev))
                return 0;
 
        if (ih_info->cb) {
index 5f2f3fa..605d266 100644 (file)
@@ -2088,7 +2088,7 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
        uint64_t size;
        int r;
 
-       if (!adev->mman.initialized || adev->in_gpu_reset ||
+       if (!adev->mman.initialized || amdgpu_in_reset(adev) ||
            adev->mman.buffer_funcs_enabled == enable)
                return;
 
index 183743c..039245c 100644 (file)
@@ -628,7 +628,8 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
        struct amdgpu_firmware_info *ucode = NULL;
 
  /* for baremetal, the ucode is allocated in gtt, so don't need to fill the bo when reset/suspend */
-       if (!amdgpu_sriov_vf(adev) && (adev->in_gpu_reset || adev->in_suspend))
+       if (!amdgpu_sriov_vf(adev) &&
+               (amdgpu_in_reset(adev) || adev->in_suspend))
                return 0;
        /*
         * if SMU loaded firmware, it needn't add SMC, UVD, and VCE
index 1203c20..5cae39d 100644 (file)
@@ -93,7 +93,7 @@ failed_undo:
        amdgpu_ring_undo(ring);
        spin_unlock_irqrestore(&kiq->ring_lock, flags);
 failed_kiq:
-       pr_err("failed to write reg %x wait reg %x\n", reg0, reg1);
+       dev_warn(adev->dev, "failed to write reg %x wait reg %x\n", reg0, reg1);
 }
 
 /**
index f826945..b2046c3 100644 (file)
@@ -325,9 +325,9 @@ static inline bool is_virtual_machine(void)
 #define amdgpu_sriov_is_pp_one_vf(adev) \
        ((adev)->virt.gim_feature & AMDGIM_FEATURE_PP_ONE_VF)
 #define amdgpu_sriov_is_debug(adev) \
-       ((!adev->in_gpu_reset) && adev->virt.tdr_debug)
+       ((!amdgpu_in_reset(adev)) && adev->virt.tdr_debug)
 #define amdgpu_sriov_is_normal(adev) \
-       ((!adev->in_gpu_reset) && (!adev->virt.tdr_debug))
+       ((!amdgpu_in_reset(adev)) && (!adev->virt.tdr_debug))
 
 bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
 void amdgpu_virt_init_setting(struct amdgpu_device *adev);
index e3a3755..4e017f3 100644 (file)
@@ -372,7 +372,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
        tmp->hive_id = adev->gmc.xgmi.hive_id;
        INIT_LIST_HEAD(&tmp->device_list);
        mutex_init(&tmp->hive_lock);
-       mutex_init(&tmp->reset_lock);
+       atomic_set(&tmp->in_reset, 0);
        task_barrier_init(&tmp->tb);
 
        if (lock)
@@ -397,6 +397,7 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
                                                hive->hi_req_gpu : adev;
        bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
        bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
+       bool locked;
 
        /* fw bug so temporarily disable pstate switching */
        return 0;
@@ -404,7 +405,9 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
        if (!hive || adev->asic_type != CHIP_VEGA20)
                return 0;
 
-       mutex_lock(&hive->hive_lock);
+       locked = atomic_read(&hive->in_reset) ? false : true;
+       if (locked)
+               mutex_lock(&hive->hive_lock);
 
        if (is_hi_req)
                hive->hi_req_count++;
@@ -439,7 +442,8 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
                                                        adev : NULL;
        }
 out:
-       mutex_unlock(&hive->hive_lock);
+       if (locked)
+               mutex_unlock(&hive->hive_lock);
        return ret;
 }
 
@@ -594,7 +598,6 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
        if(!(--hive->number_devices)){
                amdgpu_xgmi_sysfs_destroy(adev, hive);
                mutex_destroy(&hive->hive_lock);
-               mutex_destroy(&hive->reset_lock);
        }
 
        return psp_xgmi_terminate(&adev->psp);
index 6999eab..61720cd 100644 (file)
@@ -30,7 +30,8 @@ struct amdgpu_hive_info {
        uint64_t                hive_id;
        struct list_head        device_list;
        int number_devices;
-       struct mutex hive_lock, reset_lock;
+       struct mutex hive_lock;
+       atomic_t in_reset;
        struct kobject *kobj;
        struct device_attribute dev_attr;
        struct amdgpu_device *adev;
index 4cfc786..8341bd9 100644 (file)
@@ -755,6 +755,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
                                /* jiffies wrap around we will just wait a little longer */
                                ctx->last_jump_jiffies = jiffies;
                        }
+                       schedule();
                } else {
                        ctx->last_jump = ctx->start + target;
                        ctx->last_jump_jiffies = jiffies;
index 8344c3b..db9f1e8 100644 (file)
@@ -6180,7 +6180,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
        struct v10_gfx_mqd *mqd = ring->mqd_ptr;
        int mqd_idx = ring - &adev->gfx.gfx_ring[0];
 
-       if (!adev->in_gpu_reset && !adev->in_suspend) {
+       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
                memset((void *)mqd, 0, sizeof(*mqd));
                mutex_lock(&adev->srbm_mutex);
                nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -6192,7 +6192,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring)
                mutex_unlock(&adev->srbm_mutex);
                if (adev->gfx.me.mqd_backup[mqd_idx])
                        memcpy(adev->gfx.me.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
-       } else if (adev->in_gpu_reset) {
+       } else if (amdgpu_in_reset(adev)) {
                /* reset mqd with the backup copy */
                if (adev->gfx.me.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.me.mqd_backup[mqd_idx], sizeof(*mqd));
@@ -6541,7 +6541,7 @@ static int gfx_v10_0_kiq_init_queue(struct amdgpu_ring *ring)
 
        gfx_v10_0_kiq_setting(ring);
 
-       if (adev->in_gpu_reset) { /* for GPU_RESET case */
+       if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
                /* reset MQD to a clean status */
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
@@ -6577,7 +6577,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
        struct v10_compute_mqd *mqd = ring->mqd_ptr;
        int mqd_idx = ring - &adev->gfx.compute_ring[0];
 
-       if (!adev->in_gpu_reset && !adev->in_suspend) {
+       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
                memset((void *)mqd, 0, sizeof(*mqd));
                mutex_lock(&adev->srbm_mutex);
                nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
@@ -6587,7 +6587,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring)
 
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
-       } else if (adev->in_gpu_reset) { /* for GPU_RESET case */
+       } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
                /* reset MQD to a clean status */
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
index 33f1c4a..8d72089 100644 (file)
@@ -4632,7 +4632,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
 
        gfx_v8_0_kiq_setting(ring);
 
-       if (adev->in_gpu_reset) { /* for GPU_RESET case */
+       if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
                /* reset MQD to a clean status */
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
@@ -4669,7 +4669,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
        struct vi_mqd *mqd = ring->mqd_ptr;
        int mqd_idx = ring - &adev->gfx.compute_ring[0];
 
-       if (!adev->in_gpu_reset && !adev->in_suspend) {
+       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
                memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation));
                ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
                ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@@ -4681,7 +4681,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
 
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation));
-       } else if (adev->in_gpu_reset) { /* for GPU_RESET case */
+       } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
                /* reset MQD to a clean status */
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation));
index cb9d60a..e4e751f 100644 (file)
@@ -3684,7 +3684,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring)
 
        gfx_v9_0_kiq_setting(ring);
 
-       if (adev->in_gpu_reset) { /* for GPU_RESET case */
+       if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
                /* reset MQD to a clean status */
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
@@ -3722,7 +3722,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
        struct v9_mqd *mqd = ring->mqd_ptr;
        int mqd_idx = ring - &adev->gfx.compute_ring[0];
 
-       if (!adev->in_gpu_reset && !adev->in_suspend) {
+       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
                memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation));
                ((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF;
                ((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF;
@@ -3734,7 +3734,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring)
 
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation));
-       } else if (adev->in_gpu_reset) { /* for GPU_RESET case */
+       } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */
                /* reset MQD to a clean status */
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation));
@@ -3928,7 +3928,7 @@ static int gfx_v9_0_hw_fini(void *handle)
        /* Use deinitialize sequence from CAIL when unbinding device from driver,
         * otherwise KIQ is hanging when binding back
         */
-       if (!adev->in_gpu_reset && !adev->in_suspend) {
+       if (!amdgpu_in_reset(adev) && !adev->in_suspend) {
                mutex_lock(&adev->srbm_mutex);
                soc15_grbm_select(adev, adev->gfx.kiq.ring.me,
                                adev->gfx.kiq.ring.pipe,
@@ -4086,7 +4086,7 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
         *
         * also don't wait anymore for IRQ context
         * */
-       if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
+       if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt()))
                goto failed_kiq_read;
 
        might_sleep();
index 19051ce..1a78073 100644 (file)
@@ -268,7 +268,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
         */
        if (adev->gfx.kiq.ring.sched.ready &&
            (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-           !adev->in_gpu_reset) {
+           !amdgpu_in_reset(adev)) {
 
                struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
                const unsigned eng = 17;
@@ -293,7 +293,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 
        if (!adev->mman.buffer_funcs_enabled ||
            !adev->ib_pool_ready ||
-           adev->in_gpu_reset ||
+           amdgpu_in_reset(adev) ||
            ring->sched.ready == false) {
                gmc_v10_0_flush_vm_hub(adev, vmid, AMDGPU_GFXHUB_0, 0);
                mutex_unlock(&adev->mman.gtt_window_lock);
index e18296d..0f8e8af 100644 (file)
@@ -434,7 +434,7 @@ static int gmc_v7_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
        int vmid;
        unsigned int tmp;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EIO;
 
        for (vmid = 1; vmid < 16; vmid++) {
index a9e722b..abe6401 100644 (file)
@@ -635,7 +635,7 @@ static int gmc_v8_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
        int vmid;
        unsigned int tmp;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EIO;
 
        for (vmid = 1; vmid < 16; vmid++) {
index 6e4f3ff..c5f94ba 100644 (file)
@@ -501,7 +501,7 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
         */
        if (adev->gfx.kiq.ring.sched.ready &&
                        (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-                       !adev->in_gpu_reset) {
+                       !amdgpu_in_reset(adev)) {
                uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
                uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
 
@@ -596,7 +596,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
        struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
        struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 
-       if (adev->in_gpu_reset)
+       if (amdgpu_in_reset(adev))
                return -EIO;
 
        if (ring->sched.ready) {
@@ -633,7 +633,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
                spin_unlock(&adev->gfx.kiq.ring_lock);
                r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
                if (r < 1) {
-                       DRM_ERROR("wait for kiq fence error: %ld.\n", r);
+                       dev_info(adev->dev,
+                               "wait for kiq fence error: %ld\n", r);
                        return -ETIME;
                }
 
index 5fd67e1..fe31cbe 100644 (file)
@@ -238,20 +238,16 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
        struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
        struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
        int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
-       int locked;
 
        /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
         * otherwise the mailbox msg will be ruined/reseted by
         * the VF FLR.
         *
-        * we can unlock the lock_reset to allow "amdgpu_job_timedout"
+        * we can unlock the reset_sem to allow "amdgpu_job_timedout"
         * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
         * which means host side had finished this VF's FLR.
         */
-       locked = mutex_trylock(&adev->lock_reset);
-       if (locked)
-               adev->in_gpu_reset = true;
-
+       down_read(&adev->reset_sem);
        do {
                if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
                        goto flr_done;
@@ -261,10 +257,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
        } while (timeout > 1);
 
 flr_done:
-       if (locked) {
-               adev->in_gpu_reset = false;
-               mutex_unlock(&adev->lock_reset);
-       }
+       up_read(&adev->reset_sem);
 
        /* Trigger recovery for world switch failure if no TDR */
        if (amdgpu_device_should_recover_gpu(adev)
index ce2bf1f..6f55172 100644 (file)
@@ -259,20 +259,16 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
        struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
        struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
        int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
-       int locked;
 
        /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
         * otherwise the mailbox msg will be ruined/reseted by
         * the VF FLR.
         *
-        * we can unlock the lock_reset to allow "amdgpu_job_timedout"
+        * we can unlock the reset_sem to allow "amdgpu_job_timedout"
         * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
         * which means host side had finished this VF's FLR.
         */
-       locked = mutex_trylock(&adev->lock_reset);
-       if (locked)
-               adev->in_gpu_reset = true;
-
+       down_read(&adev->reset_sem);
        do {
                if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
                        goto flr_done;
@@ -282,10 +278,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
        } while (timeout > 1);
 
 flr_done:
-       if (locked) {
-               adev->in_gpu_reset = false;
-               mutex_unlock(&adev->lock_reset);
-       }
+       up_read(&adev->reset_sem);
 
        /* Trigger recovery for world switch failure if no TDR */
        if (amdgpu_device_should_recover_gpu(adev)
index e0e60b0..7ad1537 100644 (file)
@@ -304,15 +304,17 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
                                struct qcm_process_device *qpd,
                                struct queue *q)
 {
-       /* On GFX v7, CP doesn't flush TC at dequeue */
-       if (q->device->device_info->asic_family == CHIP_HAWAII)
-               if (flush_texture_cache_nocpsch(q->device, qpd))
-                       pr_err("Failed to flush TC\n");
+       if (!dqm->is_resetting) {
+               /* On GFX v7, CP doesn't flush TC at dequeue */
+               if (q->device->device_info->asic_family == CHIP_HAWAII)
+                       if (flush_texture_cache_nocpsch(q->device, qpd))
+                               pr_err("Failed to flush TC\n");
 
-       kfd_flush_tlb(qpd_to_pdd(qpd));
+               kfd_flush_tlb(qpd_to_pdd(qpd));
 
-       /* Release the vmid mapping */
-       set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+               /* Release the vmid mapping */
+               set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+       }
        dqm->vmid_pasid[qpd->vmid] = 0;
 
        qpd->vmid = 0;
index 40695d5..ee22584 100644 (file)
@@ -1551,6 +1551,10 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
 void kfd_flush_tlb(struct kfd_process_device *pdd)
 {
        struct kfd_dev *dev = pdd->dev;
+       struct device_queue_manager *dqm = dev->dqm;
+
+       if (dqm->is_resetting)
+               return;
 
        if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
                /* Nothing to flush until a VMID is assigned, which
index 03a60c7..cf79857 100644 (file)
@@ -1647,7 +1647,7 @@ static int dm_suspend(void *handle)
        struct amdgpu_display_manager *dm = &adev->dm;
        int ret = 0;
 
-       if (adev->in_gpu_reset) {
+       if (amdgpu_in_reset(adev)) {
                mutex_lock(&dm->dc_lock);
                dm->cached_dc_state = dc_copy_state(dm->dc->current_state);
 
@@ -1833,7 +1833,7 @@ static int dm_resume(void *handle)
        struct dc_state *dc_state;
        int i, r, j;
 
-       if (adev->in_gpu_reset) {
+       if (amdgpu_in_reset(adev)) {
                dc_state = dm->cached_dc_state;
 
                r = dm_dmub_hw_init(adev);
index 727cb9f..0eeccf3 100644 (file)
@@ -992,7 +992,7 @@ static int smu_disable_dpms(struct smu_context *smu)
        struct amdgpu_device *adev = smu->adev;
        int ret = 0;
        bool use_baco = !smu->is_apu &&
-               ((adev->in_gpu_reset &&
+               ((amdgpu_in_reset(adev) &&
                  (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
                 ((adev->in_runpm || adev->in_hibernate) && amdgpu_asic_supports_baco(adev)));
 
index 3b88396..bacbe2f 100644 (file)
@@ -484,7 +484,7 @@ static int vega20_setup_asic_task(struct pp_hwmgr *hwmgr)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev);
        int ret = 0;
-       bool use_baco = (adev->in_gpu_reset &&
+       bool use_baco = (amdgpu_in_reset(adev) &&
                         (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
                (adev->in_runpm && amdgpu_asic_supports_baco(adev));