drm/amd/amdgpu: add missing mutex lock to amdgpu_get_xgmi_hive() (v3)
authorTom St Denis <tom.stdenis@amd.com>
Mon, 7 Jan 2019 22:39:10 +0000 (17:39 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 14 Jan 2019 20:04:53 +0000 (15:04 -0500)
v2: Move locks around in other functions so that this
function can stand on its own.  Also only hold the hive
specific lock for add/remove device instead of the driver
global lock so you can't add/remove devices in parallel from
one hive.

v3: add reset_lock

Acked-by: Shaoyun.liu < Shaoyun.liu@amd.com>
Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

index 39d5d05..1a558dc 100644 (file)
@@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         * by different nodes. No point also since the one node already executing
         * reset will also reset all the other nodes in the hive.
         */
-       hive = amdgpu_get_xgmi_hive(adev);
+       hive = amdgpu_get_xgmi_hive(adev, 0);
        if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
-           !mutex_trylock(&hive->hive_lock))
+           !mutex_trylock(&hive->reset_lock))
                return 0;
 
        /* Start with adev pre asic reset first for soft reset check.*/
@@ -3606,7 +3606,7 @@ retry:    /* Rest of adevs pre asic reset from XGMI hive. */
        }
 
        if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
-               mutex_unlock(&hive->hive_lock);
+               mutex_unlock(&hive->reset_lock);
 
        if (r)
                dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
index ac57a87..dac1874 100644 (file)
@@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
        return &hive->device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
 {
        int i;
        struct amdgpu_hive_info *tmp;
 
        if (!adev->gmc.xgmi.hive_id)
                return NULL;
+
+       mutex_lock(&xgmi_mutex);
+
        for (i = 0 ; i < hive_count; ++i) {
                tmp = &xgmi_hives[i];
-               if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+               if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+                       if (lock)
+                               mutex_lock(&tmp->hive_lock);
+                       mutex_unlock(&xgmi_mutex);
                        return tmp;
+               }
        }
-       if (i >= AMDGPU_MAX_XGMI_HIVE)
+       if (i >= AMDGPU_MAX_XGMI_HIVE) {
+               mutex_unlock(&xgmi_mutex);
                return NULL;
+       }
 
        /* initialize new hive if not exist */
        tmp = &xgmi_hives[hive_count++];
        tmp->hive_id = adev->gmc.xgmi.hive_id;
        INIT_LIST_HEAD(&tmp->device_list);
        mutex_init(&tmp->hive_lock);
+       mutex_init(&tmp->reset_lock);
+       if (lock)
+               mutex_lock(&tmp->hive_lock);
+
+       mutex_unlock(&xgmi_mutex);
 
        return tmp;
 }
@@ -111,8 +125,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
                return ret;
        }
 
-       mutex_lock(&xgmi_mutex);
-       hive = amdgpu_get_xgmi_hive(adev);
+       hive = amdgpu_get_xgmi_hive(adev, 1);
        if (!hive) {
                ret = -EINVAL;
                dev_err(adev->dev,
@@ -147,8 +160,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
                        break;
        }
 
+       mutex_unlock(&hive->hive_lock);
 exit:
-       mutex_unlock(&xgmi_mutex);
        return ret;
 }
 
@@ -159,15 +172,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
        if (!adev->gmc.xgmi.supported)
                return;
 
-       mutex_lock(&xgmi_mutex);
-
-       hive = amdgpu_get_xgmi_hive(adev);
+       hive = amdgpu_get_xgmi_hive(adev, 1);
        if (!hive)
-               goto exit;
+               return;
 
-       if (!(hive->number_devices--))
+       if (!(hive->number_devices--)) {
                mutex_destroy(&hive->hive_lock);
-
-exit:
-       mutex_unlock(&xgmi_mutex);
+               mutex_destroy(&hive->reset_lock);
+       } else {
+               mutex_unlock(&hive->hive_lock);
+       }
 }
index 6151eb9..14bc606 100644 (file)
@@ -29,10 +29,11 @@ struct amdgpu_hive_info {
        struct list_head        device_list;
        struct psp_xgmi_topology_info   topology_info;
        int number_devices;
-       struct mutex hive_lock;
+       struct mutex hive_lock,
+                    reset_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);