drm/amdgpu: skip xcp drm device allocation when out of drm resource
authorJames Zhu <James.Zhu@amd.com>
Wed, 9 Aug 2023 20:45:04 +0000 (16:45 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 16 Aug 2023 19:46:39 +0000 (15:46 -0400)
Return 0 when drm device alloc failed with -ENOSPC in
order to  allow amdgpu drive loading. But the xcp without
drm device node assigned won't be visiable in user space.
This helps amdgpu driver loading on system which has more
than 64 nodes, the current limitation.

The proposal to add more drm nodes is discussed in public,
which will support up to 2^20 nodes totally.
kernel drm:
https://lore.kernel.org/lkml/20230724211428.3831636-1-michal.winiarski@intel.com/T/
libdrm:
https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/305

Signed-off-by: James Zhu <James.Zhu@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.c

index 9c9cca1294989c25f7c3d251e34be478e9391bc5..565a1fa436d4b8032e83cf6d7dc3863480a1d56d 100644 (file)
@@ -239,8 +239,13 @@ static int amdgpu_xcp_dev_alloc(struct amdgpu_device *adev)
 
        for (i = 1; i < MAX_XCP; i++) {
                ret = amdgpu_xcp_drm_dev_alloc(&p_ddev);
-               if (ret)
+               if (ret == -ENOSPC) {
+                       dev_warn(adev->dev,
+                       "Skip xcp node #%d when out of drm node resource.", i);
+                       return 0;
+               } else if (ret) {
                        return ret;
+               }
 
                /* Redirect all IOCTLs to the primary device */
                adev->xcp_mgr->xcp[i].rdev = p_ddev->render->dev;
@@ -328,6 +333,9 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
                return 0;
 
        for (i = 1; i < MAX_XCP; i++) {
+               if (!adev->xcp_mgr->xcp[i].ddev)
+                       break;
+
                ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, ent->driver_data);
                if (ret)
                        return ret;
@@ -345,6 +353,9 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
                return;
 
        for (i = 1; i < MAX_XCP; i++) {
+               if (!adev->xcp_mgr->xcp[i].ddev)
+                       break;
+
                p_ddev = adev->xcp_mgr->xcp[i].ddev;
                drm_dev_unplug(p_ddev);
                p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
index 61fc62f3e0034a1b129ddb4ad298cb2667fa556d..4a17bb7c7b27d7a81f6eb744ad49d59838a71f41 100644 (file)
@@ -1965,7 +1965,14 @@ int kfd_topology_add_device(struct kfd_node *gpu)
        const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type];
 
        gpu_id = kfd_generate_gpu_id(gpu);
-       pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
+       if (gpu->xcp && !gpu->xcp->ddev) {
+               dev_warn(gpu->adev->dev,
+               "Won't add GPU (ID: 0x%x) to topology since it has no drm node assigned.",
+               gpu_id);
+               return 0;
+       } else {
+               pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
+       }
 
        /* Check to see if this gpu device exists in the topology_device_list.
         * If so, assign the gpu to that device,