drm/amdkfd: Fix the warning of array-index-out-of-bounds
authorMa Jun <Jun.Ma2@amd.com>
Wed, 2 Nov 2022 07:53:26 +0000 (15:53 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 30 Mar 2023 10:49:31 +0000 (12:49 +0200)
commit c0cc999f3c32e65a7c88fb323893ddf897b24488 upstream.

For some GPUs with more CUs, the original sibling_map[32]
in struct crat_subtype_cache is not enough
to save the cache information when create the VCRAT table,
so skip filling the struct crat_subtype_cache info instead
fill struct kfd_cache_properties directly to fix this problem.

Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: "Limonciello, Mario" <Mario.Limonciello@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/gpu/drm/amd/amdkfd/kfd_crat.c
drivers/gpu/drm/amd/amdkfd/kfd_crat.h
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
drivers/gpu/drm/amd/amdkfd/kfd_topology.h

index 5e0bad7..97700cb 100644 (file)
@@ -50,16 +50,6 @@ static inline unsigned int get_and_inc_gpu_processor_id(
        return current_id;
 }
 
-/* Static table to describe GPU Cache information */
-struct kfd_gpu_cache_info {
-       uint32_t        cache_size;
-       uint32_t        cache_level;
-       uint32_t        flags;
-       /* Indicates how many Compute Units share this cache
-        * within a SA. Value = 1 indicates the cache is not shared
-        */
-       uint32_t        num_cu_shared;
-};
 
 static struct kfd_gpu_cache_info kaveri_cache_info[] = {
        {
@@ -1119,9 +1109,13 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
                        props->cachelines_per_tag = cache->lines_per_tag;
                        props->cache_assoc = cache->associativity;
                        props->cache_latency = cache->cache_latency;
+
                        memcpy(props->sibling_map, cache->sibling_map,
                                        sizeof(props->sibling_map));
 
+                       /* set the sibling_map_size as 32 for CRAT from ACPI */
+                       props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;
+
                        if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
                                props->cache_type |= HSA_CACHE_TYPE_DATA;
                        if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
@@ -1339,125 +1333,6 @@ err:
        return ret;
 }
 
-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
-static int fill_in_l1_pcache(struct crat_subtype_cache *pcache,
-                               struct kfd_gpu_cache_info *pcache_info,
-                               struct kfd_cu_info *cu_info,
-                               int mem_available,
-                               int cu_bitmask,
-                               int cache_type, unsigned int cu_processor_id,
-                               int cu_block)
-{
-       unsigned int cu_sibling_map_mask;
-       int first_active_cu;
-
-       /* First check if enough memory is available */
-       if (sizeof(struct crat_subtype_cache) > mem_available)
-               return -ENOMEM;
-
-       cu_sibling_map_mask = cu_bitmask;
-       cu_sibling_map_mask >>= cu_block;
-       cu_sibling_map_mask &=
-               ((1 << pcache_info[cache_type].num_cu_shared) - 1);
-       first_active_cu = ffs(cu_sibling_map_mask);
-
-       /* CU could be inactive. In case of shared cache find the first active
-        * CU. and incase of non-shared cache check if the CU is inactive. If
-        * inactive active skip it
-        */
-       if (first_active_cu) {
-               memset(pcache, 0, sizeof(struct crat_subtype_cache));
-               pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
-               pcache->length = sizeof(struct crat_subtype_cache);
-               pcache->flags = pcache_info[cache_type].flags;
-               pcache->processor_id_low = cu_processor_id
-                                        + (first_active_cu - 1);
-               pcache->cache_level = pcache_info[cache_type].cache_level;
-               pcache->cache_size = pcache_info[cache_type].cache_size;
-
-               /* Sibling map is w.r.t processor_id_low, so shift out
-                * inactive CU
-                */
-               cu_sibling_map_mask =
-                       cu_sibling_map_mask >> (first_active_cu - 1);
-
-               pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
-               pcache->sibling_map[1] =
-                               (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
-               pcache->sibling_map[2] =
-                               (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
-               pcache->sibling_map[3] =
-                               (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
-               return 0;
-       }
-       return 1;
-}
-
-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
-static int fill_in_l2_l3_pcache(struct crat_subtype_cache *pcache,
-                               struct kfd_gpu_cache_info *pcache_info,
-                               struct kfd_cu_info *cu_info,
-                               int mem_available,
-                               int cache_type, unsigned int cu_processor_id)
-{
-       unsigned int cu_sibling_map_mask;
-       int first_active_cu;
-       int i, j, k;
-
-       /* First check if enough memory is available */
-       if (sizeof(struct crat_subtype_cache) > mem_available)
-               return -ENOMEM;
-
-       cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
-       cu_sibling_map_mask &=
-               ((1 << pcache_info[cache_type].num_cu_shared) - 1);
-       first_active_cu = ffs(cu_sibling_map_mask);
-
-       /* CU could be inactive. In case of shared cache find the first active
-        * CU. and incase of non-shared cache check if the CU is inactive. If
-        * inactive active skip it
-        */
-       if (first_active_cu) {
-               memset(pcache, 0, sizeof(struct crat_subtype_cache));
-               pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
-               pcache->length = sizeof(struct crat_subtype_cache);
-               pcache->flags = pcache_info[cache_type].flags;
-               pcache->processor_id_low = cu_processor_id
-                                        + (first_active_cu - 1);
-               pcache->cache_level = pcache_info[cache_type].cache_level;
-               pcache->cache_size = pcache_info[cache_type].cache_size;
-
-               /* Sibling map is w.r.t processor_id_low, so shift out
-                * inactive CU
-                */
-               cu_sibling_map_mask =
-                       cu_sibling_map_mask >> (first_active_cu - 1);
-               k = 0;
-               for (i = 0; i < cu_info->num_shader_engines; i++) {
-                       for (j = 0; j < cu_info->num_shader_arrays_per_engine;
-                               j++) {
-                               pcache->sibling_map[k] =
-                                (uint8_t)(cu_sibling_map_mask & 0xFF);
-                               pcache->sibling_map[k+1] =
-                                (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
-                               pcache->sibling_map[k+2] =
-                                (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
-                               pcache->sibling_map[k+3] =
-                                (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
-                               k += 4;
-                               cu_sibling_map_mask =
-                                       cu_info->cu_bitmap[i % 4][j + i / 4];
-                               cu_sibling_map_mask &= (
-                                (1 << pcache_info[cache_type].num_cu_shared)
-                                - 1);
-                       }
-               }
-               return 0;
-       }
-       return 1;
-}
-
-#define KFD_MAX_CACHE_TYPES 6
 
 static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
                                                   struct kfd_gpu_cache_info *pcache_info)
@@ -1531,231 +1406,133 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
        return i;
 }
 
-/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
- * tables
- *
- *     @kdev - [IN] GPU device
- *     @gpu_processor_id - [IN] GPU processor ID to which these caches
- *                         associate
- *     @available_size - [IN] Amount of memory available in pcache
- *     @cu_info - [IN] Compute Unit info obtained from KGD
- *     @pcache - [OUT] memory into which cache data is to be filled in.
- *     @size_filled - [OUT] amount of data used up in pcache.
- *     @num_of_entries - [OUT] number of caches added
- */
-static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
-                       int gpu_processor_id,
-                       int available_size,
-                       struct kfd_cu_info *cu_info,
-                       struct crat_subtype_cache *pcache,
-                       int *size_filled,
-                       int *num_of_entries)
+int kfd_get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info)
 {
-       struct kfd_gpu_cache_info *pcache_info;
-       struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
        int num_of_cache_types = 0;
-       int i, j, k;
-       int ct = 0;
-       int mem_available = available_size;
-       unsigned int cu_processor_id;
-       int ret;
-       unsigned int num_cu_shared;
 
        switch (kdev->adev->asic_type) {
        case CHIP_KAVERI:
-               pcache_info = kaveri_cache_info;
+               *pcache_info = kaveri_cache_info;
                num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
                break;
        case CHIP_HAWAII:
-               pcache_info = hawaii_cache_info;
+               *pcache_info = hawaii_cache_info;
                num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
                break;
        case CHIP_CARRIZO:
-               pcache_info = carrizo_cache_info;
+               *pcache_info = carrizo_cache_info;
                num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
                break;
        case CHIP_TONGA:
-               pcache_info = tonga_cache_info;
+               *pcache_info = tonga_cache_info;
                num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
                break;
        case CHIP_FIJI:
-               pcache_info = fiji_cache_info;
+               *pcache_info = fiji_cache_info;
                num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
                break;
        case CHIP_POLARIS10:
-               pcache_info = polaris10_cache_info;
+               *pcache_info = polaris10_cache_info;
                num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
                break;
        case CHIP_POLARIS11:
-               pcache_info = polaris11_cache_info;
+               *pcache_info = polaris11_cache_info;
                num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
                break;
        case CHIP_POLARIS12:
-               pcache_info = polaris12_cache_info;
+               *pcache_info = polaris12_cache_info;
                num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
                break;
        case CHIP_VEGAM:
-               pcache_info = vegam_cache_info;
+               *pcache_info = vegam_cache_info;
                num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
                break;
        default:
                switch (KFD_GC_VERSION(kdev)) {
                case IP_VERSION(9, 0, 1):
-                       pcache_info = vega10_cache_info;
+                       *pcache_info = vega10_cache_info;
                        num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
                        break;
                case IP_VERSION(9, 2, 1):
-                       pcache_info = vega12_cache_info;
+                       *pcache_info = vega12_cache_info;
                        num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
                        break;
                case IP_VERSION(9, 4, 0):
                case IP_VERSION(9, 4, 1):
-                       pcache_info = vega20_cache_info;
+                       *pcache_info = vega20_cache_info;
                        num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
                        break;
                case IP_VERSION(9, 4, 2):
-                       pcache_info = aldebaran_cache_info;
+                       *pcache_info = aldebaran_cache_info;
                        num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
                        break;
                case IP_VERSION(9, 1, 0):
                case IP_VERSION(9, 2, 2):
-                       pcache_info = raven_cache_info;
+                       *pcache_info = raven_cache_info;
                        num_of_cache_types = ARRAY_SIZE(raven_cache_info);
                        break;
                case IP_VERSION(9, 3, 0):
-                       pcache_info = renoir_cache_info;
+                       *pcache_info = renoir_cache_info;
                        num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
                        break;
                case IP_VERSION(10, 1, 10):
                case IP_VERSION(10, 1, 2):
                case IP_VERSION(10, 1, 3):
                case IP_VERSION(10, 1, 4):
-                       pcache_info = navi10_cache_info;
+                       *pcache_info = navi10_cache_info;
                        num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
                        break;
                case IP_VERSION(10, 1, 1):
-                       pcache_info = navi14_cache_info;
+                       *pcache_info = navi14_cache_info;
                        num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
                        break;
                case IP_VERSION(10, 3, 0):
-                       pcache_info = sienna_cichlid_cache_info;
+                       *pcache_info = sienna_cichlid_cache_info;
                        num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
                        break;
                case IP_VERSION(10, 3, 2):
-                       pcache_info = navy_flounder_cache_info;
+                       *pcache_info = navy_flounder_cache_info;
                        num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
                        break;
                case IP_VERSION(10, 3, 4):
-                       pcache_info = dimgrey_cavefish_cache_info;
+                       *pcache_info = dimgrey_cavefish_cache_info;
                        num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
                        break;
                case IP_VERSION(10, 3, 1):
-                       pcache_info = vangogh_cache_info;
+                       *pcache_info = vangogh_cache_info;
                        num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
                        break;
                case IP_VERSION(10, 3, 5):
-                       pcache_info = beige_goby_cache_info;
+                       *pcache_info = beige_goby_cache_info;
                        num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
                        break;
                case IP_VERSION(10, 3, 3):
-                       pcache_info = yellow_carp_cache_info;
+                       *pcache_info = yellow_carp_cache_info;
                        num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
                        break;
                case IP_VERSION(10, 3, 6):
-                       pcache_info = gc_10_3_6_cache_info;
+                       *pcache_info = gc_10_3_6_cache_info;
                        num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);
                        break;
                case IP_VERSION(10, 3, 7):
-                       pcache_info = gfx1037_cache_info;
+                       *pcache_info = gfx1037_cache_info;
                        num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);
                        break;
                case IP_VERSION(11, 0, 0):
                case IP_VERSION(11, 0, 1):
                case IP_VERSION(11, 0, 2):
                case IP_VERSION(11, 0, 3):
-                       pcache_info = cache_info;
                        num_of_cache_types =
-                               kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info);
+                               kfd_fill_gpu_cache_info_from_gfx_config(kdev, *pcache_info);
                        break;
                default:
-                       pcache_info = dummy_cache_info;
+                       *pcache_info = dummy_cache_info;
                        num_of_cache_types = ARRAY_SIZE(dummy_cache_info);
                        pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");
                        break;
                }
        }
-
-       *size_filled = 0;
-       *num_of_entries = 0;
-
-       /* For each type of cache listed in the kfd_gpu_cache_info table,
-        * go through all available Compute Units.
-        * The [i,j,k] loop will
-        *              if kfd_gpu_cache_info.num_cu_shared = 1
-        *                      will parse through all available CU
-        *              If (kfd_gpu_cache_info.num_cu_shared != 1)
-        *                      then it will consider only one CU from
-        *                      the shared unit
-        */
-
-       for (ct = 0; ct < num_of_cache_types; ct++) {
-         cu_processor_id = gpu_processor_id;
-         if (pcache_info[ct].cache_level == 1) {
-           for (i = 0; i < cu_info->num_shader_engines; i++) {
-             for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
-               for (k = 0; k < cu_info->num_cu_per_sh;
-                 k += pcache_info[ct].num_cu_shared) {
-                 ret = fill_in_l1_pcache(pcache,
-                                       pcache_info,
-                                       cu_info,
-                                       mem_available,
-                                       cu_info->cu_bitmap[i % 4][j + i / 4],
-                                       ct,
-                                       cu_processor_id,
-                                       k);
-
-                 if (ret < 0)
-                       break;
-
-                 if (!ret) {
-                               pcache++;
-                               (*num_of_entries)++;
-                               mem_available -= sizeof(*pcache);
-                               (*size_filled) += sizeof(*pcache);
-                 }
-
-                 /* Move to next CU block */
-                 num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
-                                       cu_info->num_cu_per_sh) ?
-                                       pcache_info[ct].num_cu_shared :
-                                       (cu_info->num_cu_per_sh - k);
-                 cu_processor_id += num_cu_shared;
-               }
-             }
-           }
-         } else {
-                       ret = fill_in_l2_l3_pcache(pcache,
-                               pcache_info,
-                               cu_info,
-                               mem_available,
-                               ct,
-                               cu_processor_id);
-
-                       if (ret < 0)
-                               break;
-
-                       if (!ret) {
-                               pcache++;
-                               (*num_of_entries)++;
-                               mem_available -= sizeof(*pcache);
-                               (*size_filled) += sizeof(*pcache);
-                       }
-         }
-       }
-
-       pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
-
-       return 0;
+       return num_of_cache_types;
 }
 
 static bool kfd_ignore_crat(void)
@@ -2314,8 +2091,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
        struct kfd_cu_info cu_info;
        int avail_size = *size;
        uint32_t total_num_of_cu;
-       int num_of_cache_entries = 0;
-       int cache_mem_filled = 0;
        uint32_t nid = 0;
        int ret = 0;
 
@@ -2416,31 +2191,12 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
        crat_table->length += sizeof(struct crat_subtype_memory);
        crat_table->total_entries++;
 
-       /* TODO: Fill in cache information. This information is NOT readily
-        * available in KGD
-        */
-       sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
-               sub_type_hdr->length);
-       ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
-                               avail_size,
-                               &cu_info,
-                               (struct crat_subtype_cache *)sub_type_hdr,
-                               &cache_mem_filled,
-                               &num_of_cache_entries);
-
-       if (ret < 0)
-               return ret;
-
-       crat_table->length += cache_mem_filled;
-       crat_table->total_entries += num_of_cache_entries;
-       avail_size -= cache_mem_filled;
-
        /* Fill in Subtype: IO_LINKS
         *  Only direct links are added here which is Link from GPU to
         *  its NUMA node. Indirect links are added by userspace.
         */
        sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
-               cache_mem_filled);
+               sub_type_hdr->length);
        ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
                (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
 
index 482ba84..a867106 100644 (file)
@@ -317,6 +317,18 @@ struct cdit_header {
 
 struct kfd_dev;
 
+/* Static table to describe GPU Cache information */
+struct kfd_gpu_cache_info {
+       uint32_t        cache_size;
+       uint32_t        cache_level;
+       uint32_t        flags;
+       /* Indicates how many Compute Units share this cache
+        * within a SA. Value = 1 indicates the cache is not shared
+        */
+       uint32_t        num_cu_shared;
+};
+int kfd_get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info);
+
 int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
 void kfd_destroy_crat_image(void *crat_image);
 int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
index 35a9b70..713f893 100644 (file)
@@ -364,7 +364,6 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
 
        /* Making sure that the buffer is an empty string */
        buffer[0] = 0;
-
        cache = container_of(attr, struct kfd_cache_properties, attr);
        if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu))
                return -EPERM;
@@ -379,12 +378,13 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
        sysfs_show_32bit_prop(buffer, offs, "association", cache->cache_assoc);
        sysfs_show_32bit_prop(buffer, offs, "latency", cache->cache_latency);
        sysfs_show_32bit_prop(buffer, offs, "type", cache->cache_type);
+
        offs += snprintf(buffer+offs, PAGE_SIZE-offs, "sibling_map ");
-       for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
+       for (i = 0; i < cache->sibling_map_size; i++)
                for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++)
                        /* Check each bit */
                        offs += snprintf(buffer+offs, PAGE_SIZE-offs, "%d,",
-                                        (cache->sibling_map[i] >> j) & 1);
+                                               (cache->sibling_map[i] >> j) & 1);
 
        /* Replace the last "," with end of line */
        buffer[offs-1] = '\n';
@@ -1198,7 +1198,6 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
        struct kfd_iolink_properties *iolink;
        struct kfd_iolink_properties *p2plink;
 
-       down_write(&topology_lock);
        list_for_each_entry(dev, &topology_device_list, list) {
                /* Discrete GPUs need their own topology device list
                 * entries. Don't assign them to CPU/APU nodes.
@@ -1222,7 +1221,6 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
                        break;
                }
        }
-       up_write(&topology_lock);
        return out_dev;
 }
 
@@ -1593,6 +1591,221 @@ out:
        return ret;
 }
 
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
+                               struct kfd_gpu_cache_info *pcache_info,
+                               struct kfd_cu_info *cu_info,
+                               int cu_bitmask,
+                               int cache_type, unsigned int cu_processor_id,
+                               int cu_block)
+{
+       unsigned int cu_sibling_map_mask;
+       int first_active_cu;
+       struct kfd_cache_properties *pcache = NULL;
+
+       cu_sibling_map_mask = cu_bitmask;
+       cu_sibling_map_mask >>= cu_block;
+       cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+       first_active_cu = ffs(cu_sibling_map_mask);
+
+       /* CU could be inactive. In case of shared cache find the first active
+        * CU. and incase of non-shared cache check if the CU is inactive. If
+        * inactive active skip it
+        */
+       if (first_active_cu) {
+               pcache = kfd_alloc_struct(pcache);
+               if (!pcache)
+                       return -ENOMEM;
+
+               memset(pcache, 0, sizeof(struct kfd_cache_properties));
+               pcache->processor_id_low = cu_processor_id + (first_active_cu - 1);
+               pcache->cache_level = pcache_info[cache_type].cache_level;
+               pcache->cache_size = pcache_info[cache_type].cache_size;
+
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_DATA;
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_CPU;
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+               /* Sibling map is w.r.t processor_id_low, so shift out
+                * inactive CU
+                */
+               cu_sibling_map_mask =
+                       cu_sibling_map_mask >> (first_active_cu - 1);
+
+               pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+               pcache->sibling_map[1] =
+                               (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+               pcache->sibling_map[2] =
+                               (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+               pcache->sibling_map[3] =
+                               (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+
+               pcache->sibling_map_size = 4;
+               *props_ext = pcache;
+
+               return 0;
+       }
+       return 1;
+}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
+                               struct kfd_gpu_cache_info *pcache_info,
+                               struct kfd_cu_info *cu_info,
+                               int cache_type, unsigned int cu_processor_id)
+{
+       unsigned int cu_sibling_map_mask;
+       int first_active_cu;
+       int i, j, k;
+       struct kfd_cache_properties *pcache = NULL;
+
+       cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
+       cu_sibling_map_mask &=
+               ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+       first_active_cu = ffs(cu_sibling_map_mask);
+
+       /* CU could be inactive. In case of shared cache find the first active
+        * CU. and incase of non-shared cache check if the CU is inactive. If
+        * inactive active skip it
+        */
+       if (first_active_cu) {
+               pcache = kfd_alloc_struct(pcache);
+               if (!pcache)
+                       return -ENOMEM;
+
+               memset(pcache, 0, sizeof(struct kfd_cache_properties));
+               pcache->processor_id_low = cu_processor_id
+                                       + (first_active_cu - 1);
+               pcache->cache_level = pcache_info[cache_type].cache_level;
+               pcache->cache_size = pcache_info[cache_type].cache_size;
+
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_DATA;
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_CPU;
+               if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+                       pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+               /* Sibling map is w.r.t processor_id_low, so shift out
+                * inactive CU
+                */
+               cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
+               k = 0;
+
+               for (i = 0; i < cu_info->num_shader_engines; i++) {
+                       for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
+                               pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+                               pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+                               pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+                               pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+                               k += 4;
+
+                               cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4];
+                               cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+                       }
+               }
+               pcache->sibling_map_size = k;
+               *props_ext = pcache;
+               return 0;
+       }
+       return 1;
+}
+
+#define KFD_MAX_CACHE_TYPES 6
+
+/* kfd_fill_cache_non_crat_info - Fill GPU cache info using kfd_gpu_cache_info
+ * tables
+ */
+void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_dev *kdev)
+{
+       struct kfd_gpu_cache_info *pcache_info = NULL;
+       int i, j, k;
+       int ct = 0;
+       unsigned int cu_processor_id;
+       int ret;
+       unsigned int num_cu_shared;
+       struct kfd_cu_info cu_info;
+       struct kfd_cu_info *pcu_info;
+       int gpu_processor_id;
+       struct kfd_cache_properties *props_ext;
+       int num_of_entries = 0;
+       int num_of_cache_types = 0;
+       struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
+
+       amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
+       pcu_info = &cu_info;
+
+       gpu_processor_id = dev->node_props.simd_id_base;
+
+       pcache_info = cache_info;
+       num_of_cache_types = kfd_get_gpu_cache_info(kdev, &pcache_info);
+       if (!num_of_cache_types) {
+               pr_warn("no cache info found\n");
+               return;
+       }
+
+       /* For each type of cache listed in the kfd_gpu_cache_info table,
+        * go through all available Compute Units.
+        * The [i,j,k] loop will
+        *              if kfd_gpu_cache_info.num_cu_shared = 1
+        *                      will parse through all available CU
+        *              If (kfd_gpu_cache_info.num_cu_shared != 1)
+        *                      then it will consider only one CU from
+        *                      the shared unit
+        */
+       for (ct = 0; ct < num_of_cache_types; ct++) {
+               cu_processor_id = gpu_processor_id;
+               if (pcache_info[ct].cache_level == 1) {
+                       for (i = 0; i < pcu_info->num_shader_engines; i++) {
+                               for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
+                                       for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
+
+                                               ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
+                                                                               pcu_info->cu_bitmap[i % 4][j + i / 4], ct,
+                                                                               cu_processor_id, k);
+
+                                               if (ret < 0)
+                                                       break;
+
+                                               if (!ret) {
+                                                       num_of_entries++;
+                                                       list_add_tail(&props_ext->list, &dev->cache_props);
+                                               }
+
+                                               /* Move to next CU block */
+                                               num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
+                                                       pcu_info->num_cu_per_sh) ?
+                                                       pcache_info[ct].num_cu_shared :
+                                                       (pcu_info->num_cu_per_sh - k);
+                                               cu_processor_id += num_cu_shared;
+                                       }
+                               }
+                       }
+               } else {
+                       ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
+                                                               pcu_info, ct, cu_processor_id);
+
+                       if (ret < 0)
+                               break;
+
+                       if (!ret) {
+                               num_of_entries++;
+                               list_add_tail(&props_ext->list, &dev->cache_props);
+                       }
+               }
+       }
+       dev->node_props.caches_count += num_of_entries;
+       pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
+}
+
 int kfd_topology_add_device(struct kfd_dev *gpu)
 {
        uint32_t gpu_id;
@@ -1617,9 +1830,9 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
         * CRAT to create a new topology device. Once created assign the gpu to
         * that topology device
         */
+       down_write(&topology_lock);
        dev = kfd_assign_gpu(gpu);
        if (!dev) {
-               down_write(&topology_lock);
                proximity_domain = ++topology_crat_proximity_domain;
 
                res = kfd_create_crat_image_virtual(&crat_image, &image_size,
@@ -1631,6 +1844,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
                        topology_crat_proximity_domain--;
                        return res;
                }
+
                res = kfd_parse_crat_table(crat_image,
                                           &temp_topology_device_list,
                                           proximity_domain);
@@ -1644,23 +1858,28 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
                kfd_topology_update_device_list(&temp_topology_device_list,
                        &topology_device_list);
 
+               dev = kfd_assign_gpu(gpu);
+               if (WARN_ON(!dev)) {
+                       res = -ENODEV;
+                       goto err;
+               }
+
+               /* Fill the cache affinity information here for the GPUs
+                * using VCRAT
+                */
+               kfd_fill_cache_non_crat_info(dev, gpu);
+
                /* Update the SYSFS tree, since we added another topology
                 * device
                 */
                res = kfd_topology_update_sysfs();
-               up_write(&topology_lock);
-
                if (!res)
                        sys_props.generation_count++;
                else
                        pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
                                                gpu_id, res);
-               dev = kfd_assign_gpu(gpu);
-               if (WARN_ON(!dev)) {
-                       res = -ENODEV;
-                       goto err;
-               }
        }
+       up_write(&topology_lock);
 
        dev->gpu_id = gpu_id;
        gpu->id = gpu_id;
index 9f6c949..19283b8 100644 (file)
@@ -80,6 +80,8 @@ struct kfd_mem_properties {
        struct attribute        attr;
 };
 
+#define CACHE_SIBLINGMAP_SIZE 64
+
 struct kfd_cache_properties {
        struct list_head        list;
        uint32_t                processor_id_low;
@@ -90,10 +92,11 @@ struct kfd_cache_properties {
        uint32_t                cache_assoc;
        uint32_t                cache_latency;
        uint32_t                cache_type;
-       uint8_t                 sibling_map[CRAT_SIBLINGMAP_SIZE];
+       uint8_t                 sibling_map[CACHE_SIBLINGMAP_SIZE];
        struct kfd_dev          *gpu;
        struct kobject          *kobj;
        struct attribute        attr;
+       uint32_t                sibling_map_size;
 };
 
 struct kfd_iolink_properties {