panfrost: Separate core ID range from core count
authorAlyssa Rosenzweig <alyssa@collabora.com>
Fri, 24 Jun 2022 21:43:09 +0000 (17:43 -0400)
committerMarge Bot <emma+marge@anholt.net>
Fri, 8 Jul 2022 01:14:55 +0000 (01:14 +0000)
To query the core count, the hardware has a SHADERS_PRESENT register containing
a mask of shader cores connected. The core count equals the number of 1-bits,
regardless of placement. This value is useful for public consumption (like
in clinfo).

However, internally we are interested in the range of core IDs.
We usually query core count to determine how many cores to allocate various
per-core buffers for (performance counters, occlusion queries, and the stack).
In each case, the hardware writes at the index of its core ID, so we have to
allocate enough for entire range of core IDs. If the core mask is
discontiguous, this necessarily overallocates.

Rename the existing core_count to core_id_range, better reflecting its
definition and purpose, and repurpose core_count for the actual core count.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17265>

src/gallium/drivers/panfrost/pan_cmdstream.c
src/gallium/drivers/panfrost/pan_context.c
src/gallium/drivers/panfrost/pan_job.c
src/gallium/drivers/panfrost/pan_job.h
src/panfrost/lib/pan_cs.h
src/panfrost/lib/pan_device.h
src/panfrost/lib/pan_encoder.h
src/panfrost/lib/pan_props.c
src/panfrost/lib/pan_scratch.c
src/panfrost/perf/pan_perf.c
src/panfrost/vulkan/panvk_vX_cmd_buffer.c

index c86ee28..478d141 100644 (file)
@@ -1585,7 +1585,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
                         panfrost_batch_get_scratchpad(batch,
                                                       ss->info.tls_size,
                                                       dev->thread_tls_alloc,
-                                                      dev->core_count);
+                                                      dev->core_id_range);
                 info.tls.ptr = bo->ptr.gpu;
         }
 
@@ -1593,7 +1593,7 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
                 unsigned size =
                         pan_wls_adjust_size(info.wls.size) *
                         pan_wls_instances(&info.wls.dim) *
-                        dev->core_count;
+                        dev->core_id_range;
 
                 struct panfrost_bo *bo =
                         panfrost_batch_get_shared_memory(batch, size, 1);
@@ -2732,7 +2732,7 @@ emit_tls(struct panfrost_batch *batch)
                 panfrost_batch_get_scratchpad(batch,
                                               batch->stack_size,
                                               dev->thread_tls_alloc,
-                                              dev->core_count):
+                                              dev->core_id_range):
                 NULL;
         struct pan_tls_info tls = {
                 .tls = {
@@ -2754,7 +2754,7 @@ emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
                 panfrost_batch_get_scratchpad(batch,
                                               batch->stack_size,
                                               dev->thread_tls_alloc,
-                                              dev->core_count):
+                                              dev->core_id_range):
                 NULL;
         struct pan_tls_info tls = {
                 .tls = {
index c268e07..c051679 100644 (file)
@@ -869,7 +869,7 @@ panfrost_begin_query(struct pipe_context *pipe, struct pipe_query *q)
         case PIPE_QUERY_OCCLUSION_COUNTER:
         case PIPE_QUERY_OCCLUSION_PREDICATE:
         case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
-                unsigned size = sizeof(uint64_t) * dev->core_count;
+                unsigned size = sizeof(uint64_t) * dev->core_id_range;
 
                 /* Allocate a resource for the query results to be stored */
                 if (!query->rsrc) {
@@ -953,7 +953,7 @@ panfrost_get_query_result(struct pipe_context *pipe,
 
                 if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) {
                         uint64_t passed = 0;
-                        for (int i = 0; i < dev->core_count; ++i)
+                        for (int i = 0; i < dev->core_id_range; ++i)
                                 passed += result[i];
 
                         if (dev->arch <= 5 && !query->msaa)
index 71703d6..81d69bb 100644 (file)
@@ -388,11 +388,11 @@ struct panfrost_bo *
 panfrost_batch_get_scratchpad(struct panfrost_batch *batch,
                 unsigned size_per_thread,
                 unsigned thread_tls_alloc,
-                unsigned core_count)
+                unsigned core_id_range)
 {
         unsigned size = panfrost_get_total_stack_size(size_per_thread,
                         thread_tls_alloc,
-                        core_count);
+                        core_id_range);
 
         if (batch->scratchpad) {
                 assert(batch->scratchpad->size >= size);
index f79197b..3f2d291 100644 (file)
@@ -251,7 +251,7 @@ void
 panfrost_batch_adjust_stack_size(struct panfrost_batch *batch);
 
 struct panfrost_bo *
-panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count);
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_id_range);
 
 struct panfrost_bo *
 panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count);
index db4dcbf..2ffa601 100644 (file)
@@ -144,7 +144,7 @@ pan_wls_mem_size(const struct panfrost_device *dev,
 {
         unsigned instances = pan_wls_instances(dim);
 
-        return pan_wls_adjust_size(wls_size) * instances * dev->core_count;
+        return pan_wls_adjust_size(wls_size) * instances * dev->core_id_range;
 }
 
 #ifdef PAN_ARCH
index dbd9faf..0cbebe0 100644 (file)
@@ -184,7 +184,15 @@ struct panfrost_device {
         unsigned arch;
         unsigned gpu_id;
         unsigned revision;
+
+        /* Number of shader cores */
         unsigned core_count;
+
+        /* Range of core IDs, equal to the maximum core ID + 1. Satisfies
+         * core_id_range >= core_count.
+         */
+        unsigned core_id_range;
+
         unsigned thread_tls_alloc;
         struct panfrost_tiler_features tiler_features;
         const struct panfrost_model *model;
index 585eaf7..6834999 100644 (file)
@@ -75,7 +75,7 @@ unsigned
 panfrost_get_total_stack_size(
                 unsigned thread_size,
                 unsigned threads_per_core,
-                unsigned core_count);
+                unsigned core_id_range);
 
 /* Attributes / instancing */
 
index 3cc1616..1627e55 100644 (file)
@@ -152,20 +152,21 @@ panfrost_query_tiler_features(int fd)
 }
 
 static unsigned
-panfrost_query_core_count(int fd)
+panfrost_query_core_count(int fd, unsigned *core_id_range)
 {
         /* On older kernels, worst-case to 16 cores */
 
         unsigned mask = panfrost_query_raw(fd,
                         DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff);
 
-        /* Some cores might be absent. For TLS computation purposes, we care
-         * about the greatest ID + 1, which equals the core count if all cores
-         * are present, but allocates space for absent cores if needed.
-         * util_last_bit is defined to return the greatest bit set + 1, which
-         * is exactly what we need. */
+        /* Some cores might be absent. In some cases, we care
+         * about the range of core IDs (that is, the greatest core ID + 1). If
+         * the core mask is contiguous, this equals the core count.
+         */
+        *core_id_range = util_last_bit(mask);
 
-        return util_last_bit(mask);
+        /* The actual core count skips overs the gaps */
+        return util_bitcount(mask);
 }
 
 /* Architectural maximums, since this register may be not implemented
@@ -263,7 +264,7 @@ panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
         dev->memctx = memctx;
         dev->gpu_id = panfrost_query_gpu_version(fd);
         dev->arch = pan_arch(dev->gpu_id);
-        dev->core_count = panfrost_query_core_count(fd);
+        dev->core_count = panfrost_query_core_count(fd, &dev->core_id_range);
         dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd, dev->arch);
         dev->kernel_version = drmGetVersion(fd);
         dev->revision = panfrost_query_gpu_revision(fd);
index 2742acb..91d8bd6 100644 (file)
@@ -78,10 +78,10 @@ unsigned
 panfrost_get_total_stack_size(
                 unsigned thread_size,
                 unsigned threads_per_core,
-                unsigned core_count)
+                unsigned core_id_range)
 {
         unsigned size_per_thread = (thread_size == 0) ? 0 :
                 util_next_power_of_two(ALIGN_POT(thread_size, 16));
 
-        return size_per_thread * threads_per_core * core_count;
+        return size_per_thread * threads_per_core * core_id_range;
 }
index 930eae7..c543d7f 100644 (file)
@@ -42,7 +42,7 @@ panfrost_perf_counter_read(const struct panfrost_perf_counter *counter,
 
    // If counter belongs to shader core, accumulate values for all other cores
    if (counter->category_index == PAN_SHADER_CORE_INDEX) {
-      for (uint32_t core = 1; core < perf->dev->core_count; ++core) {
+      for (uint32_t core = 1; core < perf->dev->core_id_range; ++core) {
          ret += perf->counter_values[offset + PAN_COUNTERS_PER_CATEGORY * core];
       }
    }
@@ -77,7 +77,7 @@ panfrost_perf_init(struct panfrost_perf *perf, struct panfrost_device *dev)
    // Generally counter blocks are laid out in the following order:
    // Job manager, tiler, one or more L2 caches, and one or more shader cores.
    unsigned l2_slices = panfrost_query_l2_slices(dev);
-   uint32_t n_blocks = 2 + l2_slices + dev->core_count;
+   uint32_t n_blocks = 2 + l2_slices + dev->core_id_range;
    perf->n_counter_values = PAN_COUNTERS_PER_CATEGORY * n_blocks;
    perf->counter_values = ralloc_array(perf, uint32_t, perf->n_counter_values);
 
index a4c2d4e..638954c 100644 (file)
@@ -118,7 +118,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
    if (batch->tlsinfo.tls.size) {
       unsigned size = panfrost_get_total_stack_size(batch->tlsinfo.tls.size,
                                                     pdev->thread_tls_alloc,
-                                                    pdev->core_count);
+                                                    pdev->core_id_range);
       batch->tlsinfo.tls.ptr =
          pan_pool_alloc_aligned(&cmdbuf->tls_pool.base, size, 4096).gpu;
    }