intel/perf: support more than 64 queries
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Tue, 31 Aug 2021 08:54:41 +0000 (11:54 +0300)
committerMarge Bot <emma+marge@anholt.net>
Thu, 17 Nov 2022 12:57:06 +0000 (12:57 +0000)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18893>

src/intel/perf/intel_perf.c
src/intel/perf/intel_perf.h

index adbf845..53675a3 100644 (file)
@@ -627,8 +627,6 @@ compare_counter_categories_and_names(const void *_c1, const void *_c2)
 static void
 build_unique_counter_list(struct intel_perf_config *perf)
 {
-   assert(perf->n_queries < 64);
-
    size_t max_counters = 0;
 
    for (int q = 0; q < perf->n_queries; q++)
@@ -661,14 +659,14 @@ build_unique_counter_list(struct intel_perf_config *perf)
 
          if (entry) {
             counter_info = entry->data;
-            counter_info->query_mask |= BITFIELD64_BIT(q);
+            BITSET_SET(counter_info->query_mask, q);
             continue;
          }
          assert(perf->n_counters < max_counters);
 
          counter_info = &counter_infos[perf->n_counters++];
          counter_info->counter = counter;
-         counter_info->query_mask = BITFIELD64_BIT(q);
+         BITSET_SET(counter_info->query_mask, q);
 
          counter_info->location.group_idx = q;
          counter_info->location.counter_idx = c;
@@ -863,37 +861,61 @@ intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd,
    return i915_add_config(perf_cfg, fd, config, generated_guid);
 }
 
-static uint64_t
+static void
 get_passes_mask(struct intel_perf_config *perf,
                 const uint32_t *counter_indices,
-                uint32_t counter_indices_count)
+                uint32_t counter_indices_count,
+                BITSET_WORD *queries_mask)
 {
-   uint64_t queries_mask = 0;
+   /* For each counter, look if it's already computed by a selected metric set
+    * or find one that can compute it.
+    */
+   for (uint32_t c = 0; c < counter_indices_count; c++) {
+      uint32_t counter_idx = counter_indices[c];
+      assert(counter_idx < perf->n_counters);
 
-   assert(perf->n_queries < 64);
+      const struct intel_perf_query_counter_info *counter_info =
+         &perf->counter_infos[counter_idx];
 
-   /* Compute the number of passes by going through all counters N times (with
-    * N the number of queries) to make sure we select the most constraining
-    * counters first and look at the more flexible ones (that could be
-    * obtained from multiple queries) later. That way we minimize the number
-    * of passes required.
-    */
-   for (uint32_t q = 0; q < perf->n_queries; q++) {
-      for (uint32_t i = 0; i < counter_indices_count; i++) {
-         assert(counter_indices[i] < perf->n_counters);
+      /* Check if the counter is already computed by one of the selected
+       * metric set. If it is, there is nothing more to do with this counter.
+       */
+      uint32_t match = UINT32_MAX;
+      for (uint32_t w = 0; w < BITSET_WORDS(INTEL_PERF_MAX_METRIC_SETS); w++) {
+         if (queries_mask[w] & counter_info->query_mask[w]) {
+            match = w * BITSET_WORDBITS + ffsll(queries_mask[w] & counter_info->query_mask[w]) - 1;
+            break;
+         }
+      }
+      if (match != UINT32_MAX)
+         continue;
 
-         uint32_t idx = counter_indices[i];
-         if (util_bitcount64(perf->counter_infos[idx].query_mask) != (q + 1))
-            continue;
+      /* Now go through each metric set and find one that contains this
+       * counter.
+       */
+      for (uint32_t q = 0; q < perf->n_queries; q++) {
+         bool found = false;
 
-         if (queries_mask & perf->counter_infos[idx].query_mask)
-            continue;
+         for (uint32_t w = 0; w < BITSET_WORDS(INTEL_PERF_MAX_METRIC_SETS); w++) {
+            if (!counter_info->query_mask[w])
+               continue;
+
+            uint32_t query_idx = w * BITSET_WORDBITS + ffsll(counter_info->query_mask[w]) - 1;
+
+            /* Since we already looked for this in the query_mask, it should
+             * not be set.
+             */
+            assert(!BITSET_TEST(queries_mask, query_idx));
 
-         queries_mask |= BITFIELD64_BIT(ffsll(perf->counter_infos[idx].query_mask) - 1);
+            BITSET_SET(queries_mask, query_idx);
+            found = true;
+            break;
+         }
+
+         if (found)
+            break;
       }
    }
-
-   return queries_mask;
 }
 
 uint32_t
@@ -902,17 +924,20 @@ intel_perf_get_n_passes(struct intel_perf_config *perf,
                         uint32_t counter_indices_count,
                         struct intel_perf_query_info **pass_queries)
 {
-   uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
+   BITSET_DECLARE(queries_mask, INTEL_PERF_MAX_METRIC_SETS);
+   BITSET_ZERO(queries_mask);
+
+   get_passes_mask(perf, counter_indices, counter_indices_count, queries_mask);
 
    if (pass_queries) {
       uint32_t pass = 0;
       for (uint32_t q = 0; q < perf->n_queries; q++) {
-         if ((1ULL << q) & queries_mask)
+         if (BITSET_TEST(queries_mask, q))
             pass_queries[pass++] = &perf->queries[q];
       }
    }
 
-   return util_bitcount64(queries_mask);
+   return BITSET_COUNT(queries_mask);
 }
 
 void
@@ -921,21 +946,52 @@ intel_perf_get_counters_passes(struct intel_perf_config *perf,
                                uint32_t counter_indices_count,
                                struct intel_perf_counter_pass *counter_pass)
 {
-   uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count);
-   ASSERTED uint32_t n_passes = util_bitcount64(queries_mask);
+   BITSET_DECLARE(queries_mask, INTEL_PERF_MAX_METRIC_SETS);
+   BITSET_ZERO(queries_mask);
+
+   get_passes_mask(perf, counter_indices, counter_indices_count, queries_mask);
+   ASSERTED uint32_t n_passes = BITSET_COUNT(queries_mask);
+
+   struct intel_perf_query_info **pass_array = calloc(perf->n_queries,
+                                                      sizeof(*pass_array));
+   uint32_t n_written_passes = 0;
 
    for (uint32_t i = 0; i < counter_indices_count; i++) {
       assert(counter_indices[i] < perf->n_counters);
 
-      uint32_t idx = counter_indices[i];
-      counter_pass[i].counter = perf->counter_infos[idx].counter;
+      uint32_t counter_idx = counter_indices[i];
+      counter_pass[i].counter = perf->counter_infos[counter_idx].counter;
+
+      const struct intel_perf_query_counter_info *counter_info =
+         &perf->counter_infos[counter_idx];
+
+      uint32_t query_idx = UINT32_MAX;
+      for (uint32_t w = 0; w < BITSET_WORDS(INTEL_PERF_MAX_METRIC_SETS); w++) {
+         if (counter_info->query_mask[w] & queries_mask[w]) {
+            query_idx = w * BITSET_WORDBITS +
+               ffsll(counter_info->query_mask[w] & queries_mask[w]) - 1;
+            break;
+         }
+      }
+      assert(query_idx != UINT32_MAX);
 
-      uint32_t query_idx = ffsll(perf->counter_infos[idx].query_mask & queries_mask) - 1;
       counter_pass[i].query = &perf->queries[query_idx];
 
-      uint32_t clear_bits = 63 - query_idx;
-      counter_pass[i].pass = util_bitcount64((queries_mask << clear_bits) >> clear_bits) - 1;
-      assert(counter_pass[i].pass < n_passes);
+      uint32_t pass_idx = UINT32_MAX;
+      for (uint32_t p = 0; p < n_written_passes; p++) {
+         if (pass_array[p] == counter_pass[i].query) {
+            pass_idx = p;
+            break;
+         }
+      }
+
+      if (pass_idx == UINT32_MAX) {
+         pass_array[n_written_passes] = counter_pass[i].query;
+         pass_idx = n_written_passes++;
+      }
+
+      counter_pass[i].pass = pass_idx;
+      assert(n_written_passes <= n_passes);
    }
 }
 
index bb809a4..4855344 100644 (file)
 #include "compiler/glsl/list.h"
 #include "dev/intel_device_info.h"
 #include "util/bitscan.h"
+#include "util/bitset.h"
 #include "util/hash_table.h"
 #include "util/ralloc.h"
 
 #include "drm-uapi/i915_drm.h"
 
+#define INTEL_PERF_MAX_METRIC_SETS (1500)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -307,7 +310,7 @@ struct intel_perf_query_field_layout {
 struct intel_perf_query_counter_info {
    struct intel_perf_query_counter *counter;
 
-   uint64_t query_mask;
+   BITSET_DECLARE(query_mask, INTEL_PERF_MAX_METRIC_SETS);
 
    /**
     * Each counter can be a part of many groups, each time at different index.
@@ -453,7 +456,7 @@ uint64_t intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int
 static inline unsigned
 intel_perf_query_counter_info_first_query(const struct intel_perf_query_counter_info *counter_info)
 {
-   return ffsll(counter_info->query_mask);
+   return BITSET_FFS(counter_info->query_mask);
 }
 
 /** Read the slice/unslice frequency from 2 OA reports and store then into