From d4cbb6650671de661db8bfa6df03b17007e5d804 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Tue, 31 Aug 2021 11:54:41 +0300 Subject: [PATCH] intel/perf: support more than 64 queries Signed-off-by: Lionel Landwerlin Reviewed-by: Ivan Briano Part-of: --- src/intel/perf/intel_perf.c | 128 +++++++++++++++++++++++++++++++------------- src/intel/perf/intel_perf.h | 7 ++- 2 files changed, 97 insertions(+), 38 deletions(-) diff --git a/src/intel/perf/intel_perf.c b/src/intel/perf/intel_perf.c index adbf845..53675a3 100644 --- a/src/intel/perf/intel_perf.c +++ b/src/intel/perf/intel_perf.c @@ -627,8 +627,6 @@ compare_counter_categories_and_names(const void *_c1, const void *_c2) static void build_unique_counter_list(struct intel_perf_config *perf) { - assert(perf->n_queries < 64); - size_t max_counters = 0; for (int q = 0; q < perf->n_queries; q++) @@ -661,14 +659,14 @@ build_unique_counter_list(struct intel_perf_config *perf) if (entry) { counter_info = entry->data; - counter_info->query_mask |= BITFIELD64_BIT(q); + BITSET_SET(counter_info->query_mask, q); continue; } assert(perf->n_counters < max_counters); counter_info = &counter_infos[perf->n_counters++]; counter_info->counter = counter; - counter_info->query_mask = BITFIELD64_BIT(q); + BITSET_SET(counter_info->query_mask, q); counter_info->location.group_idx = q; counter_info->location.counter_idx = c; @@ -863,37 +861,61 @@ intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd, return i915_add_config(perf_cfg, fd, config, generated_guid); } -static uint64_t +static void get_passes_mask(struct intel_perf_config *perf, const uint32_t *counter_indices, - uint32_t counter_indices_count) + uint32_t counter_indices_count, + BITSET_WORD *queries_mask) { - uint64_t queries_mask = 0; + /* For each counter, look if it's already computed by a selected metric set + * or find one that can compute it. + */ + for (uint32_t c = 0; c < counter_indices_count; c++) { + uint32_t counter_idx = counter_indices[c]; + assert(counter_idx < perf->n_counters); - assert(perf->n_queries < 64); + const struct intel_perf_query_counter_info *counter_info = + &perf->counter_infos[counter_idx]; - /* Compute the number of passes by going through all counters N times (with - * N the number of queries) to make sure we select the most constraining - * counters first and look at the more flexible ones (that could be - * obtained from multiple queries) later. That way we minimize the number - * of passes required. - */ - for (uint32_t q = 0; q < perf->n_queries; q++) { - for (uint32_t i = 0; i < counter_indices_count; i++) { - assert(counter_indices[i] < perf->n_counters); + /* Check if the counter is already computed by one of the selected + * metric set. If it is, there is nothing more to do with this counter. + */ + uint32_t match = UINT32_MAX; + for (uint32_t w = 0; w < BITSET_WORDS(INTEL_PERF_MAX_METRIC_SETS); w++) { + if (queries_mask[w] & counter_info->query_mask[w]) { + match = w * BITSET_WORDBITS + ffsll(queries_mask[w] & counter_info->query_mask[w]) - 1; + break; + } + } + if (match != UINT32_MAX) + continue; - uint32_t idx = counter_indices[i]; - if (util_bitcount64(perf->counter_infos[idx].query_mask) != (q + 1)) - continue; + /* Now go through each metric set and find one that contains this + * counter. + */ + for (uint32_t q = 0; q < perf->n_queries; q++) { + bool found = false; - if (queries_mask & perf->counter_infos[idx].query_mask) - continue; + for (uint32_t w = 0; w < BITSET_WORDS(INTEL_PERF_MAX_METRIC_SETS); w++) { + if (!counter_info->query_mask[w]) + continue; + + uint32_t query_idx = w * BITSET_WORDBITS + ffsll(counter_info->query_mask[w]) - 1; + + /* Since we already looked for this in the query_mask, it should + * not be set. + */ + assert(!BITSET_TEST(queries_mask, query_idx)); - queries_mask |= BITFIELD64_BIT(ffsll(perf->counter_infos[idx].query_mask) - 1); + BITSET_SET(queries_mask, query_idx); + found = true; + break; + } + + if (found) + break; } } - - return queries_mask; } uint32_t @@ -902,17 +924,20 @@ intel_perf_get_n_passes(struct intel_perf_config *perf, uint32_t counter_indices_count, struct intel_perf_query_info **pass_queries) { - uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); + BITSET_DECLARE(queries_mask, INTEL_PERF_MAX_METRIC_SETS); + BITSET_ZERO(queries_mask); + + get_passes_mask(perf, counter_indices, counter_indices_count, queries_mask); if (pass_queries) { uint32_t pass = 0; for (uint32_t q = 0; q < perf->n_queries; q++) { - if ((1ULL << q) & queries_mask) + if (BITSET_TEST(queries_mask, q)) pass_queries[pass++] = &perf->queries[q]; } } - return util_bitcount64(queries_mask); + return BITSET_COUNT(queries_mask); } void @@ -921,21 +946,52 @@ intel_perf_get_counters_passes(struct intel_perf_config *perf, uint32_t counter_indices_count, struct intel_perf_counter_pass *counter_pass) { - uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); - ASSERTED uint32_t n_passes = util_bitcount64(queries_mask); + BITSET_DECLARE(queries_mask, INTEL_PERF_MAX_METRIC_SETS); + BITSET_ZERO(queries_mask); + + get_passes_mask(perf, counter_indices, counter_indices_count, queries_mask); + ASSERTED uint32_t n_passes = BITSET_COUNT(queries_mask); + + struct intel_perf_query_info **pass_array = calloc(perf->n_queries, + sizeof(*pass_array)); + uint32_t n_written_passes = 0; for (uint32_t i = 0; i < counter_indices_count; i++) { assert(counter_indices[i] < perf->n_counters); - uint32_t idx = counter_indices[i]; - counter_pass[i].counter = perf->counter_infos[idx].counter; + uint32_t counter_idx = counter_indices[i]; + counter_pass[i].counter = perf->counter_infos[counter_idx].counter; + + const struct intel_perf_query_counter_info *counter_info = + &perf->counter_infos[counter_idx]; + + uint32_t query_idx = UINT32_MAX; + for (uint32_t w = 0; w < BITSET_WORDS(INTEL_PERF_MAX_METRIC_SETS); w++) { + if (counter_info->query_mask[w] & queries_mask[w]) { + query_idx = w * BITSET_WORDBITS + + ffsll(counter_info->query_mask[w] & queries_mask[w]) - 1; + break; + } + } + assert(query_idx != UINT32_MAX); - uint32_t query_idx = ffsll(perf->counter_infos[idx].query_mask & queries_mask) - 1; counter_pass[i].query = &perf->queries[query_idx]; - uint32_t clear_bits = 63 - query_idx; - counter_pass[i].pass = util_bitcount64((queries_mask << clear_bits) >> clear_bits) - 1; - assert(counter_pass[i].pass < n_passes); + uint32_t pass_idx = UINT32_MAX; + for (uint32_t p = 0; p < n_written_passes; p++) { + if (pass_array[p] == counter_pass[i].query) { + pass_idx = p; + break; + } + } + + if (pass_idx == UINT32_MAX) { + pass_array[n_written_passes] = counter_pass[i].query; + pass_idx = n_written_passes++; + } + + counter_pass[i].pass = pass_idx; + assert(n_written_passes <= n_passes); } } diff --git a/src/intel/perf/intel_perf.h b/src/intel/perf/intel_perf.h index bb809a4..4855344 100644 --- a/src/intel/perf/intel_perf.h +++ b/src/intel/perf/intel_perf.h @@ -38,11 +38,14 @@ #include "compiler/glsl/list.h" #include "dev/intel_device_info.h" #include "util/bitscan.h" +#include "util/bitset.h" #include "util/hash_table.h" #include "util/ralloc.h" #include "drm-uapi/i915_drm.h" +#define INTEL_PERF_MAX_METRIC_SETS (1500) + #ifdef __cplusplus extern "C" { #endif @@ -307,7 +310,7 @@ struct intel_perf_query_field_layout { struct intel_perf_query_counter_info { struct intel_perf_query_counter *counter; - uint64_t query_mask; + BITSET_DECLARE(query_mask, INTEL_PERF_MAX_METRIC_SETS); /** * Each counter can be a part of many groups, each time at different index. @@ -453,7 +456,7 @@ uint64_t intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int static inline unsigned intel_perf_query_counter_info_first_query(const struct intel_perf_query_counter_info *counter_info) { - return ffsll(counter_info->query_mask); + return BITSET_FFS(counter_info->query_mask); } /** Read the slice/unslice frequency from 2 OA reports and store then into -- 2.7.4