intel/perf: prep work to enable new perf counters
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Thu, 4 Jul 2019 17:34:28 +0000 (20:34 +0300)
committerMarge Bot <eric+marge@anholt.net>
Tue, 2 Feb 2021 13:25:54 +0000 (13:25 +0000)
Those are not part of the OA reports and need some additional
scaffolding. Those counters are only available when doing queries as
we need to emit MI_SRMs to record them.

Equations making use of those counters are not there yet, they will
come in a follow up commit updating a bunch of oa-*.xml files.

v2: Fix typo

v3: Use PERF_CNT_VALUE_MASK (Marcin)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6518>

src/intel/perf/gen_perf.c
src/intel/perf/gen_perf.h
src/intel/perf/gen_perf.py
src/intel/perf/gen_perf_mdapi.c
src/intel/perf/gen_perf_mdapi.h
src/intel/perf/gen_perf_query.c
src/intel/vulkan/anv_perf.c
src/intel/vulkan/genX_query.c

index 4530bb0..85cc4ec 100644 (file)
@@ -423,6 +423,7 @@ init_oa_sys_vars(struct gen_perf_config *perf, const struct gen_device_info *dev
    perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000;
    perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
    perf->sys_vars.revision = devinfo->revision;
+   perf->sys_vars.query_mode = true;
    compute_topology_builtins(perf, devinfo);
 
    return true;
@@ -1118,6 +1119,18 @@ gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *result,
 }
 
 void
+gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result,
+                                    const struct gen_perf_query_info *query,
+                                    const uint64_t *start,
+                                    const uint64_t *end)
+{
+   for (uint32_t i = 0; i < 2; i++) {
+      result->accumulator[query->perfcnt_offset + i] =
+         (end[i] & PERF_CNT_VALUE_MASK) - (start[i] & PERF_CNT_VALUE_MASK);
+   }
+}
+
+void
 gen_perf_query_result_clear(struct gen_perf_query_result *result)
 {
    memset(result, 0, sizeof(*result));
index 4348c73..bbc8749 100644 (file)
@@ -108,8 +108,10 @@ struct gen_pipeline_stat {
  *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
  * For Gen8+
  *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
+ *
+ * Plus 2 PERF_CNT registers.
  */
-#define MAX_OA_REPORT_COUNTERS 62
+#define MAX_OA_REPORT_COUNTERS (62 + 2)
 
 /*
  * When currently allocate only one page for pipeline statistics queries. Here
@@ -180,10 +182,10 @@ struct gen_perf_query_counter {
    union {
       uint64_t (*oa_counter_read_uint64)(struct gen_perf_config *perf,
                                          const struct gen_perf_query_info *query,
-                                         const uint64_t *accumulator);
+                                         const struct gen_perf_query_result *results);
       float (*oa_counter_read_float)(struct gen_perf_config *perf,
                                      const struct gen_perf_query_info *query,
-                                     const uint64_t *accumulator);
+                                     const struct gen_perf_query_result *results);
       struct gen_pipeline_stat pipeline_stat;
    };
 };
@@ -231,6 +233,7 @@ struct gen_perf_query_info {
    int a_offset;
    int b_offset;
    int c_offset;
+   int perfcnt_offset;
 
    struct gen_perf_registers config;
 };
@@ -282,6 +285,7 @@ struct gen_perf_config {
       uint64_t gt_min_freq;         /** $GpuMinFrequency */
       uint64_t gt_max_freq;         /** $GpuMaxFrequency */
       uint64_t revision;            /** $SkuRevisionId */
+      bool     query_mode;          /** $QueryMode */
    } sys_vars;
 
    /* OA metric sets, indexed by GUID, as know by Mesa at build time, to
@@ -370,6 +374,13 @@ void gen_perf_query_result_read_gt_frequency(struct gen_perf_query_result *resul
                                              const uint32_t start,
                                              const uint32_t end);
 
+/** Store PERFCNT registers values.
+ */
+void gen_perf_query_result_read_perfcnts(struct gen_perf_query_result *result,
+                                         const struct gen_perf_query_info *query,
+                                         const uint64_t *start,
+                                         const uint64_t *end);
+
 /** Accumulate the delta between 2 OA reports into result for a given query.
  */
 void gen_perf_query_result_accumulate(struct gen_perf_query_result *result,
index c1e2b2a..b35b6a0 100644 (file)
@@ -94,7 +94,15 @@ def emit_fsub(tmp_id, args):
 
 def emit_read(tmp_id, args):
     type = args[1].lower()
-    c("uint64_t tmp{0} = accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0]))
+    c("uint64_t tmp{0} = results->accumulator[query->{1}_offset + {2}];".format(tmp_id, type, args[0]))
+    return tmp_id + 1
+
+def emit_read_reg(tmp_id, args):
+    offsets = {
+        'PERFCNT1': 0,
+        'PERFCNT2': 1,
+    }
+    c("uint64_t tmp{0} = results->accumulator[query->perfcnt_offset + {1}];".format(tmp_id, offsets[args[0]]))
     return tmp_id + 1
 
 def emit_uadd(tmp_id, args):
@@ -144,6 +152,7 @@ ops["FMAX"] = (2, emit_fmax)
 ops["FMUL"] = (2, emit_fmul)
 ops["FSUB"] = (2, emit_fsub)
 ops["READ"] = (2, emit_read)
+ops["READ_REG"] = (1, emit_read_reg)
 ops["UADD"] = (2, emit_uadd)
 ops["UDIV"] = (2, emit_udiv)
 ops["UMUL"] = (2, emit_umul)
@@ -193,6 +202,7 @@ hw_vars["$GpuTimestampFrequency"] = "perf->sys_vars.timestamp_frequency"
 hw_vars["$GpuMinFrequency"] = "perf->sys_vars.gt_min_freq"
 hw_vars["$GpuMaxFrequency"] = "perf->sys_vars.gt_max_freq"
 hw_vars["$SkuRevisionId"] = "perf->sys_vars.revision"
+hw_vars["$QueryMode"] = "perf->sys_vars.query_mode"
 
 def output_rpn_equation_code(set, counter, equation):
     c("/* RPN equation: " + equation + " */")
@@ -214,7 +224,7 @@ def output_rpn_equation_code(set, counter, equation):
                         operand = hw_vars[operand]
                     elif operand in set.counter_vars:
                         reference = set.counter_vars[operand]
-                        operand = set.read_funcs[operand[1:]] + "(perf, query, accumulator)"
+                        operand = set.read_funcs[operand[1:]] + "(perf, query, results)"
                     else:
                         raise Exception("Failed to resolve variable " + operand + " in equation " + equation + " for " + set.name + " :: " + counter.get('name'));
                 args.append(operand)
@@ -234,7 +244,7 @@ def output_rpn_equation_code(set, counter, equation):
     if value in hw_vars:
         value = hw_vars[value]
     if value in set.counter_vars:
-        value = set.read_funcs[value[1:]] + "(perf, query, accumulator)"
+        value = set.read_funcs[value[1:]] + "(perf, query, results)"
 
     c("\nreturn " + value + ";")
 
@@ -288,7 +298,7 @@ def output_counter_read(gen, set, counter):
         c(counter.read_sym + "(UNUSED struct gen_perf_config *perf,\n")
         c_indent(len(counter.read_sym) + 1)
         c("const struct gen_perf_query_info *query,\n")
-        c("const uint64_t *accumulator)\n")
+        c("const struct gen_perf_query_result *results)\n")
         c_outdent(len(counter.read_sym) + 1)
 
         c("{")
@@ -729,19 +739,21 @@ def main():
                     query->oa_format = I915_OA_FORMAT_A45_B8_C8;
                     /* Accumulation buffer offsets... */
                     query->gpu_time_offset = 0;
-                    query->a_offset = 1;
-                    query->b_offset = 46;
-                    query->c_offset = 54;
+                    query->a_offset = query->gpu_time_offset + 1;
+                    query->b_offset = query->a_offset + 45;
+                    query->c_offset = query->b_offset + 8;
+                    query->perfcnt_offset = query->c_offset + 8;
                 """))
             else:
                 c(textwrap.dedent("""\
                     query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
                     /* Accumulation buffer offsets... */
                     query->gpu_time_offset = 0;
-                    query->gpu_clock_offset = 1;
-                    query->a_offset = 2;
-                    query->b_offset = 38;
-                    query->c_offset = 46;
+                    query->gpu_clock_offset = query->gpu_time_offset + 1;
+                    query->a_offset = query->gpu_clock_offset + 1;
+                    query->b_offset = query->a_offset + 36;
+                    query->c_offset = query->b_offset + 8;
+                    query->perfcnt_offset = query->c_offset + 8;
                 """))
 
 
index 2452b99..aad5e4e 100644 (file)
@@ -54,6 +54,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
             result->accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i];
       }
 
+      mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
+      mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
+
       mdapi_data->ReportsCount = result->reports_accumulated;
       mdapi_data->TotalTime =
          gen_device_info_timebase_scale(devinfo, result->accumulator[0]);
@@ -75,6 +78,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
             result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
       }
 
+      mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
+      mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
+
       mdapi_data->ReportId = result->hw_id;
       mdapi_data->ReportsCount = result->reports_accumulated;
       mdapi_data->TotalTime =
@@ -106,6 +112,9 @@ gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
             result->accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
       }
 
+      mdapi_data->PerfCounter1 = result->accumulator[query->perfcnt_offset + 0];
+      mdapi_data->PerfCounter2 = result->accumulator[query->perfcnt_offset + 1];
+
       mdapi_data->ReportId = result->hw_id;
       mdapi_data->ReportsCount = result->reports_accumulated;
       mdapi_data->TotalTime =
@@ -354,5 +363,6 @@ gen_perf_register_mdapi_oa_query(struct gen_perf_config *perf,
       query->a_offset = copy_query->a_offset;
       query->b_offset = copy_query->b_offset;
       query->c_offset = copy_query->c_offset;
+      query->perfcnt_offset = copy_query->perfcnt_offset;
    }
 }
index acf1edd..05717d1 100644 (file)
@@ -132,41 +132,6 @@ int gen_perf_query_result_write_mdapi(void *data, uint32_t data_size,
                                       const struct gen_perf_query_info *query,
                                       const struct gen_perf_query_result *result);
 
-static inline void gen_perf_query_mdapi_write_perfcntr(void *data, uint32_t data_size,
-                                                       const struct gen_device_info *devinfo,
-                                                       const uint64_t *begin_perf_cntrs,
-                                                       const uint64_t *end_perf_cntrs)
-{
-   /* Only bits 0:43 of the 64bit registers contains the value. */
-   const uint64_t mask = (1ull << 44) - 1;
-
-   switch (devinfo->gen) {
-   case 8: {
-      if (data_size < sizeof(struct gen8_mdapi_metrics))
-         return;
-      struct gen8_mdapi_metrics *mdapi_data = data;
-      mdapi_data->PerfCounter1 =
-         (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask);
-      mdapi_data->PerfCounter2 =
-         (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask);
-      break;
-   }
-   case 9:
-   case 11: {
-      if (data_size < sizeof(struct gen9_mdapi_metrics))
-         return;
-      struct gen9_mdapi_metrics *mdapi_data = data;
-      mdapi_data->PerfCounter1 =
-         (end_perf_cntrs[0] & mask) - (begin_perf_cntrs[0] & mask);
-      mdapi_data->PerfCounter2 =
-         (end_perf_cntrs[1] & mask) - (begin_perf_cntrs[1] & mask);
-      break;
-   }
-   default:
-      break;
-   }
-}
-
 static inline void gen_perf_query_mdapi_write_marker(void *data, uint32_t data_size,
                                                      const struct gen_device_info *devinfo,
                                                      uint64_t value)
index e6d38b6..a1204b8 100644 (file)
@@ -1423,13 +1423,13 @@ get_oa_counter_data(struct gen_perf_context *perf_ctx,
             out_uint64 = (uint64_t *)(data + counter->offset);
             *out_uint64 =
                counter->oa_counter_read_uint64(perf_cfg, queryinfo,
-                                               query->oa.result.accumulator);
+                                               &query->oa.result);
             break;
          case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
             out_float = (float *)(data + counter->offset);
             *out_float =
                counter->oa_counter_read_float(perf_cfg, queryinfo,
-                                              query->oa.result.accumulator);
+                                              &query->oa.result);
             break;
          default:
             /* So far we aren't using uint32, double or bool32... */
index 0a32399..35bd246 100644 (file)
@@ -421,13 +421,13 @@ anv_perf_write_pass_results(struct gen_perf_config *perf,
             results[c].uint64 =
                counter_pass->counter->oa_counter_read_uint64(perf,
                                                              counter_pass->query,
-                                                             accumulated_results->accumulator);
+                                                             accumulated_results);
             break;
          case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
             results[c].float32 =
                counter_pass->counter->oa_counter_read_float(perf,
                                                             counter_pass->query,
-                                                            accumulated_results->accumulator);
+                                                            accumulated_results);
             break;
          default:
             /* So far we aren't using uint32, double or bool32... */
index 5994488..ab3f6d0 100644 (file)
@@ -326,7 +326,7 @@ intel_perf_rpstart_offset(bool end)
    return 16 + (end ? sizeof(uint32_t) : 0);
 }
 
-#if GEN_GEN >= 8 && GEN_GEN <= 11
+#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
 static uint32_t
 intel_perf_counter(bool end)
 {
@@ -541,14 +541,14 @@ VkResult genX(GetQueryPoolResults)(
                                                 oa_begin, oa_end);
          gen_perf_query_result_read_gt_frequency(&result, &device->info,
                                                  *rpstat_begin, *rpstat_end);
-         gen_perf_query_result_write_mdapi(pData, stride,
-                                           &device->info,
-                                           query, &result);
-#if GEN_GEN >= 8 && GEN_GEN <= 11
-         gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
+#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
+         gen_perf_query_result_read_perfcnts(&result, query,
                                              query_data + intel_perf_counter(false),
                                              query_data + intel_perf_counter(true));
 #endif
+         gen_perf_query_result_write_mdapi(pData, stride,
+                                           &device->info,
+                                           query, &result);
          const uint64_t *marker = query_data + intel_perf_marker_offset();
          gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
          break;
@@ -913,7 +913,7 @@ void genX(CmdBeginQueryIndexedEXT)(
                                                 intel_perf_rpstart_offset(false))),
                    gen_mi_reg32(GENX(RPSTAT0_num)));
 #endif
-#if GEN_GEN >= 8 && GEN_GEN <= 11
+#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
                                                     intel_perf_counter(false))),
                    gen_mi_reg64(GENX(PERFCNT1_num)));
@@ -1047,7 +1047,7 @@ void genX(CmdEndQueryIndexedEXT)(
       uint32_t marker_offset = intel_perf_marker_offset();
       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
                    gen_mi_imm(cmd_buffer->intel_perf_marker));
-#if GEN_GEN >= 8 && GEN_GEN <= 11
+#if GEN_IS_HASWELL || (GEN_GEN >= 8 && GEN_GEN <= 11)
       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
                    gen_mi_reg64(GENX(PERFCNT1_num)));
       gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),