perf arm-spe: Use SPE data source for neoverse cores
authorAli Saidi <alisaidi@amazon.com>
Thu, 11 Aug 2022 06:24:39 +0000 (14:24 +0800)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Thu, 11 Aug 2022 22:12:01 +0000 (19:12 -0300)
When synthesizing data from SPE, augment the type with source information
for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
the same encoding. I can't find encoding information for any other SPE
implementations to unify their choices with Arm's thus that is left for
future work.

This change populates the mem_lvl_num for Neoverse cores as well as the
deprecated mem_lvl namespace.

Reviewed-by: German Gomez <german.gomez@arm.com>
Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Ali Saidi <alisaidi@amazon.com>
Tested-by: Leo Yan <leo.yan@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Gustavo A. R. Silva <gustavoars@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.garry@huawei.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Like Xu <likexu@tencent.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Timothy Hayes <timothy.hayes@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20220811062451.435810-4-leo.yan@linaro.org
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
tools/perf/util/arm-spe.c

index 5e390a1a79abfd09516d6c18a926adea22fd761a..091987dd39668b8180df652e1b850f111bced55c 100644 (file)
@@ -220,6 +220,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
 
                        break;
                case ARM_SPE_DATA_SOURCE:
+                       decoder->record.source = payload;
                        break;
                case ARM_SPE_BAD:
                        break;
index 69b31084d6be58a53482521af7f646fa158a9ec4..46a61df1145b664465044124f04613382440a5b6 100644 (file)
@@ -29,6 +29,17 @@ enum arm_spe_op_type {
        ARM_SPE_ST              = 1 << 1,
 };
 
+enum arm_spe_neoverse_data_source {
+       ARM_SPE_NV_L1D           = 0x0,
+       ARM_SPE_NV_L2            = 0x8,
+       ARM_SPE_NV_PEER_CORE     = 0x9,
+       ARM_SPE_NV_LOCAL_CLUSTER = 0xa,
+       ARM_SPE_NV_SYS_CACHE     = 0xb,
+       ARM_SPE_NV_PEER_CLUSTER  = 0xc,
+       ARM_SPE_NV_REMOTE        = 0xd,
+       ARM_SPE_NV_DRAM          = 0xe,
+};
+
 struct arm_spe_record {
        enum arm_spe_sample_type type;
        int err;
@@ -40,6 +51,7 @@ struct arm_spe_record {
        u64 virt_addr;
        u64 phys_addr;
        u64 context_id;
+       u16 source;
 };
 
 struct arm_spe_insn;
index d040406f3314c567a8a95707aba2aa8628bd4edf..22dcfe07e886f905dd5eb4805923af3f4b08f210 100644 (file)
@@ -34,6 +34,7 @@
 #include "arm-spe-decoder/arm-spe-decoder.h"
 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
 
+#include "../../arch/arm64/include/asm/cputype.h"
 #define MAX_TIMESTAMP (~0ULL)
 
 struct arm_spe {
@@ -45,6 +46,7 @@ struct arm_spe {
        struct perf_session             *session;
        struct machine                  *machine;
        u32                             pmu_type;
+       u64                             midr;
 
        struct perf_tsc_conversion      tc;
 
@@ -387,35 +389,128 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
        return arm_spe_deliver_synth_event(spe, speq, event, &sample);
 }
 
-static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+static const struct midr_range neoverse_spe[] = {
+       MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
+       MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
+       MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
+       {},
+};
+
+static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
+                                               union perf_mem_data_src *data_src)
 {
-       union perf_mem_data_src data_src = { 0 };
+       /*
+        * Even though four levels of cache hierarchy are possible, no known
+        * production Neoverse systems currently include more than three levels
+        * so for the time being we assume three exist. If a production system
+        * is built with four the this function would have to be changed to
+        * detect the number of levels for reporting.
+        */
 
-       if (record->op == ARM_SPE_LD)
-               data_src.mem_op = PERF_MEM_OP_LOAD;
-       else if (record->op == ARM_SPE_ST)
-               data_src.mem_op = PERF_MEM_OP_STORE;
-       else
-               return 0;
+       /*
+        * We have no data on the hit level or data source for stores in the
+        * Neoverse SPE records.
+        */
+       if (record->op & ARM_SPE_ST) {
+               data_src->mem_lvl = PERF_MEM_LVL_NA;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
+               data_src->mem_snoop = PERF_MEM_SNOOP_NA;
+               return;
+       }
+
+       switch (record->source) {
+       case ARM_SPE_NV_L1D:
+               data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
+               data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+               break;
+       case ARM_SPE_NV_L2:
+               data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+               data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+               break;
+       case ARM_SPE_NV_PEER_CORE:
+               data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
+               data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+               break;
+       /*
+        * We don't know if this is L1, L2 but we do know it was a cache-2-cache
+        * transfer, so set SNOOPX_PEER
+        */
+       case ARM_SPE_NV_LOCAL_CLUSTER:
+       case ARM_SPE_NV_PEER_CLUSTER:
+               data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+               data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+               break;
+       /*
+        * System cache is assumed to be L3
+        */
+       case ARM_SPE_NV_SYS_CACHE:
+               data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
+               data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
+               break;
+       /*
+        * We don't know what level it hit in, except it came from the other
+        * socket
+        */
+       case ARM_SPE_NV_REMOTE:
+               data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
+               data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+               data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
+               break;
+       case ARM_SPE_NV_DRAM:
+               data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
+               data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
+               data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
+               break;
+       default:
+               break;
+       }
+}
 
+static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
+                                              union perf_mem_data_src *data_src)
+{
        if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
-               data_src.mem_lvl = PERF_MEM_LVL_L3;
+               data_src->mem_lvl = PERF_MEM_LVL_L3;
 
                if (record->type & ARM_SPE_LLC_MISS)
-                       data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+                       data_src->mem_lvl |= PERF_MEM_LVL_MISS;
                else
-                       data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+                       data_src->mem_lvl |= PERF_MEM_LVL_HIT;
        } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
-               data_src.mem_lvl = PERF_MEM_LVL_L1;
+               data_src->mem_lvl = PERF_MEM_LVL_L1;
 
                if (record->type & ARM_SPE_L1D_MISS)
-                       data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+                       data_src->mem_lvl |= PERF_MEM_LVL_MISS;
                else
-                       data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+                       data_src->mem_lvl |= PERF_MEM_LVL_HIT;
        }
 
        if (record->type & ARM_SPE_REMOTE_ACCESS)
-               data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+               data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
+}
+
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
+{
+       union perf_mem_data_src data_src = { 0 };
+       bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
+
+       if (record->op == ARM_SPE_LD)
+               data_src.mem_op = PERF_MEM_OP_LOAD;
+       else if (record->op == ARM_SPE_ST)
+               data_src.mem_op = PERF_MEM_OP_STORE;
+       else
+               return 0;
+
+       if (is_neoverse)
+               arm_spe__synth_data_source_neoverse(record, &data_src);
+       else
+               arm_spe__synth_data_source_generic(record, &data_src);
 
        if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
                data_src.mem_dtlb = PERF_MEM_TLB_WK;
@@ -436,7 +531,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
        u64 data_src;
        int err;
 
-       data_src = arm_spe__synth_data_source(record);
+       data_src = arm_spe__synth_data_source(record, spe->midr);
 
        if (spe->sample_flc) {
                if (record->type & ARM_SPE_L1D_MISS) {
@@ -1178,6 +1273,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
        struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
        size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
        struct perf_record_time_conv *tc = &session->time_conv;
+       const char *cpuid = perf_env__cpuid(session->evlist->env);
+       u64 midr = strtol(cpuid, NULL, 16);
        struct arm_spe *spe;
        int err;
 
@@ -1197,6 +1294,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
        spe->machine = &session->machines.host; /* No kvm support */
        spe->auxtrace_type = auxtrace_info->type;
        spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
+       spe->midr = midr;
 
        spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);