Merge tag 'perf-tools-fixes-for-v6.4-1-2023-05-20' of git://git.kernel.org/pub/scm...

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 21 May 2023 20:24:59 +0000 (13:24 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 21 May 2023 20:24:59 +0000 (13:24 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 21 May 2023 20:24:59 +0000 (13:24 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 21 May 2023 20:24:59 +0000 (13:24 -0700)
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h

index f8129c6..f7ddd73 100644 (file)
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags {
         __u64 reserved[2];
  };
  
+/*
+ * Counter/Timer offset structure. Describe the virtual/physical offset.
+ * To be used with KVM_ARM_SET_COUNTER_OFFSET.
+ */
+struct kvm_arm_counter_offset {
+       __u64 counter_offset;
+       __u64 reserved;
+};
+
  #define KVM_ARM_TAGS_TO_GUEST          0
  #define KVM_ARM_TAGS_FROM_GUEST                1
  
@@ -372,6 +381,10 @@ enum {
  #endif
  };
  
+/* Device Control API on vm fd */
+#define KVM_ARM_VM_SMCCC_CTRL          0
+#define   KVM_ARM_VM_SMCCC_FILTER      0
+
  /* Device Control API: ARM VGIC */
  #define KVM_DEV_ARM_VGIC_GRP_ADDR      0
  #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1
@@ -411,6 +424,8 @@ enum {
  #define KVM_ARM_VCPU_TIMER_CTRL                1
  #define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER                0
  #define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER                1
+#define   KVM_ARM_VCPU_TIMER_IRQ_HVTIMER       2
+#define   KVM_ARM_VCPU_TIMER_IRQ_HPTIMER       3
  #define KVM_ARM_VCPU_PVTIME_CTRL       2
  #define   KVM_ARM_VCPU_PVTIME_IPA      0
  
@@ -469,6 +484,27 @@ enum {
  /* run->fail_entry.hardware_entry_failure_reason codes. */
  #define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED    (1ULL << 0)
  
+enum kvm_smccc_filter_action {
+       KVM_SMCCC_FILTER_HANDLE = 0,
+       KVM_SMCCC_FILTER_DENY,
+       KVM_SMCCC_FILTER_FWD_TO_USER,
+
+#ifdef __KERNEL__
+       NR_SMCCC_FILTER_ACTIONS
+#endif
+};
+
+struct kvm_smccc_filter {
+       __u32 base;
+       __u32 nr_functions;
+       __u8 action;
+       __u8 pad[15];
+};
+
+/* arm64-specific KVM_EXIT_HYPERCALL flags */
+#define KVM_HYPERCALL_EXIT_SMC         (1U << 0)
+#define KVM_HYPERCALL_EXIT_16BIT       (1U << 1)
+
  #endif
  
  #endif /* __ARM_KVM_H__ */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h

index b890058..cb8ca46 100644 (file)
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -97,7 +97,7 @@
  #define X86_FEATURE_SYSENTER32         ( 3*32+15) /* "" sysenter in IA32 userspace */
  #define X86_FEATURE_REP_GOOD           ( 3*32+16) /* REP microcode works well */
  #define X86_FEATURE_AMD_LBR_V2         ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
-#define X86_FEATURE_LFENCE_RDTSC       ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
+/* FREE, was #define X86_FEATURE_LFENCE_RDTSC          ( 3*32+18) "" LFENCE synchronizes RDTSC */
  #define X86_FEATURE_ACC_POWER          ( 3*32+19) /* AMD Accumulated Power Mechanism */
  #define X86_FEATURE_NOPL               ( 3*32+20) /* The NOPL (0F 1F) instructions */
  #define X86_FEATURE_ALWAYS             ( 3*32+21) /* "" Always-present feature */
@@ -226,10 +226,9 @@
  
  /* Virtualization flags: Linux defined, word 8 */
  #define X86_FEATURE_TPR_SHADOW         ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI               ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY       ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT                        ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID               ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_FLEXPRIORITY       ( 8*32+ 1) /* Intel FlexPriority */
+#define X86_FEATURE_EPT                        ( 8*32+ 2) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID               ( 8*32+ 3) /* Intel Virtual Processor ID */
  
  #define X86_FEATURE_VMMCALL            ( 8*32+15) /* Prefer VMMCALL to VMCALL */
  #define X86_FEATURE_XENPV              ( 8*32+16) /* "" Xen paravirtual guest */
@@ -307,14 +306,21 @@
  #define X86_FEATURE_SGX_EDECCSSA       (11*32+18) /* "" SGX EDECCSSA user leaf function */
  #define X86_FEATURE_CALL_DEPTH         (11*32+19) /* "" Call depth tracking for RSB stuffing */
  #define X86_FEATURE_MSR_TSX_CTRL       (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
+#define X86_FEATURE_SMBA               (11*32+21) /* "" Slow Memory Bandwidth Allocation */
+#define X86_FEATURE_BMEC               (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
  
  /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
  #define X86_FEATURE_AVX_VNNI           (12*32+ 4) /* AVX VNNI instructions */
  #define X86_FEATURE_AVX512_BF16                (12*32+ 5) /* AVX512 BFLOAT16 instructions */
  #define X86_FEATURE_CMPCCXADD           (12*32+ 7) /* "" CMPccXADD instructions */
+#define X86_FEATURE_ARCH_PERFMON_EXT   (12*32+ 8) /* "" Intel Architectural PerfMon Extension */
+#define X86_FEATURE_FZRM               (12*32+10) /* "" Fast zero-length REP MOVSB */
+#define X86_FEATURE_FSRS               (12*32+11) /* "" Fast short REP STOSB */
+#define X86_FEATURE_FSRC               (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
  #define X86_FEATURE_LKGS               (12*32+18) /* "" Load "kernel" (userspace) GS */
  #define X86_FEATURE_AMX_FP16           (12*32+21) /* "" AMX fp16 Support */
  #define X86_FEATURE_AVX_IFMA            (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
+#define X86_FEATURE_LAM                        (12*32+26) /* Linear Address Masking */
  
  /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
  #define X86_FEATURE_CLZERO             (13*32+ 0) /* CLZERO instruction */
@@ -331,6 +337,7 @@
  #define X86_FEATURE_VIRT_SSBD          (13*32+25) /* Virtualized Speculative Store Bypass Disable */
  #define X86_FEATURE_AMD_SSB_NO         (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
  #define X86_FEATURE_CPPC               (13*32+27) /* Collaborative Processor Performance Control */
+#define X86_FEATURE_AMD_PSFD            (13*32+28) /* "" Predictive Store Forwarding Disable */
  #define X86_FEATURE_BTC_NO             (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
  #define X86_FEATURE_BRS                        (13*32+31) /* Branch Sampling available */
  
@@ -363,6 +370,7 @@
  #define X86_FEATURE_VGIF               (15*32+16) /* Virtual GIF */
  #define X86_FEATURE_X2AVIC             (15*32+18) /* Virtual x2apic */
  #define X86_FEATURE_V_SPEC_CTRL                (15*32+20) /* Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI               (15*32+25) /* Virtual NMI */
  #define X86_FEATURE_SVME_ADDR_CHK      (15*32+28) /* "" SVME addr check */
  
  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@@ -427,6 +435,13 @@
  #define X86_FEATURE_V_TSC_AUX          (19*32+ 9) /* "" Virtual TSC_AUX */
  #define X86_FEATURE_SME_COHERENT       (19*32+10) /* "" AMD hardware-enforced cache coherency */
  
+/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
+#define X86_FEATURE_NO_NESTED_DATA_BP  (20*32+ 0) /* "" No Nested Data Breakpoints */
+#define X86_FEATURE_LFENCE_RDTSC       (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_NULL_SEL_CLR_BASE  (20*32+ 6) /* "" Null Selector Clears Base */
+#define X86_FEATURE_AUTOIBRS           (20*32+ 8) /* "" Automatic IBRS */
+#define X86_FEATURE_NO_SMM_CTL_MSR     (20*32+ 9) /* "" SMM_CTL MSR is not present */
+
  /*
   * BUG word(s)
   */
@@ -467,5 +482,6 @@
  #define X86_BUG_MMIO_UNKNOWN           X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
  #define X86_BUG_RETBLEED               X86_BUG(27) /* CPU is affected by RETBleed */
  #define X86_BUG_EIBRS_PBRSB            X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
+#define X86_BUG_SMT_RSB                        X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
  
  #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h

index 5dfa4fb..fafe9be 100644 (file)
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -75,6 +75,12 @@
  # define DISABLE_CALL_DEPTH_TRACKING   (1 << (X86_FEATURE_CALL_DEPTH & 31))
  #endif
  
+#ifdef CONFIG_ADDRESS_MASKING
+# define DISABLE_LAM           0
+#else
+# define DISABLE_LAM           (1 << (X86_FEATURE_LAM & 31))
+#endif
+
  #ifdef CONFIG_INTEL_IOMMU_SVM
  # define DISABLE_ENQCMD                0
  #else
@@ -115,7 +121,7 @@
  #define DISABLED_MASK10        0
  #define DISABLED_MASK11        (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
                          DISABLE_CALL_DEPTH_TRACKING)
-#define DISABLED_MASK12        0
+#define DISABLED_MASK12        (DISABLE_LAM)
  #define DISABLED_MASK13        0
  #define DISABLED_MASK14        0
  #define DISABLED_MASK15        0
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h

index ad35355..3aedae6 100644 (file)
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -206,6 +206,8 @@
  
  /* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */
  #define MSR_INTEGRITY_CAPS                     0x000002d9
+#define MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT      2
+#define MSR_INTEGRITY_CAPS_ARRAY_BIST          BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
  #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT   4
  #define MSR_INTEGRITY_CAPS_PERIODIC_BIST       BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
  
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h

index 7f467fe..1a6a1f9 100644 (file)
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -559,4 +559,7 @@ struct kvm_pmu_event_filter {
  #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
  #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
  
+/* x86-specific KVM_EXIT_HYPERCALL flags. */
+#define KVM_EXIT_HYPERCALL_LONG_MODE   BIT(0)
+
  #endif /* _ASM_X86_KVM_H */
diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h

index 500b96e..e8d7ebb 100644 (file)
--- a/tools/arch/x86/include/uapi/asm/prctl.h
+++ b/tools/arch/x86/include/uapi/asm/prctl.h
@@ -16,8 +16,16 @@
  #define ARCH_GET_XCOMP_GUEST_PERM      0x1024
  #define ARCH_REQ_XCOMP_GUEST_PERM      0x1025
  
+#define ARCH_XCOMP_TILECFG             17
+#define ARCH_XCOMP_TILEDATA            18
+
  #define ARCH_MAP_VDSO_X32              0x2001
  #define ARCH_MAP_VDSO_32               0x2002
  #define ARCH_MAP_VDSO_64               0x2003
  
+#define ARCH_GET_UNTAG_MASK            0x4001
+#define ARCH_ENABLE_TAGGED_ADDR                0x4002
+#define ARCH_GET_MAX_TAG_BITS          0x4003
+#define ARCH_FORCE_TAGGED_SVA          0x4004
+
  #endif /* _ASM_X86_PRCTL_H */
diff --git a/tools/arch/x86/include/uapi/asm/unistd_32.h b/tools/arch/x86/include/uapi/asm/unistd_32.h

index b8ddfc4..bc48a4d 100644 (file)
--- a/tools/arch/x86/include/uapi/asm/unistd_32.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_32.h
@@ -2,6 +2,9 @@
  #ifndef __NR_fork
  #define __NR_fork 2
  #endif
+#ifndef __NR_execve
+#define __NR_execve 11
+#endif
  #ifndef __NR_getppid
  #define __NR_getppid 64
  #endif
diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S

index a91ac66..d055b82 100644 (file)
--- a/tools/arch/x86/lib/memcpy_64.S
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -10,13 +10,6 @@
  .section .noinstr.text, "ax"
  
  /*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
-
-/*
   * memcpy - Copy a memory block.
   *
   * Input:
@@ -26,17 +19,21 @@
   *
   * Output:
   * rax original destination
+ *
+ * The FSRM alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep movsb' itself is small enough to replace the call, but the
+ * two register moves blow up the code. And one of them is "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
   */
  SYM_TYPED_FUNC_START(__memcpy)
-       ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
-                     "jmp memcpy_erms", X86_FEATURE_ERMS
+       ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
  
         movq %rdi, %rax
         movq %rdx, %rcx
-       shrq $3, %rcx
-       andl $7, %edx
-       rep movsq
-       movl %edx, %ecx
         rep movsb
         RET
  SYM_FUNC_END(__memcpy)
@@ -45,17 +42,6 @@ EXPORT_SYMBOL(__memcpy)
  SYM_FUNC_ALIAS(memcpy, __memcpy)
  EXPORT_SYMBOL(memcpy)
  
-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-SYM_FUNC_START_LOCAL(memcpy_erms)
-       movq %rdi, %rax
-       movq %rdx, %rcx
-       rep movsb
-       RET
-SYM_FUNC_END(memcpy_erms)
-
  SYM_FUNC_START_LOCAL(memcpy_orig)
         movq %rdi, %rax
  
diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S

index 6143b1a..7c59a70 100644 (file)
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@@ -18,27 +18,22 @@
   * rdx   count (bytes)
   *
   * rax   original destination
+ *
+ * The FSRS alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep stosb' itself is small enough to replace the call, but all
+ * the register moves blow up the code. And two of them are "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
   */
  SYM_FUNC_START(__memset)
-       /*
-        * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
-        * to use it when possible. If not available, use fast string instructions.
-        *
-        * Otherwise, use original memset function.
-        */
-       ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
-                     "jmp memset_erms", X86_FEATURE_ERMS
+       ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
  
         movq %rdi,%r9
+       movb %sil,%al
         movq %rdx,%rcx
-       andl $7,%edx
-       shrq $3,%rcx
-       /* expand byte value  */
-       movzbl %sil,%esi
-       movabs $0x0101010101010101,%rax
-       imulq %rsi,%rax
-       rep stosq
-       movl %edx,%ecx
         rep stosb
         movq %r9,%rax
         RET
@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
  SYM_FUNC_ALIAS(memset, __memset)
  EXPORT_SYMBOL(memset)
  
-/*
- * ISO C memset - set a memory block to a byte value. This function uses
- * enhanced rep stosb to override the fast string function.
- * The code is simpler and shorter than the fast string function as well.
- *
- * rdi   destination
- * rsi   value (char)
- * rdx   count (bytes)
- *
- * rax   original destination
- */
-SYM_FUNC_START_LOCAL(memset_erms)
-       movq %rdi,%r9
-       movb %sil,%al
-       movq %rdx,%rcx
-       rep stosb
-       movq %r9,%rax
-       RET
-SYM_FUNC_END(memset_erms)
-
  SYM_FUNC_START_LOCAL(memset_orig)
         movq %rdi,%r10
  
diff --git a/tools/include/asm/alternative.h b/tools/include/asm/alternative.h

index b54bd86..7ce02a2 100644 (file)
--- a/tools/include/asm/alternative.h
+++ b/tools/include/asm/alternative.h
@@ -4,7 +4,6 @@
  
  /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
  
-#define altinstruction_entry #
-#define ALTERNATIVE_2 #
+#define ALTERNATIVE #
  
  #endif
diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h

index 6428085..a87bbbb 100644 (file)
--- a/tools/include/uapi/drm/drm.h
+++ b/tools/include/uapi/drm/drm.h
@@ -972,6 +972,19 @@ extern "C" {
  #define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
  #define DRM_IOCTL_SET_VERSION          DRM_IOWR(0x07, struct drm_set_version)
  #define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+/**
+ * DRM_IOCTL_GEM_CLOSE - Close a GEM handle.
+ *
+ * GEM handles are not reference-counted by the kernel. User-space is
+ * responsible for managing their lifetime. For example, if user-space imports
+ * the same memory object twice on the same DRM file description, the same GEM
+ * handle is returned by both imports, and user-space needs to ensure
+ * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen
+ * when a memory object is allocated, then exported and imported again on the
+ * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception
+ * and always returns fresh new GEM handles even if an existing GEM handle
+ * already refers to the same memory object before the IOCTL is performed.
+ */
  #define DRM_IOCTL_GEM_CLOSE            DRM_IOW (0x09, struct drm_gem_close)
  #define DRM_IOCTL_GEM_FLINK            DRM_IOWR(0x0a, struct drm_gem_flink)
  #define DRM_IOCTL_GEM_OPEN             DRM_IOWR(0x0b, struct drm_gem_open)
@@ -1012,7 +1025,37 @@ extern "C" {
  #define DRM_IOCTL_UNLOCK               DRM_IOW( 0x2b, struct drm_lock)
  #define DRM_IOCTL_FINISH               DRM_IOW( 0x2c, struct drm_lock)
  
+/**
+ * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD.
+ *
+ * User-space sets &drm_prime_handle.handle with the GEM handle to export and
+ * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in
+ * &drm_prime_handle.fd.
+ *
+ * The export can fail for any driver-specific reason, e.g. because export is
+ * not supported for this specific GEM handle (but might be for others).
+ *
+ * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT.
+ */
  #define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+/**
+ * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle.
+ *
+ * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to
+ * import, and gets back a GEM handle in &drm_prime_handle.handle.
+ * &drm_prime_handle.flags is unused.
+ *
+ * If an existing GEM handle refers to the memory object backing the DMA-BUF,
+ * that GEM handle is returned. Therefore user-space which needs to handle
+ * arbitrary DMA-BUFs must have a user-space lookup data structure to manually
+ * reference-count duplicated GEM handles. For more information see
+ * &DRM_IOCTL_GEM_CLOSE.
+ *
+ * The import can fail for any driver-specific reason, e.g. because import is
+ * only supported for DMA-BUFs allocated on this DRM device.
+ *
+ * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT.
+ */
  #define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
  
  #define DRM_IOCTL_AGP_ACQUIRE          DRM_IO(  0x30)
@@ -1104,8 +1147,13 @@ extern "C" {
   * struct as the output.
   *
   * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
- * will be filled with GEM buffer handles. Planes are valid until one has a
- * zero handle -- this can be used to compute the number of planes.
+ * will be filled with GEM buffer handles. Fresh new GEM handles are always
+ * returned, even if another GEM handle referring to the same memory object
+ * already exists on the DRM file description. The caller is responsible for
+ * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same
+ * new handle will be returned for multiple planes in case they use the same
+ * memory object. Planes are valid until one has a zero handle -- this can be
+ * used to compute the number of planes.
   *
   * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
   * until one has a zero &drm_mode_fb_cmd2.pitches.
@@ -1113,6 +1161,11 @@ extern "C" {
   * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
   * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
   * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
+ *
+ * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space
+ * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately
+ * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not
+ * double-close handles which are specified multiple times in the array.
   */
  #define DRM_IOCTL_MODE_GETFB2          DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
  
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h

index 8df261c..dba7c5a 100644 (file)
--- a/tools/include/uapi/drm/i915_drm.h
+++ b/tools/include/uapi/drm/i915_drm.h
@@ -2491,7 +2491,7 @@ struct i915_context_param_engines {
  #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
  #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */
  #define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */
-       struct i915_engine_class_instance engines[0];
+       struct i915_engine_class_instance engines[];
  } __attribute__((packed));
  
  #define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \
@@ -2676,6 +2676,10 @@ enum drm_i915_oa_format {
         I915_OAR_FORMAT_A32u40_A4u32_B8_C8,
         I915_OA_FORMAT_A24u40_A14u32_B8_C8,
  
+       /* MTL OAM */
+       I915_OAM_FORMAT_MPEC8u64_B8_C8,
+       I915_OAM_FORMAT_MPEC8u32_B8_C8,
+
         I915_OA_FORMAT_MAX          /* non-ABI */
  };
  
@@ -2758,6 +2762,25 @@ enum drm_i915_perf_property_id {
          */
         DRM_I915_PERF_PROP_POLL_OA_PERIOD,
  
+       /**
+        * Multiple engines may be mapped to the same OA unit. The OA unit is
+        * identified by class:instance of any engine mapped to it.
+        *
+        * This parameter specifies the engine class and must be passed along
+        * with DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE.
+        *
+        * This property is available in perf revision 6.
+        */
+       DRM_I915_PERF_PROP_OA_ENGINE_CLASS,
+
+       /**
+        * This parameter specifies the engine instance and must be passed along
+        * with DRM_I915_PERF_PROP_OA_ENGINE_CLASS.
+        *
+        * This property is available in perf revision 6.
+        */
+       DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE,
+
         DRM_I915_PERF_PROP_MAX /* non-ABI */
  };
  
diff --git a/tools/include/uapi/linux/const.h b/tools/include/uapi/linux/const.h

index af2a44c..a429381 100644 (file)
--- a/tools/include/uapi/linux/const.h
+++ b/tools/include/uapi/linux/const.h
@@ -28,7 +28,7 @@
  #define _BITUL(x)      (_UL(1) << (x))
  #define _BITULL(x)     (_ULL(1) << (x))
  
-#define __ALIGN_KERNEL(x, a)           __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#define __ALIGN_KERNEL(x, a)           __ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1)
  #define __ALIGN_KERNEL_MASK(x, mask)   (((x) + (mask)) & ~(mask))
  
  #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h

index 07a4cb1..4b7f2df 100644 (file)
--- a/tools/include/uapi/linux/in.h
+++ b/tools/include/uapi/linux/in.h
@@ -162,6 +162,7 @@ struct in_addr {
  #define MCAST_MSFILTER                 48
  #define IP_MULTICAST_ALL               49
  #define IP_UNICAST_IF                  50
+#define IP_LOCAL_PORT_RANGE            51
  
  #define MCAST_EXCLUDE  0
  #define MCAST_INCLUDE  1
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h

index 4003a16..737318b 100644 (file)
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -341,8 +341,13 @@ struct kvm_run {
                         __u64 nr;
                         __u64 args[6];
                         __u64 ret;
-                       __u32 longmode;
-                       __u32 pad;
+
+                       union {
+#ifndef __KERNEL__
+                               __u32 longmode;
+#endif
+                               __u64 flags;
+                       };
                 } hypercall;
                 /* KVM_EXIT_TPR_ACCESS */
                 struct {
@@ -1184,6 +1189,7 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
  #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
  #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
+#define KVM_CAP_COUNTER_OFFSET 227
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
@@ -1543,6 +1549,8 @@ struct kvm_s390_ucas_mapping {
  #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
  #define KVM_PPC_SVM_OFF                  _IO(KVMIO,  0xb3)
  #define KVM_ARM_MTE_COPY_TAGS    _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
+/* Available with KVM_CAP_COUNTER_OFFSET */
+#define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO,  0xb5, struct kvm_arm_counter_offset)
  
  /* ioctl for vm fd */
  #define KVM_CREATE_DEVICE        _IOWR(KVMIO,  0xe0, struct kvm_create_device)
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h

index 759b3f5..f23d9a1 100644 (file)
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -290,6 +290,8 @@ struct prctl_mm_map {
  #define PR_SET_VMA             0x53564d41
  # define PR_SET_VMA_ANON_NAME          0
  
+#define PR_GET_AUXV                    0x41555856
+
  #define PR_SET_MEMORY_MERGE            67
  #define PR_GET_MEMORY_MERGE            68
  #endif /* _LINUX_PRCTL_H */
diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h

index de6810e..0aa955a 100644 (file)
--- a/tools/include/uapi/sound/asound.h
+++ b/tools/include/uapi/sound/asound.h
@@ -429,9 +429,14 @@ struct snd_pcm_sw_params {
         snd_pcm_uframes_t avail_min;            /* min avail frames for wakeup */
         snd_pcm_uframes_t xfer_align;           /* obsolete: xfer size need to be a multiple */
         snd_pcm_uframes_t start_threshold;      /* min hw_avail frames for automatic start */
-       snd_pcm_uframes_t stop_threshold;       /* min avail frames for automatic stop */
-       snd_pcm_uframes_t silence_threshold;    /* min distance from noise for silence filling */
-       snd_pcm_uframes_t silence_size;         /* silence block size */
+       /*
+        * The following two thresholds alleviate playback buffer underruns; when
+        * hw_avail drops below the threshold, the respective action is triggered:
+        */
+       snd_pcm_uframes_t stop_threshold;       /* - stop playback */
+       snd_pcm_uframes_t silence_threshold;    /* - pre-fill buffer with silence */
+       snd_pcm_uframes_t silence_size;         /* max size of silence pre-fill; when >= boundary,
+                                                * fill played area with silence immediately */
         snd_pcm_uframes_t boundary;             /* pointers wrap point */
         unsigned int proto;                     /* protocol version */
         unsigned int tstamp_type;               /* timestamp type (req. proto >= 2.0.12) */
@@ -570,7 +575,8 @@ struct __snd_pcm_mmap_status64 {
  struct __snd_pcm_mmap_control64 {
         __pad_before_uframe __pad1;
         snd_pcm_uframes_t appl_ptr;      /* RW: appl ptr (0...boundary-1) */
-       __pad_before_uframe __pad2;
+       __pad_before_uframe __pad2;      // This should be __pad_after_uframe, but binary
+                                        // backwards compatibility constraints prevent a fix.
  
         __pad_before_uframe __pad3;
         snd_pcm_uframes_t  avail_min;    /* RW: min available frames for wakeup */
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config

index 4884520..7026844 100644 (file)
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -216,6 +216,12 @@ ifeq ($(call get-executable,$(BISON)),)
    dummy := $(error Error: $(BISON) is missing on this system, please install it)
  endif
  
+ifeq ($(BUILD_BPF_SKEL),1)
+  ifeq ($(call get-executable,$(CLANG)),)
+    dummy := $(error $(CLANG) is missing on this system, please install it to be able to build with BUILD_BPF_SKEL=1)
+  endif
+endif
+
  ifneq ($(OUTPUT),)
    ifeq ($(shell expr $(shell $(BISON) --version | grep bison | sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \>\= 371), 1)
      BISON_FILE_PREFIX_MAP := --file-prefix-map=$(OUTPUT)=
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf

index a42a6a9..1593c5d 100644 (file)
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1057,14 +1057,32 @@ $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_
  
  ifdef BUILD_BPF_SKEL
  BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool
-BPF_INCLUDE := -I$(SKEL_TMP_OUT)/.. -I$(LIBBPF_INCLUDE)
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
+       | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+ifneq ($(CROSS_COMPILE),)
+CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
+BPF_INCLUDE := -I$(SKEL_TMP_OUT)/.. -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES)
+TOOLS_UAPI_INCLUDE := -I$(srctree)/tools/include/uapi
  
  $(BPFTOOL): | $(SKEL_TMP_OUT)
         $(Q)CFLAGS= $(MAKE) -C ../bpf/bpftool \
                 OUTPUT=$(SKEL_TMP_OUT)/ bootstrap
  
  $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT)
-       $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \
+       $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \
           -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@
  
  $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL)
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c

index 77cb03e..9ca040b 100644 (file)
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -78,9 +78,9 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr,
         char path[PATH_MAX];
         int err;
         u32 val;
-       u64 contextid =
-               evsel->core.attr.config &
-               (perf_pmu__format_bits(&cs_etm_pmu->format, "contextid1") |
+       u64 contextid = evsel->core.attr.config &
+               (perf_pmu__format_bits(&cs_etm_pmu->format, "contextid") |
+                perf_pmu__format_bits(&cs_etm_pmu->format, "contextid1") |
                  perf_pmu__format_bits(&cs_etm_pmu->format, "contextid2"));
  
         if (!contextid)
@@ -114,8 +114,7 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr,
                  *  0b00100 Maximum of 32-bit Context ID size.
                  *  All other values are reserved.
                  */
-               val = BMVAL(val, 5, 9);
-               if (!val || val != 0x4) {
+               if (BMVAL(val, 5, 9) != 0x4) {
                         pr_err("%s: CONTEXTIDR_EL1 isn't supported, disable with %s/contextid1=0/\n",
                                CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME);
                         return -EINVAL;
diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c

index d730666..80b9f62 100644 (file)
--- a/tools/perf/arch/arm64/util/header.c
+++ b/tools/perf/arch/arm64/util/header.c
@@ -29,8 +29,8 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus)
                 char path[PATH_MAX];
                 FILE *file;
  
-               scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d"MIDR,
-                               sysfs, cpus->map[cpu]);
+               scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d" MIDR,
+                         sysfs, RC_CHK_ACCESS(cpus)->map[cpu].cpu);
  
                 file = fopen(path, "r");
                 if (!file) {
diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c

index fa143ac..ef1ed64 100644 (file)
--- a/tools/perf/arch/arm64/util/pmu.c
+++ b/tools/perf/arch/arm64/util/pmu.c
@@ -18,7 +18,7 @@ static struct perf_pmu *pmu__find_core_pmu(void)
                  * The cpumap should cover all CPUs. Otherwise, some CPUs may
                  * not support some events or have different event IDs.
                  */
-               if (pmu->cpus->nr != cpu__max_cpu().cpu)
+               if (RC_CHK_ACCESS(pmu->cpus)->nr != cpu__max_cpu().cpu)
                         return NULL;
  
                 return pmu;
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl

index 7991476..b68f475 100644 (file)
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -449,7 +449,7 @@
  444  common    landlock_create_ruleset sys_landlock_create_ruleset     sys_landlock_create_ruleset
  445  common    landlock_add_rule       sys_landlock_add_rule           sys_landlock_add_rule
  446  common    landlock_restrict_self  sys_landlock_restrict_self      sys_landlock_restrict_self
-# 447 reserved for memfd_secret
+447  common    memfd_secret            sys_memfd_secret                sys_memfd_secret
  448  common    process_mrelease        sys_process_mrelease            sys_process_mrelease
  449  common    futex_waitv             sys_futex_waitv                 sys_futex_waitv
  450  common    set_mempolicy_home_node sys_set_mempolicy_home_node     sys_set_mempolicy_home_node
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h

index 50ae8bd..6188e19 100644 (file)
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -7,7 +7,3 @@ MEMCPY_FN(memcpy_orig,
  MEMCPY_FN(__memcpy,
         "x86-64-movsq",
         "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
-
-MEMCPY_FN(memcpy_erms,
-       "x86-64-movsb",
-       "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S

index 6eb45a2..1b9fef7 100644 (file)
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -2,7 +2,7 @@
  
  /* Various wrappers to make the kernel .S file build in user-space: */
  
-// memcpy_orig and memcpy_erms are being defined as SYM_L_LOCAL but we need it
+// memcpy_orig is being defined as SYM_L_LOCAL but we need it
  #define SYM_FUNC_START_LOCAL(name)                      \
          SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
  #define memcpy MEMCPY /* don't hide glibc's memcpy() */
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h

index dac6d2b..247c72f 100644 (file)
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -7,7 +7,3 @@ MEMSET_FN(memset_orig,
  MEMSET_FN(__memset,
         "x86-64-stosq",
         "movsq-based memset() in arch/x86/lib/memset_64.S")
-
-MEMSET_FN(memset_erms,
-       "x86-64-stosb",
-       "movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S

index 6f093c4..abd26c9 100644 (file)
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,5 +1,5 @@
  /* SPDX-License-Identifier: GPL-2.0 */
-// memset_orig and memset_erms are being defined as SYM_L_LOCAL but we need it
+// memset_orig is being defined as SYM_L_LOCAL but we need it
  #define SYM_FUNC_START_LOCAL(name)                      \
          SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
  #define memset MEMSET /* don't hide glibc's memset() */
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index 006f522..c57be48 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3647,6 +3647,13 @@ static int process_stat_config_event(struct perf_session *session __maybe_unused
                                      union perf_event *event)
  {
         perf_event__read_stat_config(&stat_config, &event->stat_config);
+
+       /*
+        * Aggregation modes are not used since post-processing scripts are
+        * supposed to take care of such requirements
+        */
+       stat_config.aggr_mode = AGGR_NONE;
+
         return 0;
  }
  
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index cc9fa48..b9ad32f 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -667,6 +667,13 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
                         evsel_list->core.threads->err_thread = -1;
                         return COUNTER_RETRY;
                 }
+       } else if (counter->skippable) {
+               if (verbose > 0)
+                       ui__warning("skipping event %s that kernel failed to open .\n",
+                                   evsel__name(counter));
+               counter->supported = false;
+               counter->errored = true;
+               return COUNTER_SKIP;
         }
  
         evsel__open_strerror(counter, &target, errno, msg, sizeof(msg));
@@ -1890,15 +1897,28 @@ static int add_default_attributes(void)
                  * caused by exposing latent bugs. This is fixed properly in:
                  * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/
                  */
-               if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid() &&
-                   metricgroup__parse_groups(evsel_list, "TopdownL1",
-                                           /*metric_no_group=*/false,
-                                           /*metric_no_merge=*/false,
-                                           /*metric_no_threshold=*/true,
-                                           stat_config.user_requested_cpu_list,
-                                           stat_config.system_wide,
-                                           &stat_config.metric_events) < 0)
-                       return -1;
+               if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid()) {
+                       struct evlist *metric_evlist = evlist__new();
+                       struct evsel *metric_evsel;
+
+                       if (!metric_evlist)
+                               return -1;
+
+                       if (metricgroup__parse_groups(metric_evlist, "TopdownL1",
+                                                       /*metric_no_group=*/false,
+                                                       /*metric_no_merge=*/false,
+                                                       /*metric_no_threshold=*/true,
+                                                       stat_config.user_requested_cpu_list,
+                                                       stat_config.system_wide,
+                                                       &stat_config.metric_events) < 0)
+                               return -1;
+
+                       evlist__for_each_entry(metric_evlist, metric_evsel) {
+                               metric_evsel->skippable = true;
+                       }
+                       evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries);
+                       evlist__delete(metric_evlist);
+               }
  
                 /* Platform specific attrs */
                 if (evlist__add_default_attrs(evsel_list, default_null_attrs) < 0)
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json

index 75d80e7..1f90475 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
@@ -133,6 +133,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that uops must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.   The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound.   The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
@@ -143,6 +144,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound_aux",
          "MetricThreshold": "tma_backend_bound_aux > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that UOPS must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.  All of these subevents count backend stalls, in slots, due to a resource limitation.   These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based.  These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
@@ -153,6 +155,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
@@ -163,6 +166,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_base",
          "MetricThreshold": "tma_base > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -182,6 +186,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.05",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -209,6 +214,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -255,6 +261,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -264,6 +271,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -291,6 +299,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -593,6 +602,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.05",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -611,6 +621,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -629,6 +640,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_ms_uops",
          "MetricThreshold": "tma_ms_uops > 0.05",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
@@ -729,6 +741,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_aux_group",
          "MetricName": "tma_resource_bound",
          "MetricThreshold": "tma_resource_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that uops must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
@@ -739,6 +752,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.75",
+        "MetricgroupNoGroup": "TopdownL1",
          "ScaleUnit": "100%",
          "Unit": "cpu_atom"
      },
@@ -848,6 +862,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -858,6 +873,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -868,6 +884,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -919,6 +936,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -1031,6 +1049,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -1041,6 +1060,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -1121,6 +1141,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -1141,6 +1162,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -2023,6 +2045,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -2082,6 +2105,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -2112,6 +2136,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
@@ -2310,6 +2335,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
          "ScaleUnit": "100%",
          "Unit": "cpu_core"
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json

index 1a85d93..0402adb 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json
@@ -98,6 +98,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that uops must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.   The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound.   The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.",
          "ScaleUnit": "100%"
      },
@@ -107,6 +108,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound_aux",
          "MetricThreshold": "tma_backend_bound_aux > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that UOPS must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.  All of these subevents count backend stalls, in slots, due to a resource limitation.   These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based.  These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.",
          "ScaleUnit": "100%"
      },
@@ -116,6 +118,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
          "ScaleUnit": "100%"
      },
@@ -125,6 +128,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_base",
          "MetricThreshold": "tma_base > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%"
      },
      {
@@ -142,6 +146,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.05",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%"
      },
      {
@@ -166,6 +171,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%"
      },
      {
@@ -207,6 +213,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%"
      },
      {
@@ -215,6 +222,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%"
      },
      {
@@ -239,6 +247,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "ScaleUnit": "100%"
      },
      {
@@ -499,6 +508,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.05",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%"
      },
      {
@@ -515,6 +525,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "ScaleUnit": "100%"
      },
      {
@@ -531,6 +542,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_ms_uops",
          "MetricThreshold": "tma_ms_uops > 0.05",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
          "ScaleUnit": "100%"
      },
@@ -620,6 +632,7 @@
          "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_aux_group",
          "MetricName": "tma_resource_bound",
          "MetricThreshold": "tma_resource_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that uops must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.",
          "ScaleUnit": "100%"
      },
@@ -629,6 +642,7 @@
          "MetricGroup": "TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.75",
+        "MetricgroupNoGroup": "TopdownL1",
          "ScaleUnit": "100%"
      },
      {
diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json

index 51cf856..f9e2316 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
@@ -103,6 +103,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -112,6 +113,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -122,6 +124,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -170,6 +173,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -263,6 +267,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -272,6 +277,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -326,6 +332,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -335,6 +342,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -828,6 +836,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -858,6 +867,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -886,6 +896,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1048,6 +1059,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json

index fb57c73..e9c46d3 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
@@ -97,6 +97,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
          "ScaleUnit": "100%"
      },
@@ -106,6 +107,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -116,6 +118,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -164,6 +167,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -248,6 +252,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -257,6 +262,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -311,6 +317,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -320,6 +327,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -795,6 +803,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -825,6 +834,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -853,6 +863,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1013,6 +1024,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json

index 65ec0c9..437b986 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
@@ -103,6 +103,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -112,6 +113,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -122,6 +124,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -170,6 +173,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -263,6 +267,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -272,6 +277,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -326,6 +332,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -335,6 +342,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -829,6 +837,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -869,6 +878,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -897,6 +907,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1079,6 +1090,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json

index 8f7dc72..875c766 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
@@ -101,6 +101,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -110,6 +111,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -120,6 +122,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -167,6 +170,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -271,6 +275,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -280,6 +285,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -354,6 +360,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -372,6 +379,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -1142,6 +1150,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -1196,6 +1205,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -1224,6 +1234,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1458,6 +1469,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json

index 2528418..9570a88 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
@@ -103,6 +103,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -112,6 +113,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -122,6 +124,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -161,6 +164,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -254,6 +258,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -263,6 +268,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -272,6 +278,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -281,6 +288,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -663,6 +671,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -693,6 +702,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -721,6 +731,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -874,6 +885,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json

index 11f152c..a522202 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
@@ -103,6 +103,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -112,6 +113,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -122,6 +124,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -161,6 +164,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -254,6 +258,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -263,6 +268,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -272,6 +278,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -281,6 +288,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -664,6 +672,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -704,6 +713,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -732,6 +742,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -905,6 +916,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json

index f45ae34..1a2154f 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json
@@ -115,6 +115,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
          "ScaleUnit": "100%"
      },
@@ -124,6 +125,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -141,6 +143,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -187,6 +190,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -288,6 +292,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -297,6 +302,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -369,6 +375,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -378,6 +385,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -1111,6 +1119,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -1164,6 +1173,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -1191,6 +1201,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1360,6 +1371,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json

index 0f9b174..1ef772b 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json
@@ -80,6 +80,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
          "ScaleUnit": "100%"
      },
@@ -89,6 +90,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -106,6 +108,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -152,6 +155,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -253,6 +257,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -262,6 +267,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -334,6 +340,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -343,6 +350,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -1134,6 +1142,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -1187,6 +1196,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -1214,6 +1224,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1410,6 +1421,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json

index 5247f69..11080cc 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json
@@ -103,6 +103,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -112,6 +113,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -122,6 +124,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -161,6 +164,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -254,6 +258,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -263,6 +268,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -299,6 +305,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -308,6 +315,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -724,6 +732,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -754,6 +763,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -782,6 +792,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -917,6 +928,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json

index 89469b1..65a46d6 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json
@@ -103,6 +103,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -112,6 +113,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -122,6 +124,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -161,6 +164,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -254,6 +258,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -263,6 +268,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -299,6 +305,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -308,6 +315,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -725,6 +733,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -765,6 +774,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -793,6 +803,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -948,6 +959,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json

index e8f4e5c..66a6f65 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json
@@ -76,6 +76,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -85,6 +86,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -95,6 +97,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -114,6 +117,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -160,6 +164,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -169,6 +174,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -205,6 +211,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -214,6 +221,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -412,6 +420,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -422,6 +431,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -450,6 +460,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -487,6 +498,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json

index 4a99fe5..4b8bc19 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json
@@ -76,6 +76,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -85,6 +86,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -95,6 +97,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -114,6 +117,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -160,6 +164,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -169,6 +174,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
          "ScaleUnit": "100%"
      },
@@ -205,6 +211,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
          "ScaleUnit": "100%"
      },
@@ -214,6 +221,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -411,6 +419,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -421,6 +430,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -449,6 +459,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -486,6 +497,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json

index 126300b..620fc5b 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json
@@ -87,6 +87,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
          "ScaleUnit": "100%"
      },
@@ -96,6 +97,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -105,6 +107,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -151,6 +154,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -252,6 +256,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -261,6 +266,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -351,6 +357,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -369,6 +376,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY",
          "ScaleUnit": "100%"
      },
@@ -1216,6 +1224,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -1269,6 +1278,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -1304,6 +1314,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1509,6 +1520,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json

index a6d212b..21ef6c9 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
@@ -101,6 +101,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -110,6 +111,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -120,6 +122,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -167,6 +170,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -271,6 +275,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -280,6 +285,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -345,6 +351,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -363,6 +370,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -1065,6 +1073,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -1110,6 +1119,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -1138,6 +1148,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1343,6 +1354,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json

index fa2f7f1..eb6f12c 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
@@ -101,6 +101,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
          "ScaleUnit": "100%"
      },
@@ -110,6 +111,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -120,6 +122,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -167,6 +170,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -271,6 +275,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -280,6 +285,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -354,6 +360,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -372,6 +379,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -1123,6 +1131,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -1177,6 +1186,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -1205,6 +1215,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1429,6 +1440,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json

index 4c80d6b..b442ed4 100644 (file)
--- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json
@@ -109,6 +109,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_backend_bound",
          "MetricThreshold": "tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
          "ScaleUnit": "100%"
      },
@@ -118,6 +119,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_bad_speculation",
          "MetricThreshold": "tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
          "ScaleUnit": "100%"
      },
@@ -135,6 +137,7 @@
          "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
          "MetricName": "tma_branch_mispredicts",
          "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction.  These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers",
          "ScaleUnit": "100%"
      },
@@ -181,6 +184,7 @@
          "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_core_bound",
          "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck.  Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
          "ScaleUnit": "100%"
      },
@@ -282,6 +286,7 @@
          "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
          "MetricName": "tma_fetch_bandwidth",
          "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues.  For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp",
          "ScaleUnit": "100%"
      },
@@ -291,6 +296,7 @@
          "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
          "MetricName": "tma_fetch_latency",
          "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues.  For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
          "ScaleUnit": "100%"
      },
@@ -363,6 +369,7 @@
          "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_frontend_bound",
          "MetricThreshold": "tma_frontend_bound > 0.15",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
          "ScaleUnit": "100%"
      },
@@ -372,6 +379,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_heavy_operations",
          "MetricThreshold": "tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
          "ScaleUnit": "100%"
      },
@@ -1125,6 +1133,7 @@
          "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
          "MetricName": "tma_light_operations",
          "MetricThreshold": "tma_light_operations > 0.6",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
          "ScaleUnit": "100%"
      },
@@ -1178,6 +1187,7 @@
          "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
          "MetricName": "tma_machine_clears",
          "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears.  These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
          "ScaleUnit": "100%"
      },
@@ -1205,6 +1215,7 @@
          "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
          "MetricName": "tma_memory_bound",
          "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
+        "MetricgroupNoGroup": "TopdownL2",
          "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck.  Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
          "ScaleUnit": "100%"
      },
@@ -1374,6 +1385,7 @@
          "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
          "MetricName": "tma_retiring",
          "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
+        "MetricgroupNoGroup": "TopdownL1",
          "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category.  Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved.  Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance.  For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
          "ScaleUnit": "100%"
      },
diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py

index ca99b9c..f57a8f2 100755 (executable)
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -52,7 +52,8 @@ _json_event_attributes = [
  # Attributes that are in pmu_metric rather than pmu_event.
  _json_metric_attributes = [
      'metric_name', 'metric_group', 'metric_expr', 'metric_threshold', 'desc',
-    'long_desc', 'unit', 'compat', 'aggr_mode', 'event_grouping'
+    'long_desc', 'unit', 'compat', 'metricgroup_no_group', 'aggr_mode',
+    'event_grouping'
  ]
  # Attributes that are bools or enum int values, encoded as '0', '1',...
  _json_enum_attributes = ['aggr_mode', 'deprecated', 'event_grouping', 'perpkg']
@@ -303,6 +304,7 @@ class JsonEvent:
      self.deprecated = jd.get('Deprecated')
      self.metric_name = jd.get('MetricName')
      self.metric_group = jd.get('MetricGroup')
+    self.metricgroup_no_group = jd.get('MetricgroupNoGroup')
      self.event_grouping = convert_metric_constraint(jd.get('MetricConstraint'))
      self.metric_expr = None
      if 'MetricExpr' in jd:
diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h

index b7dff8f..8034968 100644 (file)
--- a/tools/perf/pmu-events/pmu-events.h
+++ b/tools/perf/pmu-events/pmu-events.h
@@ -59,6 +59,7 @@ struct pmu_metric {
         const char *compat;
         const char *desc;
         const char *long_desc;
+       const char *metricgroup_no_group;
         enum aggr_mode_class aggr_mode;
         enum metric_event_groups event_grouping;
  };
diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py

index ccfef86..e890c26 100644 (file)
--- a/tools/perf/tests/attr.py
+++ b/tools/perf/tests/attr.py
@@ -152,7 +152,7 @@ def parse_version(version):
  #   - expected values assignments
  class Test(object):
      def __init__(self, path, options):
-        parser = configparser.SafeConfigParser()
+        parser = configparser.ConfigParser()
          parser.read(path)
  
          log.warning("running '%s'" % path)
@@ -247,7 +247,7 @@ class Test(object):
          return True
  
      def load_events(self, path, events):
-        parser_event = configparser.SafeConfigParser()
+        parser_event = configparser.ConfigParser()
          parser_event.read(path)
  
          # The event record section header contains 'event' word,
@@ -261,7 +261,7 @@ class Test(object):
              # Read parent event if there's any
              if (':' in section):
                  base = section[section.index(':') + 1:]
-                parser_base = configparser.SafeConfigParser()
+                parser_base = configparser.ConfigParser()
                  parser_base.read(self.test_dir + '/' + base)
                  base_items = parser_base.items('event')
  
diff --git a/tools/perf/tests/attr/base-stat b/tools/perf/tests/attr/base-stat

index a21fb65..fccd8ec 100644 (file)
--- a/tools/perf/tests/attr/base-stat
+++ b/tools/perf/tests/attr/base-stat
@@ -16,7 +16,7 @@ pinned=0
  exclusive=0
  exclude_user=0
  exclude_kernel=0|1
-exclude_hv=0
+exclude_hv=0|1
  exclude_idle=0
  mmap=0
  comm=0
diff --git a/tools/perf/tests/attr/test-stat-default b/tools/perf/tests/attr/test-stat-default

index d8ea6a8..a1e2da0 100644 (file)
--- a/tools/perf/tests/attr/test-stat-default
+++ b/tools/perf/tests/attr/test-stat-default
@@ -40,7 +40,6 @@ fd=6
  type=0
  config=7
  optional=1
-
  # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND
  [event7:base-stat]
  fd=7
@@ -89,79 +88,98 @@ enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
+# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
  [event13:base-stat]
  fd=13
  group_fd=11
  type=4
-config=33024
+config=33280
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
+# PERF_TYPE_RAW / topdown-be-bound (0x8300)
  [event14:base-stat]
  fd=14
  group_fd=11
  type=4
-config=33280
+config=33536
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-be-bound (0x8300)
+# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
  [event15:base-stat]
  fd=15
  group_fd=11
  type=4
-config=33536
+config=33024
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-heavy-ops (0x8400)
+# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
  [event16:base-stat]
  fd=16
-group_fd=11
  type=4
-config=33792
-disabled=0
-enable_on_exec=0
-read_format=15
+config=4109
  optional=1
  
-# PERF_TYPE_RAW / topdown-br-mispredict (0x8500)
+# PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
  [event17:base-stat]
  fd=17
-group_fd=11
  type=4
-config=34048
-disabled=0
-enable_on_exec=0
-read_format=15
+config=17039629
  optional=1
  
-# PERF_TYPE_RAW / topdown-fetch-lat (0x8600)
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
  [event18:base-stat]
  fd=18
-group_fd=11
  type=4
-config=34304
-disabled=0
-enable_on_exec=0
-read_format=15
+config=60
  optional=1
  
-# PERF_TYPE_RAW / topdown-mem-bound (0x8700)
+# PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
  [event19:base-stat]
  fd=19
-group_fd=11
  type=4
-config=34560
-disabled=0
-enable_on_exec=0
-read_format=15
+config=2097421
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
+[event20:base-stat]
+fd=20
+type=4
+config=316
+optional=1
+
+# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
+[event21:base-stat]
+fd=21
+type=4
+config=412
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
+[event22:base-stat]
+fd=22
+type=4
+config=572
+optional=1
+
+# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
+[event23:base-stat]
+fd=23
+type=4
+config=706
+optional=1
+
+# PERF_TYPE_RAW / UOPS_ISSUED.ANY
+[event24:base-stat]
+fd=24
+type=4
+config=270
  optional=1
diff --git a/tools/perf/tests/attr/test-stat-detailed-1 b/tools/perf/tests/attr/test-stat-detailed-1

index b656ab9..1c52cb0 100644 (file)
--- a/tools/perf/tests/attr/test-stat-detailed-1
+++ b/tools/perf/tests/attr/test-stat-detailed-1
@@ -90,89 +90,108 @@ enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
+# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
  [event13:base-stat]
  fd=13
  group_fd=11
  type=4
-config=33024
+config=33280
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
+# PERF_TYPE_RAW / topdown-be-bound (0x8300)
  [event14:base-stat]
  fd=14
  group_fd=11
  type=4
-config=33280
+config=33536
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-be-bound (0x8300)
+# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
  [event15:base-stat]
  fd=15
  group_fd=11
  type=4
-config=33536
+config=33024
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-heavy-ops (0x8400)
+# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
  [event16:base-stat]
  fd=16
-group_fd=11
  type=4
-config=33792
-disabled=0
-enable_on_exec=0
-read_format=15
+config=4109
  optional=1
  
-# PERF_TYPE_RAW / topdown-br-mispredict (0x8500)
+# PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
  [event17:base-stat]
  fd=17
-group_fd=11
  type=4
-config=34048
-disabled=0
-enable_on_exec=0
-read_format=15
+config=17039629
  optional=1
  
-# PERF_TYPE_RAW / topdown-fetch-lat (0x8600)
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
  [event18:base-stat]
  fd=18
-group_fd=11
  type=4
-config=34304
-disabled=0
-enable_on_exec=0
-read_format=15
+config=60
  optional=1
  
-# PERF_TYPE_RAW / topdown-mem-bound (0x8700)
+# PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
  [event19:base-stat]
  fd=19
-group_fd=11
  type=4
-config=34560
-disabled=0
-enable_on_exec=0
-read_format=15
+config=2097421
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
+[event20:base-stat]
+fd=20
+type=4
+config=316
+optional=1
+
+# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
+[event21:base-stat]
+fd=21
+type=4
+config=412
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
+[event22:base-stat]
+fd=22
+type=4
+config=572
+optional=1
+
+# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
+[event23:base-stat]
+fd=23
+type=4
+config=706
+optional=1
+
+# PERF_TYPE_RAW / UOPS_ISSUED.ANY
+[event24:base-stat]
+fd=24
+type=4
+config=270
  optional=1
  
  # PERF_TYPE_HW_CACHE /
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event20:base-stat]
-fd=20
+[event25:base-stat]
+fd=25
  type=3
  config=0
  optional=1
@@ -181,8 +200,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event21:base-stat]
-fd=21
+[event26:base-stat]
+fd=26
  type=3
  config=65536
  optional=1
@@ -191,8 +210,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event22:base-stat]
-fd=22
+[event27:base-stat]
+fd=27
  type=3
  config=2
  optional=1
@@ -201,8 +220,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event23:base-stat]
-fd=23
+[event28:base-stat]
+fd=28
  type=3
  config=65538
  optional=1
diff --git a/tools/perf/tests/attr/test-stat-detailed-2 b/tools/perf/tests/attr/test-stat-detailed-2

index 9762509..7e961d2 100644 (file)
--- a/tools/perf/tests/attr/test-stat-detailed-2
+++ b/tools/perf/tests/attr/test-stat-detailed-2
@@ -90,89 +90,108 @@ enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
+# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
  [event13:base-stat]
  fd=13
  group_fd=11
  type=4
-config=33024
+config=33280
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
+# PERF_TYPE_RAW / topdown-be-bound (0x8300)
  [event14:base-stat]
  fd=14
  group_fd=11
  type=4
-config=33280
+config=33536
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-be-bound (0x8300)
+# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
  [event15:base-stat]
  fd=15
  group_fd=11
  type=4
-config=33536
+config=33024
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-heavy-ops (0x8400)
+# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
  [event16:base-stat]
  fd=16
-group_fd=11
  type=4
-config=33792
-disabled=0
-enable_on_exec=0
-read_format=15
+config=4109
  optional=1
  
-# PERF_TYPE_RAW / topdown-br-mispredict (0x8500)
+# PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
  [event17:base-stat]
  fd=17
-group_fd=11
  type=4
-config=34048
-disabled=0
-enable_on_exec=0
-read_format=15
+config=17039629
  optional=1
  
-# PERF_TYPE_RAW / topdown-fetch-lat (0x8600)
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
  [event18:base-stat]
  fd=18
-group_fd=11
  type=4
-config=34304
-disabled=0
-enable_on_exec=0
-read_format=15
+config=60
  optional=1
  
-# PERF_TYPE_RAW / topdown-mem-bound (0x8700)
+# PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
  [event19:base-stat]
  fd=19
-group_fd=11
  type=4
-config=34560
-disabled=0
-enable_on_exec=0
-read_format=15
+config=2097421
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
+[event20:base-stat]
+fd=20
+type=4
+config=316
+optional=1
+
+# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
+[event21:base-stat]
+fd=21
+type=4
+config=412
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
+[event22:base-stat]
+fd=22
+type=4
+config=572
+optional=1
+
+# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
+[event23:base-stat]
+fd=23
+type=4
+config=706
+optional=1
+
+# PERF_TYPE_RAW / UOPS_ISSUED.ANY
+[event24:base-stat]
+fd=24
+type=4
+config=270
  optional=1
  
  # PERF_TYPE_HW_CACHE /
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event20:base-stat]
-fd=20
+[event25:base-stat]
+fd=25
  type=3
  config=0
  optional=1
@@ -181,8 +200,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event21:base-stat]
-fd=21
+[event26:base-stat]
+fd=26
  type=3
  config=65536
  optional=1
@@ -191,8 +210,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event22:base-stat]
-fd=22
+[event27:base-stat]
+fd=27
  type=3
  config=2
  optional=1
@@ -201,8 +220,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event23:base-stat]
-fd=23
+[event28:base-stat]
+fd=28
  type=3
  config=65538
  optional=1
@@ -211,8 +230,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event24:base-stat]
-fd=24
+[event29:base-stat]
+fd=29
  type=3
  config=1
  optional=1
@@ -221,8 +240,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event25:base-stat]
-fd=25
+[event30:base-stat]
+fd=30
  type=3
  config=65537
  optional=1
@@ -231,8 +250,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event26:base-stat]
-fd=26
+[event31:base-stat]
+fd=31
  type=3
  config=3
  optional=1
@@ -241,8 +260,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event27:base-stat]
-fd=27
+[event32:base-stat]
+fd=32
  type=3
  config=65539
  optional=1
@@ -251,8 +270,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event28:base-stat]
-fd=28
+[event33:base-stat]
+fd=33
  type=3
  config=4
  optional=1
@@ -261,8 +280,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event29:base-stat]
-fd=29
+[event34:base-stat]
+fd=34
  type=3
  config=65540
  optional=1
diff --git a/tools/perf/tests/attr/test-stat-detailed-3 b/tools/perf/tests/attr/test-stat-detailed-3

index d555042..e50535f 100644 (file)
--- a/tools/perf/tests/attr/test-stat-detailed-3
+++ b/tools/perf/tests/attr/test-stat-detailed-3
@@ -90,89 +90,108 @@ enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
+# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
  [event13:base-stat]
  fd=13
  group_fd=11
  type=4
-config=33024
+config=33280
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-fe-bound (0x8200)
+# PERF_TYPE_RAW / topdown-be-bound (0x8300)
  [event14:base-stat]
  fd=14
  group_fd=11
  type=4
-config=33280
+config=33536
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-be-bound (0x8300)
+# PERF_TYPE_RAW / topdown-bad-spec (0x8100)
  [event15:base-stat]
  fd=15
  group_fd=11
  type=4
-config=33536
+config=33024
  disabled=0
  enable_on_exec=0
  read_format=15
  optional=1
  
-# PERF_TYPE_RAW / topdown-heavy-ops (0x8400)
+# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
  [event16:base-stat]
  fd=16
-group_fd=11
  type=4
-config=33792
-disabled=0
-enable_on_exec=0
-read_format=15
+config=4109
  optional=1
  
-# PERF_TYPE_RAW / topdown-br-mispredict (0x8500)
+# PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
  [event17:base-stat]
  fd=17
-group_fd=11
  type=4
-config=34048
-disabled=0
-enable_on_exec=0
-read_format=15
+config=17039629
  optional=1
  
-# PERF_TYPE_RAW / topdown-fetch-lat (0x8600)
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
  [event18:base-stat]
  fd=18
-group_fd=11
  type=4
-config=34304
-disabled=0
-enable_on_exec=0
-read_format=15
+config=60
  optional=1
  
-# PERF_TYPE_RAW / topdown-mem-bound (0x8700)
+# PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
  [event19:base-stat]
  fd=19
-group_fd=11
  type=4
-config=34560
-disabled=0
-enable_on_exec=0
-read_format=15
+config=2097421
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
+[event20:base-stat]
+fd=20
+type=4
+config=316
+optional=1
+
+# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
+[event21:base-stat]
+fd=21
+type=4
+config=412
+optional=1
+
+# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
+[event22:base-stat]
+fd=22
+type=4
+config=572
+optional=1
+
+# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
+[event23:base-stat]
+fd=23
+type=4
+config=706
+optional=1
+
+# PERF_TYPE_RAW / UOPS_ISSUED.ANY
+[event24:base-stat]
+fd=24
+type=4
+config=270
  optional=1
  
  # PERF_TYPE_HW_CACHE /
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event20:base-stat]
-fd=20
+[event25:base-stat]
+fd=25
  type=3
  config=0
  optional=1
@@ -181,8 +200,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event21:base-stat]
-fd=21
+[event26:base-stat]
+fd=26
  type=3
  config=65536
  optional=1
@@ -191,8 +210,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event22:base-stat]
-fd=22
+[event27:base-stat]
+fd=27
  type=3
  config=2
  optional=1
@@ -201,8 +220,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_LL                 <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event23:base-stat]
-fd=23
+[event28:base-stat]
+fd=28
  type=3
  config=65538
  optional=1
@@ -211,8 +230,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event24:base-stat]
-fd=24
+[event29:base-stat]
+fd=29
  type=3
  config=1
  optional=1
@@ -221,8 +240,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event25:base-stat]
-fd=25
+[event30:base-stat]
+fd=30
  type=3
  config=65537
  optional=1
@@ -231,8 +250,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event26:base-stat]
-fd=26
+[event31:base-stat]
+fd=31
  type=3
  config=3
  optional=1
@@ -241,8 +260,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_DTLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event27:base-stat]
-fd=27
+[event32:base-stat]
+fd=32
  type=3
  config=65539
  optional=1
@@ -251,8 +270,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event28:base-stat]
-fd=28
+[event33:base-stat]
+fd=33
  type=3
  config=4
  optional=1
@@ -261,8 +280,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_ITLB               <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event29:base-stat]
-fd=29
+[event34:base-stat]
+fd=34
  type=3
  config=65540
  optional=1
@@ -271,8 +290,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_PREFETCH        <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)
-[event30:base-stat]
-fd=30
+[event35:base-stat]
+fd=35
  type=3
  config=512
  optional=1
@@ -281,8 +300,8 @@ optional=1
  #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
  # (PERF_COUNT_HW_CACHE_OP_PREFETCH        <<  8) |
  # (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)
-[event31:base-stat]
-fd=31
+[event36:base-stat]
+fd=36
  type=3
  config=66048
  optional=1
diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c

index cbf0e0c..733ead1 100644 (file)
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -120,7 +120,8 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
  
         p = "FOO/0";
         ret = expr__parse(&val, ctx, p);
-       TEST_ASSERT_VAL("division by zero", ret == -1);
+       TEST_ASSERT_VAL("division by zero", ret == 0);
+       TEST_ASSERT_VAL("division by zero", isnan(val));
  
         p = "BAR/";
         ret = expr__parse(&val, ctx, p);
diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c

index 1185b79..c05148e 100644 (file)
--- a/tools/perf/tests/parse-metric.c
+++ b/tools/perf/tests/parse-metric.c
@@ -38,6 +38,7 @@ static void load_runtime_stat(struct evlist *evlist, struct value *vals)
         evlist__alloc_aggr_stats(evlist, 1);
         evlist__for_each_entry(evlist, evsel) {
                 count = find_value(evsel->name, vals);
+               evsel->supported = true;
                 evsel->stats->aggr->counts.val = count;
                 if (evsel__name_is(evsel, "duration_time"))
                         update_stats(&walltime_nsecs_stats, count);
diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh

index 2c1d3f7..b154fbb 100755 (executable)
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -28,6 +28,18 @@ test_stat_record_report() {
    echo "stat record and report test [Success]"
  }
  
+test_stat_record_script() {
+  echo "stat record and script test"
+  if ! perf stat record -o - true | perf script -i - 2>&1 | \
+    grep -E -q "CPU[[:space:]]+THREAD[[:space:]]+VAL[[:space:]]+ENA[[:space:]]+RUN[[:space:]]+TIME[[:space:]]+EVENT"
+  then
+    echo "stat record and script test [Failed]"
+    err=1
+    return
+  fi
+  echo "stat record and script test [Success]"
+}
+
  test_stat_repeat_weak_groups() {
    echo "stat repeat weak groups test"
    if ! perf stat -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}' \
@@ -93,6 +105,7 @@ test_topdown_weak_groups() {
  
  test_default_stat
  test_stat_record_report
+test_stat_record_script
  test_stat_repeat_weak_groups
  test_topdown_groups
  test_topdown_weak_groups
diff --git a/tools/perf/tests/shell/test_intel_pt.sh b/tools/perf/tests/shell/test_intel_pt.sh

index 4ddb17c..3a8b9bf 100755 (executable)
--- a/tools/perf/tests/shell/test_intel_pt.sh
+++ b/tools/perf/tests/shell/test_intel_pt.sh
@@ -506,6 +506,13 @@ test_sample()
                 echo "perf record failed with --aux-sample"
                 return 1
         fi
+       # Check with event with PMU name
+       if perf_record_no_decode -o "${perfdatafile}" -e br_misp_retired.all_branches:u uname ; then
+               if ! perf_record_no_decode -o "${perfdatafile}" -e '{intel_pt//,br_misp_retired.all_branches/aux-sample-size=8192/}:u' uname ; then
+                       echo "perf record failed with --aux-sample-size"
+                       return 1
+               fi
+       fi
         echo OK
         return 0
  }
diff --git a/tools/perf/tests/shell/test_java_symbol.sh b/tools/perf/tests/shell/test_java_symbol.sh

index 90cea88..499539d 100755 (executable)
--- a/tools/perf/tests/shell/test_java_symbol.sh
+++ b/tools/perf/tests/shell/test_java_symbol.sh
@@ -56,7 +56,7 @@ if [ $? -ne 0 ]; then
         exit 1
  fi
  
-if ! perf inject -i $PERF_DATA -o $PERF_INJ_DATA -j; then
+if ! DEBUGINFOD_URLS='' perf inject -i $PERF_DATA -o $PERF_INJ_DATA -j; then
         echo "Fail to inject samples"
         exit 1
  fi
diff --git a/tools/perf/trace/beauty/arch_prctl.c b/tools/perf/trace/beauty/arch_prctl.c

index fe022ca..a211348 100644 (file)
--- a/tools/perf/trace/beauty/arch_prctl.c
+++ b/tools/perf/trace/beauty/arch_prctl.c
@@ -12,10 +12,12 @@
  
  static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_1, "ARCH_", x86_arch_prctl_codes_1_offset);
  static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_2, "ARCH_", x86_arch_prctl_codes_2_offset);
+static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_3, "ARCH_", x86_arch_prctl_codes_3_offset);
  
  static struct strarray *x86_arch_prctl_codes[] = {
         &strarray__x86_arch_prctl_codes_1,
         &strarray__x86_arch_prctl_codes_2,
+       &strarray__x86_arch_prctl_codes_3,
  };
  
  static DEFINE_STRARRAYS(x86_arch_prctl_codes);
diff --git a/tools/perf/trace/beauty/x86_arch_prctl.sh b/tools/perf/trace/beauty/x86_arch_prctl.sh

index 57fa6aa..fd5c740 100755 (executable)
--- a/tools/perf/trace/beauty/x86_arch_prctl.sh
+++ b/tools/perf/trace/beauty/x86_arch_prctl.sh
@@ -24,3 +24,4 @@ print_range () {
  
  print_range 1 0x1 0x1001
  print_range 2 0x2 0x2001
+print_range 3 0x4 0x4001
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c

index 8d3cfbb..1d48226 100644 (file)
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -416,6 +416,8 @@ int contention_end(u64 *ctx)
         return 0;
  }
  
+struct rq {};
+
  extern struct rq runqueues __ksym;
  
  struct rq___old {
diff --git a/tools/perf/util/bpf_skel/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux.h

index 449b1ea..c7ed51b 100644 (file)
--- a/tools/perf/util/bpf_skel/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux.h
@@ -1,6 +1,7 @@
  #ifndef __VMLINUX_H
  #define __VMLINUX_H
  
+#include <linux/stddef.h> // for define __always_inline
  #include <linux/bpf.h>
  #include <linux/types.h>
  #include <linux/perf_event.h>
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

index 356c07f..2f5910b 100644 (file)
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -290,6 +290,7 @@ void evsel__init(struct evsel *evsel,
         evsel->per_pkg_mask  = NULL;
         evsel->collect_stat  = false;
         evsel->pmu_name      = NULL;
+       evsel->skippable     = false;
  }
  
  struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx)
@@ -828,26 +829,26 @@ bool evsel__name_is(struct evsel *evsel, const char *name)
  
  const char *evsel__group_pmu_name(const struct evsel *evsel)
  {
-       const struct evsel *leader;
+       struct evsel *leader = evsel__leader(evsel);
+       struct evsel *pos;
  
-       /* If the pmu_name is set use it. pmu_name isn't set for CPU and software events. */
-       if (evsel->pmu_name)
-               return evsel->pmu_name;
         /*
          * Software events may be in a group with other uncore PMU events. Use
-        * the pmu_name of the group leader to avoid breaking the software event
-        * out of the group.
+        * the pmu_name of the first non-software event to avoid breaking the
+        * software event out of the group.
          *
          * Aux event leaders, like intel_pt, expect a group with events from
          * other PMUs, so substitute the AUX event's PMU in this case.
          */
-       leader  = evsel__leader(evsel);
-       if ((evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) &&
-           leader->pmu_name) {
-               return leader->pmu_name;
+       if (evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) {
+               /* Starting with the leader, find the first event with a named PMU. */
+               for_each_group_evsel(pos, leader) {
+                       if (pos->pmu_name)
+                               return pos->pmu_name;
+               }
         }
  
-       return "cpu";
+       return evsel->pmu_name ?: "cpu";
  }
  
  const char *evsel__metric_id(const struct evsel *evsel)
@@ -1725,9 +1726,13 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread)
                 return -1;
  
         fd = FD(leader, cpu_map_idx, thread);
-       BUG_ON(fd == -1);
+       BUG_ON(fd == -1 && !leader->skippable);
  
-       return fd;
+       /*
+        * When the leader has been skipped, return -2 to distinguish from no
+        * group leader case.
+        */
+       return fd == -1 ? -2 : fd;
  }
  
  static void evsel__remove_fd(struct evsel *pos, int nr_cpus, int nr_threads, int thread_idx)
@@ -2109,6 +2114,12 @@ retry_open:
  
                         group_fd = get_group_fd(evsel, idx, thread);
  
+                       if (group_fd == -2) {
+                               pr_debug("broken group leader for %s\n", evsel->name);
+                               err = -EINVAL;
+                               goto out_close;
+                       }
+
                         test_attr__ready();
  
                         /* Debug message used by test scripts */
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

index d575390..df89287 100644 (file)
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -95,6 +95,7 @@ struct evsel {
                 bool                    weak_group;
                 bool                    bpf_counter;
                 bool                    use_config_name;
+               bool                    skippable;
                 int                     bpf_fd;
                 struct bpf_object       *bpf_obj;
                 struct list_head        config_terms;
diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y

index 250e444..4ce931c 100644 (file)
--- a/tools/perf/util/expr.y
+++ b/tools/perf/util/expr.y
@@ -225,7 +225,11 @@ expr: NUMBER
  {
         if (fpclassify($3.val) == FP_ZERO) {
                 pr_debug("division by zero\n");
-               YYABORT;
+               assert($3.ids == NULL);
+               if (compute_ids)
+                       ids__free($1.ids);
+               $$.val = NAN;
+               $$.ids = NULL;
         } else if (!compute_ids || (is_const($1.val) && is_const($3.val))) {
                 assert($1.ids == NULL);
                 assert($3.ids == NULL);
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c

index c566c68..5e9c657 100644 (file)
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -1144,12 +1144,12 @@ static int metricgroup__add_metric_callback(const struct pmu_metric *pm,
         struct metricgroup__add_metric_data *data = vdata;
         int ret = 0;
  
-       if (pm->metric_expr &&
-               (match_metric(pm->metric_group, data->metric_name) ||
-                match_metric(pm->metric_name, data->metric_name))) {
+       if (pm->metric_expr && match_pm_metric(pm, data->metric_name)) {
+               bool metric_no_group = data->metric_no_group ||
+                       match_metric(data->metric_name, pm->metricgroup_no_group);
  
                 data->has_match = true;
-               ret = add_metric(data->list, pm, data->modifier, data->metric_no_group,
+               ret = add_metric(data->list, pm, data->modifier, metric_no_group,
                                  data->metric_no_threshold, data->user_requested_cpu_list,
                                  data->system_wide, /*root_metric=*/NULL,
                                  /*visited_metrics=*/NULL, table);
@@ -1672,7 +1672,7 @@ static int metricgroup__topdown_max_level_callback(const struct pmu_metric *pm,
  {
         unsigned int *max_level = data;
         unsigned int level;
-       const char *p = strstr(pm->metric_group, "TopdownL");
+       const char *p = strstr(pm->metric_group ?: "", "TopdownL");
  
         if (!p || p[8] == '\0')
                 return 0;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index d71019d..34ba840 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2140,25 +2140,32 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list
         int *leader_idx = state;
         int lhs_leader_idx = *leader_idx, rhs_leader_idx = *leader_idx, ret;
         const char *lhs_pmu_name, *rhs_pmu_name;
+       bool lhs_has_group = false, rhs_has_group = false;
  
         /*
          * First sort by grouping/leader. Read the leader idx only if the evsel
          * is part of a group, as -1 indicates no group.
          */
-       if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1)
+       if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1) {
+               lhs_has_group = true;
                 lhs_leader_idx = lhs_core->leader->idx;
-       if (rhs_core->leader != rhs_core || rhs_core->nr_members > 1)
+       }
+       if (rhs_core->leader != rhs_core || rhs_core->nr_members > 1) {
+               rhs_has_group = true;
                 rhs_leader_idx = rhs_core->leader->idx;
+       }
  
         if (lhs_leader_idx != rhs_leader_idx)
                 return lhs_leader_idx - rhs_leader_idx;
  
-       /* Group by PMU. Groups can't span PMUs. */
-       lhs_pmu_name = evsel__group_pmu_name(lhs);
-       rhs_pmu_name = evsel__group_pmu_name(rhs);
-       ret = strcmp(lhs_pmu_name, rhs_pmu_name);
-       if (ret)
-               return ret;
+       /* Group by PMU if there is a group. Groups can't span PMUs. */
+       if (lhs_has_group && rhs_has_group) {
+               lhs_pmu_name = evsel__group_pmu_name(lhs);
+               rhs_pmu_name = evsel__group_pmu_name(rhs);
+               ret = strcmp(lhs_pmu_name, rhs_pmu_name);
+               if (ret)
+                       return ret;
+       }
  
         /* Architecture specific sorting. */
         return arch_evlist__cmp(lhs, rhs);
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c

index 73b2ff2..bf5a6c1 100644 (file)
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -431,7 +431,7 @@ static void print_metric_json(struct perf_stat_config *config __maybe_unused,
         struct outstate *os = ctx;
         FILE *out = os->fh;
  
-       fprintf(out, "\"metric-value\" : %f, ", val);
+       fprintf(out, "\"metric-value\" : \"%f\", ", val);
         fprintf(out, "\"metric-unit\" : \"%s\"", unit);
         if (!config->metric_only)
                 fprintf(out, "}");
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c

index eeccab6..1566a20 100644 (file)
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -403,12 +403,25 @@ static int prepare_metric(struct evsel **metric_events,
                         if (!aggr)
                                 break;
  
-                       /*
-                        * If an event was scaled during stat gathering, reverse
-                        * the scale before computing the metric.
-                        */
-                       val = aggr->counts.val * (1.0 / metric_events[i]->scale);
-                       source_count = evsel__source_count(metric_events[i]);
+                        if (!metric_events[i]->supported) {
+                               /*
+                                * Not supported events will have a count of 0,
+                                * which can be confusing in a
+                                * metric. Explicitly set the value to NAN. Not
+                                * counted events (enable time of 0) are read as
+                                * 0.
+                                */
+                               val = NAN;
+                               source_count = 0;
+                       } else {
+                               /*
+                                * If an event was scaled during stat gathering,
+                                * reverse the scale before computing the
+                                * metric.
+                                */
+                               val = aggr->counts.val * (1.0 / metric_events[i]->scale);
+                               source_count = evsel__source_count(metric_events[i]);
+                       }
                 }
                 n = strdup(evsel__metric_id(metric_events[i]));
                 if (!n)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 21 May 2023 20:24:59 +0000 (13:24 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 21 May 2023 20:24:59 +0000 (13:24 -0700)
tools/arch/arm64/include/uapi/asm/kvm.h		patch \| blob \| history
tools/arch/x86/include/asm/cpufeatures.h		patch \| blob \| history
tools/arch/x86/include/asm/disabled-features.h		patch \| blob \| history
tools/arch/x86/include/asm/msr-index.h		patch \| blob \| history
tools/arch/x86/include/uapi/asm/kvm.h		patch \| blob \| history
tools/arch/x86/include/uapi/asm/prctl.h		patch \| blob \| history
tools/arch/x86/include/uapi/asm/unistd_32.h		patch \| blob \| history
tools/arch/x86/lib/memcpy_64.S		patch \| blob \| history
tools/arch/x86/lib/memset_64.S		patch \| blob \| history
tools/include/asm/alternative.h		patch \| blob \| history
tools/include/uapi/drm/drm.h		patch \| blob \| history
tools/include/uapi/drm/i915_drm.h		patch \| blob \| history
tools/include/uapi/linux/const.h		patch \| blob \| history
tools/include/uapi/linux/in.h		patch \| blob \| history
tools/include/uapi/linux/kvm.h		patch \| blob \| history
tools/include/uapi/linux/prctl.h		patch \| blob \| history
tools/include/uapi/sound/asound.h		patch \| blob \| history
tools/perf/Makefile.config		patch \| blob \| history
tools/perf/Makefile.perf		patch \| blob \| history
tools/perf/arch/arm/util/cs-etm.c		patch \| blob \| history
tools/perf/arch/arm64/util/header.c		patch \| blob \| history
tools/perf/arch/arm64/util/pmu.c		patch \| blob \| history
tools/perf/arch/s390/entry/syscalls/syscall.tbl		patch \| blob \| history
tools/perf/bench/mem-memcpy-x86-64-asm-def.h		patch \| blob \| history
tools/perf/bench/mem-memcpy-x86-64-asm.S		patch \| blob \| history
tools/perf/bench/mem-memset-x86-64-asm-def.h		patch \| blob \| history
tools/perf/bench/mem-memset-x86-64-asm.S		patch \| blob \| history
tools/perf/builtin-script.c		patch \| blob \| history
tools/perf/builtin-stat.c		patch \| blob \| history
tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json		patch \| blob \| history
tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json		patch \| blob \| history
tools/perf/pmu-events/jevents.py		patch \| blob \| history
tools/perf/pmu-events/pmu-events.h		patch \| blob \| history
tools/perf/tests/attr.py		patch \| blob \| history
tools/perf/tests/attr/base-stat		patch \| blob \| history
tools/perf/tests/attr/test-stat-default		patch \| blob \| history
tools/perf/tests/attr/test-stat-detailed-1		patch \| blob \| history
tools/perf/tests/attr/test-stat-detailed-2		patch \| blob \| history
tools/perf/tests/attr/test-stat-detailed-3		patch \| blob \| history
tools/perf/tests/expr.c		patch \| blob \| history
tools/perf/tests/parse-metric.c		patch \| blob \| history
tools/perf/tests/shell/stat.sh		patch \| blob \| history
tools/perf/tests/shell/test_intel_pt.sh		patch \| blob \| history
tools/perf/tests/shell/test_java_symbol.sh		patch \| blob \| history
tools/perf/trace/beauty/arch_prctl.c		patch \| blob \| history
tools/perf/trace/beauty/x86_arch_prctl.sh		patch \| blob \| history
tools/perf/util/bpf_skel/lock_contention.bpf.c		patch \| blob \| history
tools/perf/util/bpf_skel/vmlinux.h		patch \| blob \| history
tools/perf/util/evsel.c		patch \| blob \| history
tools/perf/util/evsel.h		patch \| blob \| history
tools/perf/util/expr.y		patch \| blob \| history
tools/perf/util/metricgroup.c		patch \| blob \| history
tools/perf/util/parse-events.c		patch \| blob \| history
tools/perf/util/stat-display.c		patch \| blob \| history
tools/perf/util/stat-shadow.c		patch \| blob \| history