Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[platform/kernel/linux-rpi.git] / arch / x86 / events / intel / core.c
index bd8b988..2db9349 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/nmi.h>
+#include <linux/kvm_host.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -2852,6 +2853,47 @@ static void intel_pmu_reset(void)
        local_irq_restore(flags);
 }
 
+/*
+ * We may be running with guest PEBS events created by KVM, and the
+ * PEBS records are logged into the guest's DS and invisible to host.
+ *
+ * In the case of guest PEBS overflow, we only trigger a fake event
+ * to emulate the PEBS overflow PMI for guest PEBS counters in KVM.
+ * The guest will then vm-entry and check the guest DS area to read
+ * the guest PEBS records.
+ *
+ * The contents and other behavior of the guest event do not matter.
+ */
+static void x86_pmu_handle_guest_pebs(struct pt_regs *regs,
+                                     struct perf_sample_data *data)
+{
+       struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+       u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask;
+       struct perf_event *event = NULL;
+       int bit;
+
+       if (!unlikely(perf_guest_state()))
+               return;
+
+       if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active ||
+           !guest_pebs_idxs)
+               return;
+
+       for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs,
+                        INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) {
+               event = cpuc->events[bit];
+               if (!event->attr.precise_ip)
+                       continue;
+
+               perf_sample_data_init(data, 0, event->hw.last_period);
+               if (perf_event_overflow(event, data, regs))
+                       x86_pmu_stop(event, 0);
+
+               /* Inject one fake event is enough. */
+               break;
+       }
+}
+
 static int handle_pmi_common(struct pt_regs *regs, u64 status)
 {
        struct perf_sample_data data;
@@ -2891,10 +2933,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
         * counters from the GLOBAL_STATUS mask and we always process PEBS
         * events via drain_pebs().
         */
-       if (x86_pmu.flags & PMU_FL_PEBS_ALL)
-               status &= ~cpuc->pebs_enabled;
-       else
-               status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+       status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
 
        /*
         * PEBS overflow sets bit 62 in the global status register
@@ -2903,6 +2942,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
                u64 pebs_enabled = cpuc->pebs_enabled;
 
                handled++;
+               x86_pmu_handle_guest_pebs(regs, &data);
                x86_pmu.drain_pebs(regs, &data);
                status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
 
@@ -3930,40 +3970,98 @@ static int intel_pmu_hw_config(struct perf_event *event)
        return 0;
 }
 
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+/*
+ * Currently, the only caller of this function is the atomic_switch_perf_msrs().
+ * The host perf conext helps to prepare the values of the real hardware for
+ * a set of msrs that need to be switched atomically in a vmx transaction.
+ *
+ * For example, the pseudocode needed to add a new msr should look like:
+ *
+ * arr[(*nr)++] = (struct perf_guest_switch_msr){
+ *     .msr = the hardware msr address,
+ *     .host = the value the hardware has when it doesn't run a guest,
+ *     .guest = the value the hardware has when it runs a guest,
+ * };
+ *
+ * These values have nothing to do with the emulated values the guest sees
+ * when it uses {RD,WR}MSR, which should be handled by the KVM context,
+ * specifically in the intel_pmu_{get,set}_msr().
+ */
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
 {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+       struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data;
        u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
+       u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable;
+       int global_ctrl, pebs_enable;
+
+       *nr = 0;
+       global_ctrl = (*nr)++;
+       arr[global_ctrl] = (struct perf_guest_switch_msr){
+               .msr = MSR_CORE_PERF_GLOBAL_CTRL,
+               .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
+               .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask),
+       };
 
-       arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
-       arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
-       arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-       if (x86_pmu.flags & PMU_FL_PEBS_ALL)
-               arr[0].guest &= ~cpuc->pebs_enabled;
-       else
-               arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
-       *nr = 1;
+       if (!x86_pmu.pebs)
+               return arr;
 
-       if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
-               /*
-                * If PMU counter has PEBS enabled it is not enough to
-                * disable counter on a guest entry since PEBS memory
-                * write can overshoot guest entry and corrupt guest
-                * memory. Disabling PEBS solves the problem.
-                *
-                * Don't do this if the CPU already enforces it.
-                */
-               arr[1].msr = MSR_IA32_PEBS_ENABLE;
-               arr[1].host = cpuc->pebs_enabled;
-               arr[1].guest = 0;
-               *nr = 2;
+       /*
+        * If PMU counter has PEBS enabled it is not enough to
+        * disable counter on a guest entry since PEBS memory
+        * write can overshoot guest entry and corrupt guest
+        * memory. Disabling PEBS solves the problem.
+        *
+        * Don't do this if the CPU already enforces it.
+        */
+       if (x86_pmu.pebs_no_isolation) {
+               arr[(*nr)++] = (struct perf_guest_switch_msr){
+                       .msr = MSR_IA32_PEBS_ENABLE,
+                       .host = cpuc->pebs_enabled,
+                       .guest = 0,
+               };
+               return arr;
+       }
+
+       if (!kvm_pmu || !x86_pmu.pebs_ept)
+               return arr;
+
+       arr[(*nr)++] = (struct perf_guest_switch_msr){
+               .msr = MSR_IA32_DS_AREA,
+               .host = (unsigned long)cpuc->ds,
+               .guest = kvm_pmu->ds_area,
+       };
+
+       if (x86_pmu.intel_cap.pebs_baseline) {
+               arr[(*nr)++] = (struct perf_guest_switch_msr){
+                       .msr = MSR_PEBS_DATA_CFG,
+                       .host = cpuc->pebs_data_cfg,
+                       .guest = kvm_pmu->pebs_data_cfg,
+               };
+       }
+
+       pebs_enable = (*nr)++;
+       arr[pebs_enable] = (struct perf_guest_switch_msr){
+               .msr = MSR_IA32_PEBS_ENABLE,
+               .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
+               .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
+       };
+
+       if (arr[pebs_enable].host) {
+               /* Disable guest PEBS if host PEBS is enabled. */
+               arr[pebs_enable].guest = 0;
+       } else {
+               /* Disable guest PEBS for cross-mapped PEBS counters. */
+               arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask;
+               /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */
+               arr[global_ctrl].guest |= arr[pebs_enable].guest;
        }
 
        return arr;
 }
 
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data)
 {
        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
        struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
@@ -5650,6 +5748,7 @@ __init int intel_pmu_init(void)
        x86_pmu.events_mask_len         = eax.split.mask_length;
 
        x86_pmu.max_pebs_events         = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+       x86_pmu.pebs_capable            = PEBS_COUNTER_MASK;
 
        /*
         * Quirk: v2 perfmon does not report fixed-purpose events, so
@@ -5834,6 +5933,7 @@ __init int intel_pmu_init(void)
                x86_pmu.pebs_aliases = NULL;
                x86_pmu.pebs_prec_dist = true;
                x86_pmu.lbr_pt_coexist = true;
+               x86_pmu.pebs_capable = ~0ULL;
                x86_pmu.flags |= PMU_FL_HAS_RSP_1;
                x86_pmu.flags |= PMU_FL_PEBS_ALL;
                x86_pmu.get_event_constraints = glp_get_event_constraints;
@@ -6138,6 +6238,7 @@ __init int intel_pmu_init(void)
 
        case INTEL_FAM6_ICELAKE_X:
        case INTEL_FAM6_ICELAKE_D:
+               x86_pmu.pebs_ept = 1;
                pmem = true;
                fallthrough;
        case INTEL_FAM6_ICELAKE_L:
@@ -6190,6 +6291,7 @@ __init int intel_pmu_init(void)
                x86_pmu.pebs_aliases = NULL;
                x86_pmu.pebs_prec_dist = true;
                x86_pmu.pebs_block = true;
+               x86_pmu.pebs_capable = ~0ULL;
                x86_pmu.flags |= PMU_FL_HAS_RSP_1;
                x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
                x86_pmu.flags |= PMU_FL_PEBS_ALL;
@@ -6235,6 +6337,7 @@ __init int intel_pmu_init(void)
                x86_pmu.pebs_aliases = NULL;
                x86_pmu.pebs_prec_dist = true;
                x86_pmu.pebs_block = true;
+               x86_pmu.pebs_capable = ~0ULL;
                x86_pmu.flags |= PMU_FL_HAS_RSP_1;
                x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
                x86_pmu.flags |= PMU_FL_PEBS_ALL;
@@ -6399,8 +6502,7 @@ __init int intel_pmu_init(void)
                                          x86_pmu.intel_ctrl);
        /*
         * Access LBR MSR may cause #GP under certain circumstances.
-        * E.g. KVM doesn't support LBR MSR
-        * Check all LBT MSR here.
+        * Check all LBR MSR here.
         * Disable LBR access if any LBR MSRs can not be accessed.
         */
        if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))