Merge tag 'kvmarm-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmar...

author Paolo Bonzini <pbonzini@redhat.com>

Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)

committer Paolo Bonzini <pbonzini@redhat.com>

Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
author Paolo Bonzini <pbonzini@redhat.com>
Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
committer Paolo Bonzini <pbonzini@redhat.com>
Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe b/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe

new file mode 100644 (file)

index 0000000..ad3bbc6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe
@@ -0,0 +1,14 @@
+What:          /sys/bus/coresight/devices/trbe<cpu>/align
+Date:          March 2021
+KernelVersion: 5.13
+Contact:       Anshuman Khandual <anshuman.khandual@arm.com>
+Description:   (Read) Shows the TRBE write pointer alignment. This value
+               is fetched from the TRBIDR register.
+
+What:          /sys/bus/coresight/devices/trbe<cpu>/flag
+Date:          March 2021
+KernelVersion: 5.13
+Contact:       Anshuman Khandual <anshuman.khandual@arm.com>
+Description:   (Read) Shows if TRBE updates in the memory are with access
+               and dirty flag updates as well. This value is fetched from
+               the TRBIDR register.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index 0454572..18f8bb3 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2279,8 +2279,7 @@
                                    state is kept private from the host.
                                    Not valid if the kernel is running in EL2.
  
-                       Defaults to VHE/nVHE based on hardware support and
-                       the value of CONFIG_ARM64_VHE.
+                       Defaults to VHE/nVHE based on hardware support.
  
         kvm-arm.vgic_v3_group0_trap=
                         [KVM,ARM] Trap guest accesses to GICv3 group-0
diff --git a/Documentation/devicetree/bindings/arm/ete.yaml b/Documentation/devicetree/bindings/arm/ete.yaml

new file mode 100644 (file)

index 0000000..7f9b2d1
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/ete.yaml
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/ete.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Embedded Trace Extensions
+
+maintainers:
+  - Suzuki K Poulose <suzuki.poulose@arm.com>
+  - Mathieu Poirier <mathieu.poirier@linaro.org>
+
+description: |
+  Arm Embedded Trace Extension(ETE) is a per CPU trace component that
+  allows tracing the CPU execution. It overlaps with the CoreSight ETMv4
+  architecture and has extended support for future architecture changes.
+  The trace generated by the ETE could be stored via legacy CoreSight
+  components (e.g, TMC-ETR) or other means (e.g, using a per CPU buffer
+  Arm Trace Buffer Extension (TRBE)). Since the ETE can be connected to
+  legacy CoreSight components, a node must be listed per instance, along
+  with any optional connection graph as per the coresight bindings.
+  See bindings/arm/coresight.txt.
+
+properties:
+  $nodename:
+    pattern: "^ete([0-9a-f]+)$"
+  compatible:
+    items:
+      - const: arm,embedded-trace-extension
+
+  cpu:
+    description: |
+      Handle to the cpu this ETE is bound to.
+    $ref: /schemas/types.yaml#/definitions/phandle
+
+  out-ports:
+    description: |
+      Output connections from the ETE to legacy CoreSight trace bus.
+    $ref: /schemas/graph.yaml#/properties/ports
+    properties:
+      port:
+        description: Output connection from the ETE to legacy CoreSight Trace bus.
+        $ref: /schemas/graph.yaml#/properties/port
+
+required:
+  - compatible
+  - cpu
+
+additionalProperties: false
+
+examples:
+
+# An ETE node without legacy CoreSight connections
+  - |
+    ete0 {
+      compatible = "arm,embedded-trace-extension";
+      cpu = <&cpu_0>;
+    };
+# An ETE node with legacy CoreSight connections
+  - |
+   ete1 {
+      compatible = "arm,embedded-trace-extension";
+      cpu = <&cpu_1>;
+
+      out-ports {        /* legacy coresight connection */
+         port {
+             ete1_out_port: endpoint {
+                remote-endpoint = <&funnel_in_port0>;
+             };
+         };
+      };
+   };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/trbe.yaml b/Documentation/devicetree/bindings/arm/trbe.yaml

new file mode 100644 (file)

index 0000000..4402d7b
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/trbe.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/trbe.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Trace Buffer Extensions
+
+maintainers:
+  - Anshuman Khandual <anshuman.khandual@arm.com>
+
+description: |
+  Arm Trace Buffer Extension (TRBE) is a per CPU component
+  for storing trace generated on the CPU to memory. It is
+  accessed via CPU system registers. The software can verify
+  if it is permitted to use the component by checking the
+  TRBIDR register.
+
+properties:
+  $nodename:
+    const: "trbe"
+  compatible:
+    items:
+      - const: arm,trace-buffer-extension
+
+  interrupts:
+    description: |
+       Exactly 1 PPI must be listed. For heterogeneous systems where
+       TRBE is only supported on a subset of the CPUs, please consult
+       the arm,gic-v3 binding for details on describing a PPI partition.
+    maxItems: 1
+
+required:
+  - compatible
+  - interrupts
+
+additionalProperties: false
+
+examples:
+
+  - |
+   #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+   trbe {
+     compatible = "arm,trace-buffer-extension";
+     interrupts = <GIC_PPI 15 IRQ_TYPE_LEVEL_HIGH>;
+   };
+...
diff --git a/Documentation/trace/coresight/coresight-trbe.rst b/Documentation/trace/coresight/coresight-trbe.rst

new file mode 100644 (file)

index 0000000..b9928ef
--- /dev/null
+++ b/Documentation/trace/coresight/coresight-trbe.rst
@@ -0,0 +1,38 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Trace Buffer Extension (TRBE).
+==============================
+
+    :Author:   Anshuman Khandual <anshuman.khandual@arm.com>
+    :Date:     November 2020
+
+Hardware Description
+--------------------
+
+Trace Buffer Extension (TRBE) is a percpu hardware which captures in system
+memory, CPU traces generated from a corresponding percpu tracing unit. This
+gets plugged in as a coresight sink device because the corresponding trace
+generators (ETE), are plugged in as source device.
+
+The TRBE is not compliant to CoreSight architecture specifications, but is
+driven via the CoreSight driver framework to support the ETE (which is
+CoreSight compliant) integration.
+
+Sysfs files and directories
+---------------------------
+
+The TRBE devices appear on the existing coresight bus alongside the other
+coresight devices::
+
+       >$ ls /sys/bus/coresight/devices
+       trbe0  trbe1  trbe2 trbe3
+
+The ``trbe<N>`` named TRBEs are associated with a CPU.::
+
+       >$ ls /sys/bus/coresight/devices/trbe0/
+        align flag
+
+*Key file items are:-*
+   * ``align``: TRBE write pointer alignment
+   * ``flag``: TRBE updates memory with access and dirty flags
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst

index 56c6fca..94804c2 100644 (file)
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -3116,6 +3116,18 @@ optional features it should have.  This will cause a reset of the cpu
  registers to their initial values.  If this is not called, KVM_RUN will
  return ENOEXEC for that vcpu.
  
+The initial values are defined as:
+       - Processor state:
+               * AArch64: EL1h, D, A, I and F bits set. All other bits
+                 are cleared.
+               * AArch32: SVC, A, I and F bits set. All other bits are
+                 cleared.
+       - General Purpose registers, including PC and SP: set to 0
+       - FPSIMD/NEON registers: set to 0
+       - SVE registers: set to 0
+       - System registers: Reset to their architecturally defined
+         values as for a warm reset to EL1 (resp. SVC)
+
  Note that because some registers reflect machine topology, all vcpus
  should be created before this ioctl is invoked.
  
@@ -3335,7 +3347,8 @@ The top 16 bits of the control field are architecture specific control
  flags which can include the following:
  
    - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86, arm64]
-  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390, arm64]
+  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_HW:        using hardware debug events [arm64]
    - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
    - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
    - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -6869,3 +6882,12 @@ they will get passed on to user space. So user space still has to have
  an implementation for these despite the in kernel acceleration.
  
  This capability is always enabled.
+
+8.32 KVM_CAP_PTP_KVM
+--------------------
+
+:Architectures: arm64
+
+This capability indicates that the KVM virtual PTP service is
+supported in the host. A VMM can check whether the service is
+available to the guest on migration.
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst

index 3e2b2ab..78a9b67 100644 (file)
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,3 +10,4 @@ ARM
     hyp-abi
     psci
     pvtime
+   ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst

new file mode 100644 (file)

index 0000000..aecdc80
--- /dev/null
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=============================
+
+PTP_KVM is used for high precision time sync between host and guests.
+It relies on transferring the wall clock and counter value from the
+host to the guest using a KVM-specific hypercall.
+
+* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001
+
+This hypercall uses the SMC32/HVC32 calling convention:
+
+ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID
+    ==============    ========    =====================================
+    Function ID:      (uint32)    0x86000001
+    Arguments:        (uint32)    KVM_PTP_VIRT_COUNTER(0)
+                                  KVM_PTP_PHYS_COUNTER(1)
+    Return Values:    (int32)     NOT_SUPPORTED(-1) on error, or
+                      (uint32)    Upper 32 bits of wall clock time (r0)
+                      (uint32)    Lower 32 bits of wall clock time (r1)
+                      (uint32)    Upper 32 bits of counter (r2)
+                      (uint32)    Lower 32 bits of counter (r3)
+    Endianness:                   No Restrictions.
+    ==============    ========    =====================================
diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst b/Documentation/virt/kvm/devices/arm-vgic-its.rst

index 6c304fd..d257edd 100644 (file)
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
      -EFAULT  Invalid guest ram access
      -EBUSY   One or more VCPUS are running
      -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-            state is not available
+            state is not available without GICv4.1
      =======  ==========================================================
  
  KVM_DEV_ARM_VGIC_GRP_ITS_REGS
diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst b/Documentation/virt/kvm/devices/arm-vgic-v3.rst

index 5dd3bff..51e5e57 100644 (file)
--- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
@@ -228,7 +228,7 @@ Groups:
  
      KVM_DEV_ARM_VGIC_CTRL_INIT
        request the initialization of the VGIC, no additional parameter in
-      kvm_device_attr.addr.
+      kvm_device_attr.addr. Must be called after all VCPUs have been created.
      KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
        save all LPI pending bits into guest RAM pending tables.
  
diff --git a/MAINTAINERS b/MAINTAINERS

index 0417ebf..1dd8fb4 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1761,6 +1761,8 @@ F:        Documentation/ABI/testing/sysfs-bus-coresight-devices-*
  F:     Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
  F:     Documentation/devicetree/bindings/arm/coresight-cti.yaml
  F:     Documentation/devicetree/bindings/arm/coresight.txt
+F:     Documentation/devicetree/bindings/arm/ete.yaml
+F:     Documentation/devicetree/bindings/arm/trbe.yaml
  F:     Documentation/trace/coresight/*
  F:     drivers/hwtracing/coresight/*
  F:     include/dt-bindings/arm/coresight-cti-dt.h
@@ -9765,10 +9767,10 @@ F:      virt/kvm/*
  KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
  M:     Marc Zyngier <maz@kernel.org>
  R:     James Morse <james.morse@arm.com>
-R:     Julien Thierry <julien.thierry.kdev@gmail.com>
+R:     Alexandru Elisei <alexandru.elisei@arm.com>
  R:     Suzuki K Poulose <suzuki.poulose@arm.com>
  L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-L:     kvmarm@lists.cs.columbia.edu
+L:     kvmarm@lists.cs.columbia.edu (moderated for non-subscribers)
  S:     Maintained
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
  F:     arch/arm64/include/asm/kvm*
diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h

index df85243..bd61502 100644 (file)
--- a/arch/arm/include/asm/hypervisor.h
+++ b/arch/arm/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
  
  #include <asm/xen/hypervisor.h>
  
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
  #endif
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index e4e1b65..9ec09f9 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1426,19 +1426,6 @@ config ARM64_USE_LSE_ATOMICS
           built with binutils >= 2.25 in order for the new instructions
           to be used.
  
-config ARM64_VHE
-       bool "Enable support for Virtualization Host Extensions (VHE)"
-       default y
-       help
-         Virtualization Host Extensions (VHE) allow the kernel to run
-         directly at EL2 (instead of EL1) on processors that support
-         it. This leads to better performance for KVM, as they reduce
-         the cost of the world switch.
-
-         Selecting this option allows the VHE feature to be detected
-         at runtime, and does not affect processors that do not
-         implement this feature.
-
  endmenu
  
  menu "ARMv8.2 architectural features"
@@ -1694,7 +1681,6 @@ endmenu
  config ARM64_SVE
         bool "ARM Scalable Vector Extension support"
         default y
-       depends on !KVM || ARM64_VHE
         help
           The Scalable Vector Extension (SVE) is an extension to the AArch64
           execution state which complements and extends the SIMD functionality
@@ -1723,12 +1709,6 @@ config ARM64_SVE
           booting the kernel.  If unsure and you are not observing these
           symptoms, you should assume that it is safe to say Y.
  
-         CPUs that support SVE are architecturally required to support the
-         Virtualization Host Extensions (VHE), so the kernel makes no
-         provision for supporting SVE alongside KVM without VHE enabled.
-         Thus, you will need to enable CONFIG_ARM64_VHE if you want to support
-         KVM in the same kernel image.
-
  config ARM64_MODULE_PLTS
         bool "Use PLTs to allow module memory to spill over into vmalloc area"
         depends on MODULES
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S

index bbdb547..ab6c14e 100644 (file)
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -700,7 +700,7 @@ AES_FUNC_START(aes_mac_update)
         cbz             w5, .Lmacout
         encrypt_block   v0, w2, x1, x7, w8
         st1             {v0.16b}, [x4]                  /* return dg */
-       cond_yield      .Lmacout, x7
+       cond_yield      .Lmacout, x7, x8
         b               .Lmacloop4x
  .Lmac1x:
         add             w3, w3, #4
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S

index 8c02bbc..889ca0f 100644 (file)
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -121,7 +121,7 @@ CPU_LE(     rev32           v11.16b, v11.16b        )
         add             dgav.4s, dgav.4s, dg0v.4s
  
         cbz             w2, 2f
-       cond_yield      3f, x5
+       cond_yield      3f, x5, x6
         b               0b
  
         /*
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S

index 6cdea7d..4911799 100644 (file)
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -129,7 +129,7 @@ CPU_LE(     rev32           v19.16b, v19.16b        )
  
         /* handled all input blocks? */
         cbz             w2, 2f
-       cond_yield      3f, x5
+       cond_yield      3f, x5, x6
         b               0b
  
         /*
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S

index 6f52084..9c77313 100644 (file)
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -184,11 +184,11 @@ SYM_FUNC_START(sha3_ce_transform)
         eor      v0.16b,  v0.16b, v31.16b
  
         cbnz    w8, 3b
-       cond_yield 3f, x8
+       cond_yield 4f, x8, x9
         cbnz    w2, 0b
  
         /* save state */
-3:     st1     { v0.1d- v3.1d}, [x0], #32
+4:     st1     { v0.1d- v3.1d}, [x0], #32
         st1     { v4.1d- v7.1d}, [x0], #32
         st1     { v8.1d-v11.1d}, [x0], #32
         st1     {v12.1d-v15.1d}, [x0], #32
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S

index d6e7f6c..b6a3a36 100644 (file)
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ b/arch/arm64/crypto/sha512-ce-core.S
@@ -195,7 +195,7 @@ CPU_LE(     rev64           v19.16b, v19.16b        )
         add             v10.2d, v10.2d, v2.2d
         add             v11.2d, v11.2d, v3.2d
  
-       cond_yield      3f, x4
+       cond_yield      3f, x4, x5
         /* handled all input blocks? */
         cbnz            w2, 0b
  
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h

index ca31594..ad9ccc4 100644 (file)
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -15,6 +15,8 @@
  #include <asm-generic/export.h>
  
  #include <asm/asm-offsets.h>
+#include <asm/alternative.h>
+#include <asm/asm-bug.h>
  #include <asm/cpufeature.h>
  #include <asm/cputype.h>
  #include <asm/debug-monitors.h>
@@ -23,6 +25,14 @@
  #include <asm/ptrace.h>
  #include <asm/thread_info.h>
  
+       /*
+        * Provide a wxN alias for each wN register so what we can paste a xN
+        * reference after a 'w' to obtain the 32-bit version.
+        */
+       .irp    n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
+       wx\n    .req    w\n
+       .endr
+
         .macro save_and_disable_daif, flags
         mrs     \flags, daif
         msr     daifset, #0xf
@@ -270,12 +280,24 @@ alternative_endif
   * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val
   */
         .macro  read_ctr, reg
+#ifndef __KVM_NVHE_HYPERVISOR__
  alternative_if_not ARM64_MISMATCHED_CACHE_TYPE
         mrs     \reg, ctr_el0                   // read CTR
         nop
  alternative_else
         ldr_l   \reg, arm64_ftr_reg_ctrel0 + ARM64_FTR_SYSVAL
  alternative_endif
+#else
+alternative_if_not ARM64_KVM_PROTECTED_MODE
+       ASM_BUG()
+alternative_else_nop_endif
+alternative_cb kvm_compute_final_ctr_el0
+       movz    \reg, #0
+       movk    \reg, #0, lsl #16
+       movk    \reg, #0, lsl #32
+       movk    \reg, #0, lsl #48
+alternative_cb_end
+#endif
         .endm
  
  
@@ -676,11 +698,11 @@ USER(\label, ic   ivau, \tmp2)                    // invalidate I line PoU
         .endm
  
  /*
- * Set SCTLR_EL1 to the passed value, and invalidate the local icache
+ * Set SCTLR_ELx to the @reg value, and invalidate the local icache
   * in the process. This is called when setting the MMU on.
   */
-.macro set_sctlr_el1, reg
-       msr     sctlr_el1, \reg
+.macro set_sctlr, sreg, reg
+       msr     \sreg, \reg
         isb
         /*
          * Invalidate the local I-cache so that any instructions fetched
@@ -692,90 +714,41 @@ USER(\label, ic   ivau, \tmp2)                    // invalidate I line PoU
         isb
  .endm
  
-/*
- * Check whether to yield to another runnable task from kernel mode NEON code
- * (which runs with preemption disabled).
- *
- * if_will_cond_yield_neon
- *        // pre-yield patchup code
- * do_cond_yield_neon
- *        // post-yield patchup code
- * endif_yield_neon    <label>
- *
- * where <label> is optional, and marks the point where execution will resume
- * after a yield has been performed. If omitted, execution resumes right after
- * the endif_yield_neon invocation. Note that the entire sequence, including
- * the provided patchup code, will be omitted from the image if
- * CONFIG_PREEMPTION is not defined.
- *
- * As a convenience, in the case where no patchup code is required, the above
- * sequence may be abbreviated to
- *
- * cond_yield_neon <label>
- *
- * Note that the patchup code does not support assembler directives that change
- * the output section, any use of such directives is undefined.
- *
- * The yield itself consists of the following:
- * - Check whether the preempt count is exactly 1 and a reschedule is also
- *   needed. If so, calling of preempt_enable() in kernel_neon_end() will
- *   trigger a reschedule. If it is not the case, yielding is pointless.
- * - Disable and re-enable kernel mode NEON, and branch to the yield fixup
- *   code.
- *
- * This macro sequence may clobber all CPU state that is not guaranteed by the
- * AAPCS to be preserved across an ordinary function call.
- */
-
-       .macro          cond_yield_neon, lbl
-       if_will_cond_yield_neon
-       do_cond_yield_neon
-       endif_yield_neon        \lbl
-       .endm
-
-       .macro          if_will_cond_yield_neon
-#ifdef CONFIG_PREEMPTION
-       get_current_task        x0
-       ldr             x0, [x0, #TSK_TI_PREEMPT]
-       sub             x0, x0, #PREEMPT_DISABLE_OFFSET
-       cbz             x0, .Lyield_\@
-       /* fall through to endif_yield_neon */
-       .subsection     1
-.Lyield_\@ :
-#else
-       .section        ".discard.cond_yield_neon", "ax"
-#endif
-       .endm
-
-       .macro          do_cond_yield_neon
-       bl              kernel_neon_end
-       bl              kernel_neon_begin
-       .endm
+.macro set_sctlr_el1, reg
+       set_sctlr sctlr_el1, \reg
+.endm
  
-       .macro          endif_yield_neon, lbl
-       .ifnb           \lbl
-       b               \lbl
-       .else
-       b               .Lyield_out_\@
-       .endif
-       .previous
-.Lyield_out_\@ :
-       .endm
+.macro set_sctlr_el2, reg
+       set_sctlr sctlr_el2, \reg
+.endm
  
         /*
-        * Check whether preempt-disabled code should yield as soon as it
-        * is able. This is the case if re-enabling preemption a single
-        * time results in a preempt count of zero, and the TIF_NEED_RESCHED
-        * flag is set. (Note that the latter is stored negated in the
-        * top word of the thread_info::preempt_count field)
+        * Check whether preempt/bh-disabled asm code should yield as soon as
+        * it is able. This is the case if we are currently running in task
+        * context, and either a softirq is pending, or the TIF_NEED_RESCHED
+        * flag is set and re-enabling preemption a single time would result in
+        * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
+        * stored negated in the top word of the thread_info::preempt_count
+        * field)
          */
-       .macro          cond_yield, lbl:req, tmp:req
-#ifdef CONFIG_PREEMPTION
+       .macro          cond_yield, lbl:req, tmp:req, tmp2:req
         get_current_task \tmp
         ldr             \tmp, [\tmp, #TSK_TI_PREEMPT]
+       /*
+        * If we are serving a softirq, there is no point in yielding: the
+        * softirq will not be preempted no matter what we do, so we should
+        * run to completion as quickly as we can.
+        */
+       tbnz            \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@
+#ifdef CONFIG_PREEMPTION
         sub             \tmp, \tmp, #PREEMPT_DISABLE_OFFSET
         cbz             \tmp, \lbl
  #endif
+       adr_l           \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
+       this_cpu_offset \tmp2
+       ldr             w\tmp, [\tmp, \tmp2]
+       cbnz            w\tmp, \lbl     // yield on pending softirq in task context
+.Lnoyield_\@:
         .endm
  
  /*
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h

index c3009b0..5a8367a 100644 (file)
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -23,6 +23,7 @@
  #define dsb(opt)       asm volatile("dsb " #opt : : : "memory")
  
  #define psb_csync()    asm volatile("hint #17" : : : "memory")
+#define tsb_csync()    asm volatile("hint #18" : : : "memory")
  #define csdb()         asm volatile("hint #20" : : : "memory")
  
  #define spec_bar()     asm volatile(ALTERNATIVE("dsb nsh\nisb\n",              \
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h

index 61177ba..338840c 100644 (file)
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -63,6 +63,23 @@ struct arm64_ftr_bits {
         s64             safe_val; /* safe value for FTR_EXACT features */
  };
  
+/*
+ * Describe the early feature override to the core override code:
+ *
+ * @val                        Values that are to be merged into the final
+ *                     sanitised value of the register. Only the bitfields
+ *                     set to 1 in @mask are valid
+ * @mask               Mask of the features that are overridden by @val
+ *
+ * A @mask field set to full-1 indicates that the corresponding field
+ * in @val is a valid override.
+ *
+ * A @mask field set to full-0 with the corresponding @val field set
+ * to full-0 denotes that this field has no override
+ *
+ * A @mask field set to full-0 with the corresponding @val field set
+ * to full-1 denotes thath this field has an invalid override.
+ */
  struct arm64_ftr_override {
         u64             val;
         u64             mask;
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h

index d77d358..bda9189 100644 (file)
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -65,6 +65,19 @@
                                                 // use EL1&0 translation.
  
  .Lskip_spe_\@:
+       /* Trace buffer */
+       ubfx    x0, x1, #ID_AA64DFR0_TRBE_SHIFT, #4
+       cbz     x0, .Lskip_trace_\@             // Skip if TraceBuffer is not present
+
+       mrs_s   x0, SYS_TRBIDR_EL1
+       and     x0, x0, TRBIDR_PROG
+       cbnz    x0, .Lskip_trace_\@             // If TRBE is available at EL2
+
+       mov     x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
+       orr     x2, x2, x0                      // allow the EL1&0 translation
+                                               // to own it.
+
+.Lskip_trace_\@:
         msr     mdcr_el2, x2                    // Configure debug traps
  .endm
  
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h

index bec5f14..ff3879a 100644 (file)
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -130,6 +130,15 @@ static inline void sve_user_enable(void)
         sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN);
  }
  
+#define sve_cond_update_zcr_vq(val, reg)               \
+       do {                                            \
+               u64 __zcr = read_sysreg_s((reg));       \
+               u64 __new = __zcr & ~ZCR_ELx_LEN_MASK;  \
+               __new |= (val) & ZCR_ELx_LEN_MASK;      \
+               if (__zcr != __new)                     \
+                       write_sysreg_s(__new, (reg));   \
+       } while (0)
+
  /*
   * Probing and setup functions.
   * Calls to these functions must be serialised with one another.
@@ -159,6 +168,8 @@ static inline int sve_get_current_vl(void)
  static inline void sve_user_disable(void) { BUILD_BUG(); }
  static inline void sve_user_enable(void) { BUILD_BUG(); }
  
+#define sve_cond_update_zcr_vq(val, reg) do { } while (0)
+
  static inline void sve_init_vq_map(void) { }
  static inline void sve_update_vq_map(void) { }
  static inline int sve_verify_vq_map(void) { return 0; }
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h

index af43367..a256399 100644 (file)
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -6,6 +6,8 @@
   * Author: Catalin Marinas <catalin.marinas@arm.com>
   */
  
+#include <asm/assembler.h>
+
  .macro fpsimd_save state, tmpnr
         stp     q0, q1, [\state, #16 * 0]
         stp     q2, q3, [\state, #16 * 2]
@@ -230,8 +232,7 @@
                 str             w\nxtmp, [\xpfpsr, #4]
  .endm
  
-.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
-               sve_load_vq     \xvqminus1, x\nxtmp, \xtmp2
+.macro __sve_load nxbase, xpfpsr, nxtmp
   _for n, 0, 31,        _sve_ldr_v      \n, \nxbase, \n - 34
                 _sve_ldr_p      0, \nxbase
                 _sve_wrffr      0
@@ -242,3 +243,8 @@
                 ldr             w\nxtmp, [\xpfpsr, #4]
                 msr             fpcr, x\nxtmp
  .endm
+
+.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
+               sve_load_vq     \xvqminus1, x\nxtmp, \xtmp2
+               __sve_load      \nxbase, \xpfpsr, \nxtmp
+.endm
diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h

index 737ded6..b4b3076 100644 (file)
--- a/arch/arm64/include/asm/hyp_image.h
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -10,11 +10,15 @@
  #define __HYP_CONCAT(a, b)     a ## b
  #define HYP_CONCAT(a, b)       __HYP_CONCAT(a, b)
  
+#ifndef __KVM_NVHE_HYPERVISOR__
  /*
   * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
   * to separate it from the kernel proper.
   */
  #define kvm_nvhe_sym(sym)      __kvm_nvhe_##sym
+#else
+#define kvm_nvhe_sym(sym)      sym
+#endif
  
  #ifdef LINKER_SCRIPT
  
@@ -56,6 +60,9 @@
   */
  #define KVM_NVHE_ALIAS(sym)    kvm_nvhe_sym(sym) = sym;
  
+/* Defines a linker script alias for KVM nVHE hyp symbols */
+#define KVM_NVHE_ALIAS_HYP(first, sec) kvm_nvhe_sym(first) = kvm_nvhe_sym(sec);
+
  #endif /* LINKER_SCRIPT */
  
  #endif /* __ARM64_HYP_IMAGE_H__ */
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h

index f9cc1d0..0ae427f 100644 (file)
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
  
  #include <asm/xen/hypervisor.h>
  
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
  #endif
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h

index 94d4025..692c904 100644 (file)
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -278,6 +278,8 @@
  #define CPTR_EL2_DEFAULT       CPTR_EL2_RES1
  
  /* Hyp Debug Configuration Register bits */
+#define MDCR_EL2_E2TB_MASK     (UL(0x3))
+#define MDCR_EL2_E2TB_SHIFT    (UL(24))
  #define MDCR_EL2_TTRF          (1 << 19)
  #define MDCR_EL2_TPMS          (1 << 14)
  #define MDCR_EL2_E2PB_MASK     (UL(0x3))
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h

index a7ab84f..cf8df03 100644 (file)
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -57,6 +57,12 @@
  #define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2               12
  #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs              13
  #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs           14
+#define __KVM_HOST_SMCCC_FUNC___pkvm_init                      15
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings           16
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping    17
+#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector            18
+#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize             19
+#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp                  20
  
  #ifndef __ASSEMBLY__
  
@@ -154,6 +160,9 @@ struct kvm_nvhe_init_params {
         unsigned long tpidr_el2;
         unsigned long stack_hyp_va;
         phys_addr_t pgd_pa;
+       unsigned long hcr_el2;
+       unsigned long vttbr;
+       unsigned long vtcr;
  };
  
  /* Translate a kernel address @ptr into its equivalent linear mapping */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h

index 3a708be..7cd7d5c 100644 (file)
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -94,7 +94,7 @@ struct kvm_s2_mmu {
         /* The last vcpu id that ran on each physical CPU */
         int __percpu *last_vcpu_ran;
  
-       struct kvm *kvm;
+       struct kvm_arch *arch;
  };
  
  struct kvm_arch_memory_slot {
@@ -315,6 +315,8 @@ struct kvm_vcpu_arch {
                 struct kvm_guest_debug_arch regs;
                 /* Statistical profiling extension */
                 u64 pmscr_el1;
+               /* Self-hosted trace */
+               u64 trfcr_el1;
         } host_debug_state;
  
         /* VGIC state */
@@ -372,8 +374,10 @@ struct kvm_vcpu_arch {
  };
  
  /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
-#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
-                                     sve_ffr_offset((vcpu)->arch.sve_max_vl)))
+#define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) +     \
+                            sve_ffr_offset((vcpu)->arch.sve_max_vl))
+
+#define vcpu_sve_max_vq(vcpu)  sve_vq_from_vl((vcpu)->arch.sve_max_vl)
  
  #define vcpu_sve_state_size(vcpu) ({                                   \
         size_t __size_ret;                                              \
@@ -382,7 +386,7 @@ struct kvm_vcpu_arch {
         if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) {          \
                 __size_ret = 0;                                         \
         } else {                                                        \
-               __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl);    \
+               __vcpu_vq = vcpu_sve_max_vq(vcpu);                      \
                 __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq);              \
         }                                                               \
                                                                         \
@@ -400,6 +404,8 @@ struct kvm_vcpu_arch {
  #define KVM_ARM64_GUEST_HAS_PTRAUTH    (1 << 7) /* PTRAUTH exposed to guest */
  #define KVM_ARM64_PENDING_EXCEPTION    (1 << 8) /* Exception pending */
  #define KVM_ARM64_EXCEPT_MASK          (7 << 9) /* Target EL/MODE */
+#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active  */
+#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE        (1 << 13) /* Save TRBE context if active  */
  
  #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
                                  KVM_GUESTDBG_USE_SW_BP | \
@@ -590,6 +596,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
  void kvm_arm_halt_guest(struct kvm *kvm);
  void kvm_arm_resume_guest(struct kvm *kvm);
  
+#ifndef __KVM_NVHE_HYPERVISOR__
  #define kvm_call_hyp_nvhe(f, ...)                                              \
         ({                                                              \
                 struct arm_smccc_res res;                               \
@@ -629,9 +636,13 @@ void kvm_arm_resume_guest(struct kvm *kvm);
                                                                         \
                 ret;                                                    \
         })
+#else /* __KVM_NVHE_HYPERVISOR__ */
+#define kvm_call_hyp(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__)
+#endif /* __KVM_NVHE_HYPERVISOR__ */
  
  void force_vm_exit(const cpumask_t *mask);
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
  
  int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
  void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
@@ -691,19 +702,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
         ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr();
  }
  
-static inline bool kvm_arch_requires_vhe(void)
-{
-       /*
-        * The Arm architecture specifies that implementation of SVE
-        * requires VHE also to be implemented.  The KVM code for arm64
-        * relies on this when SVE is present:
-        */
-       if (system_supports_sve())
-               return true;
-
-       return false;
-}
-
  void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu);
  
  static inline void kvm_arch_hardware_unsetup(void) {}
@@ -712,6 +710,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
  
  void kvm_arm_init_debug(void);
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu);
  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
@@ -733,6 +732,10 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
         return (!has_vhe() && attr->exclude_host);
  }
  
+/* Flags for host debug state */
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu);
+
  #ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */
  static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
  {
@@ -770,5 +773,12 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
         (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
  
  int kvm_trng_call(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM
+extern phys_addr_t hyp_mem_base;
+extern phys_addr_t hyp_mem_size;
+void __init kvm_hyp_reserve(void);
+#else
+static inline void kvm_hyp_reserve(void) { }
+#endif
  
  #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h

index 32ae676..9d60b30 100644 (file)
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -90,6 +90,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
  
  void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
  void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+void __sve_save_state(void *sve_pffr, u32 *fpsr);
+void __sve_restore_state(void *sve_pffr, u32 *fpsr);
  
  #ifndef __KVM_NVHE_HYPERVISOR__
  void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
@@ -100,10 +102,20 @@ u64 __guest_enter(struct kvm_vcpu *vcpu);
  
  bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
  
-void __noreturn hyp_panic(void);
  #ifdef __KVM_NVHE_HYPERVISOR__
  void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
                                u64 elr, u64 par);
  #endif
  
+#ifdef __KVM_NVHE_HYPERVISOR__
+void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size,
+                           phys_addr_t pgd, void *sp, void *cont_fn);
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+               unsigned long *per_cpu_base, u32 hyp_va_bits);
+void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+#endif
+
+extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
+extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
+
  #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h

index 9087385..25ed956 100644 (file)
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -121,6 +121,8 @@ void kvm_update_va_mask(struct alt_instr *alt,
  void kvm_compute_layout(void);
  void kvm_apply_hyp_relocations(void);
  
+#define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)
+
  static __always_inline unsigned long __kern_hyp_va(unsigned long v)
  {
         asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
@@ -166,7 +168,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
  
  phys_addr_t kvm_mmu_get_httbr(void);
  phys_addr_t kvm_get_idmap_vector(void);
-int kvm_mmu_init(void);
+int kvm_mmu_init(u32 *hyp_va_bits);
+
+static inline void *__kvm_vector_slot2addr(void *base,
+                                          enum arm64_hyp_spectre_vector slot)
+{
+       int idx = slot - (slot != HYP_VECTOR_DIRECT);
+
+       return base + (idx * SZ_2K);
+}
  
  struct kvm;
  
@@ -262,9 +272,9 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
   * Must be called from hyp code running at EL2 with an updated VTTBR
   * and interrupts disabled.
   */
-static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr)
  {
-       write_sysreg(kern_hyp_va(mmu->kvm)->arch.vtcr, vtcr_el2);
+       write_sysreg(vtcr, vtcr_el2);
         write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
  
         /*
@@ -275,5 +285,14 @@ static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
         asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
  }
  
+static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+{
+       __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr);
+}
+
+static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
+{
+       return container_of(mmu->arch, struct kvm, arch);
+}
  #endif /* __ASSEMBLY__ */
  #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h

index 8886d43..c3674c4 100644 (file)
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -11,22 +11,79 @@
  #include <linux/kvm_host.h>
  #include <linux/types.h>
  
+#define KVM_PGTABLE_MAX_LEVELS         4U
+
+static inline u64 kvm_get_parange(u64 mmfr0)
+{
+       u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
+                               ID_AA64MMFR0_PARANGE_SHIFT);
+       if (parange > ID_AA64MMFR0_PARANGE_MAX)
+               parange = ID_AA64MMFR0_PARANGE_MAX;
+
+       return parange;
+}
+
  typedef u64 kvm_pte_t;
  
  /**
+ * struct kvm_pgtable_mm_ops - Memory management callbacks.
+ * @zalloc_page:       Allocate a single zeroed memory page. The @arg parameter
+ *                     can be used by the walker to pass a memcache. The
+ *                     initial refcount of the page is 1.
+ * @zalloc_pages_exact:        Allocate an exact number of zeroed memory pages. The
+ *                     @size parameter is in bytes, and is rounded-up to the
+ *                     next page boundary. The resulting allocation is
+ *                     physically contiguous.
+ * @free_pages_exact:  Free an exact number of memory pages previously
+ *                     allocated by zalloc_pages_exact.
+ * @get_page:          Increment the refcount on a page.
+ * @put_page:          Decrement the refcount on a page. When the refcount
+ *                     reaches 0 the page is automatically freed.
+ * @page_count:                Return the refcount of a page.
+ * @phys_to_virt:      Convert a physical address into a virtual address mapped
+ *                     in the current context.
+ * @virt_to_phys:      Convert a virtual address mapped in the current context
+ *                     into a physical address.
+ */
+struct kvm_pgtable_mm_ops {
+       void*           (*zalloc_page)(void *arg);
+       void*           (*zalloc_pages_exact)(size_t size);
+       void            (*free_pages_exact)(void *addr, size_t size);
+       void            (*get_page)(void *addr);
+       void            (*put_page)(void *addr);
+       int             (*page_count)(void *addr);
+       void*           (*phys_to_virt)(phys_addr_t phys);
+       phys_addr_t     (*virt_to_phys)(void *addr);
+};
+
+/**
+ * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags.
+ * @KVM_PGTABLE_S2_NOFWB:      Don't enforce Normal-WB even if the CPUs have
+ *                             ARM64_HAS_STAGE2_FWB.
+ * @KVM_PGTABLE_S2_IDMAP:      Only use identity mappings.
+ */
+enum kvm_pgtable_stage2_flags {
+       KVM_PGTABLE_S2_NOFWB                    = BIT(0),
+       KVM_PGTABLE_S2_IDMAP                    = BIT(1),
+};
+
+/**
   * struct kvm_pgtable - KVM page-table.
   * @ia_bits:           Maximum input address size, in bits.
   * @start_level:       Level at which the page-table walk starts.
   * @pgd:               Pointer to the first top-level entry of the page-table.
+ * @mm_ops:            Memory management callbacks.
   * @mmu:               Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
   */
  struct kvm_pgtable {
         u32                                     ia_bits;
         u32                                     start_level;
         kvm_pte_t                               *pgd;
+       struct kvm_pgtable_mm_ops               *mm_ops;
  
         /* Stage-2 only */
         struct kvm_s2_mmu                       *mmu;
+       enum kvm_pgtable_stage2_flags           flags;
  };
  
  /**
@@ -50,6 +107,16 @@ enum kvm_pgtable_prot {
  #define PAGE_HYP_DEVICE                (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
  
  /**
+ * struct kvm_mem_range - Range of Intermediate Physical Addresses
+ * @start:     Start of the range.
+ * @end:       End of the range.
+ */
+struct kvm_mem_range {
+       u64 start;
+       u64 end;
+};
+
+/**
   * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
   * @KVM_PGTABLE_WALK_LEAF:             Visit leaf entries, including invalid
   *                                     entries.
@@ -86,10 +153,12 @@ struct kvm_pgtable_walker {
   * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
   * @pgt:       Uninitialised page-table structure to initialise.
   * @va_bits:   Maximum virtual address bits.
+ * @mm_ops:    Memory management callbacks.
   *
   * Return: 0 on success, negative error code on failure.
   */
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+                        struct kvm_pgtable_mm_ops *mm_ops);
  
  /**
   * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
@@ -123,17 +192,41 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                         enum kvm_pgtable_prot prot);
  
  /**
- * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * kvm_get_vtcr() - Helper to construct VTCR_EL2
+ * @mmfr0:     Sanitized value of SYS_ID_AA64MMFR0_EL1 register.
+ * @mmfr1:     Sanitized value of SYS_ID_AA64MMFR1_EL1 register.
+ * @phys_shfit:        Value to set in VTCR_EL2.T0SZ.
+ *
+ * The VTCR value is common across all the physical CPUs on the system.
+ * We use system wide sanitised values to fill in different fields,
+ * except for Hardware Management of Access Flags. HA Flag is set
+ * unconditionally on all CPUs, as it is safe to run with or without
+ * the feature and the bit is RES0 on CPUs that don't support it.
+ *
+ * Return: VTCR_EL2 value
+ */
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
+
+/**
+ * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
   * @pgt:       Uninitialised page-table structure to initialise.
- * @kvm:       KVM structure representing the guest virtual machine.
+ * @arch:      Arch-specific KVM structure representing the guest virtual
+ *             machine.
+ * @mm_ops:    Memory management callbacks.
+ * @flags:     Stage-2 configuration flags.
   *
   * Return: 0 on success, negative error code on failure.
   */
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                                 struct kvm_pgtable_mm_ops *mm_ops,
+                                 enum kvm_pgtable_stage2_flags flags);
+
+#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
+       kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
  
  /**
   * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   *
   * The page-table is assumed to be unreachable by any hardware walkers prior
   * to freeing and therefore no TLB invalidation is performed.
@@ -142,13 +235,13 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
  
  /**
   * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address at which to place the mapping.
   * @size:      Size of the mapping.
   * @phys:      Physical address of the memory to map.
   * @prot:      Permissions and attributes for the mapping.
- * @mc:                Cache of pre-allocated GFP_PGTABLE_USER memory from which to
- *             allocate page-table pages.
+ * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
+ *             page-table pages.
   *
   * The offset of @addr within a page is ignored, @size is rounded-up to
   * the next page boundary and @phys is rounded-down to the previous page
@@ -170,11 +263,31 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
   */
  int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                            u64 phys, enum kvm_pgtable_prot prot,
-                          struct kvm_mmu_memory_cache *mc);
+                          void *mc);
+
+/**
+ * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
+ *                                 track ownership.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr:      Base intermediate physical address to annotate.
+ * @size:      Size of the annotated range.
+ * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
+ *             page-table pages.
+ * @owner_id:  Unique identifier for the owner of the page.
+ *
+ * By default, all page-tables are owned by identifier 0. This function can be
+ * used to mark portions of the IPA space as owned by other entities. When a
+ * stage 2 is used with identity-mappings, these annotations allow to use the
+ * page-table data structure as a simple rmap.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                                void *mc, u8 owner_id);
  
  /**
   * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address from which to remove the mapping.
   * @size:      Size of the mapping.
   *
@@ -194,7 +307,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
  /**
   * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
   *                                  without TLB invalidation.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address from which to write-protect,
   * @size:      Size of the range.
   *
@@ -211,7 +324,7 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
  
  /**
   * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address to identify the page-table entry.
   *
   * The offset of @addr within a page is ignored.
@@ -225,7 +338,7 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
  
  /**
   * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address to identify the page-table entry.
   *
   * The offset of @addr within a page is ignored.
@@ -244,7 +357,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
  /**
   * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
   *                                   page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address to identify the page-table entry.
   * @prot:      Additional permissions to grant for the mapping.
   *
@@ -263,7 +376,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
  /**
   * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
   *                                access flag set.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address to identify the page-table entry.
   *
   * The offset of @addr within a page is ignored.
@@ -276,7 +389,7 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
   * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
   *                                   of Coherency for guest stage-2 address
   *                                   range.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
   * @addr:      Intermediate physical address from which to flush.
   * @size:      Size of the range.
   *
@@ -311,4 +424,23 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
  int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
                      struct kvm_pgtable_walker *walker);
  
+/**
+ * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
+ *                                  Addresses with compatible permission
+ *                                  attributes.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr:      Address that must be covered by the range.
+ * @prot:      Protection attributes that the range must be compatible with.
+ * @range:     Range structure used to limit the search space at call time and
+ *             that will hold the result.
+ *
+ * The offset of @addr within a page is ignored. An IPA is compatible with @prot
+ * iff its corresponding stage-2 page-table entry has default ownership and, if
+ * valid, is mapped with protection attributes identical to @prot.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+                                 enum kvm_pgtable_prot prot,
+                                 struct kvm_mem_range *range);
  #endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h

index 9a65fb5..079f4e9 100644 (file)
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -71,10 +71,10 @@ extern bool arm64_use_ng_mappings;
  #define PAGE_KERNEL_EXEC       __pgprot(PROT_NORMAL & ~PTE_PXN)
  #define PAGE_KERNEL_EXEC_CONT  __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
  
-#define PAGE_S2_MEMATTR(attr)                                          \
+#define PAGE_S2_MEMATTR(attr, has_fwb)                                 \
         ({                                                              \
                 u64 __val;                                              \
-               if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))          \
+               if (has_fwb)                                            \
                         __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr);     \
                 else                                                    \
                         __val = PTE_S2_MEMATTR(MT_S2_ ## attr);         \
diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h

index 2f36b16..e4ad9db 100644 (file)
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -13,6 +13,7 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
  extern char __hyp_text_start[], __hyp_text_end[];
  extern char __hyp_rodata_start[], __hyp_rodata_end[];
  extern char __hyp_reloc_begin[], __hyp_reloc_end[];
+extern char __hyp_bss_start[], __hyp_bss_end[];
  extern char __idmap_text_start[], __idmap_text_end[];
  extern char __initdata_begin[], __initdata_end[];
  extern char __inittext_begin[], __inittext_end[];
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h

index d4a5fca..f6a77f3 100644 (file)
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -283,6 +283,8 @@
  #define SYS_PMSIRR_EL1_INTERVAL_MASK   0xffffffUL
  
  /* Filtering controls */
+#define SYS_PMSNEVFR_EL1               sys_reg(3, 0, 9, 9, 1)
+
  #define SYS_PMSFCR_EL1                 sys_reg(3, 0, 9, 9, 4)
  #define SYS_PMSFCR_EL1_FE_SHIFT                0
  #define SYS_PMSFCR_EL1_FT_SHIFT                1
@@ -333,6 +335,55 @@
  
  /*** End of Statistical Profiling Extension ***/
  
+/*
+ * TRBE Registers
+ */
+#define SYS_TRBLIMITR_EL1              sys_reg(3, 0, 9, 11, 0)
+#define SYS_TRBPTR_EL1                 sys_reg(3, 0, 9, 11, 1)
+#define SYS_TRBBASER_EL1               sys_reg(3, 0, 9, 11, 2)
+#define SYS_TRBSR_EL1                  sys_reg(3, 0, 9, 11, 3)
+#define SYS_TRBMAR_EL1                 sys_reg(3, 0, 9, 11, 4)
+#define SYS_TRBTRG_EL1                 sys_reg(3, 0, 9, 11, 6)
+#define SYS_TRBIDR_EL1                 sys_reg(3, 0, 9, 11, 7)
+
+#define TRBLIMITR_LIMIT_MASK           GENMASK_ULL(51, 0)
+#define TRBLIMITR_LIMIT_SHIFT          12
+#define TRBLIMITR_NVM                  BIT(5)
+#define TRBLIMITR_TRIG_MODE_MASK       GENMASK(1, 0)
+#define TRBLIMITR_TRIG_MODE_SHIFT      3
+#define TRBLIMITR_FILL_MODE_MASK       GENMASK(1, 0)
+#define TRBLIMITR_FILL_MODE_SHIFT      1
+#define TRBLIMITR_ENABLE               BIT(0)
+#define TRBPTR_PTR_MASK                        GENMASK_ULL(63, 0)
+#define TRBPTR_PTR_SHIFT               0
+#define TRBBASER_BASE_MASK             GENMASK_ULL(51, 0)
+#define TRBBASER_BASE_SHIFT            12
+#define TRBSR_EC_MASK                  GENMASK(5, 0)
+#define TRBSR_EC_SHIFT                 26
+#define TRBSR_IRQ                      BIT(22)
+#define TRBSR_TRG                      BIT(21)
+#define TRBSR_WRAP                     BIT(20)
+#define TRBSR_ABORT                    BIT(18)
+#define TRBSR_STOP                     BIT(17)
+#define TRBSR_MSS_MASK                 GENMASK(15, 0)
+#define TRBSR_MSS_SHIFT                        0
+#define TRBSR_BSC_MASK                 GENMASK(5, 0)
+#define TRBSR_BSC_SHIFT                        0
+#define TRBSR_FSC_MASK                 GENMASK(5, 0)
+#define TRBSR_FSC_SHIFT                        0
+#define TRBMAR_SHARE_MASK              GENMASK(1, 0)
+#define TRBMAR_SHARE_SHIFT             8
+#define TRBMAR_OUTER_MASK              GENMASK(3, 0)
+#define TRBMAR_OUTER_SHIFT             4
+#define TRBMAR_INNER_MASK              GENMASK(3, 0)
+#define TRBMAR_INNER_SHIFT             0
+#define TRBTRG_TRG_MASK                        GENMASK(31, 0)
+#define TRBTRG_TRG_SHIFT               0
+#define TRBIDR_FLAG                    BIT(5)
+#define TRBIDR_PROG                    BIT(4)
+#define TRBIDR_ALIGN_MASK              GENMASK(3, 0)
+#define TRBIDR_ALIGN_SHIFT             0
+
  #define SYS_PMINTENSET_EL1             sys_reg(3, 0, 9, 14, 1)
  #define SYS_PMINTENCLR_EL1             sys_reg(3, 0, 9, 14, 2)
  
@@ -579,9 +630,6 @@
  #define SCTLR_ELx_A    (BIT(1))
  #define SCTLR_ELx_M    (BIT(0))
  
-#define SCTLR_ELx_FLAGS        (SCTLR_ELx_M  | SCTLR_ELx_A | SCTLR_ELx_C | \
-                        SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
-
  /* SCTLR_EL2 specific flags. */
  #define SCTLR_EL2_RES1 ((BIT(4))  | (BIT(5))  | (BIT(11)) | (BIT(16)) | \
                          (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
@@ -593,6 +641,10 @@
  #define ENDIAN_SET_EL2         0
  #endif
  
+#define INIT_SCTLR_EL2_MMU_ON                                          \
+       (SCTLR_ELx_M  | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I |      \
+        SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1)
+
  #define INIT_SCTLR_EL2_MMU_OFF \
         (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
  
@@ -840,6 +892,7 @@
  #define ID_AA64MMFR2_CNP_SHIFT         0
  
  /* id_aa64dfr0 */
+#define ID_AA64DFR0_TRBE_SHIFT         44
  #define ID_AA64DFR0_TRACE_FILT_SHIFT   40
  #define ID_AA64DFR0_DOUBLELOCK_SHIFT   36
  #define ID_AA64DFR0_PMSVER_SHIFT       32
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c

index a36e2fc..8060e58 100644 (file)
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -95,6 +95,8 @@ int main(void)
    DEFINE(DMA_FROM_DEVICE,      DMA_FROM_DEVICE);
    BLANK();
    DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
+  DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
+  DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
    BLANK();
    DEFINE(CPU_BOOT_STACK,       offsetof(struct secondary_data, stack));
    DEFINE(CPU_BOOT_TASK,                offsetof(struct secondary_data, task));
@@ -120,6 +122,9 @@ int main(void)
    DEFINE(NVHE_INIT_TPIDR_EL2,  offsetof(struct kvm_nvhe_init_params, tpidr_el2));
    DEFINE(NVHE_INIT_STACK_HYP_VA,       offsetof(struct kvm_nvhe_init_params, stack_hyp_va));
    DEFINE(NVHE_INIT_PGD_PA,     offsetof(struct kvm_nvhe_init_params, pgd_pa));
+  DEFINE(NVHE_INIT_HCR_EL2,    offsetof(struct kvm_nvhe_init_params, hcr_el2));
+  DEFINE(NVHE_INIT_VTTBR,      offsetof(struct kvm_nvhe_init_params, vttbr));
+  DEFINE(NVHE_INIT_VTCR,       offsetof(struct kvm_nvhe_init_params, vtcr));
  #endif
  #ifdef CONFIG_CPU_PM
    DEFINE(CPU_CTX_SP,           offsetof(struct cpu_suspend_ctx, sp));
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S

index 37721eb..d47ff63 100644 (file)
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -30,10 +30,7 @@
   * flat identity mapping.
   */
  SYM_CODE_START(__cpu_soft_restart)
-       /* Clear sctlr_el1 flags. */
-       mrs     x12, sctlr_el1
-       mov_q   x13, SCTLR_ELx_FLAGS
-       bic     x12, x12, x13
+       mov_q   x12, INIT_SCTLR_EL1_MMU_OFF
         pre_disable_mmu_workaround
         /*
          * either disable EL1&0 translation regime or disable EL2&0 translation
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c

index e5281e1..e3e0dcb 100644 (file)
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -808,6 +808,12 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new)
                                         reg->name,
                                         ftrp->shift + ftrp->width - 1,
                                         ftrp->shift, str, tmp);
+               } else if ((ftr_mask & reg->override->val) == ftr_mask) {
+                       reg->override->val &= ~ftr_mask;
+                       pr_warn("%s[%d:%d]: impossible override, ignored\n",
+                               reg->name,
+                               ftrp->shift + ftrp->width - 1,
+                               ftrp->shift);
                 }
  
                 val = arm64_ftr_set_value(ftrp, val, ftr_new);
@@ -1619,7 +1625,6 @@ int get_cpu_with_amu_feat(void)
  }
  #endif
  
-#ifdef CONFIG_ARM64_VHE
  static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused)
  {
         return is_kernel_in_hyp_mode();
@@ -1638,7 +1643,6 @@ static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused)
         if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN))
                 write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
  }
-#endif
  
  static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
  {
@@ -1841,7 +1845,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                 .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
                 .matches = has_no_hw_prefetch,
         },
-#ifdef CONFIG_ARM64_VHE
         {
                 .desc = "Virtualization Host Extensions",
                 .capability = ARM64_HAS_VIRT_HOST_EXTN,
@@ -1849,7 +1852,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                 .matches = runs_at_el2,
                 .cpu_enable = cpu_copy_el2regs,
         },
-#endif /* CONFIG_ARM64_VHE */
         {
                 .desc = "32-bit EL0 Support",
                 .capability = ARM64_HAS_32BIT_EL0,
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c

index 062b21f..823e3a8 100644 (file)
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -180,7 +180,7 @@ static void __get_cpu_fpsimd_context(void)
   */
  static void get_cpu_fpsimd_context(void)
  {
-       preempt_disable();
+       local_bh_disable();
         __get_cpu_fpsimd_context();
  }
  
@@ -201,7 +201,7 @@ static void __put_cpu_fpsimd_context(void)
  static void put_cpu_fpsimd_context(void)
  {
         __put_cpu_fpsimd_context();
-       preempt_enable();
+       local_bh_enable();
  }
  
  static bool have_cpu_fpsimd_context(void)
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S

index 840bda1..96873df 100644 (file)
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -477,14 +477,13 @@ EXPORT_SYMBOL(kimage_vaddr)
   * booted in EL1 or EL2 respectively.
   */
  SYM_FUNC_START(init_kernel_el)
-       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
-       msr     sctlr_el1, x0
-
         mrs     x0, CurrentEL
         cmp     x0, #CurrentEL_EL2
         b.eq    init_el2
  
  SYM_INNER_LABEL(init_el1, SYM_L_LOCAL)
+       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
+       msr     sctlr_el1, x0
         isb
         mov_q   x0, INIT_PSTATE_EL1
         msr     spsr_el1, x0
@@ -504,9 +503,43 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
         msr     vbar_el2, x0
         isb
  
+       /*
+        * Fruity CPUs seem to have HCR_EL2.E2H set to RES1,
+        * making it impossible to start in nVHE mode. Is that
+        * compliant with the architecture? Absolutely not!
+        */
+       mrs     x0, hcr_el2
+       and     x0, x0, #HCR_E2H
+       cbz     x0, 1f
+
+       /* Switching to VHE requires a sane SCTLR_EL1 as a start */
+       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
+       msr_s   SYS_SCTLR_EL12, x0
+
+       /*
+        * Force an eret into a helper "function", and let it return
+        * to our original caller... This makes sure that we have
+        * initialised the basic PSTATE state.
+        */
+       mov     x0, #INIT_PSTATE_EL2
+       msr     spsr_el1, x0
+       adr     x0, __cpu_stick_to_vhe
+       msr     elr_el1, x0
+       eret
+
+1:
+       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
+       msr     sctlr_el1, x0
+
         msr     elr_el2, lr
         mov     w0, #BOOT_CPU_MODE_EL2
         eret
+
+__cpu_stick_to_vhe:
+       mov     x0, #HVC_VHE_RESTART
+       hvc     #0
+       mov     x0, #BOOT_CPU_MODE_EL2
+       ret
  SYM_FUNC_END(init_kernel_el)
  
  /*
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S

index 5eccbd6..43d2126 100644 (file)
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -27,12 +27,12 @@ SYM_CODE_START(__hyp_stub_vectors)
         ventry  el2_fiq_invalid                 // FIQ EL2t
         ventry  el2_error_invalid               // Error EL2t
  
-       ventry  el2_sync_invalid                // Synchronous EL2h
+       ventry  elx_sync                        // Synchronous EL2h
         ventry  el2_irq_invalid                 // IRQ EL2h
         ventry  el2_fiq_invalid                 // FIQ EL2h
         ventry  el2_error_invalid               // Error EL2h
  
-       ventry  el1_sync                        // Synchronous 64-bit EL1
+       ventry  elx_sync                        // Synchronous 64-bit EL1
         ventry  el1_irq_invalid                 // IRQ 64-bit EL1
         ventry  el1_fiq_invalid                 // FIQ 64-bit EL1
         ventry  el1_error_invalid               // Error 64-bit EL1
@@ -45,7 +45,7 @@ SYM_CODE_END(__hyp_stub_vectors)
  
         .align 11
  
-SYM_CODE_START_LOCAL(el1_sync)
+SYM_CODE_START_LOCAL(elx_sync)
         cmp     x0, #HVC_SET_VECTORS
         b.ne    1f
         msr     vbar_el2, x1
@@ -71,7 +71,7 @@ SYM_CODE_START_LOCAL(el1_sync)
  
  9:     mov     x0, xzr
         eret
-SYM_CODE_END(el1_sync)
+SYM_CODE_END(elx_sync)
  
  // nVHE? No way! Give me the real thing!
  SYM_CODE_START_LOCAL(mutate_to_vhe)
@@ -115,9 +115,10 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
         mrs_s   x0, SYS_VBAR_EL12
         msr     vbar_el1, x0
  
-       // Use EL2 translations for SPE and disable access from EL1
+       // Use EL2 translations for SPE & TRBE and disable access from EL1
         mrs     x0, mdcr_el2
         bic     x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
+       bic     x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
         msr     mdcr_el2, x0
  
         // Transfer the MM state from EL1 to EL2
@@ -224,7 +225,6 @@ SYM_FUNC_END(__hyp_reset_vectors)
   * Entry point to switch to VHE if deemed capable
   */
  SYM_FUNC_START(switch_to_vhe)
-#ifdef CONFIG_ARM64_VHE
         // Need to have booted at EL2
         adr_l   x1, __boot_cpu_mode
         ldr     w0, [x1]
@@ -240,6 +240,5 @@ SYM_FUNC_START(switch_to_vhe)
         mov     x0, #HVC_VHE_RESTART
         hvc     #0
  1:
-#endif
         ret
  SYM_FUNC_END(switch_to_vhe)
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c

index 83f1c4b..e628c8c 100644 (file)
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -25,14 +25,26 @@ struct ftr_set_desc {
         struct {
                 char                    name[FTR_DESC_FIELD_LEN];
                 u8                      shift;
+               bool                    (*filter)(u64 val);
         }                               fields[];
  };
  
+static bool __init mmfr1_vh_filter(u64 val)
+{
+       /*
+        * If we ever reach this point while running VHE, we're
+        * guaranteed to be on one of these funky, VHE-stuck CPUs. If
+        * the user was trying to force nVHE on us, proceed with
+        * attitude adjustment.
+        */
+       return !(is_kernel_in_hyp_mode() && val == 0);
+}
+
  static const struct ftr_set_desc mmfr1 __initconst = {
         .name           = "id_aa64mmfr1",
         .override       = &id_aa64mmfr1_override,
         .fields         = {
-               { "vh", ID_AA64MMFR1_VHE_SHIFT },
+               { "vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter },
                 {}
         },
  };
@@ -124,6 +136,18 @@ static void __init match_options(const char *cmdline)
                         if (find_field(cmdline, regs[i], f, &v))
                                 continue;
  
+                       /*
+                        * If an override gets filtered out, advertise
+                        * it by setting the value to 0xf, but
+                        * clearing the mask... Yes, this is fragile.
+                        */
+                       if (regs[i]->fields[f].filter &&
+                           !regs[i]->fields[f].filter(v)) {
+                               regs[i]->override->val  |= mask;
+                               regs[i]->override->mask &= ~mask;
+                               continue;
+                       }
+
                         regs[i]->override->val  &= ~mask;
                         regs[i]->override->val  |= (v << shift) & mask;
                         regs[i]->override->mask |= mask;
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h

index 5aa9ed1..bcf3c27 100644 (file)
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -65,13 +65,13 @@ __efistub__ctype            = _ctype;
  KVM_NVHE_ALIAS(kvm_patch_vector_branch);
  KVM_NVHE_ALIAS(kvm_update_va_mask);
  KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
+KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0);
  
  /* Global kernel state accessed by nVHE hyp code. */
  KVM_NVHE_ALIAS(kvm_vgic_global_state);
  
  /* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */
-KVM_NVHE_ALIAS(__hyp_panic_string);
-KVM_NVHE_ALIAS(panic);
+KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
  
  /* Vectors installed by hyp-init on reset HVC. */
  KVM_NVHE_ALIAS(__hyp_stub_vectors);
@@ -104,6 +104,36 @@ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
  /* PMU available static key */
  KVM_NVHE_ALIAS(kvm_arm_pmu_available);
  
+/* Position-independent library routines */
+KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page);
+KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page);
+KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(memset, __pi_memset);
+
+#ifdef CONFIG_KASAN
+KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
+#endif
+
+/* Kernel memory sections */
+KVM_NVHE_ALIAS(__start_rodata);
+KVM_NVHE_ALIAS(__end_rodata);
+KVM_NVHE_ALIAS(__bss_start);
+KVM_NVHE_ALIAS(__bss_stop);
+
+/* Hyp memory sections */
+KVM_NVHE_ALIAS(__hyp_idmap_text_start);
+KVM_NVHE_ALIAS(__hyp_idmap_text_end);
+KVM_NVHE_ALIAS(__hyp_text_start);
+KVM_NVHE_ALIAS(__hyp_text_end);
+KVM_NVHE_ALIAS(__hyp_bss_start);
+KVM_NVHE_ALIAS(__hyp_bss_end);
+KVM_NVHE_ALIAS(__hyp_rodata_start);
+KVM_NVHE_ALIAS(__hyp_rodata_end);
+
+/* pKVM static key */
+KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
+
  #endif /* CONFIG_KVM */
  
  #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S

index 7eea788..709d2c4 100644 (file)
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -5,24 +5,7 @@
   * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
   */
  
-#define RO_EXCEPTION_TABLE_ALIGN       8
-#define RUNTIME_DISCARD_EXIT
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/cache.h>
  #include <asm/hyp_image.h>
-#include <asm/kernel-pgtable.h>
-#include <asm/memory.h>
-#include <asm/page.h>
-
-#include "image.h"
-
-OUTPUT_ARCH(aarch64)
-ENTRY(_text)
-
-jiffies = jiffies_64;
-
-
  #ifdef CONFIG_KVM
  #define HYPERVISOR_EXTABLE                                     \
         . = ALIGN(SZ_8);                                        \
@@ -32,9 +15,11 @@ jiffies = jiffies_64;
  
  #define HYPERVISOR_DATA_SECTIONS                               \
         HYP_SECTION_NAME(.rodata) : {                           \
+               . = ALIGN(PAGE_SIZE);                           \
                 __hyp_rodata_start = .;                         \
                 *(HYP_SECTION_NAME(.data..ro_after_init))       \
                 *(HYP_SECTION_NAME(.rodata))                    \
+               . = ALIGN(PAGE_SIZE);                           \
                 __hyp_rodata_end = .;                           \
         }
  
@@ -51,29 +36,52 @@ jiffies = jiffies_64;
                 __hyp_reloc_end = .;                            \
         }
  
+#define BSS_FIRST_SECTIONS                                     \
+       __hyp_bss_start = .;                                    \
+       *(HYP_SECTION_NAME(.bss))                               \
+       . = ALIGN(PAGE_SIZE);                                   \
+       __hyp_bss_end = .;
+
+/*
+ * We require that __hyp_bss_start and __bss_start are aligned, and enforce it
+ * with an assertion. But the BSS_SECTION macro places an empty .sbss section
+ * between them, which can in some cases cause the linker to misalign them. To
+ * work around the issue, force a page alignment for __bss_start.
+ */
+#define SBSS_ALIGN                     PAGE_SIZE
  #else /* CONFIG_KVM */
  #define HYPERVISOR_EXTABLE
  #define HYPERVISOR_DATA_SECTIONS
  #define HYPERVISOR_PERCPU_SECTION
  #define HYPERVISOR_RELOC_SECTION
+#define SBSS_ALIGN                     0
  #endif
  
+#define RO_EXCEPTION_TABLE_ALIGN       8
+#define RUNTIME_DISCARD_EXIT
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+
+#include "image.h"
+
+OUTPUT_ARCH(aarch64)
+ENTRY(_text)
+
+jiffies = jiffies_64;
+
  #define HYPERVISOR_TEXT                                        \
-       /*                                              \
-        * Align to 4 KB so that                        \
-        * a) the HYP vector table is at its minimum    \
-        *    alignment of 2048 bytes                   \
-        * b) the HYP init code will not cross a page   \
-        *    boundary if its size does not exceed      \
-        *    4 KB (see related ASSERT() below)         \
-        */                                             \
-       . = ALIGN(SZ_4K);                               \
+       . = ALIGN(PAGE_SIZE);                           \
         __hyp_idmap_text_start = .;                     \
         *(.hyp.idmap.text)                              \
         __hyp_idmap_text_end = .;                       \
         __hyp_text_start = .;                           \
         *(.hyp.text)                                    \
         HYPERVISOR_EXTABLE                              \
+       . = ALIGN(PAGE_SIZE);                           \
         __hyp_text_end = .;
  
  #define IDMAP_TEXT                                     \
@@ -276,7 +284,7 @@ SECTIONS
         __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
         _edata = .;
  
-       BSS_SECTION(0, 0, 0)
+       BSS_SECTION(SBSS_ALIGN, 0, 0)
  
         . = ALIGN(PAGE_SIZE);
         init_pg_dir = .;
@@ -309,11 +317,12 @@ SECTIONS
  #include "image-vars.h"
  
  /*
- * The HYP init code and ID map text can't be longer than a page each,
- * and should not cross a page boundary.
+ * The HYP init code and ID map text can't be longer than a page each. The
+ * former is page-aligned, but the latter may not be with 16K or 64K pages, so
+ * it should also not cross a page boundary.
   */
-ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
-       "HYP init code too big or misaligned")
+ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE,
+       "HYP init code too big")
  ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
         "ID map text too big or misaligned")
  #ifdef CONFIG_HIBERNATION
@@ -324,6 +333,9 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
  ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
         "Entry trampoline text too big")
  #endif
+#ifdef CONFIG_KVM
+ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned")
+#endif
  /*
   * If padding is applied before .head.text, virt<->phys conversions will fail.
   */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c

index 0d92a4e..1cb39c0 100644 (file)
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_ARM_INJECT_EXT_DABT:
         case KVM_CAP_SET_GUEST_DEBUG:
         case KVM_CAP_VCPU_ATTRIBUTES:
+       case KVM_CAP_PTP_KVM:
                 r = 1;
                 break;
         case KVM_CAP_SET_GUEST_DEBUG2:
@@ -418,10 +419,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  
         if (vcpu_has_ptrauth(vcpu))
                 vcpu_ptrauth_disable(vcpu);
+       kvm_arch_vcpu_load_debug_state_flags(vcpu);
  }
  
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  {
+       kvm_arch_vcpu_put_debug_state_flags(vcpu);
         kvm_arch_vcpu_put_fp(vcpu);
         if (has_vhe())
                 kvm_vcpu_put_sysregs_vhe(vcpu);
@@ -582,6 +585,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
  
         vcpu->arch.has_run_once = true;
  
+       kvm_arm_vcpu_init_debug(vcpu);
+
         if (likely(irqchip_in_kernel(kvm))) {
                 /*
                  * Map the VGIC hardware resources before running a vcpu the
@@ -1352,16 +1357,9 @@ static unsigned long nvhe_percpu_order(void)
  /* A lookup table holding the hypervisor VA for each vector slot */
  static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
  
-static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot)
-{
-       return slot - (slot != HYP_VECTOR_DIRECT);
-}
-
  static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
  {
-       int idx = __kvm_vector_slot2idx(slot);
-
-       hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K);
+       hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
  }
  
  static int kvm_init_vector_slots(void)
@@ -1390,22 +1388,18 @@ static int kvm_init_vector_slots(void)
         return 0;
  }
  
-static void cpu_init_hyp_mode(void)
+static void cpu_prepare_hyp_mode(int cpu)
  {
-       struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params);
-       struct arm_smccc_res res;
+       struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
         unsigned long tcr;
  
-       /* Switch from the HYP stub to our own HYP init vector */
-       __hyp_set_vectors(kvm_get_idmap_vector());
-
         /*
          * Calculate the raw per-cpu offset without a translation from the
          * kernel's mapping to the linear mapping, and store it in tpidr_el2
          * so that we can use adr_l to access per-cpu variables in EL2.
          * Also drop the KASAN tag which gets in the way...
          */
-       params->tpidr_el2 = (unsigned long)kasan_reset_tag(this_cpu_ptr_nvhe_sym(__per_cpu_start)) -
+       params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
                             (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
  
         params->mair_el2 = read_sysreg(mair_el1);
@@ -1429,14 +1423,28 @@ static void cpu_init_hyp_mode(void)
         tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
         params->tcr_el2 = tcr;
  
-       params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE);
+       params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE);
         params->pgd_pa = kvm_mmu_get_httbr();
+       if (is_protected_kvm_enabled())
+               params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
+       else
+               params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
+       params->vttbr = params->vtcr = 0;
  
         /*
          * Flush the init params from the data cache because the struct will
          * be read while the MMU is off.
          */
         kvm_flush_dcache_to_poc(params, sizeof(*params));
+}
+
+static void hyp_install_host_vector(void)
+{
+       struct kvm_nvhe_init_params *params;
+       struct arm_smccc_res res;
+
+       /* Switch from the HYP stub to our own HYP init vector */
+       __hyp_set_vectors(kvm_get_idmap_vector());
  
         /*
          * Call initialization code, and switch to the full blown HYP code.
@@ -1445,8 +1453,14 @@ static void cpu_init_hyp_mode(void)
          * cpus_have_const_cap() wrapper.
          */
         BUG_ON(!system_capabilities_finalized());
+       params = this_cpu_ptr_nvhe_sym(kvm_init_params);
         arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
         WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
+}
+
+static void cpu_init_hyp_mode(void)
+{
+       hyp_install_host_vector();
  
         /*
          * Disabling SSBD on a non-VHE system requires us to enable SSBS
@@ -1489,7 +1503,10 @@ static void cpu_set_hyp_vector(void)
         struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
         void *vector = hyp_spectre_vector_selector[data->slot];
  
-       *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+       if (!is_protected_kvm_enabled())
+               *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+       else
+               kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
  }
  
  static void cpu_hyp_reinit(void)
@@ -1497,13 +1514,14 @@ static void cpu_hyp_reinit(void)
         kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
  
         cpu_hyp_reset();
-       cpu_set_hyp_vector();
  
         if (is_kernel_in_hyp_mode())
                 kvm_timer_init_vhe();
         else
                 cpu_init_hyp_mode();
  
+       cpu_set_hyp_vector();
+
         kvm_arm_init_debug();
  
         if (vgic_present)
@@ -1699,18 +1717,62 @@ static void teardown_hyp_mode(void)
         }
  }
  
+static int do_pkvm_init(u32 hyp_va_bits)
+{
+       void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
+       int ret;
+
+       preempt_disable();
+       hyp_install_host_vector();
+       ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
+                               num_possible_cpus(), kern_hyp_va(per_cpu_base),
+                               hyp_va_bits);
+       preempt_enable();
+
+       return ret;
+}
+
+static int kvm_hyp_init_protection(u32 hyp_va_bits)
+{
+       void *addr = phys_to_virt(hyp_mem_base);
+       int ret;
+
+       kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+       kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+
+       ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       ret = do_pkvm_init(hyp_va_bits);
+       if (ret)
+               return ret;
+
+       free_hyp_pgds();
+
+       return 0;
+}
+
  /**
   * Inits Hyp-mode on all online CPUs
   */
  static int init_hyp_mode(void)
  {
+       u32 hyp_va_bits;
         int cpu;
-       int err = 0;
+       int err = -ENOMEM;
+
+       /*
+        * The protected Hyp-mode cannot be initialized if the memory pool
+        * allocation has failed.
+        */
+       if (is_protected_kvm_enabled() && !hyp_mem_base)
+               goto out_err;
  
         /*
          * Allocate Hyp PGD and setup Hyp identity mapping
          */
-       err = kvm_mmu_init();
+       err = kvm_mmu_init(&hyp_va_bits);
         if (err)
                 goto out_err;
  
@@ -1771,7 +1833,19 @@ static int init_hyp_mode(void)
                 goto out_err;
         }
  
-       err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+       /*
+        * .hyp.bss is guaranteed to be placed at the beginning of the .bss
+        * section thanks to an assertion in the linker script. Map it RW and
+        * the rest of .bss RO.
+        */
+       err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
+                                 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
+       if (err) {
+               kvm_err("Cannot map hyp bss section: %d\n", err);
+               goto out_err;
+       }
+
+       err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
                                   kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
         if (err) {
                 kvm_err("Cannot map bss section\n");
@@ -1792,26 +1866,36 @@ static int init_hyp_mode(void)
                 }
         }
  
-       /*
-        * Map Hyp percpu pages
-        */
         for_each_possible_cpu(cpu) {
                 char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
                 char *percpu_end = percpu_begin + nvhe_percpu_size();
  
+               /* Map Hyp percpu pages */
                 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
-
                 if (err) {
                         kvm_err("Cannot map hyp percpu region\n");
                         goto out_err;
                 }
+
+               /* Prepare the CPU initialization parameters */
+               cpu_prepare_hyp_mode(cpu);
         }
  
         if (is_protected_kvm_enabled()) {
                 init_cpu_logical_map();
  
-               if (!init_psci_relay())
+               if (!init_psci_relay()) {
+                       err = -ENODEV;
+                       goto out_err;
+               }
+       }
+
+       if (is_protected_kvm_enabled()) {
+               err = kvm_hyp_init_protection(hyp_va_bits);
+               if (err) {
+                       kvm_err("Failed to init hyp memory protection\n");
                         goto out_err;
+               }
         }
  
         return 0;
@@ -1822,6 +1906,72 @@ out_err:
         return err;
  }
  
+static void _kvm_host_prot_finalize(void *discard)
+{
+       WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
+}
+
+static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+       return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end);
+}
+
+#define pkvm_mark_hyp_section(__section)               \
+       pkvm_mark_hyp(__pa_symbol(__section##_start),   \
+                       __pa_symbol(__section##_end))
+
+static int finalize_hyp_mode(void)
+{
+       int cpu, ret;
+
+       if (!is_protected_kvm_enabled())
+               return 0;
+
+       ret = pkvm_mark_hyp_section(__hyp_idmap_text);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_text);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_rodata);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_bss);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size);
+       if (ret)
+               return ret;
+
+       for_each_possible_cpu(cpu) {
+               phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]);
+               phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order());
+
+               ret = pkvm_mark_hyp(start, end);
+               if (ret)
+                       return ret;
+
+               start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu));
+               end = start + PAGE_SIZE;
+               ret = pkvm_mark_hyp(start, end);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * Flip the static key upfront as that may no longer be possible
+        * once the host stage 2 is installed.
+        */
+       static_branch_enable(&kvm_protected_mode_initialized);
+       on_each_cpu(_kvm_host_prot_finalize, NULL, 1);
+
+       return 0;
+}
+
  static void check_kvm_target_cpu(void *ret)
  {
         *(int *)ret = kvm_target_cpu();
@@ -1896,11 +2046,6 @@ int kvm_arch_init(void *opaque)
  
         in_hyp_mode = is_kernel_in_hyp_mode();
  
-       if (!in_hyp_mode && kvm_arch_requires_vhe()) {
-               kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
-               return -ENODEV;
-       }
-
         if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
             cpus_have_final_cap(ARM64_WORKAROUND_1508412))
                 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
@@ -1938,8 +2083,15 @@ int kvm_arch_init(void *opaque)
         if (err)
                 goto out_hyp;
  
+       if (!in_hyp_mode) {
+               err = finalize_hyp_mode();
+               if (err) {
+                       kvm_err("Failed to finalize Hyp protection\n");
+                       goto out_hyp;
+               }
+       }
+
         if (is_protected_kvm_enabled()) {
-               static_branch_enable(&kvm_protected_mode_initialized);
                 kvm_info("Protected nVHE mode initialized successfully\n");
         } else if (in_hyp_mode) {
                 kvm_info("VHE mode initialized successfully\n");
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c

index dbc8905..d5e79d7 100644 (file)
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -69,6 +69,65 @@ void kvm_arm_init_debug(void)
  }
  
  /**
+ * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * This ensures we will trap access to:
+ *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ *  - Debug ROM Address (MDCR_EL2_TDRA)
+ *  - OS related registers (MDCR_EL2_TDOSA)
+ *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
+ *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ *  - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
+ */
+static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+{
+       /*
+        * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
+        * to disable guest access to the profiling and trace buffers
+        */
+       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+                               MDCR_EL2_TPMS |
+                               MDCR_EL2_TTRF |
+                               MDCR_EL2_TPMCR |
+                               MDCR_EL2_TDRA |
+                               MDCR_EL2_TDOSA);
+
+       /* Is the VM being debugged by userspace? */
+       if (vcpu->guest_debug)
+               /* Route all software debug exceptions to EL2 */
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+       /*
+        * Trap debug register access when one of the following is true:
+        *  - Userspace is using the hardware to debug the guest
+        *  (KVM_GUESTDBG_USE_HW is set).
+        *  - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear).
+        */
+       if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) ||
+           !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+}
+
+/**
+ * kvm_arm_vcpu_init_debug - setup vcpu debug traps
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * Set vcpu initial mdcr_el2 value.
+ */
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       kvm_arm_setup_mdcr_el2(vcpu);
+       preempt_enable();
+}
+
+/**
   * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
   */
  
@@ -83,13 +142,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
   * @vcpu:      the vcpu pointer
   *
   * This is called before each entry into the hypervisor to setup any
- * debug related registers. Currently this just ensures we will trap
- * access to:
- *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
- *  - Debug ROM Address (MDCR_EL2_TDRA)
- *  - OS related registers (MDCR_EL2_TDOSA)
- *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
- *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ * debug related registers.
   *
   * Additionally, KVM only traps guest accesses to the debug registers if
   * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
@@ -101,28 +154,14 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
  
  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
  {
-       bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY);
         unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2;
  
         trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
  
-       /*
-        * This also clears MDCR_EL2_E2PB_MASK to disable guest access
-        * to the profiling buffer.
-        */
-       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
-       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
-                               MDCR_EL2_TPMS |
-                               MDCR_EL2_TTRF |
-                               MDCR_EL2_TPMCR |
-                               MDCR_EL2_TDRA |
-                               MDCR_EL2_TDOSA);
+       kvm_arm_setup_mdcr_el2(vcpu);
  
         /* Is Guest debugging in effect? */
         if (vcpu->guest_debug) {
-               /* Route all software debug exceptions to EL2 */
-               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
-
                 /* Save guest debug state */
                 save_guest_debug_regs(vcpu);
  
@@ -176,7 +215,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
  
                         vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
                         vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
-                       trap_debug = true;
  
                         trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
                                                 &vcpu->arch.debug_ptr->dbg_bcr[0],
@@ -191,10 +229,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
         BUG_ON(!vcpu->guest_debug &&
                 vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
  
-       /* Trap debug register access */
-       if (trap_debug)
-               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
-
         /* If KDE or MDE are set, perform a full save/restore cycle. */
         if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
                 vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
@@ -203,7 +237,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
         if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2)
                 write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
  
-       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
         trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
  }
  
@@ -231,3 +264,32 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
                 }
         }
  }
+
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+       u64 dfr0;
+
+       /* For VHE, there is nothing to do */
+       if (has_vhe())
+               return;
+
+       dfr0 = read_sysreg(id_aa64dfr0_el1);
+       /*
+        * If SPE is present on this CPU and is available at current EL,
+        * we may need to check if the host state needs to be saved.
+        */
+       if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) &&
+           !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT)))
+               vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE;
+
+       /* Check if we have TRBE implemented and available at the host */
+       if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) &&
+           !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG))
+               vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE;
+}
+
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE |
+                             KVM_ARM64_DEBUG_STATE_SAVE_TRBE);
+}
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c

index 3e081d5..5621020 100644 (file)
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -11,6 +11,7 @@
  #include <linux/kvm_host.h>
  #include <asm/fpsimd.h>
  #include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
  #include <asm/kvm_mmu.h>
  #include <asm/sysreg.h>
  
@@ -42,6 +43,17 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
         if (ret)
                 goto error;
  
+       if (vcpu->arch.sve_state) {
+               void *sve_end;
+
+               sve_end = vcpu->arch.sve_state + vcpu_sve_state_size(vcpu);
+
+               ret = create_hyp_mappings(vcpu->arch.sve_state, sve_end,
+                                         PAGE_HYP);
+               if (ret)
+                       goto error;
+       }
+
         vcpu->arch.host_thread_info = kern_hyp_va(ti);
         vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
  error:
@@ -109,11 +121,17 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
         local_irq_save(flags);
  
         if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
-               fpsimd_save_and_flush_cpu_state();
+               if (guest_has_sve) {
+                       __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+
+                       /* Restore the VL that was saved when bound to the CPU */
+                       if (!has_vhe())
+                               sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
+                                                      SYS_ZCR_EL1);
+               }
  
-               if (guest_has_sve)
-                       __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_s(SYS_ZCR_EL12);
-       } else if (host_has_sve) {
+               fpsimd_save_and_flush_cpu_state();
+       } else if (has_vhe() && host_has_sve) {
                 /*
                  * The FPSIMD/SVE state in the CPU has not been touched, and we
                  * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c

index 6cb39ee..5cb4a1c 100644 (file)
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -299,7 +299,7 @@ static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
  
         memset(vqs, 0, sizeof(vqs));
  
-       max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+       max_vq = vcpu_sve_max_vq(vcpu);
         for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
                 if (sve_vq_available(vq))
                         vqs[vq_word(vq)] |= vq_mask(vq);
@@ -427,7 +427,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
                 if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                         return -ENOENT;
  
-               vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+               vq = vcpu_sve_max_vq(vcpu);
  
                 reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) -
                                 SVE_SIG_REGS_OFFSET;
@@ -437,7 +437,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
                 if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                         return -ENOENT;
  
-               vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+               vq = vcpu_sve_max_vq(vcpu);
  
                 reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) -
                                 SVE_SIG_REGS_OFFSET;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c

index cebe39f..6f48336 100644 (file)
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -291,3 +291,48 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
         if (exception_index == ARM_EXCEPTION_EL1_SERROR)
                 kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
  }
+
+void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
+                                             u64 par, uintptr_t vcpu,
+                                             u64 far, u64 hpfar) {
+       u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr));
+       u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr;
+       u64 mode = spsr & PSR_MODE_MASK;
+
+       /*
+        * The nVHE hyp symbols are not included by kallsyms to avoid issues
+        * with aliasing. That means that the symbols cannot be printed with the
+        * "%pS" format specifier, so fall back to the vmlinux address if
+        * there's no better option.
+        */
+       if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
+               kvm_err("Invalid host exception to nVHE hyp!\n");
+       } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
+                  (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
+               struct bug_entry *bug = find_bug(elr_in_kimg);
+               const char *file = NULL;
+               unsigned int line = 0;
+
+               /* All hyp bugs, including warnings, are treated as fatal. */
+               if (bug)
+                       bug_get_file_line(bug, &file, &line);
+
+               if (file)
+                       kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
+               else
+                       kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset);
+       } else {
+               kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset);
+       }
+
+       /*
+        * Hyp has panicked and we're going to handle that by panicking the
+        * kernel. The kernel offset will be revealed in the panic so we're
+        * also safe to reveal the hyp offset as a debugging aid for translating
+        * hyp VAs to vmlinux addresses.
+        */
+       kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);
+
+       panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
+             spsr, elr, esr, far, hpfar, par, vcpu);
+}
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile

index 687598e..b726332 100644 (file)
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir)                               \
                     -DDISABLE_BRANCH_PROFILING          \
                     $(DISABLE_STACKLEAK_PLUGIN)
  
-obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o
diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S

index 01f114a..3c63592 100644 (file)
--- a/arch/arm64/kvm/hyp/fpsimd.S
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@@ -19,3 +19,13 @@ SYM_FUNC_START(__fpsimd_restore_state)
         fpsimd_restore  x0, 1
         ret
  SYM_FUNC_END(__fpsimd_restore_state)
+
+SYM_FUNC_START(__sve_restore_state)
+       __sve_load 0, x1, 2
+       ret
+SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+       sve_save 0, x1, 2
+       ret
+SYM_FUNC_END(__sve_save_state)
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h

index 6c1f51f..e4a2f29 100644 (file)
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -30,8 +30,6 @@
  #include <asm/processor.h>
  #include <asm/thread_info.h>
  
-extern const char __hyp_panic_string[];
-
  extern struct exception_table_entry __start___kvm_ex_table;
  extern struct exception_table_entry __stop___kvm_ex_table;
  
@@ -160,18 +158,10 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
         return true;
  }
  
-static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
  {
-       u8 ec;
-       u64 esr;
         u64 hpfar, far;
  
-       esr = vcpu->arch.fault.esr_el2;
-       ec = ESR_ELx_EC(esr);
-
-       if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
-               return true;
-
         far = read_sysreg_el2(SYS_FAR);
  
         /*
@@ -194,33 +184,59 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
                 hpfar = read_sysreg(hpfar_el2);
         }
  
-       vcpu->arch.fault.far_el2 = far;
-       vcpu->arch.fault.hpfar_el2 = hpfar;
+       fault->far_el2 = far;
+       fault->hpfar_el2 = hpfar;
         return true;
  }
  
+static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+{
+       u8 ec;
+       u64 esr;
+
+       esr = vcpu->arch.fault.esr_el2;
+       ec = ESR_ELx_EC(esr);
+
+       if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
+               return true;
+
+       return __get_fault_info(esr, &vcpu->arch.fault);
+}
+
+static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu)
+{
+       struct thread_struct *thread;
+
+       thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct,
+                             uw.fpsimd_state);
+
+       __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr);
+}
+
+static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
+{
+       sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
+       __sve_restore_state(vcpu_sve_pffr(vcpu),
+                           &vcpu->arch.ctxt.fp_regs.fpsr);
+       write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
+}
+
  /* Check for an FPSIMD/SVE trap and handle as appropriate */
  static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
  {
-       bool vhe, sve_guest, sve_host;
+       bool sve_guest, sve_host;
         u8 esr_ec;
+       u64 reg;
  
         if (!system_supports_fpsimd())
                 return false;
  
-       /*
-        * Currently system_supports_sve() currently implies has_vhe(),
-        * so the check is redundant. However, has_vhe() can be determined
-        * statically and helps the compiler remove dead code.
-        */
-       if (has_vhe() && system_supports_sve()) {
+       if (system_supports_sve()) {
                 sve_guest = vcpu_has_sve(vcpu);
                 sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE;
-               vhe = true;
         } else {
                 sve_guest = false;
                 sve_host = false;
-               vhe = has_vhe();
         }
  
         esr_ec = kvm_vcpu_trap_get_class(vcpu);
@@ -229,53 +245,38 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
                 return false;
  
         /* Don't handle SVE traps for non-SVE vcpus here: */
-       if (!sve_guest)
-               if (esr_ec != ESR_ELx_EC_FP_ASIMD)
-                       return false;
+       if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD)
+               return false;
  
         /* Valid trap.  Switch the context: */
-
-       if (vhe) {
-               u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN;
-
+       if (has_vhe()) {
+               reg = CPACR_EL1_FPEN;
                 if (sve_guest)
                         reg |= CPACR_EL1_ZEN;
  
-               write_sysreg(reg, cpacr_el1);
+               sysreg_clear_set(cpacr_el1, 0, reg);
         } else {
-               write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP,
-                            cptr_el2);
-       }
+               reg = CPTR_EL2_TFP;
+               if (sve_guest)
+                       reg |= CPTR_EL2_TZ;
  
+               sysreg_clear_set(cptr_el2, reg, 0);
+       }
         isb();
  
         if (vcpu->arch.flags & KVM_ARM64_FP_HOST) {
-               /*
-                * In the SVE case, VHE is assumed: it is enforced by
-                * Kconfig and kvm_arch_init().
-                */
-               if (sve_host) {
-                       struct thread_struct *thread = container_of(
-                               vcpu->arch.host_fpsimd_state,
-                               struct thread_struct, uw.fpsimd_state);
-
-                       sve_save_state(sve_pffr(thread),
-                                      &vcpu->arch.host_fpsimd_state->fpsr);
-               } else {
+               if (sve_host)
+                       __hyp_sve_save_host(vcpu);
+               else
                         __fpsimd_save_state(vcpu->arch.host_fpsimd_state);
-               }
  
                 vcpu->arch.flags &= ~KVM_ARM64_FP_HOST;
         }
  
-       if (sve_guest) {
-               sve_load_state(vcpu_sve_pffr(vcpu),
-                              &vcpu->arch.ctxt.fp_regs.fpsr,
-                              sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1);
-               write_sysreg_s(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR_EL12);
-       } else {
+       if (sve_guest)
+               __hyp_sve_restore_guest(vcpu);
+       else
                 __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
-       }
  
         /* Skip restoring fpexc32 for AArch64 guests */
         if (!(read_sysreg(hcr_el2) & HCR_RW))
diff --git a/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h

new file mode 100644 (file)

index 0000000..dc61aaa
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_EARLY_ALLOC_H
+#define __KVM_HYP_EARLY_ALLOC_H
+
+#include <asm/kvm_pgtable.h>
+
+void hyp_early_alloc_init(void *virt, unsigned long size);
+unsigned long hyp_early_alloc_nr_used_pages(void);
+void *hyp_early_alloc_page(void *arg);
+void *hyp_early_alloc_contig(unsigned int nr_pages);
+
+extern struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+
+#endif /* __KVM_HYP_EARLY_ALLOC_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h

new file mode 100644 (file)

index 0000000..18a4494
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_GFP_H
+#define __KVM_HYP_GFP_H
+
+#include <linux/list.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_NO_ORDER   UINT_MAX
+
+struct hyp_pool {
+       /*
+        * Spinlock protecting concurrent changes to the memory pool as well as
+        * the struct hyp_page of the pool's pages until we have a proper atomic
+        * API at EL2.
+        */
+       hyp_spinlock_t lock;
+       struct list_head free_area[MAX_ORDER];
+       phys_addr_t range_start;
+       phys_addr_t range_end;
+       unsigned int max_order;
+};
+
+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       p->refcount++;
+       hyp_spin_unlock(&pool->lock);
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+       int ret;
+
+       hyp_spin_lock(&pool->lock);
+       p->refcount--;
+       ret = (p->refcount == 0);
+       hyp_spin_unlock(&pool->lock);
+
+       return ret;
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       if (p->refcount) {
+               hyp_spin_unlock(&pool->lock);
+               BUG();
+       }
+       p->refcount = 1;
+       hyp_spin_unlock(&pool->lock);
+}
+
+/* Allocation */
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
+void hyp_get_page(void *addr);
+void hyp_put_page(void *addr);
+
+/* Used pages cannot be freed */
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+                 unsigned int reserved_pages);
+#endif /* __KVM_HYP_GFP_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h

new file mode 100644 (file)

index 0000000..42d81ec
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#ifndef __KVM_NVHE_MEM_PROTECT__
+#define __KVM_NVHE_MEM_PROTECT__
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/virt.h>
+#include <nvhe/spinlock.h>
+
+struct host_kvm {
+       struct kvm_arch arch;
+       struct kvm_pgtable pgt;
+       struct kvm_pgtable_mm_ops mm_ops;
+       hyp_spinlock_t lock;
+};
+extern struct host_kvm host_kvm;
+
+int __pkvm_prot_finalize(void);
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
+
+static __always_inline void __load_host_stage2(void)
+{
+       if (static_branch_likely(&kvm_protected_mode_initialized))
+               __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+       else
+               write_sysreg(0, vttbr_el2);
+}
+#endif /* __KVM_NVHE_MEM_PROTECT__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h

new file mode 100644 (file)

index 0000000..fd78bde
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MEMORY_H
+#define __KVM_HYP_MEMORY_H
+
+#include <asm/kvm_mmu.h>
+#include <asm/page.h>
+
+#include <linux/types.h>
+
+struct hyp_pool;
+struct hyp_page {
+       unsigned int refcount;
+       unsigned int order;
+       struct hyp_pool *pool;
+       struct list_head node;
+};
+
+extern u64 __hyp_vmemmap;
+#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap)
+
+#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset))
+
+static inline void *hyp_phys_to_virt(phys_addr_t phys)
+{
+       return __hyp_va(phys);
+}
+
+static inline phys_addr_t hyp_virt_to_phys(void *addr)
+{
+       return __hyp_pa(addr);
+}
+
+#define hyp_phys_to_pfn(phys)  ((phys) >> PAGE_SHIFT)
+#define hyp_pfn_to_phys(pfn)   ((phys_addr_t)((pfn) << PAGE_SHIFT))
+#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)])
+#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt))
+#define hyp_virt_to_pfn(virt)  hyp_phys_to_pfn(__hyp_pa(virt))
+
+#define hyp_page_to_pfn(page)  ((struct hyp_page *)(page) - hyp_vmemmap)
+#define hyp_page_to_phys(page)  hyp_pfn_to_phys((hyp_page_to_pfn(page)))
+#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
+#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
+
+static inline int hyp_page_count(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       return p->refcount;
+}
+
+#endif /* __KVM_HYP_MEMORY_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h

new file mode 100644 (file)

index 0000000..0095f62
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MM_H
+#define __KVM_HYP_MM_H
+
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+#include <linux/memblock.h>
+#include <linux/types.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_MEMBLOCK_REGIONS 128
+extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
+extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
+extern struct kvm_pgtable pkvm_pgtable;
+extern hyp_spinlock_t pkvm_pgd_lock;
+extern struct hyp_pool hpool;
+extern u64 __io_map_base;
+
+int hyp_create_idmap(u32 hyp_va_bits);
+int hyp_map_vectors(void);
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+                          unsigned long phys, enum kvm_pgtable_prot prot);
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                           enum kvm_pgtable_prot prot);
+
+static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
+                                    unsigned long *start, unsigned long *end)
+{
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct hyp_page *p = hyp_phys_to_page(phys);
+
+       *start = (unsigned long)p;
+       *end = *start + nr_pages * sizeof(struct hyp_page);
+       *start = ALIGN_DOWN(*start, PAGE_SIZE);
+       *end = ALIGN(*end, PAGE_SIZE);
+}
+
+static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
+{
+       unsigned long total = 0, i;
+
+       /* Provision the worst case scenario */
+       for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+               nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
+               total += nr_pages;
+       }
+
+       return total;
+}
+
+static inline unsigned long __hyp_pgtable_total_pages(void)
+{
+       unsigned long res = 0, i;
+
+       /* Cover all of memory with page-granularity */
+       for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+               struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
+               res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
+       }
+
+       return res;
+}
+
+static inline unsigned long hyp_s1_pgtable_pages(void)
+{
+       unsigned long res;
+
+       res = __hyp_pgtable_total_pages();
+
+       /* Allow 1 GiB for private mappings */
+       res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+
+       return res;
+}
+
+static inline unsigned long host_s2_mem_pgtable_pages(void)
+{
+       /*
+        * Include an extra 16 pages to safely upper-bound the worst case of
+        * concatenated pgds.
+        */
+       return __hyp_pgtable_total_pages() + 16;
+}
+
+static inline unsigned long host_s2_dev_pgtable_pages(void)
+{
+       /* Allow 1 GiB for MMIO mappings */
+       return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+}
+
+#endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h

new file mode 100644 (file)

index 0000000..76b537f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A stand-alone ticket spinlock implementation for use by the non-VHE
+ * KVM hypervisor code running at EL2.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ *
+ * Heavily based on the implementation removed by c11090474d70 which was:
+ * Copyright (C) 2012 ARM Ltd.
+ */
+
+#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__
+#define __ARM64_KVM_NVHE_SPINLOCK_H__
+
+#include <asm/alternative.h>
+#include <asm/lse.h>
+
+typedef union hyp_spinlock {
+       u32     __val;
+       struct {
+#ifdef __AARCH64EB__
+               u16 next, owner;
+#else
+               u16 owner, next;
+#endif
+       };
+} hyp_spinlock_t;
+
+#define hyp_spin_lock_init(l)                                          \
+do {                                                                   \
+       *(l) = (hyp_spinlock_t){ .__val = 0 };                          \
+} while (0)
+
+static inline void hyp_spin_lock(hyp_spinlock_t *lock)
+{
+       u32 tmp;
+       hyp_spinlock_t lockval, newval;
+
+       asm volatile(
+       /* Atomically increment the next ticket. */
+       ARM64_LSE_ATOMIC_INSN(
+       /* LL/SC */
+"      prfm    pstl1strm, %3\n"
+"1:    ldaxr   %w0, %3\n"
+"      add     %w1, %w0, #(1 << 16)\n"
+"      stxr    %w2, %w1, %3\n"
+"      cbnz    %w2, 1b\n",
+       /* LSE atomics */
+"      mov     %w2, #(1 << 16)\n"
+"      ldadda  %w2, %w0, %3\n"
+       __nops(3))
+
+       /* Did we get the lock? */
+"      eor     %w1, %w0, %w0, ror #16\n"
+"      cbz     %w1, 3f\n"
+       /*
+        * No: spin on the owner. Send a local event to avoid missing an
+        * unlock before the exclusive load.
+        */
+"      sevl\n"
+"2:    wfe\n"
+"      ldaxrh  %w2, %4\n"
+"      eor     %w1, %w2, %w0, lsr #16\n"
+"      cbnz    %w1, 2b\n"
+       /* We got the lock. Critical section starts here. */
+"3:"
+       : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock)
+       : "Q" (lock->owner)
+       : "memory");
+}
+
+static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
+{
+       u64 tmp;
+
+       asm volatile(
+       ARM64_LSE_ATOMIC_INSN(
+       /* LL/SC */
+       "       ldrh    %w1, %0\n"
+       "       add     %w1, %w1, #1\n"
+       "       stlrh   %w1, %0",
+       /* LSE atomics */
+       "       mov     %w1, #1\n"
+       "       staddlh %w1, %0\n"
+       __nops(1))
+       : "=Q" (lock->owner), "=&r" (tmp)
+       :
+       : "memory");
+}
+
+#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile

index a6707df..f55201a 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -9,10 +9,15 @@ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
  hostprogs := gen-hyprel
  HOST_EXTRACFLAGS += -I$(objtree)/include
  
+lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+
  obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
-        hyp-main.o hyp-smp.o psci-relay.o
+        hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
+        cache.o setup.o mm.o mem_protect.o
  obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
-        ../fpsimd.o ../hyp-entry.o ../exception.o
+        ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+obj-y += $(lib-objs)
  
  ##
  ## Build rules for compiling nVHE hyp code
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S

new file mode 100644 (file)

index 0000000..36cef69
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Code copied from arch/arm64/mm/cache.S.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/alternative.h>
+
+SYM_FUNC_START_PI(__flush_dcache_area)
+       dcache_by_line_op civac, sy, x0, x1, x2, x3
+       ret
+SYM_FUNC_END_PI(__flush_dcache_area)
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c

index f401724..7d3f258 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -21,17 +21,11 @@ static void __debug_save_spe(u64 *pmscr_el1)
         /* Clear pmscr in case of early return */
         *pmscr_el1 = 0;
  
-       /* SPE present on this CPU? */
-       if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
-                                                 ID_AA64DFR0_PMSVER_SHIFT))
-               return;
-
-       /* Yes; is it owned by EL3? */
-       reg = read_sysreg_s(SYS_PMBIDR_EL1);
-       if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
-               return;
-
-       /* No; is the host actually using the thing? */
+       /*
+        * At this point, we know that this CPU implements
+        * SPE and is available to the host.
+        * Check if the host is actually using it ?
+        */
         reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
         if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))
                 return;
@@ -58,10 +52,43 @@ static void __debug_restore_spe(u64 pmscr_el1)
         write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
  }
  
+static void __debug_save_trace(u64 *trfcr_el1)
+{
+       *trfcr_el1 = 0;
+
+       /* Check if the TRBE is enabled */
+       if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE))
+               return;
+       /*
+        * Prohibit trace generation while we are in guest.
+        * Since access to TRFCR_EL1 is trapped, the guest can't
+        * modify the filtering set by the host.
+        */
+       *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1);
+       write_sysreg_s(0, SYS_TRFCR_EL1);
+       isb();
+       /* Drain the trace buffer to memory */
+       tsb_csync();
+       dsb(nsh);
+}
+
+static void __debug_restore_trace(u64 trfcr_el1)
+{
+       if (!trfcr_el1)
+               return;
+
+       /* Restore trace filter controls */
+       write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1);
+}
+
  void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
  {
         /* Disable and flush SPE data generation */
-       __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+               __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+       /* Disable and flush Self-Hosted Trace generation */
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+               __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1);
  }
  
  void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -71,7 +98,10 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
  
  void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
  {
-       __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+               __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+               __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1);
  }
  
  void __debug_switch_to_host(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c

new file mode 100644 (file)

index 0000000..1306c43
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/memory.h>
+
+struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+s64 __ro_after_init hyp_physvirt_offset;
+
+static unsigned long base;
+static unsigned long end;
+static unsigned long cur;
+
+unsigned long hyp_early_alloc_nr_used_pages(void)
+{
+       return (cur - base) >> PAGE_SHIFT;
+}
+
+void *hyp_early_alloc_contig(unsigned int nr_pages)
+{
+       unsigned long size = (nr_pages << PAGE_SHIFT);
+       void *ret = (void *)cur;
+
+       if (!nr_pages)
+               return NULL;
+
+       if (end - cur < size)
+               return NULL;
+
+       cur += size;
+       memset(ret, 0, size);
+
+       return ret;
+}
+
+void *hyp_early_alloc_page(void *arg)
+{
+       return hyp_early_alloc_contig(1);
+}
+
+void hyp_early_alloc_init(void *virt, unsigned long size)
+{
+       base = cur = (unsigned long)virt;
+       end = base + size;
+
+       hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page;
+       hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt;
+       hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c

index ead02c6..6bc88a7 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
+++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
@@ -50,6 +50,18 @@
  #ifndef R_AARCH64_ABS64
  #define R_AARCH64_ABS64                        257
  #endif
+#ifndef R_AARCH64_PREL64
+#define R_AARCH64_PREL64               260
+#endif
+#ifndef R_AARCH64_PREL32
+#define R_AARCH64_PREL32               261
+#endif
+#ifndef R_AARCH64_PREL16
+#define R_AARCH64_PREL16               262
+#endif
+#ifndef R_AARCH64_PLT32
+#define R_AARCH64_PLT32                        314
+#endif
  #ifndef R_AARCH64_LD_PREL_LO19
  #define R_AARCH64_LD_PREL_LO19         273
  #endif
@@ -371,6 +383,12 @@ static void emit_rela_section(Elf64_Shdr *sh_rela)
                 case R_AARCH64_ABS64:
                         emit_rela_abs64(rela, sh_orig_name);
                         break;
+               /* Allow position-relative data relocations. */
+               case R_AARCH64_PREL64:
+               case R_AARCH64_PREL32:
+               case R_AARCH64_PREL16:
+               case R_AARCH64_PLT32:
+                       break;
                 /* Allow relocations to generate PC-relative addressing. */
                 case R_AARCH64_LD_PREL_LO19:
                 case R_AARCH64_ADR_PREL_LO21:
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S

index 5d94584..2b23400 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -79,22 +79,18 @@ SYM_FUNC_START(__hyp_do_panic)
         mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
                       PSR_MODE_EL1h)
         msr     spsr_el2, lr
-       ldr     lr, =panic
+       ldr     lr, =nvhe_hyp_panic_handler
         hyp_kimg_va lr, x6
         msr     elr_el2, lr
  
         mov     x29, x0
  
-       /* Load the format string into x0 and arguments into x1-7 */
-       ldr     x0, =__hyp_panic_string
-       hyp_kimg_va x0, x6
-
-       /* Load the format arguments into x1-7. */
-       mov     x6, x3
-       get_vcpu_ptr x7, x3
-       mrs     x3, esr_el2
-       mrs     x4, far_el2
-       mrs     x5, hpfar_el2
+       /* Load the panic arguments into x0-7 */
+       mrs     x0, esr_el2
+       get_vcpu_ptr x4, x5
+       mrs     x5, far_el2
+       mrs     x6, hpfar_el2
+       mov     x7, xzr                 // Unused argument
  
         /* Enter the host, conditionally restoring the host context. */
         cbz     x29, __host_enter_without_restoring
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S

index c631e29..c953fb4 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -83,11 +83,6 @@ SYM_CODE_END(__kvm_hyp_init)
   * x0: struct kvm_nvhe_init_params PA
   */
  SYM_CODE_START_LOCAL(___kvm_hyp_init)
-alternative_if ARM64_KVM_PROTECTED_MODE
-       mov_q   x1, HCR_HOST_NVHE_PROTECTED_FLAGS
-       msr     hcr_el2, x1
-alternative_else_nop_endif
-
         ldr     x1, [x0, #NVHE_INIT_TPIDR_EL2]
         msr     tpidr_el2, x1
  
@@ -97,6 +92,15 @@ alternative_else_nop_endif
         ldr     x1, [x0, #NVHE_INIT_MAIR_EL2]
         msr     mair_el2, x1
  
+       ldr     x1, [x0, #NVHE_INIT_HCR_EL2]
+       msr     hcr_el2, x1
+
+       ldr     x1, [x0, #NVHE_INIT_VTTBR]
+       msr     vttbr_el2, x1
+
+       ldr     x1, [x0, #NVHE_INIT_VTCR]
+       msr     vtcr_el2, x1
+
         ldr     x1, [x0, #NVHE_INIT_PGD_PA]
         phys_to_ttbr x2, x1
  alternative_if ARM64_HAS_CNP
@@ -115,15 +119,10 @@ alternative_else_nop_endif
  
         /* Invalidate the stale TLBs from Bootloader */
         tlbi    alle2
+       tlbi    vmalls12e1
         dsb     sy
  
-       /*
-        * Preserve all the RES1 bits while setting the default flags,
-        * as well as the EE bit on BE. Drop the A flag since the compiler
-        * is allowed to generate unaligned accesses.
-        */
-       mov_q   x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE(        orr     x0, x0, #SCTLR_ELx_EE)
+       mov_q   x0, INIT_SCTLR_EL2_MMU_ON
  alternative_if ARM64_HAS_ADDRESS_AUTH
         mov_q   x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
                      SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
@@ -221,9 +220,7 @@ SYM_CODE_START(__kvm_handle_stub_hvc)
         mov     x0, xzr
  reset:
         /* Reset kvm back to the hyp stub. */
-       mrs     x5, sctlr_el2
-       mov_q   x6, SCTLR_ELx_FLAGS
-       bic     x5, x5, x6              // Clear SCTL_M and etc
+       mov_q   x5, INIT_SCTLR_EL2_MMU_OFF
         pre_disable_mmu_workaround
         msr     sctlr_el2, x5
         isb
@@ -244,4 +241,31 @@ alternative_else_nop_endif
  
  SYM_CODE_END(__kvm_handle_stub_hvc)
  
+SYM_FUNC_START(__pkvm_init_switch_pgd)
+       /* Turn the MMU off */
+       pre_disable_mmu_workaround
+       mrs     x2, sctlr_el2
+       bic     x3, x2, #SCTLR_ELx_M
+       msr     sctlr_el2, x3
+       isb
+
+       tlbi    alle2
+
+       /* Install the new pgtables */
+       ldr     x3, [x0, #NVHE_INIT_PGD_PA]
+       phys_to_ttbr x4, x3
+alternative_if ARM64_HAS_CNP
+       orr     x4, x4, #TTBR_CNP_BIT
+alternative_else_nop_endif
+       msr     ttbr0_el2, x4
+
+       /* Set the new stack pointer */
+       ldr     x0, [x0, #NVHE_INIT_STACK_HYP_VA]
+       mov     sp, x0
+
+       /* And turn the MMU back on! */
+       set_sctlr_el2   x2
+       ret     x1
+SYM_FUNC_END(__pkvm_init_switch_pgd)
+
         .popsection
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c

index 9363282..f36420a 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -6,12 +6,15 @@
  
  #include <hyp/switch.h>
  
+#include <asm/pgtable-types.h>
  #include <asm/kvm_asm.h>
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_host.h>
  #include <asm/kvm_hyp.h>
  #include <asm/kvm_mmu.h>
  
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
  #include <nvhe/trap_handler.h>
  
  DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -106,6 +109,61 @@ static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
         __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
  }
  
+static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+       DECLARE_REG(unsigned long, size, host_ctxt, 2);
+       DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
+       DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
+       DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+
+       /*
+        * __pkvm_init() will return only if an error occurred, otherwise it
+        * will tail-call in __pkvm_init_finalise() which will have to deal
+        * with the host context directly.
+        */
+       cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
+                                           hyp_va_bits);
+}
+
+static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1);
+
+       cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
+}
+
+static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(unsigned long, start, host_ctxt, 1);
+       DECLARE_REG(unsigned long, size, host_ctxt, 2);
+       DECLARE_REG(unsigned long, phys, host_ctxt, 3);
+       DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
+}
+
+static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+       DECLARE_REG(size_t, size, host_ctxt, 2);
+       DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
+}
+
+static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
+{
+       cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
+}
+
+static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
+       DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end);
+}
  typedef void (*hcall_t)(struct kvm_cpu_context *);
  
  #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -125,6 +183,12 @@ static const hcall_t host_hcall[] = {
         HANDLE_FUNC(__kvm_get_mdcr_el2),
         HANDLE_FUNC(__vgic_v3_save_aprs),
         HANDLE_FUNC(__vgic_v3_restore_aprs),
+       HANDLE_FUNC(__pkvm_init),
+       HANDLE_FUNC(__pkvm_cpu_set_vector),
+       HANDLE_FUNC(__pkvm_create_mappings),
+       HANDLE_FUNC(__pkvm_create_private_mapping),
+       HANDLE_FUNC(__pkvm_prot_finalize),
+       HANDLE_FUNC(__pkvm_mark_hyp),
  };
  
  static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -177,7 +241,16 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
         case ESR_ELx_EC_SMC64:
                 handle_host_smc(host_ctxt);
                 break;
+       case ESR_ELx_EC_SVE:
+               sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
+               isb();
+               sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
+               break;
+       case ESR_ELx_EC_IABT_LOW:
+       case ESR_ELx_EC_DABT_LOW:
+               handle_host_mem_abort(host_ctxt);
+               break;
         default:
-               hyp_panic();
+               BUG();
         }
  }
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c

index 8795590..9f54833 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -18,8 +18,7 @@ u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID
  
  u64 cpu_logical_map(unsigned int cpu)
  {
-       if (cpu >= ARRAY_SIZE(hyp_cpu_logical_map))
-               hyp_panic();
+       BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map));
  
         return hyp_cpu_logical_map[cpu];
  }
@@ -30,8 +29,7 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu)
         unsigned long this_cpu_base;
         unsigned long elf_base;
  
-       if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
-               hyp_panic();
+       BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base));
  
         cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base;
         this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S

index cd119d8..f4562f4 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -25,4 +25,5 @@ SECTIONS {
         BEGIN_HYP_SECTION(.data..percpu)
                 PERCPU_INPUT(L1_CACHE_BYTES)
         END_HYP_SECTION
+       HYP_SECTION(.bss)
  }
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c

new file mode 100644 (file)

index 0000000..e342f7f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/stage2_pgtable.h>
+
+#include <hyp/switch.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+
+#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
+
+extern unsigned long hyp_nr_cpus;
+struct host_kvm host_kvm;
+
+struct hyp_pool host_s2_mem;
+struct hyp_pool host_s2_dev;
+
+/*
+ * Copies of the host's CPU features registers holding sanitized values.
+ */
+u64 id_aa64mmfr0_el1_sys_val;
+u64 id_aa64mmfr1_el1_sys_val;
+
+static const u8 pkvm_hyp_id = 1;
+
+static void *host_s2_zalloc_pages_exact(size_t size)
+{
+       return hyp_alloc_pages(&host_s2_mem, get_order(size));
+}
+
+static void *host_s2_zalloc_page(void *pool)
+{
+       return hyp_alloc_pages(pool, 0);
+}
+
+static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+       unsigned long nr_pages, pfn;
+       int ret;
+
+       pfn = hyp_virt_to_pfn(mem_pgt_pool);
+       nr_pages = host_s2_mem_pgtable_pages();
+       ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
+       if (ret)
+               return ret;
+
+       pfn = hyp_virt_to_pfn(dev_pgt_pool);
+       nr_pages = host_s2_dev_pgtable_pages();
+       ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+       if (ret)
+               return ret;
+
+       host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
+               .zalloc_pages_exact = host_s2_zalloc_pages_exact,
+               .zalloc_page = host_s2_zalloc_page,
+               .phys_to_virt = hyp_phys_to_virt,
+               .virt_to_phys = hyp_virt_to_phys,
+               .page_count = hyp_page_count,
+               .get_page = hyp_get_page,
+               .put_page = hyp_put_page,
+       };
+
+       return 0;
+}
+
+static void prepare_host_vtcr(void)
+{
+       u32 parange, phys_shift;
+
+       /* The host stage 2 is id-mapped, so use parange for T0SZ */
+       parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
+       phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+       host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+                                         id_aa64mmfr1_el1_sys_val, phys_shift);
+}
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+       struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+       int ret;
+
+       prepare_host_vtcr();
+       hyp_spin_lock_init(&host_kvm.lock);
+
+       ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
+       if (ret)
+               return ret;
+
+       ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch,
+                                           &host_kvm.mm_ops, KVM_HOST_S2_FLAGS);
+       if (ret)
+               return ret;
+
+       mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
+       mmu->arch = &host_kvm.arch;
+       mmu->pgt = &host_kvm.pgt;
+       mmu->vmid.vmid_gen = 0;
+       mmu->vmid.vmid = 0;
+
+       return 0;
+}
+
+int __pkvm_prot_finalize(void)
+{
+       struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+       struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+       params->vttbr = kvm_get_vttbr(mmu);
+       params->vtcr = host_kvm.arch.vtcr;
+       params->hcr_el2 |= HCR_VM;
+       kvm_flush_dcache_to_poc(params, sizeof(*params));
+
+       write_sysreg(params->hcr_el2, hcr_el2);
+       __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+
+       /*
+        * Make sure to have an ISB before the TLB maintenance below but only
+        * when __load_stage2() doesn't include one already.
+        */
+       asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+       /* Invalidate stale HCR bits that may be cached in TLBs */
+       __tlbi(vmalls12e1);
+       dsb(nsh);
+       isb();
+
+       return 0;
+}
+
+static int host_stage2_unmap_dev_all(void)
+{
+       struct kvm_pgtable *pgt = &host_kvm.pgt;
+       struct memblock_region *reg;
+       u64 addr = 0;
+       int i, ret;
+
+       /* Unmap all non-memory regions to recycle the pages */
+       for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
+               reg = &hyp_memory[i];
+               ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+               if (ret)
+                       return ret;
+       }
+       return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
+}
+
+static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+{
+       int cur, left = 0, right = hyp_memblock_nr;
+       struct memblock_region *reg;
+       phys_addr_t end;
+
+       range->start = 0;
+       range->end = ULONG_MAX;
+
+       /* The list of memblock regions is sorted, binary search it */
+       while (left < right) {
+               cur = (left + right) >> 1;
+               reg = &hyp_memory[cur];
+               end = reg->base + reg->size;
+               if (addr < reg->base) {
+                       right = cur;
+                       range->end = reg->base;
+               } else if (addr >= end) {
+                       left = cur + 1;
+                       range->start = end;
+               } else {
+                       range->start = reg->base;
+                       range->end = end;
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static bool range_is_memory(u64 start, u64 end)
+{
+       struct kvm_mem_range r1, r2;
+
+       if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2))
+               return false;
+       if (r1.start != r2.start)
+               return false;
+
+       return true;
+}
+
+static inline int __host_stage2_idmap(u64 start, u64 end,
+                                     enum kvm_pgtable_prot prot,
+                                     struct hyp_pool *pool)
+{
+       return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
+                                     prot, pool);
+}
+
+static int host_stage2_idmap(u64 addr)
+{
+       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
+       struct kvm_mem_range range;
+       bool is_memory = find_mem_range(addr, &range);
+       struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
+       int ret;
+
+       if (is_memory)
+               prot |= KVM_PGTABLE_PROT_X;
+
+       hyp_spin_lock(&host_kvm.lock);
+       ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
+       if (ret)
+               goto unlock;
+
+       ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+       if (is_memory || ret != -ENOMEM)
+               goto unlock;
+
+       /*
+        * host_s2_mem has been provided with enough pages to cover all of
+        * memory with page granularity, so we should never hit the ENOMEM case.
+        * However, it is difficult to know how much of the MMIO range we will
+        * need to cover upfront, so we may need to 'recycle' the pages if we
+        * run out.
+        */
+       ret = host_stage2_unmap_dev_all();
+       if (ret)
+               goto unlock;
+
+       ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+
+unlock:
+       hyp_spin_unlock(&host_kvm.lock);
+
+       return ret;
+}
+
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+       int ret;
+
+       /*
+        * host_stage2_unmap_dev_all() currently relies on MMIO mappings being
+        * non-persistent, so don't allow changing page ownership in MMIO range.
+        */
+       if (!range_is_memory(start, end))
+               return -EINVAL;
+
+       hyp_spin_lock(&host_kvm.lock);
+       ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
+                                          &host_s2_mem, pkvm_hyp_id);
+       hyp_spin_unlock(&host_kvm.lock);
+
+       return ret != -EAGAIN ? ret : 0;
+}
+
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+       struct kvm_vcpu_fault_info fault;
+       u64 esr, addr;
+       int ret = 0;
+
+       esr = read_sysreg_el2(SYS_ESR);
+       BUG_ON(!__get_fault_info(esr, &fault));
+
+       addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+       ret = host_stage2_idmap(addr);
+       BUG_ON(ret && ret != -EAGAIN);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c

new file mode 100644 (file)

index 0000000..a8efdf0
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+#include <nvhe/spinlock.h>
+
+struct kvm_pgtable pkvm_pgtable;
+hyp_spinlock_t pkvm_pgd_lock;
+u64 __io_map_base;
+
+struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
+unsigned int hyp_memblock_nr;
+
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+                         unsigned long phys, enum kvm_pgtable_prot prot)
+{
+       int err;
+
+       hyp_spin_lock(&pkvm_pgd_lock);
+       err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot);
+       hyp_spin_unlock(&pkvm_pgd_lock);
+
+       return err;
+}
+
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                           enum kvm_pgtable_prot prot)
+{
+       unsigned long addr;
+       int err;
+
+       hyp_spin_lock(&pkvm_pgd_lock);
+
+       size = PAGE_ALIGN(size + offset_in_page(phys));
+       addr = __io_map_base;
+       __io_map_base += size;
+
+       /* Are we overflowing on the vmemmap ? */
+       if (__io_map_base > __hyp_vmemmap) {
+               __io_map_base -= size;
+               addr = (unsigned long)ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot);
+       if (err) {
+               addr = (unsigned long)ERR_PTR(err);
+               goto out;
+       }
+
+       addr = addr + offset_in_page(phys);
+out:
+       hyp_spin_unlock(&pkvm_pgd_lock);
+
+       return addr;
+}
+
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+       unsigned long start = (unsigned long)from;
+       unsigned long end = (unsigned long)to;
+       unsigned long virt_addr;
+       phys_addr_t phys;
+
+       start = start & PAGE_MASK;
+       end = PAGE_ALIGN(end);
+
+       for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+               int err;
+
+               phys = hyp_virt_to_phys((void *)virt_addr);
+               err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
+{
+       unsigned long start, end;
+
+       hyp_vmemmap_range(phys, size, &start, &end);
+
+       return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
+}
+
+static void *__hyp_bp_vect_base;
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot)
+{
+       void *vector;
+
+       switch (slot) {
+       case HYP_VECTOR_DIRECT: {
+               vector = __kvm_hyp_vector;
+               break;
+       }
+       case HYP_VECTOR_SPECTRE_DIRECT: {
+               vector = __bp_harden_hyp_vecs;
+               break;
+       }
+       case HYP_VECTOR_INDIRECT:
+       case HYP_VECTOR_SPECTRE_INDIRECT: {
+               vector = (void *)__hyp_bp_vect_base;
+               break;
+       }
+       default:
+               return -EINVAL;
+       }
+
+       vector = __kvm_vector_slot2addr(vector, slot);
+       *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector;
+
+       return 0;
+}
+
+int hyp_map_vectors(void)
+{
+       phys_addr_t phys;
+       void *bp_base;
+
+       if (!cpus_have_const_cap(ARM64_SPECTRE_V3A))
+               return 0;
+
+       phys = __hyp_pa(__bp_harden_hyp_vecs);
+       bp_base = (void *)__pkvm_create_private_mapping(phys,
+                                                       __BP_HARDEN_HYP_VECS_SZ,
+                                                       PAGE_HYP_EXEC);
+       if (IS_ERR_OR_NULL(bp_base))
+               return PTR_ERR(bp_base);
+
+       __hyp_bp_vect_base = bp_base;
+
+       return 0;
+}
+
+int hyp_create_idmap(u32 hyp_va_bits)
+{
+       unsigned long start, end;
+
+       start = hyp_virt_to_phys((void *)__hyp_idmap_text_start);
+       start = ALIGN_DOWN(start, PAGE_SIZE);
+
+       end = hyp_virt_to_phys((void *)__hyp_idmap_text_end);
+       end = ALIGN(end, PAGE_SIZE);
+
+       /*
+        * One half of the VA space is reserved to linearly map portions of
+        * memory -- see va_layout.c for more details. The other half of the VA
+        * space contains the trampoline page, and needs some care. Split that
+        * second half in two and find the quarter of VA space not conflicting
+        * with the idmap to place the IOs and the vmemmap. IOs use the lower
+        * half of the quarter and the vmemmap the upper half.
+        */
+       __io_map_base = start & BIT(hyp_va_bits - 2);
+       __io_map_base ^= BIT(hyp_va_bits - 2);
+       __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+
+       return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c

new file mode 100644 (file)

index 0000000..237e03b
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_hyp.h>
+#include <nvhe/gfp.h>
+
+u64 __hyp_vmemmap;
+
+/*
+ * Index the hyp_vmemmap to find a potential buddy page, but make no assumption
+ * about its current state.
+ *
+ * Example buddy-tree for a 4-pages physically contiguous pool:
+ *
+ *                 o : Page 3
+ *                /
+ *               o-o : Page 2
+ *              /
+ *             /   o : Page 1
+ *            /   /
+ *           o---o-o : Page 0
+ *    Order  2   1 0
+ *
+ * Example of requests on this pool:
+ *   __find_buddy_nocheck(pool, page 0, order 0) => page 1
+ *   __find_buddy_nocheck(pool, page 0, order 1) => page 2
+ *   __find_buddy_nocheck(pool, page 1, order 0) => page 0
+ *   __find_buddy_nocheck(pool, page 2, order 0) => page 3
+ */
+static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
+                                            struct hyp_page *p,
+                                            unsigned int order)
+{
+       phys_addr_t addr = hyp_page_to_phys(p);
+
+       addr ^= (PAGE_SIZE << order);
+
+       /*
+        * Don't return a page outside the pool range -- it belongs to
+        * something else and may not be mapped in hyp_vmemmap.
+        */
+       if (addr < pool->range_start || addr >= pool->range_end)
+               return NULL;
+
+       return hyp_phys_to_page(addr);
+}
+
+/* Find a buddy page currently available for allocation */
+static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
+                                          struct hyp_page *p,
+                                          unsigned int order)
+{
+       struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
+
+       if (!buddy || buddy->order != order || list_empty(&buddy->node))
+               return NULL;
+
+       return buddy;
+
+}
+
+static void __hyp_attach_page(struct hyp_pool *pool,
+                             struct hyp_page *p)
+{
+       unsigned int order = p->order;
+       struct hyp_page *buddy;
+
+       memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
+
+       /*
+        * Only the first struct hyp_page of a high-order page (otherwise known
+        * as the 'head') should have p->order set. The non-head pages should
+        * have p->order = HYP_NO_ORDER. Here @p may no longer be the head
+        * after coallescing, so make sure to mark it HYP_NO_ORDER proactively.
+        */
+       p->order = HYP_NO_ORDER;
+       for (; (order + 1) < pool->max_order; order++) {
+               buddy = __find_buddy_avail(pool, p, order);
+               if (!buddy)
+                       break;
+
+               /* Take the buddy out of its list, and coallesce with @p */
+               list_del_init(&buddy->node);
+               buddy->order = HYP_NO_ORDER;
+               p = min(p, buddy);
+       }
+
+       /* Mark the new head, and insert it */
+       p->order = order;
+       list_add_tail(&p->node, &pool->free_area[order]);
+}
+
+static void hyp_attach_page(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       __hyp_attach_page(pool, p);
+       hyp_spin_unlock(&pool->lock);
+}
+
+static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
+                                          struct hyp_page *p,
+                                          unsigned int order)
+{
+       struct hyp_page *buddy;
+
+       list_del_init(&p->node);
+       while (p->order > order) {
+               /*
+                * The buddy of order n - 1 currently has HYP_NO_ORDER as it
+                * is covered by a higher-level page (whose head is @p). Use
+                * __find_buddy_nocheck() to find it and inject it in the
+                * free_list[n - 1], effectively splitting @p in half.
+                */
+               p->order--;
+               buddy = __find_buddy_nocheck(pool, p, p->order);
+               buddy->order = p->order;
+               list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
+       }
+
+       return p;
+}
+
+void hyp_put_page(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       if (hyp_page_ref_dec_and_test(p))
+               hyp_attach_page(p);
+}
+
+void hyp_get_page(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       hyp_page_ref_inc(p);
+}
+
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
+{
+       unsigned int i = order;
+       struct hyp_page *p;
+
+       hyp_spin_lock(&pool->lock);
+
+       /* Look for a high-enough-order page */
+       while (i < pool->max_order && list_empty(&pool->free_area[i]))
+               i++;
+       if (i >= pool->max_order) {
+               hyp_spin_unlock(&pool->lock);
+               return NULL;
+       }
+
+       /* Extract it from the tree at the right order */
+       p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
+       p = __hyp_extract_page(pool, p, order);
+
+       hyp_spin_unlock(&pool->lock);
+       hyp_set_page_refcounted(p);
+
+       return hyp_page_to_virt(p);
+}
+
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+                 unsigned int reserved_pages)
+{
+       phys_addr_t phys = hyp_pfn_to_phys(pfn);
+       struct hyp_page *p;
+       int i;
+
+       hyp_spin_lock_init(&pool->lock);
+       pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+       for (i = 0; i < pool->max_order; i++)
+               INIT_LIST_HEAD(&pool->free_area[i]);
+       pool->range_start = phys;
+       pool->range_end = phys + (nr_pages << PAGE_SHIFT);
+
+       /* Init the vmemmap portion */
+       p = hyp_phys_to_page(phys);
+       memset(p, 0, sizeof(*p) * nr_pages);
+       for (i = 0; i < nr_pages; i++) {
+               p[i].pool = pool;
+               INIT_LIST_HEAD(&p[i].node);
+       }
+
+       /* Attach the unused pages to the buddy tree */
+       for (i = reserved_pages; i < nr_pages; i++)
+               __hyp_attach_page(pool, &p[i]);
+
+       return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c

index 63de71c..0850878 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -11,6 +11,7 @@
  #include <linux/kvm_host.h>
  #include <uapi/linux/psci.h>
  
+#include <nvhe/memory.h>
  #include <nvhe/trap_handler.h>
  
  void kvm_hyp_cpu_entry(unsigned long r0);
@@ -20,9 +21,6 @@ void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
  
  /* Config options set by the host. */
  struct kvm_host_psci_config __ro_after_init kvm_host_psci_config;
-s64 __ro_after_init hyp_physvirt_offset;
-
-#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)
  
  #define INVALID_CPU_ID UINT_MAX
  
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c

new file mode 100644 (file)

index 0000000..7488f53
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/trap_handler.h>
+
+struct hyp_pool hpool;
+struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
+unsigned long hyp_nr_cpus;
+
+#define hyp_percpu_size ((unsigned long)__per_cpu_end - \
+                        (unsigned long)__per_cpu_start)
+
+static void *vmemmap_base;
+static void *hyp_pgt_base;
+static void *host_s2_mem_pgt_base;
+static void *host_s2_dev_pgt_base;
+
+static int divide_memory_pool(void *virt, unsigned long size)
+{
+       unsigned long vstart, vend, nr_pages;
+
+       hyp_early_alloc_init(virt, size);
+
+       hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend);
+       nr_pages = (vend - vstart) >> PAGE_SHIFT;
+       vmemmap_base = hyp_early_alloc_contig(nr_pages);
+       if (!vmemmap_base)
+               return -ENOMEM;
+
+       nr_pages = hyp_s1_pgtable_pages();
+       hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!hyp_pgt_base)
+               return -ENOMEM;
+
+       nr_pages = host_s2_mem_pgtable_pages();
+       host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!host_s2_mem_pgt_base)
+               return -ENOMEM;
+
+       nr_pages = host_s2_dev_pgtable_pages();
+       host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!host_s2_dev_pgt_base)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
+                                unsigned long *per_cpu_base,
+                                u32 hyp_va_bits)
+{
+       void *start, *end, *virt = hyp_phys_to_virt(phys);
+       unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
+       int ret, i;
+
+       /* Recreate the hyp page-table using the early page allocator */
+       hyp_early_alloc_init(hyp_pgt_base, pgt_size);
+       ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits,
+                                  &hyp_early_alloc_mm_ops);
+       if (ret)
+               return ret;
+
+       ret = hyp_create_idmap(hyp_va_bits);
+       if (ret)
+               return ret;
+
+       ret = hyp_map_vectors();
+       if (ret)
+               return ret;
+
+       ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base));
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < hyp_nr_cpus; i++) {
+               start = (void *)kern_hyp_va(per_cpu_base[i]);
+               end = start + PAGE_ALIGN(hyp_percpu_size);
+               ret = pkvm_create_mappings(start, end, PAGE_HYP);
+               if (ret)
+                       return ret;
+
+               end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va;
+               start = end - PAGE_SIZE;
+               ret = pkvm_create_mappings(start, end, PAGE_HYP);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static void update_nvhe_init_params(void)
+{
+       struct kvm_nvhe_init_params *params;
+       unsigned long i;
+
+       for (i = 0; i < hyp_nr_cpus; i++) {
+               params = per_cpu_ptr(&kvm_init_params, i);
+               params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
+               __flush_dcache_area(params, sizeof(*params));
+       }
+}
+
+static void *hyp_zalloc_hyp_page(void *arg)
+{
+       return hyp_alloc_pages(&hpool, 0);
+}
+
+void __noreturn __pkvm_init_finalise(void)
+{
+       struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
+       struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt;
+       unsigned long nr_pages, reserved_pages, pfn;
+       int ret;
+
+       /* Now that the vmemmap is backed, install the full-fledged allocator */
+       pfn = hyp_virt_to_pfn(hyp_pgt_base);
+       nr_pages = hyp_s1_pgtable_pages();
+       reserved_pages = hyp_early_alloc_nr_used_pages();
+       ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages);
+       if (ret)
+               goto out;
+
+       ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
+       if (ret)
+               goto out;
+
+       pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
+               .zalloc_page = hyp_zalloc_hyp_page,
+               .phys_to_virt = hyp_phys_to_virt,
+               .virt_to_phys = hyp_virt_to_phys,
+               .get_page = hyp_get_page,
+               .put_page = hyp_put_page,
+       };
+       pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
+
+out:
+       /*
+        * We tail-called to here from handle___pkvm_init() and will not return,
+        * so make sure to propagate the return value to the host.
+        */
+       cpu_reg(host_ctxt, 1) = ret;
+
+       __host_enter(host_ctxt);
+}
+
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+               unsigned long *per_cpu_base, u32 hyp_va_bits)
+{
+       struct kvm_nvhe_init_params *params;
+       void *virt = hyp_phys_to_virt(phys);
+       void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+       int ret;
+
+       if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+               return -EINVAL;
+
+       hyp_spin_lock_init(&pkvm_pgd_lock);
+       hyp_nr_cpus = nr_cpus;
+
+       ret = divide_memory_pool(virt, size);
+       if (ret)
+               return ret;
+
+       ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits);
+       if (ret)
+               return ret;
+
+       update_nvhe_init_params();
+
+       /* Jump in the idmap page to switch to the new page-tables */
+       params = this_cpu_ptr(&kvm_init_params);
+       fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
+       fn(__hyp_pa(params), __pkvm_init_finalise);
+
+       unreachable();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/stub.c b/arch/arm64/kvm/hyp/nvhe/stub.c

new file mode 100644 (file)

index 0000000..c0aa6bb
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/stub.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stubs for out-of-line function calls caused by re-using kernel
+ * infrastructure at EL2.
+ *
+ * Copyright (C) 2020 - Google LLC
+ */
+
+#include <linux/list.h>
+
+#ifdef CONFIG_DEBUG_LIST
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+                     struct list_head *next)
+{
+               return true;
+}
+
+bool __list_del_entry_valid(struct list_head *entry)
+{
+               return true;
+}
+#endif
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c

index 68ab6b4..e9f6ea7 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -28,6 +28,8 @@
  #include <asm/processor.h>
  #include <asm/thread_info.h>
  
+#include <nvhe/mem_protect.h>
+
  /* Non-VHE specific context */
  DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
  DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -41,9 +43,9 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
         __activate_traps_common(vcpu);
  
         val = CPTR_EL2_DEFAULT;
-       val |= CPTR_EL2_TTA | CPTR_EL2_TZ | CPTR_EL2_TAM;
+       val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
         if (!update_fp_enabled(vcpu)) {
-               val |= CPTR_EL2_TFP;
+               val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
                 __activate_traps_fpsimd32(vcpu);
         }
  
@@ -68,7 +70,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
  static void __deactivate_traps(struct kvm_vcpu *vcpu)
  {
         extern char __kvm_hyp_host_vector[];
-       u64 mdcr_el2;
+       u64 mdcr_el2, cptr;
  
         ___deactivate_traps(vcpu);
  
@@ -95,19 +97,17 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
  
         mdcr_el2 &= MDCR_EL2_HPMN_MASK;
         mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+       mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
  
         write_sysreg(mdcr_el2, mdcr_el2);
-       if (is_protected_kvm_enabled())
-               write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2);
-       else
-               write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
-       write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
-       write_sysreg(__kvm_hyp_host_vector, vbar_el2);
-}
+       write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
  
-static void __load_host_stage2(void)
-{
-       write_sysreg(0, vttbr_el2);
+       cptr = CPTR_EL2_DEFAULT;
+       if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED))
+               cptr |= CPTR_EL2_TZ;
+
+       write_sysreg(cptr, cptr_el2);
+       write_sysreg(__kvm_hyp_host_vector, vbar_el2);
  }
  
  /* Save VGICv3 state on non-VHE systems */
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c

index 229b067..83dc3b2 100644 (file)
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -8,6 +8,8 @@
  #include <asm/kvm_mmu.h>
  #include <asm/tlbflush.h>
  
+#include <nvhe/mem_protect.h>
+
  struct tlb_inv_context {
         u64             tcr;
  };
@@ -43,7 +45,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
  
  static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
  {
-       write_sysreg(0, vttbr_el2);
+       __load_host_stage2();
  
         if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
                 /* Ensure write of the host VMID */
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c

index 926fc07..c37c1dc 100644 (file)
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -9,8 +9,7 @@
  
  #include <linux/bitfield.h>
  #include <asm/kvm_pgtable.h>
-
-#define KVM_PGTABLE_MAX_LEVELS         4U
+#include <asm/stage2_pgtable.h>
  
  #define KVM_PTE_VALID                  BIT(0)
  
@@ -49,6 +48,11 @@
                                          KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
                                          KVM_PTE_LEAF_ATTR_HI_S2_XN)
  
+#define KVM_PTE_LEAF_ATTR_S2_IGNORED   GENMASK(58, 55)
+
+#define KVM_INVALID_PTE_OWNER_MASK     GENMASK(63, 56)
+#define KVM_MAX_OWNER_ID               1
+
  struct kvm_pgtable_walk_data {
         struct kvm_pgtable              *pgt;
         struct kvm_pgtable_walker       *walker;
@@ -68,21 +72,36 @@ static u64 kvm_granule_size(u32 level)
         return BIT(kvm_granule_shift(level));
  }
  
-static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+#define KVM_PHYS_INVALID (-1ULL)
+
+static bool kvm_phys_is_valid(u64 phys)
  {
-       u64 granule = kvm_granule_size(level);
+       return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX));
+}
  
+static bool kvm_level_supports_block_mapping(u32 level)
+{
         /*
          * Reject invalid block mappings and don't bother with 4TB mappings for
          * 52-bit PAs.
          */
-       if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+       return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+       u64 granule = kvm_granule_size(level);
+
+       if (!kvm_level_supports_block_mapping(level))
                 return false;
  
         if (granule > (end - addr))
                 return false;
  
-       return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+       if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
+               return false;
+
+       return IS_ALIGNED(addr, granule);
  }
  
  static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
@@ -152,20 +171,20 @@ static kvm_pte_t kvm_phys_to_pte(u64 pa)
         return pte;
  }
  
-static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
  {
-       return __va(kvm_pte_to_phys(pte));
+       return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
  }
  
-static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+static void kvm_clear_pte(kvm_pte_t *ptep)
  {
-       kvm_pte_t pte = *ptep;
-       WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+       WRITE_ONCE(*ptep, 0);
  }
  
-static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
+                             struct kvm_pgtable_mm_ops *mm_ops)
  {
-       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
  
         pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
         pte |= KVM_PTE_VALID;
@@ -187,6 +206,11 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
         return pte;
  }
  
+static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
+{
+       return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
+}
+
  static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
                                   u32 level, kvm_pte_t *ptep,
                                   enum kvm_pgtable_walk_flags flag)
@@ -228,7 +252,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
                 goto out;
         }
  
-       childp = kvm_pte_follow(pte);
+       childp = kvm_pte_follow(pte, data->pgt->mm_ops);
         ret = __kvm_pgtable_walk(data, childp, level + 1);
         if (ret)
                 goto out;
@@ -303,12 +327,12 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
  }
  
  struct hyp_map_data {
-       u64             phys;
-       kvm_pte_t       attr;
+       u64                             phys;
+       kvm_pte_t                       attr;
+       struct kvm_pgtable_mm_ops       *mm_ops;
  };
  
-static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
-                                struct hyp_map_data *data)
+static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
  {
         bool device = prot & KVM_PGTABLE_PROT_DEVICE;
         u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
@@ -333,7 +357,8 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
         attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
-       data->attr = attr;
+       *ptep = attr;
+
         return 0;
  }
  
@@ -359,6 +384,8 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                           enum kvm_pgtable_walk_flags flag, void * const arg)
  {
         kvm_pte_t *childp;
+       struct hyp_map_data *data = arg;
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
  
         if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
                 return 0;
@@ -366,11 +393,11 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
                 return -EINVAL;
  
-       childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
         if (!childp)
                 return -ENOMEM;
  
-       kvm_set_table_pte(ptep, childp);
+       kvm_set_table_pte(ptep, childp, mm_ops);
         return 0;
  }
  
@@ -380,6 +407,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
         int ret;
         struct hyp_map_data map_data = {
                 .phys   = ALIGN_DOWN(phys, PAGE_SIZE),
+               .mm_ops = pgt->mm_ops,
         };
         struct kvm_pgtable_walker walker = {
                 .cb     = hyp_map_walker,
@@ -387,7 +415,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                 .arg    = &map_data,
         };
  
-       ret = hyp_map_set_prot_attr(prot, &map_data);
+       ret = hyp_set_prot_attr(prot, &map_data.attr);
         if (ret)
                 return ret;
  
@@ -397,16 +425,18 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
         return ret;
  }
  
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+                        struct kvm_pgtable_mm_ops *mm_ops)
  {
         u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
  
-       pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
         if (!pgt->pgd)
                 return -ENOMEM;
  
         pgt->ia_bits            = va_bits;
         pgt->start_level        = KVM_PGTABLE_MAX_LEVELS - levels;
+       pgt->mm_ops             = mm_ops;
         pgt->mmu                = NULL;
         return 0;
  }
@@ -414,7 +444,9 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
  static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                            enum kvm_pgtable_walk_flags flag, void * const arg)
  {
-       free_page((unsigned long)kvm_pte_follow(*ptep));
+       struct kvm_pgtable_mm_ops *mm_ops = arg;
+
+       mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops));
         return 0;
  }
  
@@ -423,29 +455,75 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
         struct kvm_pgtable_walker walker = {
                 .cb     = hyp_free_walker,
                 .flags  = KVM_PGTABLE_WALK_TABLE_POST,
+               .arg    = pgt->mm_ops,
         };
  
         WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
-       free_page((unsigned long)pgt->pgd);
+       pgt->mm_ops->put_page(pgt->pgd);
         pgt->pgd = NULL;
  }
  
  struct stage2_map_data {
         u64                             phys;
         kvm_pte_t                       attr;
+       u8                              owner_id;
  
         kvm_pte_t                       *anchor;
+       kvm_pte_t                       *childp;
  
         struct kvm_s2_mmu               *mmu;
-       struct kvm_mmu_memory_cache     *memcache;
+       void                            *memcache;
+
+       struct kvm_pgtable_mm_ops       *mm_ops;
  };
  
-static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
-                                   struct stage2_map_data *data)
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
+{
+       u64 vtcr = VTCR_EL2_FLAGS;
+       u8 lvls;
+
+       vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
+       vtcr |= VTCR_EL2_T0SZ(phys_shift);
+       /*
+        * Use a minimum 2 level page table to prevent splitting
+        * host PMD huge pages at stage2.
+        */
+       lvls = stage2_pgtable_levels(phys_shift);
+       if (lvls < 2)
+               lvls = 2;
+       vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+       /*
+        * Enable the Hardware Access Flag management, unconditionally
+        * on all CPUs. The features is RES0 on CPUs without the support
+        * and must be ignored by the CPUs.
+        */
+       vtcr |= VTCR_EL2_HA;
+
+       /* Set the vmid bits */
+       vtcr |= (get_vmid_bits(mmfr1) == 16) ?
+               VTCR_EL2_VS_16BIT :
+               VTCR_EL2_VS_8BIT;
+
+       return vtcr;
+}
+
+static bool stage2_has_fwb(struct kvm_pgtable *pgt)
+{
+       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+               return false;
+
+       return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
+}
+
+#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
+
+static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
+                               kvm_pte_t *ptep)
  {
         bool device = prot & KVM_PGTABLE_PROT_DEVICE;
-       kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
-                           PAGE_S2_MEMATTR(NORMAL);
+       kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
+                           KVM_S2_MEMATTR(pgt, NORMAL);
         u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
  
         if (!(prot & KVM_PGTABLE_PROT_X))
@@ -461,44 +539,78 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
  
         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
         attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
-       data->attr = attr;
+       *ptep = attr;
+
         return 0;
  }
  
+static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+{
+       if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
+               return true;
+
+       return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
+}
+
+static bool stage2_pte_is_counted(kvm_pte_t pte)
+{
+       /*
+        * The refcount tracks valid entries as well as invalid entries if they
+        * encode ownership of a page to another entity than the page-table
+        * owner, whose id is 0.
+        */
+       return !!pte;
+}
+
+static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
+                          u32 level, struct kvm_pgtable_mm_ops *mm_ops)
+{
+       /*
+        * Clear the existing PTE, and perform break-before-make with
+        * TLB maintenance if it was valid.
+        */
+       if (kvm_pte_valid(*ptep)) {
+               kvm_clear_pte(ptep);
+               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+       }
+
+       mm_ops->put_page(ptep);
+}
+
  static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
                                       kvm_pte_t *ptep,
                                       struct stage2_map_data *data)
  {
         kvm_pte_t new, old = *ptep;
         u64 granule = kvm_granule_size(level), phys = data->phys;
-       struct page *page = virt_to_page(ptep);
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
  
         if (!kvm_block_mapping_supported(addr, end, phys, level))
                 return -E2BIG;
  
-       new = kvm_init_valid_leaf_pte(phys, data->attr, level);
-       if (kvm_pte_valid(old)) {
+       if (kvm_phys_is_valid(phys))
+               new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+       else
+               new = kvm_init_invalid_leaf_owner(data->owner_id);
+
+       if (stage2_pte_is_counted(old)) {
                 /*
                  * Skip updating the PTE if we are trying to recreate the exact
                  * same mapping or only change the access permissions. Instead,
                  * the vCPU will exit one more time from guest if still needed
                  * and then go through the path of relaxing permissions.
                  */
-               if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+               if (!stage2_pte_needs_update(old, new))
                         return -EAGAIN;
  
-               /*
-                * There's an existing different valid leaf entry, so perform
-                * break-before-make.
-                */
-               kvm_set_invalid_pte(ptep);
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-               put_page(page);
+               stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
         }
  
         smp_store_release(ptep, new);
-       get_page(page);
-       data->phys += granule;
+       if (stage2_pte_is_counted(new))
+               mm_ops->get_page(ptep);
+       if (kvm_phys_is_valid(phys))
+               data->phys += granule;
         return 0;
  }
  
@@ -512,7 +624,8 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
         if (!kvm_block_mapping_supported(addr, end, data->phys, level))
                 return 0;
  
-       kvm_set_invalid_pte(ptep);
+       data->childp = kvm_pte_follow(*ptep, data->mm_ops);
+       kvm_clear_pte(ptep);
  
         /*
          * Invalidate the whole stage-2, as we may have numerous leaf
@@ -527,13 +640,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
  static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                                 struct stage2_map_data *data)
  {
-       int ret;
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
         kvm_pte_t *childp, pte = *ptep;
-       struct page *page = virt_to_page(ptep);
+       int ret;
  
         if (data->anchor) {
-               if (kvm_pte_valid(pte))
-                       put_page(page);
+               if (stage2_pte_is_counted(pte))
+                       mm_ops->put_page(ptep);
  
                 return 0;
         }
@@ -548,7 +661,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         if (!data->memcache)
                 return -ENOMEM;
  
-       childp = kvm_mmu_memory_cache_alloc(data->memcache);
+       childp = mm_ops->zalloc_page(data->memcache);
         if (!childp)
                 return -ENOMEM;
  
@@ -557,14 +670,11 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
          * a table. Accesses beyond 'end' that fall within the new table
          * will be mapped lazily.
          */
-       if (kvm_pte_valid(pte)) {
-               kvm_set_invalid_pte(ptep);
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-               put_page(page);
-       }
+       if (stage2_pte_is_counted(pte))
+               stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
  
-       kvm_set_table_pte(ptep, childp);
-       get_page(page);
+       kvm_set_table_pte(ptep, childp, mm_ops);
+       mm_ops->get_page(ptep);
  
         return 0;
  }
@@ -573,19 +683,25 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
                                       kvm_pte_t *ptep,
                                       struct stage2_map_data *data)
  {
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+       kvm_pte_t *childp;
         int ret = 0;
  
         if (!data->anchor)
                 return 0;
  
-       free_page((unsigned long)kvm_pte_follow(*ptep));
-       put_page(virt_to_page(ptep));
-
         if (data->anchor == ptep) {
+               childp = data->childp;
                 data->anchor = NULL;
+               data->childp = NULL;
                 ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+       } else {
+               childp = kvm_pte_follow(*ptep, mm_ops);
         }
  
+       mm_ops->put_page(childp);
+       mm_ops->put_page(ptep);
+
         return ret;
  }
  
@@ -627,13 +743,14 @@ static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
  
  int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                            u64 phys, enum kvm_pgtable_prot prot,
-                          struct kvm_mmu_memory_cache *mc)
+                          void *mc)
  {
         int ret;
         struct stage2_map_data map_data = {
                 .phys           = ALIGN_DOWN(phys, PAGE_SIZE),
                 .mmu            = pgt->mmu,
                 .memcache       = mc,
+               .mm_ops         = pgt->mm_ops,
         };
         struct kvm_pgtable_walker walker = {
                 .cb             = stage2_map_walker,
@@ -643,7 +760,10 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                 .arg            = &map_data,
         };
  
-       ret = stage2_map_set_prot_attr(prot, &map_data);
+       if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
+               return -EINVAL;
+
+       ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
         if (ret)
                 return ret;
  
@@ -652,38 +772,63 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
         return ret;
  }
  
-static void stage2_flush_dcache(void *addr, u64 size)
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                                void *mc, u8 owner_id)
  {
-       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-               return;
+       int ret;
+       struct stage2_map_data map_data = {
+               .phys           = KVM_PHYS_INVALID,
+               .mmu            = pgt->mmu,
+               .memcache       = mc,
+               .mm_ops         = pgt->mm_ops,
+               .owner_id       = owner_id,
+       };
+       struct kvm_pgtable_walker walker = {
+               .cb             = stage2_map_walker,
+               .flags          = KVM_PGTABLE_WALK_TABLE_PRE |
+                                 KVM_PGTABLE_WALK_LEAF |
+                                 KVM_PGTABLE_WALK_TABLE_POST,
+               .arg            = &map_data,
+       };
+
+       if (owner_id > KVM_MAX_OWNER_ID)
+               return -EINVAL;
  
-       __flush_dcache_area(addr, size);
+       ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+       return ret;
  }
  
-static bool stage2_pte_cacheable(kvm_pte_t pte)
+static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
  {
         u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-       return memattr == PAGE_S2_MEMATTR(NORMAL);
+       return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
  }
  
  static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                                enum kvm_pgtable_walk_flags flag,
                                void * const arg)
  {
-       struct kvm_s2_mmu *mmu = arg;
+       struct kvm_pgtable *pgt = arg;
+       struct kvm_s2_mmu *mmu = pgt->mmu;
+       struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
         kvm_pte_t pte = *ptep, *childp = NULL;
         bool need_flush = false;
  
-       if (!kvm_pte_valid(pte))
+       if (!kvm_pte_valid(pte)) {
+               if (stage2_pte_is_counted(pte)) {
+                       kvm_clear_pte(ptep);
+                       mm_ops->put_page(ptep);
+               }
                 return 0;
+       }
  
         if (kvm_pte_table(pte, level)) {
-               childp = kvm_pte_follow(pte);
+               childp = kvm_pte_follow(pte, mm_ops);
  
-               if (page_count(virt_to_page(childp)) != 1)
+               if (mm_ops->page_count(childp) != 1)
                         return 0;
-       } else if (stage2_pte_cacheable(pte)) {
-               need_flush = true;
+       } else if (stage2_pte_cacheable(pgt, pte)) {
+               need_flush = !stage2_has_fwb(pgt);
         }
  
         /*
@@ -691,17 +836,15 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
          * block entry and rely on the remaining portions being faulted
          * back lazily.
          */
-       kvm_set_invalid_pte(ptep);
-       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
-       put_page(virt_to_page(ptep));
+       stage2_put_pte(ptep, mmu, addr, level, mm_ops);
  
         if (need_flush) {
-               stage2_flush_dcache(kvm_pte_follow(pte),
+               __flush_dcache_area(kvm_pte_follow(pte, mm_ops),
                                     kvm_granule_size(level));
         }
  
         if (childp)
-               free_page((unsigned long)childp);
+               mm_ops->put_page(childp);
  
         return 0;
  }
@@ -710,7 +853,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
  {
         struct kvm_pgtable_walker walker = {
                 .cb     = stage2_unmap_walker,
-               .arg    = pgt->mmu,
+               .arg    = pgt,
                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
         };
  
@@ -842,12 +985,14 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                                enum kvm_pgtable_walk_flags flag,
                                void * const arg)
  {
+       struct kvm_pgtable *pgt = arg;
+       struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
         kvm_pte_t pte = *ptep;
  
-       if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+       if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
                 return 0;
  
-       stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+       __flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level));
         return 0;
  }
  
@@ -856,30 +1001,35 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
         struct kvm_pgtable_walker walker = {
                 .cb     = stage2_flush_walker,
                 .flags  = KVM_PGTABLE_WALK_LEAF,
+               .arg    = pgt,
         };
  
-       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+       if (stage2_has_fwb(pgt))
                 return 0;
  
         return kvm_pgtable_walk(pgt, addr, size, &walker);
  }
  
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                                 struct kvm_pgtable_mm_ops *mm_ops,
+                                 enum kvm_pgtable_stage2_flags flags)
  {
         size_t pgd_sz;
-       u64 vtcr = kvm->arch.vtcr;
+       u64 vtcr = arch->vtcr;
         u32 ia_bits = VTCR_EL2_IPA(vtcr);
         u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
         u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
  
         pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
-       pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz);
         if (!pgt->pgd)
                 return -ENOMEM;
  
         pgt->ia_bits            = ia_bits;
         pgt->start_level        = start_level;
-       pgt->mmu                = &kvm->arch.mmu;
+       pgt->mm_ops             = mm_ops;
+       pgt->mmu                = &arch->mmu;
+       pgt->flags              = flags;
  
         /* Ensure zeroed PGD pages are visible to the hardware walker */
         dsb(ishst);
@@ -890,15 +1040,16 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                               enum kvm_pgtable_walk_flags flag,
                               void * const arg)
  {
+       struct kvm_pgtable_mm_ops *mm_ops = arg;
         kvm_pte_t pte = *ptep;
  
-       if (!kvm_pte_valid(pte))
+       if (!stage2_pte_is_counted(pte))
                 return 0;
  
-       put_page(virt_to_page(ptep));
+       mm_ops->put_page(ptep);
  
         if (kvm_pte_table(pte, level))
-               free_page((unsigned long)kvm_pte_follow(pte));
+               mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
  
         return 0;
  }
@@ -910,10 +1061,85 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
                 .cb     = stage2_free_walker,
                 .flags  = KVM_PGTABLE_WALK_LEAF |
                           KVM_PGTABLE_WALK_TABLE_POST,
+               .arg    = pgt->mm_ops,
         };
  
         WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
         pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
-       free_pages_exact(pgt->pgd, pgd_sz);
+       pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
         pgt->pgd = NULL;
  }
+
+#define KVM_PTE_LEAF_S2_COMPAT_MASK    (KVM_PTE_LEAF_ATTR_S2_PERMS | \
+                                        KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
+                                        KVM_PTE_LEAF_ATTR_S2_IGNORED)
+
+static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
+                                         kvm_pte_t *ptep,
+                                         enum kvm_pgtable_walk_flags flag,
+                                         void * const arg)
+{
+       kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
+
+       /*
+        * Compatible mappings are either invalid and owned by the page-table
+        * owner (whose id is 0), or valid with matching permission attributes.
+        */
+       if (kvm_pte_valid(pte)) {
+               old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
+               if (old_attr != *new_attr)
+                       return -EEXIST;
+       } else if (pte) {
+               return -EEXIST;
+       }
+
+       return 0;
+}
+
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+                                 enum kvm_pgtable_prot prot,
+                                 struct kvm_mem_range *range)
+{
+       kvm_pte_t attr;
+       struct kvm_pgtable_walker check_perm_walker = {
+               .cb             = stage2_check_permission_walker,
+               .flags          = KVM_PGTABLE_WALK_LEAF,
+               .arg            = &attr,
+       };
+       u64 granule, start, end;
+       u32 level;
+       int ret;
+
+       ret = stage2_set_prot_attr(pgt, prot, &attr);
+       if (ret)
+               return ret;
+       attr &= KVM_PTE_LEAF_S2_COMPAT_MASK;
+
+       for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) {
+               granule = kvm_granule_size(level);
+               start = ALIGN_DOWN(addr, granule);
+               end = start + granule;
+
+               if (!kvm_level_supports_block_mapping(level))
+                       continue;
+
+               if (start < range->start || range->end < end)
+                       continue;
+
+               /*
+                * Check the presence of existing mappings with incompatible
+                * permissions within the current block range, and try one level
+                * deeper if one is found.
+                */
+               ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker);
+               if (ret != -EEXIST)
+                       break;
+       }
+
+       if (!ret) {
+               range->start = start;
+               range->end = end;
+       }
+
+       return ret;
+}
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c

new file mode 100644 (file)

index 0000000..83ca23a
--- /dev/null
+++ b/arch/arm64/kvm/hyp/reserved_mem.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/kvm_host.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+
+static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
+static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
+
+phys_addr_t hyp_mem_base;
+phys_addr_t hyp_mem_size;
+
+static int cmp_hyp_memblock(const void *p1, const void *p2)
+{
+       const struct memblock_region *r1 = p1;
+       const struct memblock_region *r2 = p2;
+
+       return r1->base < r2->base ? -1 : (r1->base > r2->base);
+}
+
+static void __init sort_memblock_regions(void)
+{
+       sort(hyp_memory,
+            *hyp_memblock_nr_ptr,
+            sizeof(struct memblock_region),
+            cmp_hyp_memblock,
+            NULL);
+}
+
+static int __init register_memblock_regions(void)
+{
+       struct memblock_region *reg;
+
+       for_each_mem_region(reg) {
+               if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
+                       return -ENOMEM;
+
+               hyp_memory[*hyp_memblock_nr_ptr] = *reg;
+               (*hyp_memblock_nr_ptr)++;
+       }
+       sort_memblock_regions();
+
+       return 0;
+}
+
+void __init kvm_hyp_reserve(void)
+{
+       u64 nr_pages, prev, hyp_mem_pages = 0;
+       int ret;
+
+       if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
+               return;
+
+       if (kvm_get_mode() != KVM_MODE_PROTECTED)
+               return;
+
+       ret = register_memblock_regions();
+       if (ret) {
+               *hyp_memblock_nr_ptr = 0;
+               kvm_err("Failed to register hyp memblocks: %d\n", ret);
+               return;
+       }
+
+       hyp_mem_pages += hyp_s1_pgtable_pages();
+       hyp_mem_pages += host_s2_mem_pgtable_pages();
+       hyp_mem_pages += host_s2_dev_pgtable_pages();
+
+       /*
+        * The hyp_vmemmap needs to be backed by pages, but these pages
+        * themselves need to be present in the vmemmap, so compute the number
+        * of pages needed by looking for a fixed point.
+        */
+       nr_pages = 0;
+       do {
+               prev = nr_pages;
+               nr_pages = hyp_mem_pages + prev;
+               nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE);
+               nr_pages += __hyp_pgtable_max_pages(nr_pages);
+       } while (nr_pages != prev);
+       hyp_mem_pages += nr_pages;
+
+       /*
+        * Try to allocate a PMD-aligned region to reduce TLB pressure once
+        * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
+        */
+       hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
+       hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+                                             ALIGN(hyp_mem_size, PMD_SIZE),
+                                             PMD_SIZE);
+       if (!hyp_mem_base)
+               hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+                                                     hyp_mem_size, PAGE_SIZE);
+       else
+               hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
+
+       if (!hyp_mem_base) {
+               kvm_err("Failed to reserve hyp memory\n");
+               return;
+       }
+       memblock_reserve(hyp_mem_base, hyp_mem_size);
+
+       kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
+                hyp_mem_base);
+}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c

index af8e940..7b8f7db 100644 (file)
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -27,8 +27,6 @@
  #include <asm/processor.h>
  #include <asm/thread_info.h>
  
-const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
-
  /* VHE specific context */
  DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
  DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -207,7 +205,7 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
         __deactivate_traps(vcpu);
         sysreg_restore_host_state_vhe(host_ctxt);
  
-       panic(__hyp_panic_string,
+       panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n",
               spsr, elr,
               read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR),
               read_sysreg(hpfar_el2), par, vcpu);
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c

index ead21b9..30da78f 100644 (file)
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -9,16 +9,65 @@
  #include <kvm/arm_hypercalls.h>
  #include <kvm/arm_psci.h>
  
+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+       struct system_time_snapshot systime_snapshot;
+       u64 cycles = ~0UL;
+       u32 feature;
+
+       /*
+        * system time and counter value must captured at the same
+        * time to keep consistency and precision.
+        */
+       ktime_get_snapshot(&systime_snapshot);
+
+       /*
+        * This is only valid if the current clocksource is the
+        * architected counter, as this is the only one the guest
+        * can see.
+        */
+       if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+               return;
+
+       /*
+        * The guest selects one of the two reference counters
+        * (virtual or physical) with the first argument of the SMCCC
+        * call. In case the identifier is not supported, error out.
+        */
+       feature = smccc_get_arg1(vcpu);
+       switch (feature) {
+       case KVM_PTP_VIRT_COUNTER:
+               cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+               break;
+       case KVM_PTP_PHYS_COUNTER:
+               cycles = systime_snapshot.cycles;
+               break;
+       default:
+               return;
+       }
+
+       /*
+        * This relies on the top bit of val[0] never being set for
+        * valid values of system time, because that is *really* far
+        * in the future (about 292 years from 1970, and at that stage
+        * nobody will give a damn about it).
+        */
+       val[0] = upper_32_bits(systime_snapshot.real);
+       val[1] = lower_32_bits(systime_snapshot.real);
+       val[2] = upper_32_bits(cycles);
+       val[3] = lower_32_bits(cycles);
+}
+
  int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
  {
         u32 func_id = smccc_get_function(vcpu);
-       long val = SMCCC_RET_NOT_SUPPORTED;
+       u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
         u32 feature;
         gpa_t gpa;
  
         switch (func_id) {
         case ARM_SMCCC_VERSION_FUNC_ID:
-               val = ARM_SMCCC_VERSION_1_1;
+               val[0] = ARM_SMCCC_VERSION_1_1;
                 break;
         case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
                 feature = smccc_get_arg1(vcpu);
@@ -28,10 +77,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                         case SPECTRE_VULNERABLE:
                                 break;
                         case SPECTRE_MITIGATED:
-                               val = SMCCC_RET_SUCCESS;
+                               val[0] = SMCCC_RET_SUCCESS;
                                 break;
                         case SPECTRE_UNAFFECTED:
-                               val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
+                               val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
                                 break;
                         }
                         break;
@@ -54,22 +103,35 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                                         break;
                                 fallthrough;
                         case SPECTRE_UNAFFECTED:
-                               val = SMCCC_RET_NOT_REQUIRED;
+                               val[0] = SMCCC_RET_NOT_REQUIRED;
                                 break;
                         }
                         break;
                 case ARM_SMCCC_HV_PV_TIME_FEATURES:
-                       val = SMCCC_RET_SUCCESS;
+                       val[0] = SMCCC_RET_SUCCESS;
                         break;
                 }
                 break;
         case ARM_SMCCC_HV_PV_TIME_FEATURES:
-               val = kvm_hypercall_pv_features(vcpu);
+               val[0] = kvm_hypercall_pv_features(vcpu);
                 break;
         case ARM_SMCCC_HV_PV_TIME_ST:
                 gpa = kvm_init_stolen_time(vcpu);
                 if (gpa != GPA_INVALID)
-                       val = gpa;
+                       val[0] = gpa;
+               break;
+       case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+               val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+               val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+               val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+               val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+               break;
+       case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+               val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+               val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+               break;
+       case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+               kvm_ptp_get_time(vcpu, val);
                 break;
         case ARM_SMCCC_TRNG_VERSION:
         case ARM_SMCCC_TRNG_FEATURES:
@@ -81,6 +143,6 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                 return kvm_psci_call(vcpu);
         }
  
-       smccc_set_retval(vcpu, val, 0, 0, 0);
+       smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
         return 1;
  }
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c

index 3572823..c5d1f3c 100644 (file)
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -88,6 +88,44 @@ static bool kvm_is_device_pfn(unsigned long pfn)
         return !pfn_valid(pfn);
  }
  
+static void *stage2_memcache_zalloc_page(void *arg)
+{
+       struct kvm_mmu_memory_cache *mc = arg;
+
+       /* Allocated with __GFP_ZERO, so no need to zero */
+       return kvm_mmu_memory_cache_alloc(mc);
+}
+
+static void *kvm_host_zalloc_pages_exact(size_t size)
+{
+       return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+}
+
+static void kvm_host_get_page(void *addr)
+{
+       get_page(virt_to_page(addr));
+}
+
+static void kvm_host_put_page(void *addr)
+{
+       put_page(virt_to_page(addr));
+}
+
+static int kvm_host_page_count(void *addr)
+{
+       return page_count(virt_to_page(addr));
+}
+
+static phys_addr_t kvm_host_pa(void *addr)
+{
+       return __pa(addr);
+}
+
+static void *kvm_host_va(phys_addr_t phys)
+{
+       return __va(phys);
+}
+
  /*
   * Unmapping vs dcache management:
   *
@@ -127,7 +165,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
  static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
                                  bool may_block)
  {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
         phys_addr_t end = start + size;
  
         assert_spin_locked(&kvm->mmu_lock);
@@ -183,15 +221,39 @@ void free_hyp_pgds(void)
         if (hyp_pgtable) {
                 kvm_pgtable_hyp_destroy(hyp_pgtable);
                 kfree(hyp_pgtable);
+               hyp_pgtable = NULL;
         }
         mutex_unlock(&kvm_hyp_pgd_mutex);
  }
  
+static bool kvm_host_owns_hyp_mappings(void)
+{
+       if (static_branch_likely(&kvm_protected_mode_initialized))
+               return false;
+
+       /*
+        * This can happen at boot time when __create_hyp_mappings() is called
+        * after the hyp protection has been enabled, but the static key has
+        * not been flipped yet.
+        */
+       if (!hyp_pgtable && is_protected_kvm_enabled())
+               return false;
+
+       WARN_ON(!hyp_pgtable);
+
+       return true;
+}
+
  static int __create_hyp_mappings(unsigned long start, unsigned long size,
                                  unsigned long phys, enum kvm_pgtable_prot prot)
  {
         int err;
  
+       if (!kvm_host_owns_hyp_mappings()) {
+               return kvm_call_hyp_nvhe(__pkvm_create_mappings,
+                                        start, size, phys, prot);
+       }
+
         mutex_lock(&kvm_hyp_pgd_mutex);
         err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
         mutex_unlock(&kvm_hyp_pgd_mutex);
@@ -253,6 +315,16 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
         unsigned long base;
         int ret = 0;
  
+       if (!kvm_host_owns_hyp_mappings()) {
+               base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
+                                        phys_addr, size, prot);
+               if (IS_ERR_OR_NULL((void *)base))
+                       return PTR_ERR((void *)base);
+               *haddr = base;
+
+               return 0;
+       }
+
         mutex_lock(&kvm_hyp_pgd_mutex);
  
         /*
@@ -351,6 +423,17 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
         return 0;
  }
  
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
+       .zalloc_page            = stage2_memcache_zalloc_page,
+       .zalloc_pages_exact     = kvm_host_zalloc_pages_exact,
+       .free_pages_exact       = free_pages_exact,
+       .get_page               = kvm_host_get_page,
+       .put_page               = kvm_host_put_page,
+       .page_count             = kvm_host_page_count,
+       .phys_to_virt           = kvm_host_va,
+       .virt_to_phys           = kvm_host_pa,
+};
+
  /**
   * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
   * @kvm:       The pointer to the KVM structure
@@ -374,7 +457,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
         if (!pgt)
                 return -ENOMEM;
  
-       err = kvm_pgtable_stage2_init(pgt, kvm);
+       err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops);
         if (err)
                 goto out_free_pgtable;
  
@@ -387,7 +470,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
         for_each_possible_cpu(cpu)
                 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
  
-       mmu->kvm = kvm;
+       mmu->arch = &kvm->arch;
         mmu->pgt = pgt;
         mmu->pgd_phys = __pa(pgt->pgd);
         mmu->vmid.vmid_gen = 0;
@@ -421,10 +504,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
          *     +--------------------------------------------+
          */
         do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               struct vm_area_struct *vma;
                 hva_t vm_start, vm_end;
  
-               if (!vma || vma->vm_start >= reg_end)
+               vma = find_vma_intersection(current->mm, hva, reg_end);
+               if (!vma)
                         break;
  
                 /*
@@ -469,7 +553,7 @@ void stage2_unmap_vm(struct kvm *kvm)
  
  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
  {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
         struct kvm_pgtable *pgt = NULL;
  
         spin_lock(&kvm->mmu_lock);
@@ -538,7 +622,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
   */
  static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
  {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
         stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
  }
  
@@ -555,7 +639,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_
   * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
   * serializing operations for VM memory regions.
   */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
  {
         struct kvm_memslots *slots = kvm_memslots(kvm);
         struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
@@ -842,10 +926,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
          * unmapped afterwards, the call to kvm_unmap_gfn will take it away
          * from us again properly. This smp_rmb() interacts with the smp_wmb()
          * in kvm_mmu_notifier_invalidate_<page|range_end>.
+        *
+        * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
+        * used to avoid unnecessary overhead introduced to locate the memory
+        * slot because it's always fixed even @gfn is adjusted for huge pages.
          */
         smp_rmb();
  
-       pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+                                  write_fault, &writable, NULL);
         if (pfn == KVM_PFN_ERR_HWPOISON) {
                 kvm_send_hwpoison_signal(hva, vma_shift);
                 return 0;
@@ -911,7 +1000,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         /* Mark the page dirty only if the fault is handled successfully */
         if (writable && !ret) {
                 kvm_set_pfn_dirty(pfn);
-               mark_page_dirty(kvm, gfn);
+               mark_page_dirty_in_slot(kvm, memslot, gfn);
         }
  
  out_unlock:
@@ -1152,10 +1241,22 @@ static int kvm_map_idmap_text(void)
         return err;
  }
  
-int kvm_mmu_init(void)
+static void *kvm_hyp_zalloc_page(void *arg)
+{
+       return (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
+       .zalloc_page            = kvm_hyp_zalloc_page,
+       .get_page               = kvm_host_get_page,
+       .put_page               = kvm_host_put_page,
+       .phys_to_virt           = kvm_host_va,
+       .virt_to_phys           = kvm_host_pa,
+};
+
+int kvm_mmu_init(u32 *hyp_va_bits)
  {
         int err;
-       u32 hyp_va_bits;
  
         hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
         hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1169,8 +1270,8 @@ int kvm_mmu_init(void)
          */
         BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
  
-       hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
-       kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
+       *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
         kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
         kvm_debug("HYP VA range: %lx:%lx\n",
                   kern_hyp_va(PAGE_OFFSET),
@@ -1195,7 +1296,7 @@ int kvm_mmu_init(void)
                 goto out;
         }
  
-       err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+       err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
         if (err)
                 goto out_free_pgtable;
  
@@ -1273,10 +1374,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
          *     +--------------------------------------------+
          */
         do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               struct vm_area_struct *vma;
                 hva_t vm_start, vm_end;
  
-               if (!vma || vma->vm_start >= reg_end)
+               vma = find_vma_intersection(current->mm, hva, reg_end);
+               if (!vma)
                         break;
  
                 /*
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c

index 7391643..151c31f 100644 (file)
--- a/arch/arm64/kvm/perf.c
+++ b/arch/arm64/kvm/perf.c
@@ -50,12 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
  
  int kvm_perf_init(void)
  {
-       /*
-        * Check if HW_PERF_EVENTS are supported by checking the number of
-        * hardware performance counters. This could ensure the presence of
-        * a physical PMU and CONFIG_PERF_EVENT is selected.
-        */
-       if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0)
+       if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled())
                 static_branch_enable(&kvm_arm_pmu_available);
  
         return perf_register_guest_info_callbacks(&kvm_guest_cbs);
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c

index e32c6e1..fd167d4 100644 (file)
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -739,7 +739,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
         kvm_pmu_create_perf_event(vcpu, select_idx);
  }
  
-static int kvm_pmu_probe_pmuver(void)
+int kvm_pmu_probe_pmuver(void)
  {
         struct perf_event_attr attr = { };
         struct perf_event *event;
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c

index faf32a4..03a6c1f 100644 (file)
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
  {
         struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
  
-       if (!ctx || !kvm_pmu_switch_needed(attr))
+       if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr))
                 return;
  
         if (!attr->exclude_host)
@@ -49,7 +49,7 @@ void kvm_clr_pmu_events(u32 clr)
  {
         struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
  
-       if (!ctx)
+       if (!kvm_arm_support_pmu_v3() || !ctx)
                 return;
  
         ctx->pmu_events.events_host &= ~clr;
@@ -172,7 +172,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
         struct kvm_host_data *host;
         u32 events_guest, events_host;
  
-       if (!has_vhe())
+       if (!kvm_arm_support_pmu_v3() || !has_vhe())
                 return;
  
         preempt_disable();
@@ -193,7 +193,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
         struct kvm_host_data *host;
         u32 events_guest, events_host;
  
-       if (!has_vhe())
+       if (!kvm_arm_support_pmu_v3() || !has_vhe())
                 return;
  
         host = this_cpu_ptr_hyp_sym(kvm_host_data);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c

index bd354cd..956cdc2 100644 (file)
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -74,10 +74,6 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
         if (!system_supports_sve())
                 return -EINVAL;
  
-       /* Verify that KVM startup enforced this when SVE was detected: */
-       if (WARN_ON(!has_vhe()))
-               return -EINVAL;
-
         vcpu->arch.sve_max_vl = kvm_sve_max_vl;
  
         /*
@@ -242,6 +238,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
  
         /* Reset core registers */
         memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+       memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
+       vcpu->arch.ctxt.spsr_abt = 0;
+       vcpu->arch.ctxt.spsr_und = 0;
+       vcpu->arch.ctxt.spsr_irq = 0;
+       vcpu->arch.ctxt.spsr_fiq = 0;
         vcpu_gp_regs(vcpu)->pstate = pstate;
  
         /* Reset system registers */
@@ -333,19 +334,10 @@ int kvm_set_ipa_limit(void)
         return 0;
  }
  
-/*
- * Configure the VTCR_EL2 for this VM. The VTCR value is common
- * across all the physical CPUs on the system. We use system wide
- * sanitised values to fill in different fields, except for Hardware
- * Management of Access Flags. HA Flag is set unconditionally on
- * all CPUs, as it is safe to run with or without the feature and
- * the bit is RES0 on CPUs that don't support it.
- */
  int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
  {
-       u64 vtcr = VTCR_EL2_FLAGS, mmfr0;
-       u32 parange, phys_shift;
-       u8 lvls;
+       u64 mmfr0, mmfr1;
+       u32 phys_shift;
  
         if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
                 return -EINVAL;
@@ -365,33 +357,8 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
         }
  
         mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
-       parange = cpuid_feature_extract_unsigned_field(mmfr0,
-                               ID_AA64MMFR0_PARANGE_SHIFT);
-       if (parange > ID_AA64MMFR0_PARANGE_MAX)
-               parange = ID_AA64MMFR0_PARANGE_MAX;
-       vtcr |= parange << VTCR_EL2_PS_SHIFT;
-
-       vtcr |= VTCR_EL2_T0SZ(phys_shift);
-       /*
-        * Use a minimum 2 level page table to prevent splitting
-        * host PMD huge pages at stage2.
-        */
-       lvls = stage2_pgtable_levels(phys_shift);
-       if (lvls < 2)
-               lvls = 2;
-       vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
-
-       /*
-        * Enable the Hardware Access Flag management, unconditionally
-        * on all CPUs. The features is RES0 on CPUs without the support
-        * and must be ignored by the CPUs.
-        */
-       vtcr |= VTCR_EL2_HA;
+       mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+       kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
  
-       /* Set the vmid bits */
-       vtcr |= (kvm_get_vmid_bits() == 16) ?
-               VTCR_EL2_VS_16BIT :
-               VTCR_EL2_VS_8BIT;
-       kvm->arch.vtcr = vtcr;
         return 0;
  }
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c

index 4f2f1e3..76ea280 100644 (file)
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1063,6 +1063,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
                 val = cpuid_feature_cap_perfmon_field(val,
                                                       ID_AA64DFR0_PMUVER_SHIFT,
                                                       kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0);
+               /* Hide SPE from guests */
+               val &= ~FEATURE(ID_AA64DFR0_PMSVER);
                 break;
         case SYS_ID_DFR0_EL1:
                 /* Limit guests to PMUv3 for ARMv8.4 */
@@ -1472,6 +1474,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
         { SYS_DESC(SYS_GCR_EL1), undef_access },
  
         { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
+       { SYS_DESC(SYS_TRFCR_EL1), undef_access },
         { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
         { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
         { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
@@ -1501,6 +1504,19 @@ static const struct sys_reg_desc sys_reg_descs[] = {
         { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
         { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
  
+       { SYS_DESC(SYS_PMSCR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSICR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSIRR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSFCR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSEVFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSLATFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSIDR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBPTR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBSR_EL1), undef_access },
+       /* PMBIDR_EL1 is not trapped */
+
         { PMU_SYS_REG(SYS_PMINTENSET_EL1),
           .access = access_pminten, .reg = PMINTENSET_EL1 },
         { PMU_SYS_REG(SYS_PMINTENCLR_EL1),
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c

index 9783013..acdb7b3 100644 (file)
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -288,3 +288,10 @@ void kvm_get_kimage_voffset(struct alt_instr *alt,
  {
         generate_mov_q(kimage_voffset, origptr, updptr, nr_inst);
  }
+
+void kvm_compute_final_ctr_el0(struct alt_instr *alt,
+                              __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+       generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0),
+                      origptr, updptr, nr_inst);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c

index 052917d..58cbda0 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -335,13 +335,14 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
         kfree(dist->spis);
         dist->spis = NULL;
         dist->nr_spis = 0;
+       dist->vgic_dist_base = VGIC_ADDR_UNDEF;
  
-       if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
-                       list_del(&rdreg->list);
-                       kfree(rdreg);
-               }
+       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
+                       vgic_v3_free_redist_region(rdreg);
                 INIT_LIST_HEAD(&dist->rd_regions);
+       } else {
+               dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
         }
  
         if (vgic_has_its(kvm))
@@ -362,6 +363,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
         vgic_flush_pending_lpis(vcpu);
  
         INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+       vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
  }
  
  /* To be called with kvm->lock held */
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c

index 40cbaca..ec7543a 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
                 /*
                  * If an LPI carries the HW bit, this means that this
                  * interrupt is controlled by GICv4, and we do not
-                * have direct access to that state. Let's simply fail
-                * the save operation...
+                * have direct access to that state without GICv4.1.
+                * Let's simply fail the save operation...
                  */
-               if (ite->irq->hw)
+               if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
                         return -EACCES;
  
                 ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c

index 4441967..7740995 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -87,8 +87,8 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
                         r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
                         goto out;
                 }
-               rdreg = list_first_entry(&vgic->rd_regions,
-                                        struct vgic_redist_region, list);
+               rdreg = list_first_entry_or_null(&vgic->rd_regions,
+                                                struct vgic_redist_region, list);
                 if (!rdreg)
                         addr_ptr = &undef_value;
                 else
@@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
                 u64 addr;
                 unsigned long type = (unsigned long)attr->attr;
  
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
                 r = kvm_vgic_addr(dev->kvm, type, &addr, false);
                 if (r)
                         return (r == -ENODEV) ? -ENXIO : r;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c

index 15a6c98..03a2537 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -251,30 +251,35 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
                 vgic_enable_lpis(vcpu);
  }
  
-static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
-                                             gpa_t addr, unsigned int len)
+static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu)
  {
-       unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
         struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
-       int target_vcpu_id = vcpu->vcpu_id;
-       gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
-                       (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
-       u64 value;
+       struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg;
  
-       value = (u64)(mpidr & GENMASK(23, 0)) << 32;
-       value |= ((target_vcpu_id & 0xffff) << 8);
+       if (!rdreg)
+               return false;
  
-       if (addr == last_rdist_typer)
-               value |= GICR_TYPER_LAST;
-       if (vgic_has_its(vcpu->kvm))
-               value |= GICR_TYPER_PLPIS;
+       if (vgic_cpu->rdreg_index < rdreg->free_index - 1) {
+               return false;
+       } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) {
+               struct list_head *rd_regions = &vgic->rd_regions;
+               gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
  
-       return extract_bytes(value, addr & 7, len);
+               /*
+                * the rdist is the last one of the redist region,
+                * check whether there is no other contiguous rdist region
+                */
+               list_for_each_entry(iter, rd_regions, list) {
+                       if (iter->base == end && iter->free_index > 0)
+                               return false;
+               }
+       }
+       return true;
  }
  
-static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
-                                                gpa_t addr, unsigned int len)
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
  {
         unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
         int target_vcpu_id = vcpu->vcpu_id;
@@ -286,7 +291,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
         if (vgic_has_its(vcpu->kvm))
                 value |= GICR_TYPER_PLPIS;
  
-       /* reporting of the Last bit is not supported for userspace */
+       if (vgic_mmio_vcpu_rdist_is_last(vcpu))
+               value |= GICR_TYPER_LAST;
+
         return extract_bytes(value, addr & 7, len);
  }
  
@@ -612,7 +619,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
                 VGIC_ACCESS_32bit),
         REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER,
                 vgic_mmio_read_v3r_typer, vgic_mmio_write_wi,
-               vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8,
+               NULL, vgic_mmio_uaccess_write_wi, 8,
                 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
         REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
                 vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
@@ -714,6 +721,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
                 return -EINVAL;
  
         vgic_cpu->rdreg = rdreg;
+       vgic_cpu->rdreg_index = rdreg->free_index;
  
         rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
  
@@ -768,7 +776,7 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
  }
  
  /**
- * vgic_v3_insert_redist_region - Insert a new redistributor region
+ * vgic_v3_alloc_redist_region - Allocate a new redistributor region
   *
   * Performs various checks before inserting the rdist region in the list.
   * Those tests depend on whether the size of the rdist region is known
@@ -782,8 +790,8 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
   *
   * Return 0 on success, < 0 otherwise
   */
-static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
-                                       gpa_t base, uint32_t count)
+static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index,
+                                      gpa_t base, uint32_t count)
  {
         struct vgic_dist *d = &kvm->arch.vgic;
         struct vgic_redist_region *rdreg;
@@ -791,10 +799,6 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
         size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
         int ret;
  
-       /* single rdist region already set ?*/
-       if (!count && !list_empty(rd_regions))
-               return -EINVAL;
-
         /* cross the end of memory ? */
         if (base + size < base)
                 return -EINVAL;
@@ -805,11 +809,15 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
         } else {
                 rdreg = list_last_entry(rd_regions,
                                         struct vgic_redist_region, list);
-               if (index != rdreg->index + 1)
+
+               /* Don't mix single region and discrete redist regions */
+               if (!count && rdreg->count)
                         return -EINVAL;
  
-               /* Cannot add an explicitly sized regions after legacy region */
-               if (!rdreg->count)
+               if (!count)
+                       return -EEXIST;
+
+               if (index != rdreg->index + 1)
                         return -EINVAL;
         }
  
@@ -848,11 +856,17 @@ free:
         return ret;
  }
  
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg)
+{
+       list_del(&rdreg->list);
+       kfree(rdreg);
+}
+
  int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
  {
         int ret;
  
-       ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
+       ret = vgic_v3_alloc_redist_region(kvm, index, addr, count);
         if (ret)
                 return ret;
  
@@ -861,8 +875,13 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
          * afterwards will register the iodevs when needed.
          */
         ret = vgic_register_all_redist_iodevs(kvm);
-       if (ret)
+       if (ret) {
+               struct vgic_redist_region *rdreg;
+
+               rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+               vgic_v3_free_redist_region(rdreg);
                 return ret;
+       }
  
         return 0;
  }
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c

index b2d73fc..48c6067 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
         return region;
  }
  
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                              gpa_t addr, u32 *val)
  {
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
         const struct vgic_register_region *region;
         struct kvm_vcpu *r_vcpu;
  
@@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
         return 0;
  }
  
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                               gpa_t addr, const u32 *val)
  {
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
         const struct vgic_register_region *region;
         struct kvm_vcpu *r_vcpu;
  
@@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
                  bool is_write, int offset, u32 *val)
  {
         if (is_write)
-               return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+               return vgic_uaccess_write(vcpu, dev, offset, val);
         else
-               return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+               return vgic_uaccess_read(vcpu, dev, offset, val);
  }
  
  static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c

index 6f53092..41ecf21 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
  // SPDX-License-Identifier: GPL-2.0-only
  
  #include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
  #include <linux/kvm.h>
  #include <linux/kvm_host.h>
  #include <kvm/arm_vgic.h>
@@ -356,6 +358,32 @@ retry:
         return 0;
  }
  
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+               desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+               irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+       }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+               desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+               irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+       }
+}
+
  /**
   * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
   * kvm lock and all vcpu lock must be held
@@ -365,13 +393,28 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
         struct vgic_dist *dist = &kvm->arch.vgic;
         struct vgic_irq *irq;
         gpa_t last_ptr = ~(gpa_t)0;
-       int ret;
+       bool vlpi_avail = false;
+       int ret = 0;
         u8 val;
  
+       if (unlikely(!vgic_initialized(kvm)))
+               return -ENXIO;
+
+       /*
+        * A preparation for getting any VLPI states.
+        * The above vgic initialized check also ensures that the allocation
+        * and enabling of the doorbells have already been done.
+        */
+       if (kvm_vgic_global_state.has_gicv4_1) {
+               unmap_all_vpes(dist);
+               vlpi_avail = true;
+       }
+
         list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
                 int byte_offset, bit_nr;
                 struct kvm_vcpu *vcpu;
                 gpa_t pendbase, ptr;
+               bool is_pending;
                 bool stored;
  
                 vcpu = irq->target_vcpu;
@@ -387,24 +430,35 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
                 if (ptr != last_ptr) {
                         ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
                         if (ret)
-                               return ret;
+                               goto out;
                         last_ptr = ptr;
                 }
  
                 stored = val & (1U << bit_nr);
-               if (stored == irq->pending_latch)
+
+               is_pending = irq->pending_latch;
+
+               if (irq->hw && vlpi_avail)
+                       vgic_v4_get_vlpi_state(irq, &is_pending);
+
+               if (stored == is_pending)
                         continue;
  
-               if (irq->pending_latch)
+               if (is_pending)
                         val |= 1 << bit_nr;
                 else
                         val &= ~(1 << bit_nr);
  
                 ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
                 if (ret)
-                       return ret;
+                       goto out;
         }
-       return 0;
+
+out:
+       if (vlpi_avail)
+               map_all_vpes(dist);
+
+       return ret;
  }
  
  /**
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c

index 66508b0..c1845d8 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
         kvm_arm_resume_guest(kvm);
  }
  
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+       struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+       int mask = BIT(irq->intid % BITS_PER_BYTE);
+       void *va;
+       u8 *ptr;
+
+       va = page_address(vpe->vpt_page);
+       ptr = va + irq->intid / BITS_PER_BYTE;
+
+       *val = !!(*ptr & mask);
+}
+
  /**
   * vgic_v4_init - Initialize the GICv4 data structures
   * @kvm:       Pointer to the VM being initialized
@@ -385,6 +404,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
         struct vgic_its *its;
         struct vgic_irq *irq;
         struct its_vlpi_map map;
+       unsigned long flags;
         int ret;
  
         if (!vgic_supports_direct_msis(kvm))
@@ -430,6 +450,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
         irq->host_irq   = virq;
         atomic_inc(&map.vpe->vlpi_count);
  
+       /* Transfer pending state */
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       if (irq->pending_latch) {
+               ret = irq_set_irqchip_state(irq->host_irq,
+                                           IRQCHIP_STATE_PENDING,
+                                           irq->pending_latch);
+               WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+               /*
+                * Clear pending_latch and communicate this state
+                * change via vgic_queue_irq_unlock.
+                */
+               irq->pending_latch = false;
+               vgic_queue_irq_unlock(kvm, irq, flags);
+       } else {
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+       }
+
  out:
         mutex_unlock(&its->its_lock);
         return ret;
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h

index 64fcd75..dc1f3d1 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -293,6 +293,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
  
  struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
                                                            u32 index);
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg);
  
  bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
  
@@ -317,5 +318,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
  int vgic_v4_init(struct kvm *kvm);
  void vgic_v4_teardown(struct kvm *kvm);
  void vgic_v4_configure_vsgis(struct kvm *kvm);
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
  
  #endif
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S

index 073acbf..b84b179 100644 (file)
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -14,7 +14,7 @@
   * Parameters:
   *     x0 - dest
   */
-SYM_FUNC_START(clear_page)
+SYM_FUNC_START_PI(clear_page)
         mrs     x1, dczid_el0
         and     w1, w1, #0xf
         mov     x2, #4
@@ -25,5 +25,5 @@ SYM_FUNC_START(clear_page)
         tst     x0, #(PAGE_SIZE - 1)
         b.ne    1b
         ret
-SYM_FUNC_END(clear_page)
+SYM_FUNC_END_PI(clear_page)
  EXPORT_SYMBOL(clear_page)
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S

index e7a7939..29144f4 100644 (file)
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -17,7 +17,7 @@
   *     x0 - dest
   *     x1 - src
   */
-SYM_FUNC_START(copy_page)
+SYM_FUNC_START_PI(copy_page)
  alternative_if ARM64_HAS_NO_HW_PREFETCH
         // Prefetch three cache lines ahead.
         prfm    pldl1strm, [x1, #128]
@@ -75,5 +75,5 @@ alternative_else_nop_endif
         stnp    x16, x17, [x0, #112 - 256]
  
         ret
-SYM_FUNC_END(copy_page)
+SYM_FUNC_END_PI(copy_page)
  EXPORT_SYMBOL(copy_page)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c

index 3685e12..6cb22da 100644 (file)
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -35,6 +35,7 @@
  #include <asm/fixmap.h>
  #include <asm/kasan.h>
  #include <asm/kernel-pgtable.h>
+#include <asm/kvm_host.h>
  #include <asm/memory.h>
  #include <asm/numa.h>
  #include <asm/sections.h>
@@ -452,6 +453,8 @@ void __init bootmem_init(void)
  
         dma_pernuma_cma_reserve();
  
+       kvm_hyp_reserve();
+
         /*
          * sparse_init() tries to allocate memory from memblock, so must be
          * done after the fixed reservations
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c

index 1e75cc9..ea7729b 100644 (file)
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -23,27 +23,6 @@
  #include <asm/sysinfo.h>
  #include <asm/unwind.h>
  
-const char *perf_pmu_name(void)
-{
-       if (cpum_cf_avail() || cpum_sf_avail())
-               return "CPU-Measurement Facilities (CPU-MF)";
-       return "pmu";
-}
-EXPORT_SYMBOL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       int num = 0;
-
-       if (cpum_cf_avail())
-               num += PERF_CPUM_CF_MAX_CTR;
-       if (cpum_sf_avail())
-               num += PERF_CPUM_SF_MAX_CTR;
-
-       return num;
-}
-EXPORT_SYMBOL(perf_num_counters);
-
  static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
  {
         struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c

index 445e3ec..1d2507f 100644 (file)
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -57,24 +57,6 @@ static inline int sh_pmu_initialized(void)
         return !!sh_pmu;
  }
  
-const char *perf_pmu_name(void)
-{
-       if (!sh_pmu)
-               return NULL;
-
-       return sh_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       if (!sh_pmu)
-               return 0;
-
-       return sh_pmu->num_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
  /*
   * Release the PMU if this is the last perf_event.
   */
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c

index d017782..e0f167e 100644 (file)
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -16,6 +16,7 @@
  #include <linux/cpu_pm.h>
  #include <linux/clockchips.h>
  #include <linux/clocksource.h>
+#include <linux/clocksource_ids.h>
  #include <linux/interrupt.h>
  #include <linux/of_irq.h>
  #include <linux/of_address.h>
@@ -24,6 +25,8 @@
  #include <linux/sched/clock.h>
  #include <linux/sched_clock.h>
  #include <linux/acpi.h>
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
  
  #include <asm/arch_timer.h>
  #include <asm/virt.h>
@@ -191,6 +194,7 @@ static u64 arch_counter_read_cc(const struct cyclecounter *cc)
  
  static struct clocksource clocksource_counter = {
         .name   = "arch_sys_counter",
+       .id     = CSID_ARM_ARCH_COUNTER,
         .rating = 400,
         .read   = arch_counter_read,
         .mask   = CLOCKSOURCE_MASK(56),
@@ -1657,3 +1661,35 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table)
  }
  TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
  #endif
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts,
+                                struct clocksource **cs)
+{
+       struct arm_smccc_res hvc_res;
+       u32 ptp_counter;
+       ktime_t ktime;
+
+       if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY))
+               return -EOPNOTSUPP;
+
+       if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
+               ptp_counter = KVM_PTP_VIRT_COUNTER;
+       else
+               ptp_counter = KVM_PTP_PHYS_COUNTER;
+
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+                            ptp_counter, &hvc_res);
+
+       if ((int)(hvc_res.a0) < 0)
+               return -EOPNOTSUPP;
+
+       ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1;
+       *ts = ktime_to_timespec64(ktime);
+       if (cycle)
+               *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3;
+       if (cs)
+               *cs = &clocksource_counter;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);
diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c

index f5fc429..69e296f 100644 (file)
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -23,6 +23,7 @@
  
  #include <asm/cpuidle.h>
  #include <asm/cputype.h>
+#include <asm/hypervisor.h>
  #include <asm/system_misc.h>
  #include <asm/smp_plat.h>
  #include <asm/suspend.h>
@@ -498,6 +499,7 @@ static int __init psci_probe(void)
                 psci_init_cpu_suspend();
                 psci_init_system_suspend();
                 psci_init_system_reset2();
+               kvm_init_hyp_services();
         }
  
         return 0;
diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile

index 72ab840..40d1914 100644 (file)
--- a/drivers/firmware/smccc/Makefile
+++ b/drivers/firmware/smccc/Makefile
@@ -1,4 +1,4 @@
  # SPDX-License-Identifier: GPL-2.0
  #
-obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o
+obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o kvm_guest.o
  obj-$(CONFIG_ARM_SMCCC_SOC_ID) += soc_id.o
diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c

new file mode 100644 (file)

index 0000000..2d3e866
--- /dev/null
+++ b/drivers/firmware/smccc/kvm_guest.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "smccc: KVM: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/hypervisor.h>
+
+static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { };
+
+void __init kvm_init_hyp_services(void)
+{
+       struct arm_smccc_res res;
+       u32 val[4];
+
+       if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_HVC)
+               return;
+
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res);
+       if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 ||
+           res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 ||
+           res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 ||
+           res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3)
+               return;
+
+       memset(&res, 0, sizeof(res));
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res);
+
+       val[0] = lower_32_bits(res.a0);
+       val[1] = lower_32_bits(res.a1);
+       val[2] = lower_32_bits(res.a2);
+       val[3] = lower_32_bits(res.a3);
+
+       bitmap_from_arr32(__kvm_arm_hyp_services, val, ARM_SMCCC_KVM_NUM_FUNCS);
+
+       pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n",
+                res.a3, res.a2, res.a1, res.a0);
+}
+
+bool kvm_arm_hyp_service_available(u32 func_id)
+{
+       if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS)
+               return false;
+
+       return test_bit(func_id, __kvm_arm_hyp_services);
+}
+EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available);
diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c

index d52bfc5..028f81d 100644 (file)
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -8,6 +8,7 @@
  #include <linux/cache.h>
  #include <linux/init.h>
  #include <linux/arm-smccc.h>
+#include <linux/kernel.h>
  #include <asm/archrandom.h>
  
  static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig

index 7b44ba2..84530fd 100644 (file)
--- a/drivers/hwtracing/coresight/Kconfig
+++ b/drivers/hwtracing/coresight/Kconfig
@@ -97,15 +97,15 @@ config CORESIGHT_SOURCE_ETM3X
           module will be called coresight-etm3x.
  
  config CORESIGHT_SOURCE_ETM4X
-       tristate "CoreSight Embedded Trace Macrocell 4.x driver"
+       tristate "CoreSight ETMv4.x / ETE driver"
         depends on ARM64
         select CORESIGHT_LINKS_AND_SINKS
         select PID_IN_CONTEXTIDR
         help
-         This driver provides support for the ETM4.x tracer module, tracing the
-         instructions that a processor is executing. This is primarily useful
-         for instruction level tracing. Depending on the implemented version
-         data tracing may also be available.
+         This driver provides support for the CoreSight Embedded Trace Macrocell
+         version 4.x and the Embedded Trace Extensions (ETE). Both are CPU tracer
+         modules, tracing the instructions that a processor is executing. This is
+         primarily useful for instruction level tracing.
  
           To compile this driver as a module, choose M here: the
           module will be called coresight-etm4x.
@@ -173,4 +173,18 @@ config CORESIGHT_CTI_INTEGRATION_REGS
           CTI trigger connections between this and other devices.These
           registers are not used in normal operation and can leave devices in
           an inconsistent state.
+
+config CORESIGHT_TRBE
+       tristate "Trace Buffer Extension (TRBE) driver"
+       depends on ARM64 && CORESIGHT_SOURCE_ETM4X
+       help
+         This driver provides support for percpu Trace Buffer Extension (TRBE).
+         TRBE always needs to be used along with it's corresponding percpu ETE
+         component. ETE generates trace data which is then captured with TRBE.
+         Unlike traditional sink devices, TRBE is a CPU feature accessible via
+         system registers. But it's explicit dependency with trace unit (ETE)
+         requires it to be plugged in as a coresight sink device.
+
+         To compile this driver as a module, choose M here: the module will be
+         called coresight-trbe.
  endif
diff --git a/drivers/hwtracing/coresight/Makefile b/drivers/hwtracing/coresight/Makefile

index f20e357..d608165 100644 (file)
--- a/drivers/hwtracing/coresight/Makefile
+++ b/drivers/hwtracing/coresight/Makefile
@@ -21,5 +21,6 @@ obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
  obj-$(CONFIG_CORESIGHT_CPU_DEBUG) += coresight-cpu-debug.o
  obj-$(CONFIG_CORESIGHT_CATU) += coresight-catu.o
  obj-$(CONFIG_CORESIGHT_CTI) += coresight-cti.o
+obj-$(CONFIG_CORESIGHT_TRBE) += coresight-trbe.o
  coresight-cti-y := coresight-cti-core.o        coresight-cti-platform.o \
                    coresight-cti-sysfs.o
diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c

index 0062c89..ca75b0b 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -23,6 +23,7 @@
  #include "coresight-priv.h"
  
  static DEFINE_MUTEX(coresight_mutex);
+static DEFINE_PER_CPU(struct coresight_device *, csdev_sink);
  
  /**
   * struct coresight_node - elements of a path, from source to sink
@@ -70,6 +71,18 @@ void coresight_remove_cti_ops(void)
  }
  EXPORT_SYMBOL_GPL(coresight_remove_cti_ops);
  
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev)
+{
+       per_cpu(csdev_sink, cpu) = csdev;
+}
+EXPORT_SYMBOL_GPL(coresight_set_percpu_sink);
+
+struct coresight_device *coresight_get_percpu_sink(int cpu)
+{
+       return per_cpu(csdev_sink, cpu);
+}
+EXPORT_SYMBOL_GPL(coresight_get_percpu_sink);
+
  static int coresight_id_match(struct device *dev, void *data)
  {
         int trace_id, i_trace_id;
@@ -784,6 +797,14 @@ static int _coresight_build_path(struct coresight_device *csdev,
         if (csdev == sink)
                 goto out;
  
+       if (coresight_is_percpu_source(csdev) && coresight_is_percpu_sink(sink) &&
+           sink == per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev))) {
+               if (_coresight_build_path(sink, sink, path) == 0) {
+                       found = true;
+                       goto out;
+               }
+       }
+
         /* Not a sink - recursively explore each port found on this element */
         for (i = 0; i < csdev->pdata->nr_outport; i++) {
                 struct coresight_device *child_dev;
@@ -999,8 +1020,12 @@ coresight_find_default_sink(struct coresight_device *csdev)
         int depth = 0;
  
         /* look for a default sink if we have not found for this device */
-       if (!csdev->def_sink)
-               csdev->def_sink = coresight_find_sink(csdev, &depth);
+       if (!csdev->def_sink) {
+               if (coresight_is_percpu_source(csdev))
+                       csdev->def_sink = per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev));
+               if (!csdev->def_sink)
+                       csdev->def_sink = coresight_find_sink(csdev, &depth);
+       }
         return csdev->def_sink;
  }
  
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c

index 0f603b4..f123c26 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -24,7 +24,26 @@
  static struct pmu etm_pmu;
  static bool etm_perf_up;
  
-static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
+/*
+ * An ETM context for a running event includes the perf aux handle
+ * and aux_data. For ETM, the aux_data (etm_event_data), consists of
+ * the trace path and the sink configuration. The event data is accessible
+ * via perf_get_aux(handle). However, a sink could "end" a perf output
+ * handle via the IRQ handler. And if the "sink" encounters a failure
+ * to "begin" another session (e.g due to lack of space in the buffer),
+ * the handle will be cleared. Thus, the event_data may not be accessible
+ * from the handle when we get to the etm_event_stop(), which is required
+ * for stopping the trace path. The event_data is guaranteed to stay alive
+ * until "free_aux()", which cannot happen as long as the event is active on
+ * the ETM. Thus the event_data for the session must be part of the ETM context
+ * to make sure we can disable the trace path.
+ */
+struct etm_ctxt {
+       struct perf_output_handle handle;
+       struct etm_event_data *event_data;
+};
+
+static DEFINE_PER_CPU(struct etm_ctxt, etm_ctxt);
  static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
  
  /*
@@ -232,6 +251,25 @@ static void etm_free_aux(void *data)
         schedule_work(&event_data->work);
  }
  
+/*
+ * Check if two given sinks are compatible with each other,
+ * so that they can use the same sink buffers, when an event
+ * moves around.
+ */
+static bool sinks_compatible(struct coresight_device *a,
+                            struct coresight_device *b)
+{
+       if (!a || !b)
+               return false;
+       /*
+        * If the sinks are of the same subtype and driven
+        * by the same driver, we can use the same buffer
+        * on these sinks.
+        */
+       return (a->subtype.sink_subtype == b->subtype.sink_subtype) &&
+              (sink_ops(a) == sink_ops(b));
+}
+
  static void *etm_setup_aux(struct perf_event *event, void **pages,
                            int nr_pages, bool overwrite)
  {
@@ -239,6 +277,7 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
         int cpu = event->cpu;
         cpumask_t *mask;
         struct coresight_device *sink = NULL;
+       struct coresight_device *user_sink = NULL, *last_sink = NULL;
         struct etm_event_data *event_data = NULL;
  
         event_data = alloc_event_data(cpu);
@@ -249,7 +288,7 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
         /* First get the selected sink from user space. */
         if (event->attr.config2) {
                 id = (u32)event->attr.config2;
-               sink = coresight_get_sink_by_id(id);
+               sink = user_sink = coresight_get_sink_by_id(id);
         }
  
         mask = &event_data->mask;
@@ -277,14 +316,33 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
                 }
  
                 /*
-                * No sink provided - look for a default sink for one of the
-                * devices. At present we only support topology where all CPUs
-                * use the same sink [N:1], so only need to find one sink. The
-                * coresight_build_path later will remove any CPU that does not
-                * attach to the sink, or if we have not found a sink.
+                * No sink provided - look for a default sink for all the ETMs,
+                * where this event can be scheduled.
+                * We allocate the sink specific buffers only once for this
+                * event. If the ETMs have different default sink devices, we
+                * can only use a single "type" of sink as the event can carry
+                * only one sink specific buffer. Thus we have to make sure
+                * that the sinks are of the same type and driven by the same
+                * driver, as the one we allocate the buffer for. As such
+                * we choose the first sink and check if the remaining ETMs
+                * have a compatible default sink. We don't trace on a CPU
+                * if the sink is not compatible.
                  */
-               if (!sink)
+               if (!user_sink) {
+                       /* Find the default sink for this ETM */
                         sink = coresight_find_default_sink(csdev);
+                       if (!sink) {
+                               cpumask_clear_cpu(cpu, mask);
+                               continue;
+                       }
+
+                       /* Check if this sink compatible with the last sink */
+                       if (last_sink && !sinks_compatible(last_sink, sink)) {
+                               cpumask_clear_cpu(cpu, mask);
+                               continue;
+                       }
+                       last_sink = sink;
+               }
  
                 /*
                  * Building a path doesn't enable it, it simply builds a
@@ -312,7 +370,12 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
         if (!sink_ops(sink)->alloc_buffer || !sink_ops(sink)->free_buffer)
                 goto err;
  
-       /* Allocate the sink buffer for this session */
+       /*
+        * Allocate the sink buffer for this session. All the sinks
+        * where this event can be scheduled are ensured to be of the
+        * same type. Thus the same sink configuration is used by the
+        * sinks.
+        */
         event_data->snk_config =
                         sink_ops(sink)->alloc_buffer(sink, event, pages,
                                                      nr_pages, overwrite);
@@ -332,13 +395,18 @@ static void etm_event_start(struct perf_event *event, int flags)
  {
         int cpu = smp_processor_id();
         struct etm_event_data *event_data;
-       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
+       struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt);
+       struct perf_output_handle *handle = &ctxt->handle;
         struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
         struct list_head *path;
  
         if (!csdev)
                 goto fail;
  
+       /* Have we messed up our tracking ? */
+       if (WARN_ON(ctxt->event_data))
+               goto fail;
+
         /*
          * Deal with the ring buffer API and get a handle on the
          * session's information.
@@ -374,6 +442,8 @@ static void etm_event_start(struct perf_event *event, int flags)
         if (source_ops(csdev)->enable(csdev, event, CS_MODE_PERF))
                 goto fail_disable_path;
  
+       /* Save the event_data for this ETM */
+       ctxt->event_data = event_data;
  out:
         return;
  
@@ -392,13 +462,30 @@ static void etm_event_stop(struct perf_event *event, int mode)
         int cpu = smp_processor_id();
         unsigned long size;
         struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
-       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
-       struct etm_event_data *event_data = perf_get_aux(handle);
+       struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt);
+       struct perf_output_handle *handle = &ctxt->handle;
+       struct etm_event_data *event_data;
         struct list_head *path;
  
+       /*
+        * If we still have access to the event_data via handle,
+        * confirm that we haven't messed up the tracking.
+        */
+       if (handle->event &&
+           WARN_ON(perf_get_aux(handle) != ctxt->event_data))
+               return;
+
+       event_data = ctxt->event_data;
+       /* Clear the event_data as this ETM is stopping the trace. */
+       ctxt->event_data = NULL;
+
         if (event->hw.state == PERF_HES_STOPPED)
                 return;
  
+       /* We must have a valid event_data for a running event */
+       if (WARN_ON(!event_data))
+               return;
+
         if (!csdev)
                 return;
  
@@ -416,7 +503,13 @@ static void etm_event_stop(struct perf_event *event, int mode)
         /* tell the core */
         event->hw.state = PERF_HES_STOPPED;
  
-       if (mode & PERF_EF_UPDATE) {
+       /*
+        * If the handle is not bound to an event anymore
+        * (e.g, the sink driver was unable to restart the
+        * handle due to lack of buffer space), we don't
+        * have to do anything here.
+        */
+       if (handle->event && (mode & PERF_EF_UPDATE)) {
                 if (WARN_ON_ONCE(handle->event != event))
                         return;
  
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c

index 15016f7..efb84ce 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm4x-core.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c
@@ -31,6 +31,7 @@
  #include <linux/pm_runtime.h>
  #include <linux/property.h>
  
+#include <asm/barrier.h>
  #include <asm/sections.h>
  #include <asm/sysreg.h>
  #include <asm/local.h>
@@ -114,30 +115,91 @@ void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit)
         }
  }
  
-static void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata, struct csdev_access *csa)
+static u64 ete_sysreg_read(u32 offset, bool _relaxed, bool _64bit)
  {
-       /* Writing 0 to TRCOSLAR unlocks the trace registers */
-       etm4x_relaxed_write32(csa, 0x0, TRCOSLAR);
-       drvdata->os_unlock = true;
+       u64 res = 0;
+
+       switch (offset) {
+       ETE_READ_CASES(res)
+       default :
+               pr_warn_ratelimited("ete: trying to read unsupported register @%x\n",
+                                   offset);
+       }
+
+       if (!_relaxed)
+               __iormb(res);   /* Imitate the !relaxed I/O helpers */
+
+       return res;
+}
+
+static void ete_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit)
+{
+       if (!_relaxed)
+               __iowmb();      /* Imitate the !relaxed I/O helpers */
+       if (!_64bit)
+               val &= GENMASK(31, 0);
+
+       switch (offset) {
+       ETE_WRITE_CASES(val)
+       default :
+               pr_warn_ratelimited("ete: trying to write to unsupported register @%x\n",
+                                   offset);
+       }
+}
+
+static void etm_detect_os_lock(struct etmv4_drvdata *drvdata,
+                              struct csdev_access *csa)
+{
+       u32 oslsr = etm4x_relaxed_read32(csa, TRCOSLSR);
+
+       drvdata->os_lock_model = ETM_OSLSR_OSLM(oslsr);
+}
+
+static void etm_write_os_lock(struct etmv4_drvdata *drvdata,
+                             struct csdev_access *csa, u32 val)
+{
+       val = !!val;
+
+       switch (drvdata->os_lock_model) {
+       case ETM_OSLOCK_PRESENT:
+               etm4x_relaxed_write32(csa, val, TRCOSLAR);
+               break;
+       case ETM_OSLOCK_PE:
+               write_sysreg_s(val, SYS_OSLAR_EL1);
+               break;
+       default:
+               pr_warn_once("CPU%d: Unsupported Trace OSLock model: %x\n",
+                            smp_processor_id(), drvdata->os_lock_model);
+               fallthrough;
+       case ETM_OSLOCK_NI:
+               return;
+       }
         isb();
  }
  
+static inline void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata,
+                                     struct csdev_access *csa)
+{
+       WARN_ON(drvdata->cpu != smp_processor_id());
+
+       /* Writing 0 to OS Lock unlocks the trace unit registers */
+       etm_write_os_lock(drvdata, csa, 0x0);
+       drvdata->os_unlock = true;
+}
+
  static void etm4_os_unlock(struct etmv4_drvdata *drvdata)
  {
         if (!WARN_ON(!drvdata->csdev))
                 etm4_os_unlock_csa(drvdata, &drvdata->csdev->access);
-
  }
  
  static void etm4_os_lock(struct etmv4_drvdata *drvdata)
  {
         if (WARN_ON(!drvdata->csdev))
                 return;
-
-       /* Writing 0x1 to TRCOSLAR locks the trace registers */
-       etm4x_relaxed_write32(&drvdata->csdev->access, 0x1, TRCOSLAR);
+       /* Writing 0x1 to OS Lock locks the trace registers */
+       etm_write_os_lock(drvdata, &drvdata->csdev->access, 0x1);
         drvdata->os_unlock = false;
-       isb();
  }
  
  static void etm4_cs_lock(struct etmv4_drvdata *drvdata,
@@ -371,6 +433,13 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
                 etm4x_relaxed_write32(csa, trcpdcr | TRCPDCR_PU, TRCPDCR);
         }
  
+       /*
+        * ETE mandates that the TRCRSR is written to before
+        * enabling it.
+        */
+       if (etm4x_is_ete(drvdata))
+               etm4x_relaxed_write32(csa, TRCRSR_TA, TRCRSR);
+
         /* Enable the trace unit */
         etm4x_relaxed_write32(csa, 1, TRCPRGCTLR);
  
@@ -654,6 +723,7 @@ static int etm4_enable(struct coresight_device *csdev,
  static void etm4_disable_hw(void *info)
  {
         u32 control;
+       u64 trfcr;
         struct etmv4_drvdata *drvdata = info;
         struct etmv4_config *config = &drvdata->config;
         struct coresight_device *csdev = drvdata->csdev;
@@ -677,18 +747,32 @@ static void etm4_disable_hw(void *info)
         control &= ~0x1;
  
         /*
+        * If the CPU supports v8.4 Trace filter Control,
+        * set the ETM to trace prohibited region.
+        */
+       if (drvdata->trfc) {
+               trfcr = read_sysreg_s(SYS_TRFCR_EL1);
+               write_sysreg_s(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE),
+                              SYS_TRFCR_EL1);
+               isb();
+       }
+       /*
          * Make sure everything completes before disabling, as recommended
          * by section 7.3.77 ("TRCVICTLR, ViewInst Main Control Register,
          * SSTATUS") of ARM IHI 0064D
          */
         dsb(sy);
         isb();
+       /* Trace synchronization barrier, is a nop if not supported */
+       tsb_csync();
         etm4x_relaxed_write32(csa, control, TRCPRGCTLR);
  
         /* wait for TRCSTATR.PMSTABLE to go to '1' */
         if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1))
                 dev_err(etm_dev,
                         "timeout while waiting for PM stable Trace Status\n");
+       if (drvdata->trfc)
+               write_sysreg_s(trfcr, SYS_TRFCR_EL1);
  
         /* read the status of the single shot comparators */
         for (i = 0; i < drvdata->nr_ss_cmp; i++) {
@@ -817,13 +901,24 @@ static bool etm4_init_sysreg_access(struct etmv4_drvdata *drvdata,
          * ETMs implementing sysreg access must implement TRCDEVARCH.
          */
         devarch = read_etm4x_sysreg_const_offset(TRCDEVARCH);
-       if ((devarch & ETM_DEVARCH_ID_MASK) != ETM_DEVARCH_ETMv4x_ARCH)
+       switch (devarch & ETM_DEVARCH_ID_MASK) {
+       case ETM_DEVARCH_ETMv4x_ARCH:
+               *csa = (struct csdev_access) {
+                       .io_mem = false,
+                       .read   = etm4x_sysreg_read,
+                       .write  = etm4x_sysreg_write,
+               };
+               break;
+       case ETM_DEVARCH_ETE_ARCH:
+               *csa = (struct csdev_access) {
+                       .io_mem = false,
+                       .read   = ete_sysreg_read,
+                       .write  = ete_sysreg_write,
+               };
+               break;
+       default:
                 return false;
-       *csa = (struct csdev_access) {
-               .io_mem = false,
-               .read   = etm4x_sysreg_read,
-               .write  = etm4x_sysreg_write,
-       };
+       }
  
         drvdata->arch = etm_devarch_to_arch(devarch);
         return true;
@@ -873,7 +968,7 @@ static bool etm4_init_csdev_access(struct etmv4_drvdata *drvdata,
         return false;
  }
  
-static void cpu_enable_tracing(void)
+static void cpu_enable_tracing(struct etmv4_drvdata *drvdata)
  {
         u64 dfr0 = read_sysreg(id_aa64dfr0_el1);
         u64 trfcr;
@@ -881,6 +976,7 @@ static void cpu_enable_tracing(void)
         if (!cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRACE_FILT_SHIFT))
                 return;
  
+       drvdata->trfc = true;
         /*
          * If the CPU supports v8.4 SelfHosted Tracing, enable
          * tracing at the kernel EL and EL0, forcing to use the
@@ -920,6 +1016,9 @@ static void etm4_init_arch_data(void *info)
         if (!etm4_init_csdev_access(drvdata, csa))
                 return;
  
+       /* Detect the support for OS Lock before we actually use it */
+       etm_detect_os_lock(drvdata, csa);
+
         /* Make sure all registers are accessible */
         etm4_os_unlock_csa(drvdata, csa);
         etm4_cs_unlock(drvdata, csa);
@@ -1082,7 +1181,7 @@ static void etm4_init_arch_data(void *info)
         /* NUMCNTR, bits[30:28] number of counters available for tracing */
         drvdata->nr_cntr = BMVAL(etmidr5, 28, 30);
         etm4_cs_lock(drvdata, csa);
-       cpu_enable_tracing();
+       cpu_enable_tracing(drvdata);
  }
  
  static inline u32 etm4_get_victlr_access_type(struct etmv4_config *config)
@@ -1760,6 +1859,8 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
         struct etmv4_drvdata *drvdata;
         struct coresight_desc desc = { 0 };
         struct etm4_init_arg init_arg = { 0 };
+       u8 major, minor;
+       char *type_name;
  
         drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
         if (!drvdata)
@@ -1786,10 +1887,6 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
         if (drvdata->cpu < 0)
                 return drvdata->cpu;
  
-       desc.name = devm_kasprintf(dev, GFP_KERNEL, "etm%d", drvdata->cpu);
-       if (!desc.name)
-               return -ENOMEM;
-
         init_arg.drvdata = drvdata;
         init_arg.csa = &desc.access;
         init_arg.pid = etm_pid;
@@ -1806,6 +1903,22 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
             fwnode_property_present(dev_fwnode(dev), "qcom,skip-power-up"))
                 drvdata->skip_power_up = true;
  
+       major = ETM_ARCH_MAJOR_VERSION(drvdata->arch);
+       minor = ETM_ARCH_MINOR_VERSION(drvdata->arch);
+
+       if (etm4x_is_ete(drvdata)) {
+               type_name = "ete";
+               /* ETE v1 has major version == 0b101. Adjust this for logging.*/
+               major -= 4;
+       } else {
+               type_name = "etm";
+       }
+
+       desc.name = devm_kasprintf(dev, GFP_KERNEL,
+                                  "%s%d", type_name, drvdata->cpu);
+       if (!desc.name)
+               return -ENOMEM;
+
         etm4_init_trace_id(drvdata);
         etm4_set_default(&drvdata->config);
  
@@ -1833,9 +1946,8 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
  
         etmdrvdata[drvdata->cpu] = drvdata;
  
-       dev_info(&drvdata->csdev->dev, "CPU%d: ETM v%d.%d initialized\n",
-                drvdata->cpu, ETM_ARCH_MAJOR_VERSION(drvdata->arch),
-                ETM_ARCH_MINOR_VERSION(drvdata->arch));
+       dev_info(&drvdata->csdev->dev, "CPU%d: %s v%d.%d initialized\n",
+                drvdata->cpu, type_name, major, minor);
  
         if (boot_enable) {
                 coresight_enable(drvdata->csdev);
@@ -1978,6 +2090,7 @@ static struct amba_driver etm4x_amba_driver = {
  
  static const struct of_device_id etm4_sysreg_match[] = {
         { .compatible   = "arm,coresight-etm4x-sysreg" },
+       { .compatible   = "arm,embedded-trace-extension" },
         {}
  };
  
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c

index 0995a10..007bad9 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
@@ -2374,12 +2374,20 @@ static inline bool
  etm4x_register_implemented(struct etmv4_drvdata *drvdata, u32 offset)
  {
         switch (offset) {
-       ETM4x_SYSREG_LIST_CASES
+       ETM_COMMON_SYSREG_LIST_CASES
                 /*
-                * Registers accessible via system instructions are always
-                * implemented.
+                * Common registers to ETE & ETM4x accessible via system
+                * instructions are always implemented.
                  */
                 return true;
+
+       ETM4x_ONLY_SYSREG_LIST_CASES
+               /*
+                * We only support etm4x and ete. So if the device is not
+                * ETE, it must be ETMv4x.
+                */
+               return !etm4x_is_ete(drvdata);
+
         ETM4x_MMAP_LIST_CASES
                 /*
                  * Registers accessible only via memory-mapped registers
@@ -2389,8 +2397,13 @@ etm4x_register_implemented(struct etmv4_drvdata *drvdata, u32 offset)
                  * coresight_register() and the csdev is not initialized
                  * until that is done. So rely on the drvdata->base to
                  * detect if we have a memory mapped access.
+                * Also ETE doesn't implement memory mapped access, thus
+                * it is sufficient to check that we are using mmio.
                  */
                 return !!drvdata->base;
+
+       ETE_ONLY_SYSREG_LIST_CASES
+               return etm4x_is_ete(drvdata);
         }
  
         return false;
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h

index 0af6057..e5b79bd 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -29,6 +29,7 @@
  #define TRCAUXCTLR                     0x018
  #define TRCEVENTCTL0R                  0x020
  #define TRCEVENTCTL1R                  0x024
+#define TRCRSR                         0x028
  #define TRCSTALLCTLR                   0x02C
  #define TRCTSCTLR                      0x030
  #define TRCSYNCPR                      0x034
@@ -49,6 +50,7 @@
  #define TRCSEQRSTEVR                   0x118
  #define TRCSEQSTR                      0x11C
  #define TRCEXTINSELR                   0x120
+#define TRCEXTINSELRn(n)               (0x120 + (n * 4)) /* n = 0-3 */
  #define TRCCNTRLDVRn(n)                        (0x140 + (n * 4)) /* n = 0-3 */
  #define TRCCNTCTLRn(n)                 (0x150 + (n * 4)) /* n = 0-3 */
  #define TRCCNTVRn(n)                   (0x160 + (n * 4)) /* n = 0-3 */
@@ -126,6 +128,8 @@
  #define TRCCIDR2                       0xFF8
  #define TRCCIDR3                       0xFFC
  
+#define TRCRSR_TA                      BIT(12)
+
  /*
   * System instructions to access ETM registers.
   * See ETMv4.4 spec ARM IHI0064F section 4.3.6 System instructions
@@ -160,10 +164,22 @@
  #define CASE_NOP(__unused, x)                                  \
         case (x):       /* fall through */
  
+#define ETE_ONLY_SYSREG_LIST(op, val)          \
+       CASE_##op((val), TRCRSR)                \
+       CASE_##op((val), TRCEXTINSELRn(1))      \
+       CASE_##op((val), TRCEXTINSELRn(2))      \
+       CASE_##op((val), TRCEXTINSELRn(3))
+
  /* List of registers accessible via System instructions */
-#define ETM_SYSREG_LIST(op, val)               \
-       CASE_##op((val), TRCPRGCTLR)            \
+#define ETM4x_ONLY_SYSREG_LIST(op, val)                \
         CASE_##op((val), TRCPROCSELR)           \
+       CASE_##op((val), TRCVDCTLR)             \
+       CASE_##op((val), TRCVDSACCTLR)          \
+       CASE_##op((val), TRCVDARCCTLR)          \
+       CASE_##op((val), TRCOSLAR)
+
+#define ETM_COMMON_SYSREG_LIST(op, val)                \
+       CASE_##op((val), TRCPRGCTLR)            \
         CASE_##op((val), TRCSTATR)              \
         CASE_##op((val), TRCCONFIGR)            \
         CASE_##op((val), TRCAUXCTLR)            \
@@ -180,9 +196,6 @@
         CASE_##op((val), TRCVIIECTLR)           \
         CASE_##op((val), TRCVISSCTLR)           \
         CASE_##op((val), TRCVIPCSSCTLR)         \
-       CASE_##op((val), TRCVDCTLR)             \
-       CASE_##op((val), TRCVDSACCTLR)          \
-       CASE_##op((val), TRCVDARCCTLR)          \
         CASE_##op((val), TRCSEQEVRn(0))         \
         CASE_##op((val), TRCSEQEVRn(1))         \
         CASE_##op((val), TRCSEQEVRn(2))         \
@@ -277,7 +290,6 @@
         CASE_##op((val), TRCSSPCICRn(5))        \
         CASE_##op((val), TRCSSPCICRn(6))        \
         CASE_##op((val), TRCSSPCICRn(7))        \
-       CASE_##op((val), TRCOSLAR)              \
         CASE_##op((val), TRCOSLSR)              \
         CASE_##op((val), TRCACVRn(0))           \
         CASE_##op((val), TRCACVRn(1))           \
@@ -369,12 +381,38 @@
         CASE_##op((val), TRCPIDR2)              \
         CASE_##op((val), TRCPIDR3)
  
-#define ETM4x_READ_SYSREG_CASES(res)   ETM_SYSREG_LIST(READ, (res))
-#define ETM4x_WRITE_SYSREG_CASES(val)  ETM_SYSREG_LIST(WRITE, (val))
+#define ETM4x_READ_SYSREG_CASES(res)           \
+       ETM_COMMON_SYSREG_LIST(READ, (res))     \
+       ETM4x_ONLY_SYSREG_LIST(READ, (res))
+
+#define ETM4x_WRITE_SYSREG_CASES(val)          \
+       ETM_COMMON_SYSREG_LIST(WRITE, (val))    \
+       ETM4x_ONLY_SYSREG_LIST(WRITE, (val))
+
+#define ETM_COMMON_SYSREG_LIST_CASES           \
+       ETM_COMMON_SYSREG_LIST(NOP, __unused)
+
+#define ETM4x_ONLY_SYSREG_LIST_CASES           \
+       ETM4x_ONLY_SYSREG_LIST(NOP, __unused)
+
+#define ETM4x_SYSREG_LIST_CASES                        \
+       ETM_COMMON_SYSREG_LIST_CASES            \
+       ETM4x_ONLY_SYSREG_LIST(NOP, __unused)
  
-#define ETM4x_SYSREG_LIST_CASES                ETM_SYSREG_LIST(NOP, __unused)
  #define ETM4x_MMAP_LIST_CASES          ETM_MMAP_LIST(NOP, __unused)
  
+/* ETE only supports system register access */
+#define ETE_READ_CASES(res)                    \
+       ETM_COMMON_SYSREG_LIST(READ, (res))     \
+       ETE_ONLY_SYSREG_LIST(READ, (res))
+
+#define ETE_WRITE_CASES(val)                   \
+       ETM_COMMON_SYSREG_LIST(WRITE, (val))    \
+       ETE_ONLY_SYSREG_LIST(WRITE, (val))
+
+#define ETE_ONLY_SYSREG_LIST_CASES             \
+       ETE_ONLY_SYSREG_LIST(NOP, __unused)
+
  #define read_etm4x_sysreg_offset(offset, _64bit)                               \
         ({                                                                      \
                 u64 __val;                                                      \
@@ -506,6 +544,20 @@
                                          ETM_MODE_EXCL_USER)
  
  /*
+ * TRCOSLSR.OSLM advertises the OS Lock model.
+ * OSLM[2:0] = TRCOSLSR[4:3,0]
+ *
+ *     0b000 - Trace OS Lock is not implemented.
+ *     0b010 - Trace OS Lock is implemented.
+ *     0b100 - Trace OS Lock is not implemented, unit is controlled by PE OS Lock.
+ */
+#define ETM_OSLOCK_NI          0b000
+#define ETM_OSLOCK_PRESENT     0b010
+#define ETM_OSLOCK_PE          0b100
+
+#define ETM_OSLSR_OSLM(oslsr)  ((((oslsr) & GENMASK(4, 3)) >> 2) | (oslsr & 0x1))
+
+/*
   * TRCDEVARCH Bit field definitions
   * Bits[31:21] - ARCHITECT = Always Arm Ltd.
   *                * Bits[31:28] = 0x4
@@ -541,11 +593,14 @@
         ((ETM_DEVARCH_MAKE_ARCHID_ARCH_VER(major)) | ETM_DEVARCH_ARCHID_ARCH_PART(0xA13))
  
  #define ETM_DEVARCH_ARCHID_ETMv4x              ETM_DEVARCH_MAKE_ARCHID(0x4)
+#define ETM_DEVARCH_ARCHID_ETE                 ETM_DEVARCH_MAKE_ARCHID(0x5)
  
  #define ETM_DEVARCH_ID_MASK                                            \
         (ETM_DEVARCH_ARCHITECT_MASK | ETM_DEVARCH_ARCHID_MASK | ETM_DEVARCH_PRESENT)
  #define ETM_DEVARCH_ETMv4x_ARCH                                                \
         (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETMv4x | ETM_DEVARCH_PRESENT)
+#define ETM_DEVARCH_ETE_ARCH                                           \
+       (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETE | ETM_DEVARCH_PRESENT)
  
  #define TRCSTATR_IDLE_BIT              0
  #define TRCSTATR_PMSTABLE_BIT          1
@@ -635,6 +690,8 @@
  #define ETM_ARCH_MINOR_VERSION(arch)   ((arch) & 0xfU)
  
  #define ETM_ARCH_V4    ETM_ARCH_VERSION(4, 0)
+#define ETM_ARCH_ETE   ETM_ARCH_VERSION(5, 0)
+
  /* Interpretation of resource numbers change at ETM v4.3 architecture */
  #define ETM_ARCH_V4_3  ETM_ARCH_VERSION(4, 3)
  
@@ -862,6 +919,7 @@ struct etmv4_save_state {
   * @nooverflow:        Indicate if overflow prevention is supported.
   * @atbtrig:   If the implementation can support ATB triggers
   * @lpoverride:        If the implementation can support low-power state over.
+ * @trfc:      If the implementation supports Arm v8.4 trace filter controls.
   * @config:    structure holding configuration parameters.
   * @save_state:        State to be preserved across power loss
   * @state_needs_restore: True when there is context to restore after PM exit
@@ -897,6 +955,7 @@ struct etmv4_drvdata {
         u8                              s_ex_level;
         u8                              ns_ex_level;
         u8                              q_support;
+       u8                              os_lock_model;
         bool                            sticky_enable;
         bool                            boot_enable;
         bool                            os_unlock;
@@ -912,6 +971,7 @@ struct etmv4_drvdata {
         bool                            nooverflow;
         bool                            atbtrig;
         bool                            lpoverride;
+       bool                            trfc;
         struct etmv4_config             config;
         struct etmv4_save_state         *save_state;
         bool                            state_needs_restore;
@@ -940,4 +1000,9 @@ void etm4_config_trace_mode(struct etmv4_config *config);
  
  u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit);
  void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit);
+
+static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)
+{
+       return drvdata->arch >= ETM_ARCH_ETE;
+}
  #endif
diff --git a/drivers/hwtracing/coresight/coresight-platform.c b/drivers/hwtracing/coresight/coresight-platform.c

index 3629b78..c594f45 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-platform.c
+++ b/drivers/hwtracing/coresight/coresight-platform.c
@@ -90,6 +90,12 @@ static void of_coresight_get_ports_legacy(const struct device_node *node,
         struct of_endpoint endpoint;
         int in = 0, out = 0;
  
+       /*
+        * Avoid warnings in of_graph_get_next_endpoint()
+        * if the device doesn't have any graph connections
+        */
+       if (!of_graph_is_present(node))
+               return;
         do {
                 ep = of_graph_get_next_endpoint(node, ep);
                 if (!ep)
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h

index f5f654e..ff1dd20 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -232,4 +232,7 @@ coresight_find_csdev_by_fwnode(struct fwnode_handle *r_fwnode);
  void coresight_set_assoc_ectdev_mutex(struct coresight_device *csdev,
                                       struct coresight_device *ect_csdev);
  
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev);
+struct coresight_device *coresight_get_percpu_sink(int cpu);
+
  #endif
diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c

new file mode 100644 (file)

index 0000000..1768684
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-trbe.c
@@ -0,0 +1,1157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This driver enables Trace Buffer Extension (TRBE) as a per-cpu coresight
+ * sink device could then pair with an appropriate per-cpu coresight source
+ * device (ETE) thus generating required trace data. Trace can be enabled
+ * via the perf framework.
+ *
+ * The AUX buffer handling is inspired from Arm SPE PMU driver.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define DRVNAME "arm_trbe"
+
+#define pr_fmt(fmt) DRVNAME ": " fmt
+
+#include <asm/barrier.h>
+#include "coresight-trbe.h"
+
+#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT))
+
+/*
+ * A padding packet that will help the user space tools
+ * in skipping relevant sections in the captured trace
+ * data which could not be decoded. TRBE doesn't support
+ * formatting the trace data, unlike the legacy CoreSight
+ * sinks and thus we use ETE trace packets to pad the
+ * sections of the buffer.
+ */
+#define ETE_IGNORE_PACKET              0x70
+
+/*
+ * Minimum amount of meaningful trace will contain:
+ * A-Sync, Trace Info, Trace On, Address, Atom.
+ * This is about 44bytes of ETE trace. To be on
+ * the safer side, we assume 64bytes is the minimum
+ * space required for a meaningful session, before
+ * we hit a "WRAP" event.
+ */
+#define TRBE_TRACE_MIN_BUF_SIZE                64
+
+enum trbe_fault_action {
+       TRBE_FAULT_ACT_WRAP,
+       TRBE_FAULT_ACT_SPURIOUS,
+       TRBE_FAULT_ACT_FATAL,
+};
+
+struct trbe_buf {
+       /*
+        * Even though trbe_base represents vmap()
+        * mapped allocated buffer's start address,
+        * it's being as unsigned long for various
+        * arithmetic and comparision operations &
+        * also to be consistent with trbe_write &
+        * trbe_limit sibling pointers.
+        */
+       unsigned long trbe_base;
+       unsigned long trbe_limit;
+       unsigned long trbe_write;
+       int nr_pages;
+       void **pages;
+       bool snapshot;
+       struct trbe_cpudata *cpudata;
+};
+
+struct trbe_cpudata {
+       bool trbe_flag;
+       u64 trbe_align;
+       int cpu;
+       enum cs_mode mode;
+       struct trbe_buf *buf;
+       struct trbe_drvdata *drvdata;
+};
+
+struct trbe_drvdata {
+       struct trbe_cpudata __percpu *cpudata;
+       struct perf_output_handle * __percpu *handle;
+       struct hlist_node hotplug_node;
+       int irq;
+       cpumask_t supported_cpus;
+       enum cpuhp_state trbe_online;
+       struct platform_device *pdev;
+};
+
+static int trbe_alloc_node(struct perf_event *event)
+{
+       if (event->cpu == -1)
+               return NUMA_NO_NODE;
+       return cpu_to_node(event->cpu);
+}
+
+static void trbe_drain_buffer(void)
+{
+       tsb_csync();
+       dsb(nsh);
+}
+
+static void trbe_drain_and_disable_local(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       trbe_drain_buffer();
+
+       /*
+        * Disable the TRBE without clearing LIMITPTR which
+        * might be required for fetching the buffer limits.
+        */
+       trblimitr &= ~TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+       isb();
+}
+
+static void trbe_reset_local(void)
+{
+       trbe_drain_and_disable_local();
+       write_sysreg_s(0, SYS_TRBLIMITR_EL1);
+       write_sysreg_s(0, SYS_TRBPTR_EL1);
+       write_sysreg_s(0, SYS_TRBBASER_EL1);
+       write_sysreg_s(0, SYS_TRBSR_EL1);
+}
+
+static void trbe_stop_and_truncate_event(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       /*
+        * We cannot proceed with the buffer collection and we
+        * do not have any data for the current session. The
+        * etm_perf driver expects to close out the aux_buffer
+        * at event_stop(). So disable the TRBE here and leave
+        * the update_buffer() to return a 0 size.
+        */
+       trbe_drain_and_disable_local();
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL;
+}
+
+/*
+ * TRBE Buffer Management
+ *
+ * The TRBE buffer spans from the base pointer till the limit pointer. When enabled,
+ * it starts writing trace data from the write pointer onward till the limit pointer.
+ * When the write pointer reaches the address just before the limit pointer, it gets
+ * wrapped around again to the base pointer. This is called a TRBE wrap event, which
+ * generates a maintenance interrupt when operated in WRAP or FILL mode. This driver
+ * uses FILL mode, where the TRBE stops the trace collection at wrap event. The IRQ
+ * handler updates the AUX buffer and re-enables the TRBE with updated WRITE and
+ * LIMIT pointers.
+ *
+ *     Wrap around with an IRQ
+ *     ------ < ------ < ------- < ----- < -----
+ *     |                                       |
+ *     ------ > ------ > ------- > ----- > -----
+ *
+ *     +---------------+-----------------------+
+ *     |               |                       |
+ *     +---------------+-----------------------+
+ *     Base Pointer    Write Pointer           Limit Pointer
+ *
+ * The base and limit pointers always needs to be PAGE_SIZE aligned. But the write
+ * pointer can be aligned to the implementation defined TRBE trace buffer alignment
+ * as captured in trbe_cpudata->trbe_align.
+ *
+ *
+ *             head            tail            wakeup
+ *     +---------------------------------------+----- ~ ~ ------
+ *     |$$$$$$$|################|$$$$$$$$$$$$$$|               |
+ *     +---------------------------------------+----- ~ ~ ------
+ *     Base Pointer    Write Pointer           Limit Pointer
+ *
+ * The perf_output_handle indices (head, tail, wakeup) are monotonically increasing
+ * values which tracks all the driver writes and user reads from the perf auxiliary
+ * buffer. Generally [head..tail] is the area where the driver can write into unless
+ * the wakeup is behind the tail. Enabled TRBE buffer span needs to be adjusted and
+ * configured depending on the perf_output_handle indices, so that the driver does
+ * not override into areas in the perf auxiliary buffer which is being or yet to be
+ * consumed from the user space. The enabled TRBE buffer area is a moving subset of
+ * the allocated perf auxiliary buffer.
+ */
+static void trbe_pad_buf(struct perf_output_handle *handle, int len)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       u64 head = PERF_IDX2OFF(handle->head, buf);
+
+       memset((void *)buf->trbe_base + head, ETE_IGNORE_PACKET, len);
+       if (!buf->snapshot)
+               perf_aux_output_skip(handle, len);
+}
+
+static unsigned long trbe_snapshot_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       /*
+        * The ETE trace has alignment synchronization packets allowing
+        * the decoder to reset in case of an overflow or corruption.
+        * So we can use the entire buffer for the snapshot mode.
+        */
+       return buf->nr_pages * PAGE_SIZE;
+}
+
+/*
+ * TRBE Limit Calculation
+ *
+ * The following markers are used to illustrate various TRBE buffer situations.
+ *
+ * $$$$ - Data area, unconsumed captured trace data, not to be overridden
+ * #### - Free area, enabled, trace will be written
+ * %%%% - Free area, disabled, trace will not be written
+ * ==== - Free area, padded with ETE_IGNORE_PACKET, trace will be skipped
+ */
+static unsigned long __trbe_normal_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       struct trbe_cpudata *cpudata = buf->cpudata;
+       const u64 bufsize = buf->nr_pages * PAGE_SIZE;
+       u64 limit = bufsize;
+       u64 head, tail, wakeup;
+
+       head = PERF_IDX2OFF(handle->head, buf);
+
+       /*
+        *              head
+        *      ------->|
+        *      |
+        *      head    TRBE align      tail
+        * +----|-------|---------------|-------+
+        * |$$$$|=======|###############|$$$$$$$|
+        * +----|-------|---------------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * Perf aux buffer output head position can be misaligned depending on
+        * various factors including user space reads. In case misaligned, head
+        * needs to be aligned before TRBE can be configured. Pad the alignment
+        * gap with ETE_IGNORE_PACKET bytes that will be ignored by user tools
+        * and skip this section thus advancing the head.
+        */
+       if (!IS_ALIGNED(head, cpudata->trbe_align)) {
+               unsigned long delta = roundup(head, cpudata->trbe_align) - head;
+
+               delta = min(delta, handle->size);
+               trbe_pad_buf(handle, delta);
+               head = PERF_IDX2OFF(handle->head, buf);
+       }
+
+       /*
+        *      head = tail (size = 0)
+        * +----|-------------------------------+
+        * |$$$$|$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ |
+        * +----|-------------------------------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * Perf aux buffer does not have any space for the driver to write into.
+        * Just communicate trace truncation event to the user space by marking
+        * it with PERF_AUX_FLAG_TRUNCATED.
+        */
+       if (!handle->size) {
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+               return 0;
+       }
+
+       /* Compute the tail and wakeup indices now that we've aligned head */
+       tail = PERF_IDX2OFF(handle->head + handle->size, buf);
+       wakeup = PERF_IDX2OFF(handle->wakeup, buf);
+
+       /*
+        * Lets calculate the buffer area which TRBE could write into. There
+        * are three possible scenarios here. Limit needs to be aligned with
+        * PAGE_SIZE per the TRBE requirement. Always avoid clobbering the
+        * unconsumed data.
+        *
+        * 1) head < tail
+        *
+        *      head                    tail
+        * +----|-----------------------|-------+
+        * |$$$$|#######################|$$$$$$$|
+        * +----|-----------------------|-------+
+        * trbe_base                    limit   trbe_base + nr_pages
+        *
+        * TRBE could write into [head..tail] area. Unless the tail is right at
+        * the end of the buffer, neither an wrap around nor an IRQ is expected
+        * while being enabled.
+        *
+        * 2) head == tail
+        *
+        *      head = tail (size > 0)
+        * +----|-------------------------------+
+        * |%%%%|###############################|
+        * +----|-------------------------------+
+        * trbe_base                            limit = trbe_base + nr_pages
+        *
+        * TRBE should just write into [head..base + nr_pages] area even though
+        * the entire buffer is empty. Reason being, when the trace reaches the
+        * end of the buffer, it will just wrap around with an IRQ giving an
+        * opportunity to reconfigure the buffer.
+        *
+        * 3) tail < head
+        *
+        *      tail                    head
+        * +----|-----------------------|-------+
+        * |%%%%|$$$$$$$$$$$$$$$$$$$$$$$|#######|
+        * +----|-----------------------|-------+
+        * trbe_base                            limit = trbe_base + nr_pages
+        *
+        * TRBE should just write into [head..base + nr_pages] area even though
+        * the [trbe_base..tail] is also empty. Reason being, when the trace
+        * reaches the end of the buffer, it will just wrap around with an IRQ
+        * giving an opportunity to reconfigure the buffer.
+        */
+       if (head < tail)
+               limit = round_down(tail, PAGE_SIZE);
+
+       /*
+        * Wakeup may be arbitrarily far into the future. If it's not in the
+        * current generation, either we'll wrap before hitting it, or it's
+        * in the past and has been handled already.
+        *
+        * If there's a wakeup before we wrap, arrange to be woken up by the
+        * page boundary following it. Keep the tail boundary if that's lower.
+        *
+        *      head            wakeup  tail
+        * +----|---------------|-------|-------+
+        * |$$$$|###############|%%%%%%%|$$$$$$$|
+        * +----|---------------|-------|-------+
+        * trbe_base            limit           trbe_base + nr_pages
+        */
+       if (handle->wakeup < (handle->head + handle->size) && head <= wakeup)
+               limit = min(limit, round_up(wakeup, PAGE_SIZE));
+
+       /*
+        * There are two situation when this can happen i.e limit is before
+        * the head and hence TRBE cannot be configured.
+        *
+        * 1) head < tail (aligned down with PAGE_SIZE) and also they are both
+        * within the same PAGE size range.
+        *
+        *                      PAGE_SIZE
+        *              |----------------------|
+        *
+        *              limit   head    tail
+        * +------------|------|--------|-------+
+        * |$$$$$$$$$$$$$$$$$$$|========|$$$$$$$|
+        * +------------|------|--------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * 2) head < wakeup (aligned up with PAGE_SIZE) < tail and also both
+        * head and wakeup are within same PAGE size range.
+        *
+        *              PAGE_SIZE
+        *      |----------------------|
+        *
+        *      limit   head    wakeup  tail
+        * +----|------|-------|--------|-------+
+        * |$$$$$$$$$$$|=======|========|$$$$$$$|
+        * +----|------|-------|--------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        */
+       if (limit > head)
+               return limit;
+
+       trbe_pad_buf(handle, handle->size);
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       return 0;
+}
+
+static unsigned long trbe_normal_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = perf_get_aux(handle);
+       u64 limit = __trbe_normal_offset(handle);
+       u64 head = PERF_IDX2OFF(handle->head, buf);
+
+       /*
+        * If the head is too close to the limit and we don't
+        * have space for a meaningful run, we rather pad it
+        * and start fresh.
+        */
+       if (limit && (limit - head < TRBE_TRACE_MIN_BUF_SIZE)) {
+               trbe_pad_buf(handle, limit - head);
+               limit = __trbe_normal_offset(handle);
+       }
+       return limit;
+}
+
+static unsigned long compute_trbe_buffer_limit(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       unsigned long offset;
+
+       if (buf->snapshot)
+               offset = trbe_snapshot_offset(handle);
+       else
+               offset = trbe_normal_offset(handle);
+       return buf->trbe_base + offset;
+}
+
+static void clr_trbe_status(void)
+{
+       u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1);
+
+       WARN_ON(is_trbe_enabled());
+       trbsr &= ~TRBSR_IRQ;
+       trbsr &= ~TRBSR_TRG;
+       trbsr &= ~TRBSR_WRAP;
+       trbsr &= ~(TRBSR_EC_MASK << TRBSR_EC_SHIFT);
+       trbsr &= ~(TRBSR_BSC_MASK << TRBSR_BSC_SHIFT);
+       trbsr &= ~TRBSR_STOP;
+       write_sysreg_s(trbsr, SYS_TRBSR_EL1);
+}
+
+static void set_trbe_limit_pointer_enabled(unsigned long addr)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       WARN_ON(!IS_ALIGNED(addr, (1UL << TRBLIMITR_LIMIT_SHIFT)));
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+
+       trblimitr &= ~TRBLIMITR_NVM;
+       trblimitr &= ~(TRBLIMITR_FILL_MODE_MASK << TRBLIMITR_FILL_MODE_SHIFT);
+       trblimitr &= ~(TRBLIMITR_TRIG_MODE_MASK << TRBLIMITR_TRIG_MODE_SHIFT);
+       trblimitr &= ~(TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT);
+
+       /*
+        * Fill trace buffer mode is used here while configuring the
+        * TRBE for trace capture. In this particular mode, the trace
+        * collection is stopped and a maintenance interrupt is raised
+        * when the current write pointer wraps. This pause in trace
+        * collection gives the software an opportunity to capture the
+        * trace data in the interrupt handler, before reconfiguring
+        * the TRBE.
+        */
+       trblimitr |= (TRBE_FILL_MODE_FILL & TRBLIMITR_FILL_MODE_MASK) << TRBLIMITR_FILL_MODE_SHIFT;
+
+       /*
+        * Trigger mode is not used here while configuring the TRBE for
+        * the trace capture. Hence just keep this in the ignore mode.
+        */
+       trblimitr |= (TRBE_TRIG_MODE_IGNORE & TRBLIMITR_TRIG_MODE_MASK) <<
+                     TRBLIMITR_TRIG_MODE_SHIFT;
+       trblimitr |= (addr & PAGE_MASK);
+
+       trblimitr |= TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+
+       /* Synchronize the TRBE enable event */
+       isb();
+}
+
+static void trbe_enable_hw(struct trbe_buf *buf)
+{
+       WARN_ON(buf->trbe_write < buf->trbe_base);
+       WARN_ON(buf->trbe_write >= buf->trbe_limit);
+       set_trbe_disabled();
+       isb();
+       clr_trbe_status();
+       set_trbe_base_pointer(buf->trbe_base);
+       set_trbe_write_pointer(buf->trbe_write);
+
+       /*
+        * Synchronize all the register updates
+        * till now before enabling the TRBE.
+        */
+       isb();
+       set_trbe_limit_pointer_enabled(buf->trbe_limit);
+}
+
+static enum trbe_fault_action trbe_get_fault_act(u64 trbsr)
+{
+       int ec = get_trbe_ec(trbsr);
+       int bsc = get_trbe_bsc(trbsr);
+
+       WARN_ON(is_trbe_running(trbsr));
+       if (is_trbe_trg(trbsr) || is_trbe_abort(trbsr))
+               return TRBE_FAULT_ACT_FATAL;
+
+       if ((ec == TRBE_EC_STAGE1_ABORT) || (ec == TRBE_EC_STAGE2_ABORT))
+               return TRBE_FAULT_ACT_FATAL;
+
+       if (is_trbe_wrap(trbsr) && (ec == TRBE_EC_OTHERS) && (bsc == TRBE_BSC_FILLED)) {
+               if (get_trbe_write_pointer() == get_trbe_base_pointer())
+                       return TRBE_FAULT_ACT_WRAP;
+       }
+       return TRBE_FAULT_ACT_SPURIOUS;
+}
+
+static void *arm_trbe_alloc_buffer(struct coresight_device *csdev,
+                                  struct perf_event *event, void **pages,
+                                  int nr_pages, bool snapshot)
+{
+       struct trbe_buf *buf;
+       struct page **pglist;
+       int i;
+
+       /*
+        * TRBE LIMIT and TRBE WRITE pointers must be page aligned. But with
+        * just a single page, there would not be any room left while writing
+        * into a partially filled TRBE buffer after the page size alignment.
+        * Hence restrict the minimum buffer size as two pages.
+        */
+       if (nr_pages < 2)
+               return NULL;
+
+       buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, trbe_alloc_node(event));
+       if (!buf)
+               return ERR_PTR(-ENOMEM);
+
+       pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
+       if (!pglist) {
+               kfree(buf);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       for (i = 0; i < nr_pages; i++)
+               pglist[i] = virt_to_page(pages[i]);
+
+       buf->trbe_base = (unsigned long)vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
+       if (!buf->trbe_base) {
+               kfree(pglist);
+               kfree(buf);
+               return ERR_PTR(-ENOMEM);
+       }
+       buf->trbe_limit = buf->trbe_base + nr_pages * PAGE_SIZE;
+       buf->trbe_write = buf->trbe_base;
+       buf->snapshot = snapshot;
+       buf->nr_pages = nr_pages;
+       buf->pages = pages;
+       kfree(pglist);
+       return buf;
+}
+
+static void arm_trbe_free_buffer(void *config)
+{
+       struct trbe_buf *buf = config;
+
+       vunmap((void *)buf->trbe_base);
+       kfree(buf);
+}
+
+static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev,
+                                           struct perf_output_handle *handle,
+                                           void *config)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct trbe_buf *buf = config;
+       enum trbe_fault_action act;
+       unsigned long size, offset;
+       unsigned long write, base, status;
+       unsigned long flags;
+
+       WARN_ON(buf->cpudata != cpudata);
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (cpudata->mode != CS_MODE_PERF)
+               return 0;
+
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW);
+
+       /*
+        * We are about to disable the TRBE. And this could in turn
+        * fill up the buffer triggering, an IRQ. This could be consumed
+        * by the PE asynchronously, causing a race here against
+        * the IRQ handler in closing out the handle. So, let us
+        * make sure the IRQ can't trigger while we are collecting
+        * the buffer. We also make sure that a WRAP event is handled
+        * accordingly.
+        */
+       local_irq_save(flags);
+
+       /*
+        * If the TRBE was disabled due to lack of space in the AUX buffer or a
+        * spurious fault, the driver leaves it disabled, truncating the buffer.
+        * Since the etm_perf driver expects to close out the AUX buffer, the
+        * driver skips it. Thus, just pass in 0 size here to indicate that the
+        * buffer was truncated.
+        */
+       if (!is_trbe_enabled()) {
+               size = 0;
+               goto done;
+       }
+       /*
+        * perf handle structure needs to be shared with the TRBE IRQ handler for
+        * capturing trace data and restarting the handle. There is a probability
+        * of an undefined reference based crash when etm event is being stopped
+        * while a TRBE IRQ also getting processed. This happens due the release
+        * of perf handle via perf_aux_output_end() in etm_event_stop(). Stopping
+        * the TRBE here will ensure that no IRQ could be generated when the perf
+        * handle gets freed in etm_event_stop().
+        */
+       trbe_drain_and_disable_local();
+       write = get_trbe_write_pointer();
+       base = get_trbe_base_pointer();
+
+       /* Check if there is a pending interrupt and handle it here */
+       status = read_sysreg_s(SYS_TRBSR_EL1);
+       if (is_trbe_irq(status)) {
+
+               /*
+                * Now that we are handling the IRQ here, clear the IRQ
+                * from the status, to let the irq handler know that it
+                * is taken care of.
+                */
+               clr_trbe_irq();
+               isb();
+
+               act = trbe_get_fault_act(status);
+               /*
+                * If this was not due to a WRAP event, we have some
+                * errors and as such buffer is empty.
+                */
+               if (act != TRBE_FAULT_ACT_WRAP) {
+                       size = 0;
+                       goto done;
+               }
+
+               /*
+                * Otherwise, the buffer is full and the write pointer
+                * has reached base. Adjust this back to the Limit pointer
+                * for correct size. Also, mark the buffer truncated.
+                */
+               write = get_trbe_limit_pointer();
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       }
+
+       offset = write - base;
+       if (WARN_ON_ONCE(offset < PERF_IDX2OFF(handle->head, buf)))
+               size = 0;
+       else
+               size = offset - PERF_IDX2OFF(handle->head, buf);
+
+done:
+       local_irq_restore(flags);
+
+       if (buf->snapshot)
+               handle->head += size;
+       return size;
+}
+
+static int arm_trbe_enable(struct coresight_device *csdev, u32 mode, void *data)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct perf_output_handle *handle = data;
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (mode != CS_MODE_PERF)
+               return -EINVAL;
+
+       *this_cpu_ptr(drvdata->handle) = handle;
+       cpudata->buf = buf;
+       cpudata->mode = mode;
+       buf->cpudata = cpudata;
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_stop_and_truncate_event(handle);
+               return 0;
+       }
+       trbe_enable_hw(buf);
+       return 0;
+}
+
+static int arm_trbe_disable(struct coresight_device *csdev)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct trbe_buf *buf = cpudata->buf;
+
+       WARN_ON(buf->cpudata != cpudata);
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (cpudata->mode != CS_MODE_PERF)
+               return -EINVAL;
+
+       trbe_drain_and_disable_local();
+       buf->cpudata = NULL;
+       cpudata->buf = NULL;
+       cpudata->mode = CS_MODE_DISABLED;
+       return 0;
+}
+
+static void trbe_handle_spurious(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_drain_and_disable_local();
+               return;
+       }
+       trbe_enable_hw(buf);
+}
+
+static void trbe_handle_overflow(struct perf_output_handle *handle)
+{
+       struct perf_event *event = handle->event;
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       unsigned long offset, size;
+       struct etm_event_data *event_data;
+
+       offset = get_trbe_limit_pointer() - get_trbe_base_pointer();
+       size = offset - PERF_IDX2OFF(handle->head, buf);
+       if (buf->snapshot)
+               handle->head += size;
+
+       /*
+        * Mark the buffer as truncated, as we have stopped the trace
+        * collection upon the WRAP event, without stopping the source.
+        */
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW |
+                                    PERF_AUX_FLAG_TRUNCATED);
+       perf_aux_output_end(handle, size);
+       event_data = perf_aux_output_begin(handle, event);
+       if (!event_data) {
+               /*
+                * We are unable to restart the trace collection,
+                * thus leave the TRBE disabled. The etm-perf driver
+                * is able to detect this with a disconnected handle
+                * (handle->event = NULL).
+                */
+               trbe_drain_and_disable_local();
+               *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL;
+               return;
+       }
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_stop_and_truncate_event(handle);
+               return;
+       }
+       *this_cpu_ptr(buf->cpudata->drvdata->handle) = handle;
+       trbe_enable_hw(buf);
+}
+
+static bool is_perf_trbe(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       struct trbe_cpudata *cpudata = buf->cpudata;
+       struct trbe_drvdata *drvdata = cpudata->drvdata;
+       int cpu = smp_processor_id();
+
+       WARN_ON(buf->trbe_base != get_trbe_base_pointer());
+       WARN_ON(buf->trbe_limit != get_trbe_limit_pointer());
+
+       if (cpudata->mode != CS_MODE_PERF)
+               return false;
+
+       if (cpudata->cpu != cpu)
+               return false;
+
+       if (!cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+               return false;
+
+       return true;
+}
+
+static irqreturn_t arm_trbe_irq_handler(int irq, void *dev)
+{
+       struct perf_output_handle **handle_ptr = dev;
+       struct perf_output_handle *handle = *handle_ptr;
+       enum trbe_fault_action act;
+       u64 status;
+
+       /*
+        * Ensure the trace is visible to the CPUs and
+        * any external aborts have been resolved.
+        */
+       trbe_drain_and_disable_local();
+
+       status = read_sysreg_s(SYS_TRBSR_EL1);
+       /*
+        * If the pending IRQ was handled by update_buffer callback
+        * we have nothing to do here.
+        */
+       if (!is_trbe_irq(status))
+               return IRQ_NONE;
+
+       clr_trbe_irq();
+       isb();
+
+       if (WARN_ON_ONCE(!handle) || !perf_get_aux(handle))
+               return IRQ_NONE;
+
+       if (!is_perf_trbe(handle))
+               return IRQ_NONE;
+
+       /*
+        * Ensure perf callbacks have completed, which may disable
+        * the trace buffer in response to a TRUNCATION flag.
+        */
+       irq_work_run();
+
+       act = trbe_get_fault_act(status);
+       switch (act) {
+       case TRBE_FAULT_ACT_WRAP:
+               trbe_handle_overflow(handle);
+               break;
+       case TRBE_FAULT_ACT_SPURIOUS:
+               trbe_handle_spurious(handle);
+               break;
+       case TRBE_FAULT_ACT_FATAL:
+               trbe_stop_and_truncate_event(handle);
+               break;
+       }
+       return IRQ_HANDLED;
+}
+
+static const struct coresight_ops_sink arm_trbe_sink_ops = {
+       .enable         = arm_trbe_enable,
+       .disable        = arm_trbe_disable,
+       .alloc_buffer   = arm_trbe_alloc_buffer,
+       .free_buffer    = arm_trbe_free_buffer,
+       .update_buffer  = arm_trbe_update_buffer,
+};
+
+static const struct coresight_ops arm_trbe_cs_ops = {
+       .sink_ops       = &arm_trbe_sink_ops,
+};
+
+static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct trbe_cpudata *cpudata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%llx\n", cpudata->trbe_align);
+}
+static DEVICE_ATTR_RO(align);
+
+static ssize_t flag_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct trbe_cpudata *cpudata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%d\n", cpudata->trbe_flag);
+}
+static DEVICE_ATTR_RO(flag);
+
+static struct attribute *arm_trbe_attrs[] = {
+       &dev_attr_align.attr,
+       &dev_attr_flag.attr,
+       NULL,
+};
+
+static const struct attribute_group arm_trbe_group = {
+       .attrs = arm_trbe_attrs,
+};
+
+static const struct attribute_group *arm_trbe_groups[] = {
+       &arm_trbe_group,
+       NULL,
+};
+
+static void arm_trbe_enable_cpu(void *info)
+{
+       struct trbe_drvdata *drvdata = info;
+
+       trbe_reset_local();
+       enable_percpu_irq(drvdata->irq, IRQ_TYPE_NONE);
+}
+
+static void arm_trbe_register_coresight_cpu(struct trbe_drvdata *drvdata, int cpu)
+{
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu);
+       struct coresight_desc desc = { 0 };
+       struct device *dev;
+
+       if (WARN_ON(trbe_csdev))
+               return;
+
+       dev = &cpudata->drvdata->pdev->dev;
+       desc.name = devm_kasprintf(dev, GFP_KERNEL, "trbe%d", cpu);
+       if (!desc.name)
+               goto cpu_clear;
+
+       desc.type = CORESIGHT_DEV_TYPE_SINK;
+       desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM;
+       desc.ops = &arm_trbe_cs_ops;
+       desc.pdata = dev_get_platdata(dev);
+       desc.groups = arm_trbe_groups;
+       desc.dev = dev;
+       trbe_csdev = coresight_register(&desc);
+       if (IS_ERR(trbe_csdev))
+               goto cpu_clear;
+
+       dev_set_drvdata(&trbe_csdev->dev, cpudata);
+       coresight_set_percpu_sink(cpu, trbe_csdev);
+       return;
+cpu_clear:
+       cpumask_clear_cpu(cpu, &drvdata->supported_cpus);
+}
+
+static void arm_trbe_probe_cpu(void *info)
+{
+       struct trbe_drvdata *drvdata = info;
+       int cpu = smp_processor_id();
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       u64 trbidr;
+
+       if (WARN_ON(!cpudata))
+               goto cpu_clear;
+
+       if (!is_trbe_available()) {
+               pr_err("TRBE is not implemented on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+
+       trbidr = read_sysreg_s(SYS_TRBIDR_EL1);
+       if (!is_trbe_programmable(trbidr)) {
+               pr_err("TRBE is owned in higher exception level on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+
+       cpudata->trbe_align = 1ULL << get_trbe_address_align(trbidr);
+       if (cpudata->trbe_align > SZ_2K) {
+               pr_err("Unsupported alignment on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+       cpudata->trbe_flag = get_trbe_flag_update(trbidr);
+       cpudata->cpu = cpu;
+       cpudata->drvdata = drvdata;
+       return;
+cpu_clear:
+       cpumask_clear_cpu(cpu, &drvdata->supported_cpus);
+}
+
+static void arm_trbe_remove_coresight_cpu(void *info)
+{
+       int cpu = smp_processor_id();
+       struct trbe_drvdata *drvdata = info;
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu);
+
+       disable_percpu_irq(drvdata->irq);
+       trbe_reset_local();
+       if (trbe_csdev) {
+               coresight_unregister(trbe_csdev);
+               cpudata->drvdata = NULL;
+               coresight_set_percpu_sink(cpu, NULL);
+       }
+}
+
+static int arm_trbe_probe_coresight(struct trbe_drvdata *drvdata)
+{
+       int cpu;
+
+       drvdata->cpudata = alloc_percpu(typeof(*drvdata->cpudata));
+       if (!drvdata->cpudata)
+               return -ENOMEM;
+
+       for_each_cpu(cpu, &drvdata->supported_cpus) {
+               smp_call_function_single(cpu, arm_trbe_probe_cpu, drvdata, 1);
+               if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                       arm_trbe_register_coresight_cpu(drvdata, cpu);
+               if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                       smp_call_function_single(cpu, arm_trbe_enable_cpu, drvdata, 1);
+       }
+       return 0;
+}
+
+static int arm_trbe_remove_coresight(struct trbe_drvdata *drvdata)
+{
+       int cpu;
+
+       for_each_cpu(cpu, &drvdata->supported_cpus)
+               smp_call_function_single(cpu, arm_trbe_remove_coresight_cpu, drvdata, 1);
+       free_percpu(drvdata->cpudata);
+       return 0;
+}
+
+static int arm_trbe_cpu_startup(unsigned int cpu, struct hlist_node *node)
+{
+       struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node);
+
+       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) {
+
+               /*
+                * If this CPU was not probed for TRBE,
+                * initialize it now.
+                */
+               if (!coresight_get_percpu_sink(cpu)) {
+                       arm_trbe_probe_cpu(drvdata);
+                       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                               arm_trbe_register_coresight_cpu(drvdata, cpu);
+                       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                               arm_trbe_enable_cpu(drvdata);
+               } else {
+                       arm_trbe_enable_cpu(drvdata);
+               }
+       }
+       return 0;
+}
+
+static int arm_trbe_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+       struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node);
+
+       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) {
+               disable_percpu_irq(drvdata->irq);
+               trbe_reset_local();
+       }
+       return 0;
+}
+
+static int arm_trbe_probe_cpuhp(struct trbe_drvdata *drvdata)
+{
+       enum cpuhp_state trbe_online;
+       int ret;
+
+       trbe_online = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME,
+                                             arm_trbe_cpu_startup, arm_trbe_cpu_teardown);
+       if (trbe_online < 0)
+               return trbe_online;
+
+       ret = cpuhp_state_add_instance(trbe_online, &drvdata->hotplug_node);
+       if (ret) {
+               cpuhp_remove_multi_state(trbe_online);
+               return ret;
+       }
+       drvdata->trbe_online = trbe_online;
+       return 0;
+}
+
+static void arm_trbe_remove_cpuhp(struct trbe_drvdata *drvdata)
+{
+       cpuhp_remove_multi_state(drvdata->trbe_online);
+}
+
+static int arm_trbe_probe_irq(struct platform_device *pdev,
+                             struct trbe_drvdata *drvdata)
+{
+       int ret;
+
+       drvdata->irq = platform_get_irq(pdev, 0);
+       if (drvdata->irq < 0) {
+               pr_err("IRQ not found for the platform device\n");
+               return drvdata->irq;
+       }
+
+       if (!irq_is_percpu(drvdata->irq)) {
+               pr_err("IRQ is not a PPI\n");
+               return -EINVAL;
+       }
+
+       if (irq_get_percpu_devid_partition(drvdata->irq, &drvdata->supported_cpus))
+               return -EINVAL;
+
+       drvdata->handle = alloc_percpu(struct perf_output_handle *);
+       if (!drvdata->handle)
+               return -ENOMEM;
+
+       ret = request_percpu_irq(drvdata->irq, arm_trbe_irq_handler, DRVNAME, drvdata->handle);
+       if (ret) {
+               free_percpu(drvdata->handle);
+               return ret;
+       }
+       return 0;
+}
+
+static void arm_trbe_remove_irq(struct trbe_drvdata *drvdata)
+{
+       free_percpu_irq(drvdata->irq, drvdata->handle);
+       free_percpu(drvdata->handle);
+}
+
+static int arm_trbe_device_probe(struct platform_device *pdev)
+{
+       struct coresight_platform_data *pdata;
+       struct trbe_drvdata *drvdata;
+       struct device *dev = &pdev->dev;
+       int ret;
+
+       drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
+       if (!drvdata)
+               return -ENOMEM;
+
+       pdata = coresight_get_platform_data(dev);
+       if (IS_ERR(pdata))
+               return PTR_ERR(pdata);
+
+       dev_set_drvdata(dev, drvdata);
+       dev->platform_data = pdata;
+       drvdata->pdev = pdev;
+       ret = arm_trbe_probe_irq(pdev, drvdata);
+       if (ret)
+               return ret;
+
+       ret = arm_trbe_probe_coresight(drvdata);
+       if (ret)
+               goto probe_failed;
+
+       ret = arm_trbe_probe_cpuhp(drvdata);
+       if (ret)
+               goto cpuhp_failed;
+
+       return 0;
+cpuhp_failed:
+       arm_trbe_remove_coresight(drvdata);
+probe_failed:
+       arm_trbe_remove_irq(drvdata);
+       return ret;
+}
+
+static int arm_trbe_device_remove(struct platform_device *pdev)
+{
+       struct trbe_drvdata *drvdata = platform_get_drvdata(pdev);
+
+       arm_trbe_remove_cpuhp(drvdata);
+       arm_trbe_remove_coresight(drvdata);
+       arm_trbe_remove_irq(drvdata);
+       return 0;
+}
+
+static const struct of_device_id arm_trbe_of_match[] = {
+       { .compatible = "arm,trace-buffer-extension"},
+       {},
+};
+MODULE_DEVICE_TABLE(of, arm_trbe_of_match);
+
+static struct platform_driver arm_trbe_driver = {
+       .driver = {
+               .name = DRVNAME,
+               .of_match_table = of_match_ptr(arm_trbe_of_match),
+               .suppress_bind_attrs = true,
+       },
+       .probe  = arm_trbe_device_probe,
+       .remove = arm_trbe_device_remove,
+};
+
+static int __init arm_trbe_init(void)
+{
+       int ret;
+
+       if (arm64_kernel_unmapped_at_el0()) {
+               pr_err("TRBE wouldn't work if kernel gets unmapped at EL0\n");
+               return -EOPNOTSUPP;
+       }
+
+       ret = platform_driver_register(&arm_trbe_driver);
+       if (!ret)
+               return 0;
+
+       pr_err("Error registering %s platform driver\n", DRVNAME);
+       return ret;
+}
+
+static void __exit arm_trbe_exit(void)
+{
+       platform_driver_unregister(&arm_trbe_driver);
+}
+module_init(arm_trbe_init);
+module_exit(arm_trbe_exit);
+
+MODULE_AUTHOR("Anshuman Khandual <anshuman.khandual@arm.com>");
+MODULE_DESCRIPTION("Arm Trace Buffer Extension (TRBE) driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/hwtracing/coresight/coresight-trbe.h b/drivers/hwtracing/coresight/coresight-trbe.h

new file mode 100644 (file)

index 0000000..abf3e36
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-trbe.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This contains all required hardware related helper functions for
+ * Trace Buffer Extension (TRBE) driver in the coresight framework.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#include <linux/coresight.h>
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+
+#include "coresight-etm-perf.h"
+
+static inline bool is_trbe_available(void)
+{
+       u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
+       unsigned int trbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_TRBE_SHIFT);
+
+       return trbe >= 0b0001;
+}
+
+static inline bool is_trbe_enabled(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       return trblimitr & TRBLIMITR_ENABLE;
+}
+
+#define TRBE_EC_OTHERS         0
+#define TRBE_EC_STAGE1_ABORT   36
+#define TRBE_EC_STAGE2_ABORT   37
+
+static inline int get_trbe_ec(u64 trbsr)
+{
+       return (trbsr >> TRBSR_EC_SHIFT) & TRBSR_EC_MASK;
+}
+
+#define TRBE_BSC_NOT_STOPPED 0
+#define TRBE_BSC_FILLED      1
+#define TRBE_BSC_TRIGGERED   2
+
+static inline int get_trbe_bsc(u64 trbsr)
+{
+       return (trbsr >> TRBSR_BSC_SHIFT) & TRBSR_BSC_MASK;
+}
+
+static inline void clr_trbe_irq(void)
+{
+       u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1);
+
+       trbsr &= ~TRBSR_IRQ;
+       write_sysreg_s(trbsr, SYS_TRBSR_EL1);
+}
+
+static inline bool is_trbe_irq(u64 trbsr)
+{
+       return trbsr & TRBSR_IRQ;
+}
+
+static inline bool is_trbe_trg(u64 trbsr)
+{
+       return trbsr & TRBSR_TRG;
+}
+
+static inline bool is_trbe_wrap(u64 trbsr)
+{
+       return trbsr & TRBSR_WRAP;
+}
+
+static inline bool is_trbe_abort(u64 trbsr)
+{
+       return trbsr & TRBSR_ABORT;
+}
+
+static inline bool is_trbe_running(u64 trbsr)
+{
+       return !(trbsr & TRBSR_STOP);
+}
+
+#define TRBE_TRIG_MODE_STOP            0
+#define TRBE_TRIG_MODE_IRQ             1
+#define TRBE_TRIG_MODE_IGNORE          3
+
+#define TRBE_FILL_MODE_FILL            0
+#define TRBE_FILL_MODE_WRAP            1
+#define TRBE_FILL_MODE_CIRCULAR_BUFFER 3
+
+static inline void set_trbe_disabled(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       trblimitr &= ~TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+}
+
+static inline bool get_trbe_flag_update(u64 trbidr)
+{
+       return trbidr & TRBIDR_FLAG;
+}
+
+static inline bool is_trbe_programmable(u64 trbidr)
+{
+       return !(trbidr & TRBIDR_PROG);
+}
+
+static inline int get_trbe_address_align(u64 trbidr)
+{
+       return (trbidr >> TRBIDR_ALIGN_SHIFT) & TRBIDR_ALIGN_MASK;
+}
+
+static inline unsigned long get_trbe_write_pointer(void)
+{
+       return read_sysreg_s(SYS_TRBPTR_EL1);
+}
+
+static inline void set_trbe_write_pointer(unsigned long addr)
+{
+       WARN_ON(is_trbe_enabled());
+       write_sysreg_s(addr, SYS_TRBPTR_EL1);
+}
+
+static inline unsigned long get_trbe_limit_pointer(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+       unsigned long addr = trblimitr & (TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT);
+
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       return addr;
+}
+
+static inline unsigned long get_trbe_base_pointer(void)
+{
+       u64 trbbaser = read_sysreg_s(SYS_TRBBASER_EL1);
+       unsigned long addr = trbbaser & (TRBBASER_BASE_MASK << TRBBASER_BASE_SHIFT);
+
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       return addr;
+}
+
+static inline void set_trbe_base_pointer(unsigned long addr)
+{
+       WARN_ON(is_trbe_enabled());
+       WARN_ON(!IS_ALIGNED(addr, (1UL << TRBBASER_BASE_SHIFT)));
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       write_sysreg_s(addr, SYS_TRBBASER_EL1);
+}
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c

index ed46e60..d205faf 100644 (file)
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -794,8 +794,13 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its,
  
         its_encode_alloc(cmd, alloc);
  
-       /* We can only signal PTZ when alloc==1. Why do we have two bits? */
-       its_encode_ptz(cmd, alloc);
+       /*
+        * GICv4.1 provides a way to get the VLPI state, which needs the vPE
+        * to be unmapped first, and in this case, we may remap the vPE
+        * back while the VPT is not empty. So we can't assume that the
+        * VPT is empty on map. This is why we never advertise PTZ.
+        */
+       its_encode_ptz(cmd, false);
         its_encode_vconf_addr(cmd, vconf_addr);
         its_encode_vmapp_default_db(cmd, desc->its_vmapp_cmd.vpe->vpe_db_lpi);
  
@@ -4554,6 +4559,15 @@ static void its_vpe_irq_domain_deactivate(struct irq_domain *domain,
  
                 its_send_vmapp(its, vpe, false);
         }
+
+       /*
+        * There may be a direct read to the VPT after unmapping the
+        * vPE, to guarantee the validity of this, we make the VPT
+        * memory coherent with the CPU caches here.
+        */
+       if (find_4_1_its() && !atomic_read(&vpe->vmapp_count))
+               gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
+                                       LPI_PENDBASE_SZ);
  }
  
  static const struct irq_domain_ops its_vpe_domain_ops = {
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c

index 2d10d84..d4f7f1f 100644 (file)
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -581,33 +581,6 @@ static const struct attribute_group armpmu_common_attr_group = {
         .attrs = armpmu_common_attrs,
  };
  
-/* Set at runtime when we know what CPU type we are. */
-static struct arm_pmu *__oprofile_cpu_pmu;
-
-/*
- * Despite the names, these two functions are CPU-specific and are used
- * by the OProfile/perf code.
- */
-const char *perf_pmu_name(void)
-{
-       if (!__oprofile_cpu_pmu)
-               return NULL;
-
-       return __oprofile_cpu_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       int max_events = 0;
-
-       if (__oprofile_cpu_pmu != NULL)
-               max_events = __oprofile_cpu_pmu->num_events;
-
-       return max_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
  static int armpmu_count_irq_users(const int irq)
  {
         int cpu, count = 0;
@@ -979,9 +952,6 @@ int armpmu_register(struct arm_pmu *pmu)
         if (ret)
                 goto out_destroy;
  
-       if (!__oprofile_cpu_pmu)
-               __oprofile_cpu_pmu = pmu;
-
         pr_info("enabled with %s PMU driver, %d counters available%s\n",
                 pmu->name, pmu->num_events,
                 has_nmi ? ", using NMIs" : "");
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig

index f2edef0..8c20e52 100644 (file)
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -108,7 +108,7 @@ config PTP_1588_CLOCK_PCH
  config PTP_1588_CLOCK_KVM
         tristate "KVM virtual PTP clock"
         depends on PTP_1588_CLOCK
-       depends on KVM_GUEST && X86
+       depends on (KVM_GUEST && X86) || (HAVE_ARM_SMCCC_DISCOVERY && ARM_ARCH_TIMER)
         default y
         help
           This driver adds support for using kvm infrastructure as a PTP
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile

index db5aef3..8673d17 100644 (file)
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -4,6 +4,8 @@
  #
  
  ptp-y                                  := ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-$(CONFIG_X86)                  := ptp_kvm_x86.o ptp_kvm_common.o
+ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC)       := ptp_kvm_arm.o ptp_kvm_common.o
  obj-$(CONFIG_PTP_1588_CLOCK)           += ptp.o
  obj-$(CONFIG_PTP_1588_CLOCK_DTE)       += ptp_dte.o
  obj-$(CONFIG_PTP_1588_CLOCK_INES)      += ptp_ines.o
diff --git a/drivers/ptp/ptp_kvm_arm.c b/drivers/ptp/ptp_kvm_arm.c

new file mode 100644 (file)

index 0000000..b7d28c8
--- /dev/null
+++ b/drivers/ptp/ptp_kvm_arm.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
+
+#include <asm/arch_timer.h>
+#include <asm/hypervisor.h>
+
+int kvm_arch_ptp_init(void)
+{
+       int ret;
+
+       ret = kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_PTP);
+       if (ret <= 0)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+       return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL);
+}
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm_common.c

similarity index 60%

rename from drivers/ptp/ptp_kvm.c

rename to drivers/ptp/ptp_kvm_common.c

index 658d33f..fcae32f 100644 (file)
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/ptp_kvm_common.c
@@ -8,11 +8,11 @@
  #include <linux/err.h>
  #include <linux/init.h>
  #include <linux/kernel.h>
+#include <linux/slab.h>
  #include <linux/module.h>
+#include <linux/ptp_kvm.h>
  #include <uapi/linux/kvm_para.h>
  #include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/kvmclock.h>
  #include <uapi/asm/kvm_para.h>
  
  #include <linux/ptp_clock_kernel.h>
@@ -24,56 +24,29 @@ struct kvm_ptp_clock {
  
  static DEFINE_SPINLOCK(kvm_ptp_lock);
  
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
  static int ptp_kvm_get_time_fn(ktime_t *device_time,
                                struct system_counterval_t *system_counter,
                                void *ctx)
  {
-       unsigned long ret;
+       long ret;
+       u64 cycle;
         struct timespec64 tspec;
-       unsigned version;
-       int cpu;
-       struct pvclock_vcpu_time_info *src;
+       struct clocksource *cs;
  
         spin_lock(&kvm_ptp_lock);
  
         preempt_disable_notrace();
-       cpu = smp_processor_id();
-       src = &hv_clock[cpu].pvti;
-
-       do {
-               /*
-                * We are using a TSC value read in the hosts
-                * kvm_hc_clock_pairing handling.
-                * So any changes to tsc_to_system_mul
-                * and tsc_shift or any other pvclock
-                * data invalidate that measurement.
-                */
-               version = pvclock_read_begin(src);
-
-               ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-                                    clock_pair_gpa,
-                                    KVM_CLOCK_PAIRING_WALLCLOCK);
-               if (ret != 0) {
-                       pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
-                       spin_unlock(&kvm_ptp_lock);
-                       preempt_enable_notrace();
-                       return -EOPNOTSUPP;
-               }
-
-               tspec.tv_sec = clock_pair.sec;
-               tspec.tv_nsec = clock_pair.nsec;
-               ret = __pvclock_read_cycles(src, clock_pair.tsc);
-       } while (pvclock_read_retry(src, version));
+       ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs);
+       if (ret) {
+               spin_unlock(&kvm_ptp_lock);
+               preempt_enable_notrace();
+               return ret;
+       }
  
         preempt_enable_notrace();
  
-       system_counter->cycles = ret;
-       system_counter->cs = &kvm_clock;
+       system_counter->cycles = cycle;
+       system_counter->cs = cs;
  
         *device_time = timespec64_to_ktime(tspec);
  
@@ -111,22 +84,17 @@ static int ptp_kvm_settime(struct ptp_clock_info *ptp,
  
  static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
  {
-       unsigned long ret;
+       long ret;
         struct timespec64 tspec;
  
         spin_lock(&kvm_ptp_lock);
  
-       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-                            clock_pair_gpa,
-                            KVM_CLOCK_PAIRING_WALLCLOCK);
-       if (ret != 0) {
-               pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+       ret = kvm_arch_ptp_get_clock(&tspec);
+       if (ret) {
                 spin_unlock(&kvm_ptp_lock);
-               return -EOPNOTSUPP;
+               return ret;
         }
  
-       tspec.tv_sec = clock_pair.sec;
-       tspec.tv_nsec = clock_pair.nsec;
         spin_unlock(&kvm_ptp_lock);
  
         memcpy(ts, &tspec, sizeof(struct timespec64));
@@ -168,19 +136,12 @@ static int __init ptp_kvm_init(void)
  {
         long ret;
  
-       if (!kvm_para_available())
-               return -ENODEV;
-
-       clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-       hv_clock = pvclock_get_pvti_cpu0_va();
-
-       if (!hv_clock)
-               return -ENODEV;
-
-       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
-                       KVM_CLOCK_PAIRING_WALLCLOCK);
-       if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
-               return -ENODEV;
+       ret = kvm_arch_ptp_init();
+       if (ret) {
+               if (ret != -EOPNOTSUPP)
+                       pr_err("fail to initialize ptp_kvm");
+               return ret;
+       }
  
         kvm_ptp_clock.caps = ptp_kvm_caps;
  
diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c

new file mode 100644 (file)

index 0000000..3dd519d
--- /dev/null
+++ b/drivers/ptp/ptp_kvm_x86.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <linux/module.h>
+#include <uapi/asm/kvm_para.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_kvm.h>
+
+struct pvclock_vsyscall_time_info *hv_clock;
+
+static phys_addr_t clock_pair_gpa;
+static struct kvm_clock_pairing clock_pair;
+
+int kvm_arch_ptp_init(void)
+{
+       long ret;
+
+       if (!kvm_para_available())
+               return -ENODEV;
+
+       clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+       hv_clock = pvclock_get_pvti_cpu0_va();
+       if (!hv_clock)
+               return -ENODEV;
+
+       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+                            KVM_CLOCK_PAIRING_WALLCLOCK);
+       if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+               return -ENODEV;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+       long ret;
+
+       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+                            clock_pair_gpa,
+                            KVM_CLOCK_PAIRING_WALLCLOCK);
+       if (ret != 0) {
+               pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+               return -EOPNOTSUPP;
+       }
+
+       ts->tv_sec = clock_pair.sec;
+       ts->tv_nsec = clock_pair.nsec;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
+                             struct clocksource **cs)
+{
+       struct pvclock_vcpu_time_info *src;
+       unsigned int version;
+       long ret;
+       int cpu;
+
+       cpu = smp_processor_id();
+       src = &hv_clock[cpu].pvti;
+
+       do {
+               /*
+                * We are using a TSC value read in the hosts
+                * kvm_hc_clock_pairing handling.
+                * So any changes to tsc_to_system_mul
+                * and tsc_shift or any other pvclock
+                * data invalidate that measurement.
+                */
+               version = pvclock_read_begin(src);
+
+               ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+                                    clock_pair_gpa,
+                                    KVM_CLOCK_PAIRING_WALLCLOCK);
+               if (ret != 0) {
+                       pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+                       return -EOPNOTSUPP;
+               }
+               tspec->tv_sec = clock_pair.sec;
+               tspec->tv_nsec = clock_pair.nsec;
+               *cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+       } while (pvclock_read_retry(src, version));
+
+       *cs = &kvm_clock;
+
+       return 0;
+}
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h

index 6fd3cda..864b999 100644 (file)
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -61,6 +61,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
  int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
                             struct kvm_device_attr *attr);
  int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
+int kvm_pmu_probe_pmuver(void);
  #else
  struct kvm_pmu {
  };
@@ -116,6 +117,9 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
  {
         return 0;
  }
+
+static inline int kvm_pmu_probe_pmuver(void) { return 0xf; }
+
  #endif
  
  #endif
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h

index 3d74f10..ec62118 100644 (file)
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -322,6 +322,7 @@ struct vgic_cpu {
          */
         struct vgic_io_device   rd_iodev;
         struct vgic_redist_region *rdreg;
+       u32 rdreg_index;
  
         /* Contains the attributes and gpa of the LPI pending tables. */
         u64 pendbaser;
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h

index 62c5423..6861489 100644 (file)
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -55,6 +55,8 @@
  #define ARM_SMCCC_OWNER_TRUSTED_OS     50
  #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63
  
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
  #define ARM_SMCCC_QUIRK_NONE           0
  #define ARM_SMCCC_QUIRK_QCOM_A6                1 /* Save/restore register a6 */
  
@@ -87,8 +89,47 @@
                            ARM_SMCCC_SMC_32,                            \
                            0, 0x7fff)
  
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID                          \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0     0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1     0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2     0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3     0x743a004dU
+
+/* KVM "vendor specific" services */
+#define ARM_SMCCC_KVM_FUNC_FEATURES            0
+#define ARM_SMCCC_KVM_FUNC_PTP                 1
+#define ARM_SMCCC_KVM_FUNC_FEATURES_2          127
+#define ARM_SMCCC_KVM_NUM_FUNCS                        128
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID                      \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_FEATURES)
+
  #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED   1
  
+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID                           \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_PTP)
+
+/* ptp_kvm counter type ID */
+#define KVM_PTP_VIRT_COUNTER                   0
+#define KVM_PTP_PHYS_COUNTER                   1
+
  /* Paravirtualised time calls (defined by ARM DEN0057A) */
  #define ARM_SMCCC_HV_PV_TIME_FEATURES                          \
         ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
diff --git a/include/linux/bug.h b/include/linux/bug.h

index f639bd0..348acf2 100644 (file)
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -36,6 +36,9 @@ static inline int is_warning_bug(const struct bug_entry *bug)
         return bug->flags & BUGFLAG_WARNING;
  }
  
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+                      unsigned int *line);
+
  struct bug_entry *find_bug(unsigned long bugaddr);
  
  enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
@@ -58,6 +61,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
         return BUG_TRAP_TYPE_BUG;
  }
  
+struct bug_entry;
+static inline void bug_get_file_line(struct bug_entry *bug, const char **file,
+                                    unsigned int *line)
+{
+       *file = NULL;
+       *line = 0;
+}
  
  static inline void generic_bug_clear_once(void) {}
  
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h

index 86d143d..1290d0d 100644 (file)
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -17,6 +17,7 @@
  #include <linux/timer.h>
  #include <linux/init.h>
  #include <linux/of.h>
+#include <linux/clocksource_ids.h>
  #include <asm/div64.h>
  #include <asm/io.h>
  
@@ -62,6 +63,10 @@ struct module;
   *                     400-499: Perfect
   *                             The ideal clocksource. A must-use where
   *                             available.
+ * @id:                        Defaults to CSID_GENERIC. The id value is captured
+ *                     in certain snapshot functions to allow callers to
+ *                     validate the clocksource from which the snapshot was
+ *                     taken.
   * @flags:             Flags describing special properties
   * @enable:            Optional function to enable the clocksource
   * @disable:           Optional function to disable the clocksource
@@ -100,6 +105,7 @@ struct clocksource {
         const char              *name;
         struct list_head        list;
         int                     rating;
+       enum clocksource_ids    id;
         enum vdso_clock_mode    vdso_clock_mode;
         unsigned long           flags;
  
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h

new file mode 100644 (file)

index 0000000..16775d7
--- /dev/null
+++ b/include/linux/clocksource_ids.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOCKSOURCE_IDS_H
+#define _LINUX_CLOCKSOURCE_IDS_H
+
+/* Enum to give clocksources a unique identifier */
+enum clocksource_ids {
+       CSID_GENERIC            = 0,
+       CSID_ARM_ARCH_COUNTER,
+       CSID_MAX,
+};
+
+#endif
diff --git a/include/linux/coresight.h b/include/linux/coresight.h

index 976ec26..85008a6 100644 (file)
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -50,6 +50,7 @@ enum coresight_dev_subtype_sink {
         CORESIGHT_DEV_SUBTYPE_SINK_PORT,
         CORESIGHT_DEV_SUBTYPE_SINK_BUFFER,
         CORESIGHT_DEV_SUBTYPE_SINK_SYSMEM,
+       CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM,
  };
  
  enum coresight_dev_subtype_link {
@@ -455,6 +456,18 @@ static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 o
  }
  #endif /* CONFIG_64BIT */
  
+static inline bool coresight_is_percpu_source(struct coresight_device *csdev)
+{
+       return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) &&
+              (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC);
+}
+
+static inline bool coresight_is_percpu_sink(struct coresight_device *csdev)
+{
+       return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SINK) &&
+              (csdev->subtype.sink_subtype == CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM);
+}
+
  extern struct coresight_device *
  coresight_register(struct coresight_desc *desc);
  extern void coresight_unregister(struct coresight_device *csdev);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 3f7f89e..51154ed 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -951,8 +951,6 @@ extern void perf_event_itrace_started(struct perf_event *event);
  extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
  extern void perf_pmu_unregister(struct pmu *pmu);
  
-extern int perf_num_counters(void);
-extern const char *perf_pmu_name(void);
  extern void __perf_event_task_sched_in(struct task_struct *prev,
                                        struct task_struct *task);
  extern void __perf_event_task_sched_out(struct task_struct *prev,
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h

new file mode 100644 (file)

index 0000000..f960a71
--- /dev/null
+++ b/include/linux/ptp_kvm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#ifndef _PTP_KVM_H_
+#define _PTP_KVM_H_
+
+struct timespec64;
+struct clocksource;
+
+int kvm_arch_ptp_init(void);
+int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+int kvm_arch_ptp_get_crosststamp(u64 *cycle,
+               struct timespec64 *tspec, struct clocksource **cs);
+
+#endif /* _PTP_KVM_H_ */
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h

index c6792cf..78a98bd 100644 (file)
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -3,6 +3,7 @@
  #define _LINUX_TIMEKEEPING_H
  
  #include <linux/errno.h>
+#include <linux/clocksource_ids.h>
  
  /* Included from linux/ktime.h */
  
@@ -243,11 +244,12 @@ struct ktime_timestamps {
   * @cs_was_changed_seq:        The sequence number of clocksource change events
   */
  struct system_time_snapshot {
-       u64             cycles;
-       ktime_t         real;
-       ktime_t         raw;
-       unsigned int    clock_was_set_seq;
-       u8              cs_was_changed_seq;
+       u64                     cycles;
+       ktime_t                 real;
+       ktime_t                 raw;
+       enum clocksource_ids    cs_id;
+       unsigned int            clock_was_set_seq;
+       u8                      cs_was_changed_seq;
  };
  
  /**
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index d765334..3fd9a7e 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1081,6 +1081,7 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_SET_GUEST_DEBUG2 195
  #define KVM_CAP_SGX_ATTRIBUTE 196
  #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
+#define KVM_CAP_PTP_KVM 198
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h

index ad15e40..63971ea 100644 (file)
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1156,10 +1156,15 @@ enum perf_callchain_context {
  /**
   * PERF_RECORD_AUX::flags bits
   */
-#define PERF_AUX_FLAG_TRUNCATED                0x01    /* record was truncated to fit */
-#define PERF_AUX_FLAG_OVERWRITE                0x02    /* snapshot from overwrite mode */
-#define PERF_AUX_FLAG_PARTIAL          0x04    /* record contains gaps */
-#define PERF_AUX_FLAG_COLLISION                0x08    /* sample collided with another */
+#define PERF_AUX_FLAG_TRUNCATED                        0x01    /* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE                        0x02    /* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL                  0x04    /* record contains gaps */
+#define PERF_AUX_FLAG_COLLISION                        0x08    /* sample collided with another */
+#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK     0xff00  /* PMU specific trace format type */
+
+/* CoreSight PMU AUX buffer formats */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT       0x0000 /* Default for backward compatibility */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW             0x0100 /* Raw format of the source */
  
  #define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
  #define PERF_FLAG_FD_OUTPUT            (1UL << 1)
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 03db40f..88cb0ba 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -580,11 +580,6 @@ static u64 perf_event_time(struct perf_event *event);
  
  void __weak perf_event_print_debug(void)       { }
  
-extern __weak const char *perf_pmu_name(void)
-{
-       return "pmu";
-}
-
  static inline u64 perf_clock(void)
  {
         return local_clock();
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c

index cce484a..4fe1df8 100644 (file)
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -920,6 +920,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
  
         clocksource_arch_init(cs);
  
+       if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+               cs->id = CSID_GENERIC;
         if (cs->vdso_clock_mode < 0 ||
             cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
                 pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c

index 6aee576..06f55f9 100644 (file)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1048,6 +1048,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
         do {
                 seq = read_seqcount_begin(&tk_core.seq);
                 now = tk_clock_read(&tk->tkr_mono);
+               systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                 systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                 systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                 base_real = ktime_add(tk->tkr_mono.base,
diff --git a/lib/bug.c b/lib/bug.c

index 8f9d537..45a0584 100644 (file)
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -127,6 +127,22 @@ static inline struct bug_entry *module_find_bug(unsigned long bugaddr)
  }
  #endif
  
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+                      unsigned int *line)
+{
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+       *file = bug->file;
+#else
+       *file = (const char *)bug + bug->file_disp;
+#endif
+       *line = bug->line;
+#else
+       *file = NULL;
+       *line = 0;
+#endif
+}
+
  struct bug_entry *find_bug(unsigned long bugaddr)
  {
         struct bug_entry *bug;
@@ -153,32 +169,20 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
  
         disable_trace_on_warning();
  
-       file = NULL;
-       line = 0;
-       warning = 0;
+       bug_get_file_line(bug, &file, &line);
  
-       if (bug) {
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
-               file = bug->file;
-#else
-               file = (const char *)bug + bug->file_disp;
-#endif
-               line = bug->line;
-#endif
-               warning = (bug->flags & BUGFLAG_WARNING) != 0;
-               once = (bug->flags & BUGFLAG_ONCE) != 0;
-               done = (bug->flags & BUGFLAG_DONE) != 0;
-
-               if (warning && once) {
-                       if (done)
-                               return BUG_TRAP_TYPE_WARN;
-
-                       /*
-                        * Since this is the only store, concurrency is not an issue.
-                        */
-                       bug->flags |= BUGFLAG_DONE;
-               }
+       warning = (bug->flags & BUGFLAG_WARNING) != 0;
+       once = (bug->flags & BUGFLAG_ONCE) != 0;
+       done = (bug->flags & BUGFLAG_DONE) != 0;
+
+       if (warning && once) {
+               if (done)
+                       return BUG_TRAP_TYPE_WARN;
+
+               /*
+                * Since this is the only store, concurrency is not an issue.
+                */
+               bug->flags |= BUGFLAG_DONE;
         }
  
         /*
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore

index 34414e8..bd83158 100644 (file)
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -1,6 +1,7 @@
  # SPDX-License-Identifier: GPL-2.0-only
  /aarch64/get-reg-list
  /aarch64/get-reg-list-sve
+/aarch64/vgic_init
  /s390x/memop
  /s390x/resets
  /s390x/sync_regs_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile

index 6b0a9e7..ea5c428 100644 (file)
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -79,6 +79,7 @@ TEST_GEN_PROGS_x86_64 += steal_time
  
  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
  TEST_GEN_PROGS_aarch64 += demand_paging_test
  TEST_GEN_PROGS_aarch64 += dirty_log_test
  TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c

new file mode 100644 (file)

index 0000000..623f31a
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic init sequence tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS               4
+
+#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) (((uint64_t)(count) << 52) | \
+       ((uint64_t)((base) >> 16) << 16) | ((uint64_t)(flags) << 12) | index)
+#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset)
+
+#define GICR_TYPER 0x8
+
+struct vm_gic {
+       struct kvm_vm *vm;
+       int gic_fd;
+};
+
+static int max_ipa_bits;
+
+/* helper to access a redistributor register */
+static int access_redist_reg(int gicv3_fd, int vcpu, int offset,
+                            uint32_t *val, bool write)
+{
+       uint64_t attr = REG_OFFSET(vcpu, offset);
+
+       return _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+                                 attr, val, write);
+}
+
+/* dummy guest code */
+static void guest_code(void)
+{
+       GUEST_SYNC(0);
+       GUEST_SYNC(1);
+       GUEST_SYNC(2);
+       GUEST_DONE();
+}
+
+/* we don't want to assert on run execution, hence that helper */
+static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       ucall_init(vm, NULL);
+       int ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+       if (ret)
+               return -errno;
+       return 0;
+}
+
+static struct vm_gic vm_gic_create(void)
+{
+       struct vm_gic v;
+
+       v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL);
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       return v;
+}
+
+static void vm_gic_destroy(struct vm_gic *v)
+{
+       close(v->gic_fd);
+       kvm_vm_free(v->vm);
+}
+
+/**
+ * Helper routine that performs KVM device tests in general and
+ * especially ARM_VGIC_V3 ones. Eventually the ARM_VGIC_V3
+ * device gets created, a legacy RDIST region is set at @0x0
+ * and a DIST region is set @0x60000
+ */
+static void subtest_dist_rdist(struct vm_gic *v)
+{
+       int ret;
+       uint64_t addr;
+
+       /* Check existing group/attributes */
+       kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                             KVM_VGIC_V3_ADDR_TYPE_DIST);
+
+       kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                             KVM_VGIC_V3_ADDR_TYPE_REDIST);
+
+       /* check non existing attribute */
+       ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0);
+       TEST_ASSERT(ret && errno == ENXIO, "attribute not supported");
+
+       /* misaligned DIST and REDIST address settings */
+       addr = 0x1000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "GICv3 dist base not 64kB aligned");
+
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "GICv3 redist base not 64kB aligned");
+
+       /* out of range address */
+       if (max_ipa_bits) {
+               addr = 1ULL << max_ipa_bits;
+               ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                        KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+               TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit");
+
+               ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                        KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+               TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit");
+       }
+
+       /* set REDIST base address @0x0*/
+       addr = 0x00000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+
+       /* Attempt to create a second legacy redistributor region */
+       addr = 0xE0000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EEXIST, "GICv3 redist base set again");
+
+       /* Attempt to mix legacy and new redistributor regions */
+       addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION");
+
+       /*
+        * Set overlapping DIST / REDIST, cannot be detected here. Will be detected
+        * on first vcpu run instead.
+        */
+       addr = 3 * 2 * 0x10000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST,
+                         &addr, true);
+}
+
+/* Test the new REDIST region API */
+static void subtest_redist_regions(struct vm_gic *v)
+{
+       uint64_t addr, expected_addr;
+       int ret;
+
+       ret = kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                    KVM_VGIC_V3_ADDR_TYPE_REDIST);
+       TEST_ASSERT(!ret, "Multiple redist regions advertised");
+
+       addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "attempt to register the first rdist region with index != 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "register an rdist region overlapping with another one");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 1ULL << max_ipa_bits, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == E2BIG,
+                   "register redist region with base address beyond IPA range");
+
+       addr = 0x260000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION");
+
+       /*
+        * Now there are 2 redist regions:
+        * region 0 @ 0x200000 2 redists
+        * region 1 @ 0x240000 1 redist
+        * Attempt to read their characteristics
+        */
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0);
+       expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1);
+       expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region");
+
+       addr = 0x260000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist");
+}
+
+/*
+ * VGIC KVM device is created and initialized before the secondary CPUs
+ * get created
+ */
+static void test_vgic_then_vcpus(void)
+{
+       struct vm_gic v;
+       int ret, i;
+
+       v.vm = vm_create_default(0, 0, guest_code);
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       subtest_dist_rdist(&v);
+
+       /* Add the rest of the VCPUs */
+       for (i = 1; i < NR_VCPUS; ++i)
+               vm_vcpu_add_default(v.vm, i, guest_code);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+/* All the VCPUs are created before the VGIC KVM device gets initialized */
+static void test_vcpus_then_vgic(void)
+{
+       struct vm_gic v;
+       int ret;
+
+       v = vm_gic_create();
+
+       subtest_dist_rdist(&v);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+static void test_new_redist_regions(void)
+{
+       void *dummy = NULL;
+       struct vm_gic v;
+       uint64_t addr;
+       int ret;
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists");
+       vm_gic_destroy(&v);
+
+       /* step2 */
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init");
+
+       vm_gic_destroy(&v);
+
+       /* step 3 */
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+
+       _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true);
+       TEST_ASSERT(ret && errno == EFAULT,
+                   "register a third region allowing to cover the 4 vcpus");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(!ret, "vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+static void test_typer_accesses(void)
+{
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret, i;
+
+       v.vm = vm_create_default(0, 0, guest_code);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       vm_vcpu_add_default(v.vm, 3, guest_code);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(ret && errno == EINVAL, "attempting to read GICR_TYPER of non created vcpu");
+
+       vm_vcpu_add_default(v.vm, 1, guest_code);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(ret && errno == EBUSY, "read GICR_TYPER before GIC initialized");
+
+       vm_vcpu_add_default(v.vm, 2, guest_code);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       for (i = 0; i < NR_VCPUS ; i++) {
+               ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+               TEST_ASSERT(!ret && !val, "read GICR_TYPER before rdist region setting");
+       }
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       /* The 2 first rdists should be put there (vcpu 0 and 3) */
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && !val, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1");
+
+       addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1);
+       ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100,
+                   "no redist region attached to vcpu #1 yet, last cannot be returned");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x200,
+                   "no redist region attached to vcpu #2, last cannot be returned");
+
+       addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x210,
+                   "read typer of rdist #1, last properly returned");
+
+       vm_gic_destroy(&v);
+}
+
+/**
+ * Test GICR_TYPER last bit with new redist regions
+ * rdist regions #1 and #2 are contiguous
+ * rdist region #0 @0x100000 2 rdist capacity
+ *     rdists: 0, 3 (Last)
+ * rdist region #1 @0x240000 2 rdist capacity
+ *     rdists:  5, 4 (Last)
+ * rdist region #2 @0x200000 2 rdist capacity
+ *     rdists: 1, 2
+ */
+static void test_last_bit_redist_regions(void)
+{
+       uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret;
+
+       v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3");
+
+       ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5");
+
+       ret = access_redist_reg(v.gic_fd, 4, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4");
+
+       vm_gic_destroy(&v);
+}
+
+/* Test last bit with legacy region */
+static void test_last_bit_single_rdist(void)
+{
+       uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret;
+
+       v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       addr = 0x10000;
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3");
+
+       vm_gic_destroy(&v);
+}
+
+void test_kvm_device(void)
+{
+       struct vm_gic v;
+       int ret, fd;
+
+       v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL);
+
+       /* try to create a non existing KVM device */
+       ret = _kvm_create_device(v.vm, 0, true, &fd);
+       TEST_ASSERT(ret && errno == ENODEV, "unsupported device");
+
+       /* trial mode with VGIC_V3 device */
+       ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true, &fd);
+       if (ret) {
+               print_skip("GICv3 not supported");
+               exit(KSFT_SKIP);
+       }
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false, &fd);
+       TEST_ASSERT(ret && errno == EEXIST, "create GICv3 device twice");
+
+       kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true);
+
+       if (!_kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, true, &fd)) {
+               ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, false, &fd);
+               TEST_ASSERT(ret && errno == EINVAL, "create GICv2 while v3 exists");
+       }
+
+       vm_gic_destroy(&v);
+}
+
+int main(int ac, char **av)
+{
+       max_ipa_bits = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+       test_kvm_device();
+       test_vcpus_then_vgic();
+       test_vgic_then_vcpus();
+       test_new_redist_regions();
+       test_typer_accesses();
+       test_last_bit_redist_regions();
+       test_last_bit_single_rdist();
+
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h

index 0e6cc25..a8f0227 100644 (file)
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -223,6 +223,15 @@ int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
  #endif
  void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid);
  
+int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd);
+int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test);
+int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                      void *val, bool write);
+int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write);
+
  const char *exit_reason_str(unsigned int exit_reason);
  
  void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c

index 35247db..8926f91 100644 (file)
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1731,6 +1731,81 @@ int _kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
  }
  
  /*
+ * Device Ioctl
+ */
+
+int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
+{
+       struct kvm_device_attr attribute = {
+               .group = group,
+               .attr = attr,
+               .flags = 0,
+       };
+
+       return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
+}
+
+int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
+{
+       int ret = _kvm_device_check_attr(dev_fd, group, attr);
+
+       TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno);
+       return ret;
+}
+
+int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd)
+{
+       struct kvm_create_device create_dev;
+       int ret;
+
+       create_dev.type = type;
+       create_dev.fd = -1;
+       create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
+       ret = ioctl(vm_get_fd(vm), KVM_CREATE_DEVICE, &create_dev);
+       *fd = create_dev.fd;
+       return ret;
+}
+
+int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test)
+{
+       int fd, ret;
+
+       ret = _kvm_create_device(vm, type, test, &fd);
+
+       if (!test) {
+               TEST_ASSERT(ret >= 0,
+                           "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno);
+               return fd;
+       }
+       return ret;
+}
+
+int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write)
+{
+       struct kvm_device_attr kvmattr = {
+               .group = group,
+               .attr = attr,
+               .flags = 0,
+               .addr = (uintptr_t)val,
+       };
+       int ret;
+
+       ret = ioctl(dev_fd, write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
+                   &kvmattr);
+       return ret;
+}
+
+int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write)
+{
+       int ret = _kvm_device_access(dev_fd, group, attr, val, write);
+
+       TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno);
+       return ret;
+}
+
+/*
   * VM Dump
   *
   * Input Args:
author	Paolo Bonzini <pbonzini@redhat.com>
	Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe	[new file with mode: 0644]	patch \| blob
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
Documentation/devicetree/bindings/arm/ete.yaml	[new file with mode: 0644]	patch \| blob
Documentation/devicetree/bindings/arm/trbe.yaml	[new file with mode: 0644]	patch \| blob
Documentation/trace/coresight/coresight-trbe.rst	[new file with mode: 0644]	patch \| blob
Documentation/virt/kvm/api.rst		patch \| blob \| history
Documentation/virt/kvm/arm/index.rst		patch \| blob \| history
Documentation/virt/kvm/arm/ptp_kvm.rst	[new file with mode: 0644]	patch \| blob
Documentation/virt/kvm/devices/arm-vgic-its.rst		patch \| blob \| history
Documentation/virt/kvm/devices/arm-vgic-v3.rst		patch \| blob \| history
MAINTAINERS		patch \| blob \| history
arch/arm/include/asm/hypervisor.h		patch \| blob \| history
arch/arm64/Kconfig		patch \| blob \| history
arch/arm64/crypto/aes-modes.S		patch \| blob \| history
arch/arm64/crypto/sha1-ce-core.S		patch \| blob \| history
arch/arm64/crypto/sha2-ce-core.S		patch \| blob \| history
arch/arm64/crypto/sha3-ce-core.S		patch \| blob \| history
arch/arm64/crypto/sha512-ce-core.S		patch \| blob \| history
arch/arm64/include/asm/assembler.h		patch \| blob \| history
arch/arm64/include/asm/barrier.h		patch \| blob \| history
arch/arm64/include/asm/cpufeature.h		patch \| blob \| history
arch/arm64/include/asm/el2_setup.h		patch \| blob \| history
arch/arm64/include/asm/fpsimd.h		patch \| blob \| history
arch/arm64/include/asm/fpsimdmacros.h		patch \| blob \| history
arch/arm64/include/asm/hyp_image.h		patch \| blob \| history
arch/arm64/include/asm/hypervisor.h		patch \| blob \| history
arch/arm64/include/asm/kvm_arm.h		patch \| blob \| history
arch/arm64/include/asm/kvm_asm.h		patch \| blob \| history
arch/arm64/include/asm/kvm_host.h		patch \| blob \| history
arch/arm64/include/asm/kvm_hyp.h		patch \| blob \| history
arch/arm64/include/asm/kvm_mmu.h		patch \| blob \| history
arch/arm64/include/asm/kvm_pgtable.h		patch \| blob \| history
arch/arm64/include/asm/pgtable-prot.h		patch \| blob \| history
arch/arm64/include/asm/sections.h		patch \| blob \| history
arch/arm64/include/asm/sysreg.h		patch \| blob \| history
arch/arm64/kernel/asm-offsets.c		patch \| blob \| history
arch/arm64/kernel/cpu-reset.S		patch \| blob \| history
arch/arm64/kernel/cpufeature.c		patch \| blob \| history
arch/arm64/kernel/fpsimd.c		patch \| blob \| history
arch/arm64/kernel/head.S		patch \| blob \| history
arch/arm64/kernel/hyp-stub.S		patch \| blob \| history
arch/arm64/kernel/idreg-override.c		patch \| blob \| history
arch/arm64/kernel/image-vars.h		patch \| blob \| history
arch/arm64/kernel/vmlinux.lds.S		patch \| blob \| history
arch/arm64/kvm/arm.c		patch \| blob \| history
arch/arm64/kvm/debug.c		patch \| blob \| history
arch/arm64/kvm/fpsimd.c		patch \| blob \| history
arch/arm64/kvm/guest.c		patch \| blob \| history
arch/arm64/kvm/handle_exit.c		patch \| blob \| history
arch/arm64/kvm/hyp/Makefile		patch \| blob \| history
arch/arm64/kvm/hyp/fpsimd.S		patch \| blob \| history
arch/arm64/kvm/hyp/include/hyp/switch.h		patch \| blob \| history
arch/arm64/kvm/hyp/include/nvhe/early_alloc.h	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/include/nvhe/gfp.h	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/include/nvhe/mem_protect.h	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/include/nvhe/memory.h	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/include/nvhe/mm.h	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/include/nvhe/spinlock.h	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/Makefile		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/cache.S	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/debug-sr.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/early_alloc.c	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/gen-hyprel.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/host.S		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/hyp-init.S		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/hyp-main.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/hyp-smp.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/hyp.lds.S		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/mem_protect.c	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/mm.c	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/page_alloc.c	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/psci-relay.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/setup.c	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/stub.c	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/nvhe/switch.c		patch \| blob \| history
arch/arm64/kvm/hyp/nvhe/tlb.c		patch \| blob \| history
arch/arm64/kvm/hyp/pgtable.c		patch \| blob \| history
arch/arm64/kvm/hyp/reserved_mem.c	[new file with mode: 0644]	patch \| blob
arch/arm64/kvm/hyp/vhe/switch.c		patch \| blob \| history
arch/arm64/kvm/hypercalls.c		patch \| blob \| history
arch/arm64/kvm/mmu.c		patch \| blob \| history
arch/arm64/kvm/perf.c		patch \| blob \| history
arch/arm64/kvm/pmu-emul.c		patch \| blob \| history
arch/arm64/kvm/pmu.c		patch \| blob \| history
arch/arm64/kvm/reset.c		patch \| blob \| history
arch/arm64/kvm/sys_regs.c		patch \| blob \| history
arch/arm64/kvm/va_layout.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-init.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-its.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-kvm-device.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-mmio-v3.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-mmio.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-v3.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-v4.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic.h		patch \| blob \| history
arch/arm64/lib/clear_page.S		patch \| blob \| history
arch/arm64/lib/copy_page.S		patch \| blob \| history
arch/arm64/mm/init.c		patch \| blob \| history
arch/s390/kernel/perf_event.c		patch \| blob \| history
arch/sh/kernel/perf_event.c		patch \| blob \| history
drivers/clocksource/arm_arch_timer.c		patch \| blob \| history
drivers/firmware/psci/psci.c		patch \| blob \| history
drivers/firmware/smccc/Makefile		patch \| blob \| history
drivers/firmware/smccc/kvm_guest.c	[new file with mode: 0644]	patch \| blob
drivers/firmware/smccc/smccc.c		patch \| blob \| history
drivers/hwtracing/coresight/Kconfig		patch \| blob \| history
drivers/hwtracing/coresight/Makefile		patch \| blob \| history
drivers/hwtracing/coresight/coresight-core.c		patch \| blob \| history
drivers/hwtracing/coresight/coresight-etm-perf.c		patch \| blob \| history
drivers/hwtracing/coresight/coresight-etm4x-core.c		patch \| blob \| history
drivers/hwtracing/coresight/coresight-etm4x-sysfs.c		patch \| blob \| history
drivers/hwtracing/coresight/coresight-etm4x.h		patch \| blob \| history
drivers/hwtracing/coresight/coresight-platform.c		patch \| blob \| history
drivers/hwtracing/coresight/coresight-priv.h		patch \| blob \| history
drivers/hwtracing/coresight/coresight-trbe.c	[new file with mode: 0644]	patch \| blob
drivers/hwtracing/coresight/coresight-trbe.h	[new file with mode: 0644]	patch \| blob
drivers/irqchip/irq-gic-v3-its.c		patch \| blob \| history
drivers/perf/arm_pmu.c		patch \| blob \| history
drivers/ptp/Kconfig		patch \| blob \| history
drivers/ptp/Makefile		patch \| blob \| history
drivers/ptp/ptp_kvm_arm.c	[new file with mode: 0644]	patch \| blob
drivers/ptp/ptp_kvm_common.c	[moved from drivers/ptp/ptp_kvm.c with 60% similarity]	patch \| blob \| history
drivers/ptp/ptp_kvm_x86.c	[new file with mode: 0644]	patch \| blob
include/kvm/arm_pmu.h		patch \| blob \| history
include/kvm/arm_vgic.h		patch \| blob \| history
include/linux/arm-smccc.h		patch \| blob \| history
include/linux/bug.h		patch \| blob \| history
include/linux/clocksource.h		patch \| blob \| history
include/linux/clocksource_ids.h	[new file with mode: 0644]	patch \| blob
include/linux/coresight.h		patch \| blob \| history
include/linux/perf_event.h		patch \| blob \| history
include/linux/ptp_kvm.h	[new file with mode: 0644]	patch \| blob
include/linux/timekeeping.h		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
include/uapi/linux/perf_event.h		patch \| blob \| history
kernel/events/core.c		patch \| blob \| history
kernel/time/clocksource.c		patch \| blob \| history
kernel/time/timekeeping.c		patch \| blob \| history
lib/bug.c		patch \| blob \| history
tools/testing/selftests/kvm/.gitignore		patch \| blob \| history
tools/testing/selftests/kvm/Makefile		patch \| blob \| history
tools/testing/selftests/kvm/aarch64/vgic_init.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/kvm/include/kvm_util.h		patch \| blob \| history
tools/testing/selftests/kvm/lib/kvm_util.c		patch \| blob \| history