Merge tag 'kvmarm-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmar...
authorPaolo Bonzini <pbonzini@redhat.com>
Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
KVM/arm64 updates for Linux 5.13

New features:

- Stage-2 isolation for the host kernel when running in protected mode
- Guest SVE support when running in nVHE mode
- Force W^X hypervisor mappings in nVHE mode
- ITS save/restore for guests using direct injection with GICv4.1
- nVHE panics now produce readable backtraces
- Guest support for PTP using the ptp_kvm driver
- Performance improvements in the S2 fault handler
- Alexandru is now a reviewer (not really a new feature...)

Fixes:
- Proper emulation of the GICR_TYPER register
- Handle the complete set of relocation in the nVHE EL2 object
- Get rid of the oprofile dependency in the PMU code (and of the
  oprofile body parts at the same time)
- Debug and SPE fixes
- Fix vcpu reset

144 files changed:
Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe [new file with mode: 0644]
Documentation/admin-guide/kernel-parameters.txt
Documentation/devicetree/bindings/arm/ete.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/arm/trbe.yaml [new file with mode: 0644]
Documentation/trace/coresight/coresight-trbe.rst [new file with mode: 0644]
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/arm/index.rst
Documentation/virt/kvm/arm/ptp_kvm.rst [new file with mode: 0644]
Documentation/virt/kvm/devices/arm-vgic-its.rst
Documentation/virt/kvm/devices/arm-vgic-v3.rst
MAINTAINERS
arch/arm/include/asm/hypervisor.h
arch/arm64/Kconfig
arch/arm64/crypto/aes-modes.S
arch/arm64/crypto/sha1-ce-core.S
arch/arm64/crypto/sha2-ce-core.S
arch/arm64/crypto/sha3-ce-core.S
arch/arm64/crypto/sha512-ce-core.S
arch/arm64/include/asm/assembler.h
arch/arm64/include/asm/barrier.h
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/el2_setup.h
arch/arm64/include/asm/fpsimd.h
arch/arm64/include/asm/fpsimdmacros.h
arch/arm64/include/asm/hyp_image.h
arch/arm64/include/asm/hypervisor.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_hyp.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/kvm_pgtable.h
arch/arm64/include/asm/pgtable-prot.h
arch/arm64/include/asm/sections.h
arch/arm64/include/asm/sysreg.h
arch/arm64/kernel/asm-offsets.c
arch/arm64/kernel/cpu-reset.S
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/fpsimd.c
arch/arm64/kernel/head.S
arch/arm64/kernel/hyp-stub.S
arch/arm64/kernel/idreg-override.c
arch/arm64/kernel/image-vars.h
arch/arm64/kernel/vmlinux.lds.S
arch/arm64/kvm/arm.c
arch/arm64/kvm/debug.c
arch/arm64/kvm/fpsimd.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp/Makefile
arch/arm64/kvm/hyp/fpsimd.S
arch/arm64/kvm/hyp/include/hyp/switch.h
arch/arm64/kvm/hyp/include/nvhe/early_alloc.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/gfp.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/mem_protect.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/memory.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/mm.h [new file with mode: 0644]
arch/arm64/kvm/hyp/include/nvhe/spinlock.h [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/Makefile
arch/arm64/kvm/hyp/nvhe/cache.S [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/debug-sr.c
arch/arm64/kvm/hyp/nvhe/early_alloc.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
arch/arm64/kvm/hyp/nvhe/host.S
arch/arm64/kvm/hyp/nvhe/hyp-init.S
arch/arm64/kvm/hyp/nvhe/hyp-main.c
arch/arm64/kvm/hyp/nvhe/hyp-smp.c
arch/arm64/kvm/hyp/nvhe/hyp.lds.S
arch/arm64/kvm/hyp/nvhe/mem_protect.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/mm.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/page_alloc.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/psci-relay.c
arch/arm64/kvm/hyp/nvhe/setup.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/stub.c [new file with mode: 0644]
arch/arm64/kvm/hyp/nvhe/switch.c
arch/arm64/kvm/hyp/nvhe/tlb.c
arch/arm64/kvm/hyp/pgtable.c
arch/arm64/kvm/hyp/reserved_mem.c [new file with mode: 0644]
arch/arm64/kvm/hyp/vhe/switch.c
arch/arm64/kvm/hypercalls.c
arch/arm64/kvm/mmu.c
arch/arm64/kvm/perf.c
arch/arm64/kvm/pmu-emul.c
arch/arm64/kvm/pmu.c
arch/arm64/kvm/reset.c
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/va_layout.c
arch/arm64/kvm/vgic/vgic-init.c
arch/arm64/kvm/vgic/vgic-its.c
arch/arm64/kvm/vgic/vgic-kvm-device.c
arch/arm64/kvm/vgic/vgic-mmio-v3.c
arch/arm64/kvm/vgic/vgic-mmio.c
arch/arm64/kvm/vgic/vgic-v3.c
arch/arm64/kvm/vgic/vgic-v4.c
arch/arm64/kvm/vgic/vgic.h
arch/arm64/lib/clear_page.S
arch/arm64/lib/copy_page.S
arch/arm64/mm/init.c
arch/s390/kernel/perf_event.c
arch/sh/kernel/perf_event.c
drivers/clocksource/arm_arch_timer.c
drivers/firmware/psci/psci.c
drivers/firmware/smccc/Makefile
drivers/firmware/smccc/kvm_guest.c [new file with mode: 0644]
drivers/firmware/smccc/smccc.c
drivers/hwtracing/coresight/Kconfig
drivers/hwtracing/coresight/Makefile
drivers/hwtracing/coresight/coresight-core.c
drivers/hwtracing/coresight/coresight-etm-perf.c
drivers/hwtracing/coresight/coresight-etm4x-core.c
drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
drivers/hwtracing/coresight/coresight-etm4x.h
drivers/hwtracing/coresight/coresight-platform.c
drivers/hwtracing/coresight/coresight-priv.h
drivers/hwtracing/coresight/coresight-trbe.c [new file with mode: 0644]
drivers/hwtracing/coresight/coresight-trbe.h [new file with mode: 0644]
drivers/irqchip/irq-gic-v3-its.c
drivers/perf/arm_pmu.c
drivers/ptp/Kconfig
drivers/ptp/Makefile
drivers/ptp/ptp_kvm_arm.c [new file with mode: 0644]
drivers/ptp/ptp_kvm_common.c [moved from drivers/ptp/ptp_kvm.c with 60% similarity]
drivers/ptp/ptp_kvm_x86.c [new file with mode: 0644]
include/kvm/arm_pmu.h
include/kvm/arm_vgic.h
include/linux/arm-smccc.h
include/linux/bug.h
include/linux/clocksource.h
include/linux/clocksource_ids.h [new file with mode: 0644]
include/linux/coresight.h
include/linux/perf_event.h
include/linux/ptp_kvm.h [new file with mode: 0644]
include/linux/timekeeping.h
include/uapi/linux/kvm.h
include/uapi/linux/perf_event.h
kernel/events/core.c
kernel/time/clocksource.c
kernel/time/timekeeping.c
lib/bug.c
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/aarch64/vgic_init.c [new file with mode: 0644]
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/lib/kvm_util.c

diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe b/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe
new file mode 100644 (file)
index 0000000..ad3bbc6
--- /dev/null
@@ -0,0 +1,14 @@
+What:          /sys/bus/coresight/devices/trbe<cpu>/align
+Date:          March 2021
+KernelVersion: 5.13
+Contact:       Anshuman Khandual <anshuman.khandual@arm.com>
+Description:   (Read) Shows the TRBE write pointer alignment. This value
+               is fetched from the TRBIDR register.
+
+What:          /sys/bus/coresight/devices/trbe<cpu>/flag
+Date:          March 2021
+KernelVersion: 5.13
+Contact:       Anshuman Khandual <anshuman.khandual@arm.com>
+Description:   (Read) Shows if TRBE updates in the memory are with access
+               and dirty flag updates as well. This value is fetched from
+               the TRBIDR register.
index 0454572..18f8bb3 100644 (file)
                                   state is kept private from the host.
                                   Not valid if the kernel is running in EL2.
 
-                       Defaults to VHE/nVHE based on hardware support and
-                       the value of CONFIG_ARM64_VHE.
+                       Defaults to VHE/nVHE based on hardware support.
 
        kvm-arm.vgic_v3_group0_trap=
                        [KVM,ARM] Trap guest accesses to GICv3 group-0
diff --git a/Documentation/devicetree/bindings/arm/ete.yaml b/Documentation/devicetree/bindings/arm/ete.yaml
new file mode 100644 (file)
index 0000000..7f9b2d1
--- /dev/null
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/ete.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Embedded Trace Extensions
+
+maintainers:
+  - Suzuki K Poulose <suzuki.poulose@arm.com>
+  - Mathieu Poirier <mathieu.poirier@linaro.org>
+
+description: |
+  Arm Embedded Trace Extension(ETE) is a per CPU trace component that
+  allows tracing the CPU execution. It overlaps with the CoreSight ETMv4
+  architecture and has extended support for future architecture changes.
+  The trace generated by the ETE could be stored via legacy CoreSight
+  components (e.g, TMC-ETR) or other means (e.g, using a per CPU buffer
+  Arm Trace Buffer Extension (TRBE)). Since the ETE can be connected to
+  legacy CoreSight components, a node must be listed per instance, along
+  with any optional connection graph as per the coresight bindings.
+  See bindings/arm/coresight.txt.
+
+properties:
+  $nodename:
+    pattern: "^ete([0-9a-f]+)$"
+  compatible:
+    items:
+      - const: arm,embedded-trace-extension
+
+  cpu:
+    description: |
+      Handle to the cpu this ETE is bound to.
+    $ref: /schemas/types.yaml#/definitions/phandle
+
+  out-ports:
+    description: |
+      Output connections from the ETE to legacy CoreSight trace bus.
+    $ref: /schemas/graph.yaml#/properties/ports
+    properties:
+      port:
+        description: Output connection from the ETE to legacy CoreSight Trace bus.
+        $ref: /schemas/graph.yaml#/properties/port
+
+required:
+  - compatible
+  - cpu
+
+additionalProperties: false
+
+examples:
+
+# An ETE node without legacy CoreSight connections
+  - |
+    ete0 {
+      compatible = "arm,embedded-trace-extension";
+      cpu = <&cpu_0>;
+    };
+# An ETE node with legacy CoreSight connections
+  - |
+   ete1 {
+      compatible = "arm,embedded-trace-extension";
+      cpu = <&cpu_1>;
+
+      out-ports {        /* legacy coresight connection */
+         port {
+             ete1_out_port: endpoint {
+                remote-endpoint = <&funnel_in_port0>;
+             };
+         };
+      };
+   };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/trbe.yaml b/Documentation/devicetree/bindings/arm/trbe.yaml
new file mode 100644 (file)
index 0000000..4402d7b
--- /dev/null
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/trbe.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Trace Buffer Extensions
+
+maintainers:
+  - Anshuman Khandual <anshuman.khandual@arm.com>
+
+description: |
+  Arm Trace Buffer Extension (TRBE) is a per CPU component
+  for storing trace generated on the CPU to memory. It is
+  accessed via CPU system registers. The software can verify
+  if it is permitted to use the component by checking the
+  TRBIDR register.
+
+properties:
+  $nodename:
+    const: "trbe"
+  compatible:
+    items:
+      - const: arm,trace-buffer-extension
+
+  interrupts:
+    description: |
+       Exactly 1 PPI must be listed. For heterogeneous systems where
+       TRBE is only supported on a subset of the CPUs, please consult
+       the arm,gic-v3 binding for details on describing a PPI partition.
+    maxItems: 1
+
+required:
+  - compatible
+  - interrupts
+
+additionalProperties: false
+
+examples:
+
+  - |
+   #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+   trbe {
+     compatible = "arm,trace-buffer-extension";
+     interrupts = <GIC_PPI 15 IRQ_TYPE_LEVEL_HIGH>;
+   };
+...
diff --git a/Documentation/trace/coresight/coresight-trbe.rst b/Documentation/trace/coresight/coresight-trbe.rst
new file mode 100644 (file)
index 0000000..b9928ef
--- /dev/null
@@ -0,0 +1,38 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Trace Buffer Extension (TRBE).
+==============================
+
+    :Author:   Anshuman Khandual <anshuman.khandual@arm.com>
+    :Date:     November 2020
+
+Hardware Description
+--------------------
+
+Trace Buffer Extension (TRBE) is a percpu hardware which captures in system
+memory, CPU traces generated from a corresponding percpu tracing unit. This
+gets plugged in as a coresight sink device because the corresponding trace
+generators (ETE), are plugged in as source device.
+
+The TRBE is not compliant to CoreSight architecture specifications, but is
+driven via the CoreSight driver framework to support the ETE (which is
+CoreSight compliant) integration.
+
+Sysfs files and directories
+---------------------------
+
+The TRBE devices appear on the existing coresight bus alongside the other
+coresight devices::
+
+       >$ ls /sys/bus/coresight/devices
+       trbe0  trbe1  trbe2 trbe3
+
+The ``trbe<N>`` named TRBEs are associated with a CPU.::
+
+       >$ ls /sys/bus/coresight/devices/trbe0/
+        align flag
+
+*Key file items are:-*
+   * ``align``: TRBE write pointer alignment
+   * ``flag``: TRBE updates memory with access and dirty flags
index 56c6fca..94804c2 100644 (file)
@@ -3116,6 +3116,18 @@ optional features it should have. Â This will cause a reset of the cpu
 registers to their initial values. Â If this is not called, KVM_RUN will
 return ENOEXEC for that vcpu.
 
+The initial values are defined as:
+       - Processor state:
+               * AArch64: EL1h, D, A, I and F bits set. All other bits
+                 are cleared.
+               * AArch32: SVC, A, I and F bits set. All other bits are
+                 cleared.
+       - General Purpose registers, including PC and SP: set to 0
+       - FPSIMD/NEON registers: set to 0
+       - SVE registers: set to 0
+       - System registers: Reset to their architecturally defined
+         values as for a warm reset to EL1 (resp. SVC)
+
 Note that because some registers reflect machine topology, all vcpus
 should be created before this ioctl is invoked.
 
@@ -3335,7 +3347,8 @@ The top 16 bits of the control field are architecture specific control
 flags which can include the following:
 
   - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86, arm64]
-  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390, arm64]
+  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_HW:        using hardware debug events [arm64]
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -6869,3 +6882,12 @@ they will get passed on to user space. So user space still has to have
 an implementation for these despite the in kernel acceleration.
 
 This capability is always enabled.
+
+8.32 KVM_CAP_PTP_KVM
+--------------------
+
+:Architectures: arm64
+
+This capability indicates that the KVM virtual PTP service is
+supported in the host. A VMM can check whether the service is
+available to the guest on migration.
index 3e2b2ab..78a9b67 100644 (file)
@@ -10,3 +10,4 @@ ARM
    hyp-abi
    psci
    pvtime
+   ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644 (file)
index 0000000..aecdc80
--- /dev/null
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=============================
+
+PTP_KVM is used for high precision time sync between host and guests.
+It relies on transferring the wall clock and counter value from the
+host to the guest using a KVM-specific hypercall.
+
+* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001
+
+This hypercall uses the SMC32/HVC32 calling convention:
+
+ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID
+    ==============    ========    =====================================
+    Function ID:      (uint32)    0x86000001
+    Arguments:        (uint32)    KVM_PTP_VIRT_COUNTER(0)
+                                  KVM_PTP_PHYS_COUNTER(1)
+    Return Values:    (int32)     NOT_SUPPORTED(-1) on error, or
+                      (uint32)    Upper 32 bits of wall clock time (r0)
+                      (uint32)    Lower 32 bits of wall clock time (r1)
+                      (uint32)    Upper 32 bits of counter (r2)
+                      (uint32)    Lower 32 bits of counter (r3)
+    Endianness:                   No Restrictions.
+    ==============    ========    =====================================
index 6c304fd..d257edd 100644 (file)
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
     -EFAULT  Invalid guest ram access
     -EBUSY   One or more VCPUS are running
     -EACCES  The virtual ITS is backed by a physical GICv4 ITS, and the
-            state is not available
+            state is not available without GICv4.1
     =======  ==========================================================
 
 KVM_DEV_ARM_VGIC_GRP_ITS_REGS
index 5dd3bff..51e5e57 100644 (file)
@@ -228,7 +228,7 @@ Groups:
 
     KVM_DEV_ARM_VGIC_CTRL_INIT
       request the initialization of the VGIC, no additional parameter in
-      kvm_device_attr.addr.
+      kvm_device_attr.addr. Must be called after all VCPUs have been created.
     KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
       save all LPI pending bits into guest RAM pending tables.
 
index 0417ebf..1dd8fb4 100644 (file)
@@ -1761,6 +1761,8 @@ F:        Documentation/ABI/testing/sysfs-bus-coresight-devices-*
 F:     Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
 F:     Documentation/devicetree/bindings/arm/coresight-cti.yaml
 F:     Documentation/devicetree/bindings/arm/coresight.txt
+F:     Documentation/devicetree/bindings/arm/ete.yaml
+F:     Documentation/devicetree/bindings/arm/trbe.yaml
 F:     Documentation/trace/coresight/*
 F:     drivers/hwtracing/coresight/*
 F:     include/dt-bindings/arm/coresight-cti-dt.h
@@ -9765,10 +9767,10 @@ F:      virt/kvm/*
 KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
 M:     Marc Zyngier <maz@kernel.org>
 R:     James Morse <james.morse@arm.com>
-R:     Julien Thierry <julien.thierry.kdev@gmail.com>
+R:     Alexandru Elisei <alexandru.elisei@arm.com>
 R:     Suzuki K Poulose <suzuki.poulose@arm.com>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-L:     kvmarm@lists.cs.columbia.edu
+L:     kvmarm@lists.cs.columbia.edu (moderated for non-subscribers)
 S:     Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
 F:     arch/arm64/include/asm/kvm*
index df85243..bd61502 100644 (file)
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
index e4e1b65..9ec09f9 100644 (file)
@@ -1426,19 +1426,6 @@ config ARM64_USE_LSE_ATOMICS
          built with binutils >= 2.25 in order for the new instructions
          to be used.
 
-config ARM64_VHE
-       bool "Enable support for Virtualization Host Extensions (VHE)"
-       default y
-       help
-         Virtualization Host Extensions (VHE) allow the kernel to run
-         directly at EL2 (instead of EL1) on processors that support
-         it. This leads to better performance for KVM, as they reduce
-         the cost of the world switch.
-
-         Selecting this option allows the VHE feature to be detected
-         at runtime, and does not affect processors that do not
-         implement this feature.
-
 endmenu
 
 menu "ARMv8.2 architectural features"
@@ -1694,7 +1681,6 @@ endmenu
 config ARM64_SVE
        bool "ARM Scalable Vector Extension support"
        default y
-       depends on !KVM || ARM64_VHE
        help
          The Scalable Vector Extension (SVE) is an extension to the AArch64
          execution state which complements and extends the SIMD functionality
@@ -1723,12 +1709,6 @@ config ARM64_SVE
          booting the kernel.  If unsure and you are not observing these
          symptoms, you should assume that it is safe to say Y.
 
-         CPUs that support SVE are architecturally required to support the
-         Virtualization Host Extensions (VHE), so the kernel makes no
-         provision for supporting SVE alongside KVM without VHE enabled.
-         Thus, you will need to enable CONFIG_ARM64_VHE if you want to support
-         KVM in the same kernel image.
-
 config ARM64_MODULE_PLTS
        bool "Use PLTs to allow module memory to spill over into vmalloc area"
        depends on MODULES
index bbdb547..ab6c14e 100644 (file)
@@ -700,7 +700,7 @@ AES_FUNC_START(aes_mac_update)
        cbz             w5, .Lmacout
        encrypt_block   v0, w2, x1, x7, w8
        st1             {v0.16b}, [x4]                  /* return dg */
-       cond_yield      .Lmacout, x7
+       cond_yield      .Lmacout, x7, x8
        b               .Lmacloop4x
 .Lmac1x:
        add             w3, w3, #4
index 8c02bbc..889ca0f 100644 (file)
@@ -121,7 +121,7 @@ CPU_LE(     rev32           v11.16b, v11.16b        )
        add             dgav.4s, dgav.4s, dg0v.4s
 
        cbz             w2, 2f
-       cond_yield      3f, x5
+       cond_yield      3f, x5, x6
        b               0b
 
        /*
index 6cdea7d..4911799 100644 (file)
@@ -129,7 +129,7 @@ CPU_LE(     rev32           v19.16b, v19.16b        )
 
        /* handled all input blocks? */
        cbz             w2, 2f
-       cond_yield      3f, x5
+       cond_yield      3f, x5, x6
        b               0b
 
        /*
index 6f52084..9c77313 100644 (file)
@@ -184,11 +184,11 @@ SYM_FUNC_START(sha3_ce_transform)
        eor      v0.16b,  v0.16b, v31.16b
 
        cbnz    w8, 3b
-       cond_yield 3f, x8
+       cond_yield 4f, x8, x9
        cbnz    w2, 0b
 
        /* save state */
-3:     st1     { v0.1d- v3.1d}, [x0], #32
+4:     st1     { v0.1d- v3.1d}, [x0], #32
        st1     { v4.1d- v7.1d}, [x0], #32
        st1     { v8.1d-v11.1d}, [x0], #32
        st1     {v12.1d-v15.1d}, [x0], #32
index d6e7f6c..b6a3a36 100644 (file)
@@ -195,7 +195,7 @@ CPU_LE(     rev64           v19.16b, v19.16b        )
        add             v10.2d, v10.2d, v2.2d
        add             v11.2d, v11.2d, v3.2d
 
-       cond_yield      3f, x4
+       cond_yield      3f, x4, x5
        /* handled all input blocks? */
        cbnz            w2, 0b
 
index ca31594..ad9ccc4 100644 (file)
@@ -15,6 +15,8 @@
 #include <asm-generic/export.h>
 
 #include <asm/asm-offsets.h>
+#include <asm/alternative.h>
+#include <asm/asm-bug.h>
 #include <asm/cpufeature.h>
 #include <asm/cputype.h>
 #include <asm/debug-monitors.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 
+       /*
+        * Provide a wxN alias for each wN register so what we can paste a xN
+        * reference after a 'w' to obtain the 32-bit version.
+        */
+       .irp    n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
+       wx\n    .req    w\n
+       .endr
+
        .macro save_and_disable_daif, flags
        mrs     \flags, daif
        msr     daifset, #0xf
@@ -270,12 +280,24 @@ alternative_endif
  * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val
  */
        .macro  read_ctr, reg
+#ifndef __KVM_NVHE_HYPERVISOR__
 alternative_if_not ARM64_MISMATCHED_CACHE_TYPE
        mrs     \reg, ctr_el0                   // read CTR
        nop
 alternative_else
        ldr_l   \reg, arm64_ftr_reg_ctrel0 + ARM64_FTR_SYSVAL
 alternative_endif
+#else
+alternative_if_not ARM64_KVM_PROTECTED_MODE
+       ASM_BUG()
+alternative_else_nop_endif
+alternative_cb kvm_compute_final_ctr_el0
+       movz    \reg, #0
+       movk    \reg, #0, lsl #16
+       movk    \reg, #0, lsl #32
+       movk    \reg, #0, lsl #48
+alternative_cb_end
+#endif
        .endm
 
 
@@ -676,11 +698,11 @@ USER(\label, ic   ivau, \tmp2)                    // invalidate I line PoU
        .endm
 
 /*
- * Set SCTLR_EL1 to the passed value, and invalidate the local icache
+ * Set SCTLR_ELx to the @reg value, and invalidate the local icache
  * in the process. This is called when setting the MMU on.
  */
-.macro set_sctlr_el1, reg
-       msr     sctlr_el1, \reg
+.macro set_sctlr, sreg, reg
+       msr     \sreg, \reg
        isb
        /*
         * Invalidate the local I-cache so that any instructions fetched
@@ -692,90 +714,41 @@ USER(\label, ic   ivau, \tmp2)                    // invalidate I line PoU
        isb
 .endm
 
-/*
- * Check whether to yield to another runnable task from kernel mode NEON code
- * (which runs with preemption disabled).
- *
- * if_will_cond_yield_neon
- *        // pre-yield patchup code
- * do_cond_yield_neon
- *        // post-yield patchup code
- * endif_yield_neon    <label>
- *
- * where <label> is optional, and marks the point where execution will resume
- * after a yield has been performed. If omitted, execution resumes right after
- * the endif_yield_neon invocation. Note that the entire sequence, including
- * the provided patchup code, will be omitted from the image if
- * CONFIG_PREEMPTION is not defined.
- *
- * As a convenience, in the case where no patchup code is required, the above
- * sequence may be abbreviated to
- *
- * cond_yield_neon <label>
- *
- * Note that the patchup code does not support assembler directives that change
- * the output section, any use of such directives is undefined.
- *
- * The yield itself consists of the following:
- * - Check whether the preempt count is exactly 1 and a reschedule is also
- *   needed. If so, calling of preempt_enable() in kernel_neon_end() will
- *   trigger a reschedule. If it is not the case, yielding is pointless.
- * - Disable and re-enable kernel mode NEON, and branch to the yield fixup
- *   code.
- *
- * This macro sequence may clobber all CPU state that is not guaranteed by the
- * AAPCS to be preserved across an ordinary function call.
- */
-
-       .macro          cond_yield_neon, lbl
-       if_will_cond_yield_neon
-       do_cond_yield_neon
-       endif_yield_neon        \lbl
-       .endm
-
-       .macro          if_will_cond_yield_neon
-#ifdef CONFIG_PREEMPTION
-       get_current_task        x0
-       ldr             x0, [x0, #TSK_TI_PREEMPT]
-       sub             x0, x0, #PREEMPT_DISABLE_OFFSET
-       cbz             x0, .Lyield_\@
-       /* fall through to endif_yield_neon */
-       .subsection     1
-.Lyield_\@ :
-#else
-       .section        ".discard.cond_yield_neon", "ax"
-#endif
-       .endm
-
-       .macro          do_cond_yield_neon
-       bl              kernel_neon_end
-       bl              kernel_neon_begin
-       .endm
+.macro set_sctlr_el1, reg
+       set_sctlr sctlr_el1, \reg
+.endm
 
-       .macro          endif_yield_neon, lbl
-       .ifnb           \lbl
-       b               \lbl
-       .else
-       b               .Lyield_out_\@
-       .endif
-       .previous
-.Lyield_out_\@ :
-       .endm
+.macro set_sctlr_el2, reg
+       set_sctlr sctlr_el2, \reg
+.endm
 
        /*
-        * Check whether preempt-disabled code should yield as soon as it
-        * is able. This is the case if re-enabling preemption a single
-        * time results in a preempt count of zero, and the TIF_NEED_RESCHED
-        * flag is set. (Note that the latter is stored negated in the
-        * top word of the thread_info::preempt_count field)
+        * Check whether preempt/bh-disabled asm code should yield as soon as
+        * it is able. This is the case if we are currently running in task
+        * context, and either a softirq is pending, or the TIF_NEED_RESCHED
+        * flag is set and re-enabling preemption a single time would result in
+        * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
+        * stored negated in the top word of the thread_info::preempt_count
+        * field)
         */
-       .macro          cond_yield, lbl:req, tmp:req
-#ifdef CONFIG_PREEMPTION
+       .macro          cond_yield, lbl:req, tmp:req, tmp2:req
        get_current_task \tmp
        ldr             \tmp, [\tmp, #TSK_TI_PREEMPT]
+       /*
+        * If we are serving a softirq, there is no point in yielding: the
+        * softirq will not be preempted no matter what we do, so we should
+        * run to completion as quickly as we can.
+        */
+       tbnz            \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@
+#ifdef CONFIG_PREEMPTION
        sub             \tmp, \tmp, #PREEMPT_DISABLE_OFFSET
        cbz             \tmp, \lbl
 #endif
+       adr_l           \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
+       this_cpu_offset \tmp2
+       ldr             w\tmp, [\tmp, \tmp2]
+       cbnz            w\tmp, \lbl     // yield on pending softirq in task context
+.Lnoyield_\@:
        .endm
 
 /*
index c3009b0..5a8367a 100644 (file)
@@ -23,6 +23,7 @@
 #define dsb(opt)       asm volatile("dsb " #opt : : : "memory")
 
 #define psb_csync()    asm volatile("hint #17" : : : "memory")
+#define tsb_csync()    asm volatile("hint #18" : : : "memory")
 #define csdb()         asm volatile("hint #20" : : : "memory")
 
 #define spec_bar()     asm volatile(ALTERNATIVE("dsb nsh\nisb\n",              \
index 61177ba..338840c 100644 (file)
@@ -63,6 +63,23 @@ struct arm64_ftr_bits {
        s64             safe_val; /* safe value for FTR_EXACT features */
 };
 
+/*
+ * Describe the early feature override to the core override code:
+ *
+ * @val                        Values that are to be merged into the final
+ *                     sanitised value of the register. Only the bitfields
+ *                     set to 1 in @mask are valid
+ * @mask               Mask of the features that are overridden by @val
+ *
+ * A @mask field set to full-1 indicates that the corresponding field
+ * in @val is a valid override.
+ *
+ * A @mask field set to full-0 with the corresponding @val field set
+ * to full-0 denotes that this field has no override
+ *
+ * A @mask field set to full-0 with the corresponding @val field set
+ * to full-1 denotes thath this field has an invalid override.
+ */
 struct arm64_ftr_override {
        u64             val;
        u64             mask;
index d77d358..bda9189 100644 (file)
                                                // use EL1&0 translation.
 
 .Lskip_spe_\@:
+       /* Trace buffer */
+       ubfx    x0, x1, #ID_AA64DFR0_TRBE_SHIFT, #4
+       cbz     x0, .Lskip_trace_\@             // Skip if TraceBuffer is not present
+
+       mrs_s   x0, SYS_TRBIDR_EL1
+       and     x0, x0, TRBIDR_PROG
+       cbnz    x0, .Lskip_trace_\@             // If TRBE is available at EL2
+
+       mov     x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
+       orr     x2, x2, x0                      // allow the EL1&0 translation
+                                               // to own it.
+
+.Lskip_trace_\@:
        msr     mdcr_el2, x2                    // Configure debug traps
 .endm
 
index bec5f14..ff3879a 100644 (file)
@@ -130,6 +130,15 @@ static inline void sve_user_enable(void)
        sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN);
 }
 
+#define sve_cond_update_zcr_vq(val, reg)               \
+       do {                                            \
+               u64 __zcr = read_sysreg_s((reg));       \
+               u64 __new = __zcr & ~ZCR_ELx_LEN_MASK;  \
+               __new |= (val) & ZCR_ELx_LEN_MASK;      \
+               if (__zcr != __new)                     \
+                       write_sysreg_s(__new, (reg));   \
+       } while (0)
+
 /*
  * Probing and setup functions.
  * Calls to these functions must be serialised with one another.
@@ -159,6 +168,8 @@ static inline int sve_get_current_vl(void)
 static inline void sve_user_disable(void) { BUILD_BUG(); }
 static inline void sve_user_enable(void) { BUILD_BUG(); }
 
+#define sve_cond_update_zcr_vq(val, reg) do { } while (0)
+
 static inline void sve_init_vq_map(void) { }
 static inline void sve_update_vq_map(void) { }
 static inline int sve_verify_vq_map(void) { return 0; }
index af43367..a256399 100644 (file)
@@ -6,6 +6,8 @@
  * Author: Catalin Marinas <catalin.marinas@arm.com>
  */
 
+#include <asm/assembler.h>
+
 .macro fpsimd_save state, tmpnr
        stp     q0, q1, [\state, #16 * 0]
        stp     q2, q3, [\state, #16 * 2]
                str             w\nxtmp, [\xpfpsr, #4]
 .endm
 
-.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
-               sve_load_vq     \xvqminus1, x\nxtmp, \xtmp2
+.macro __sve_load nxbase, xpfpsr, nxtmp
  _for n, 0, 31,        _sve_ldr_v      \n, \nxbase, \n - 34
                _sve_ldr_p      0, \nxbase
                _sve_wrffr      0
                ldr             w\nxtmp, [\xpfpsr, #4]
                msr             fpcr, x\nxtmp
 .endm
+
+.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
+               sve_load_vq     \xvqminus1, x\nxtmp, \xtmp2
+               __sve_load      \nxbase, \xpfpsr, \nxtmp
+.endm
index 737ded6..b4b3076 100644 (file)
 #define __HYP_CONCAT(a, b)     a ## b
 #define HYP_CONCAT(a, b)       __HYP_CONCAT(a, b)
 
+#ifndef __KVM_NVHE_HYPERVISOR__
 /*
  * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
  * to separate it from the kernel proper.
  */
 #define kvm_nvhe_sym(sym)      __kvm_nvhe_##sym
+#else
+#define kvm_nvhe_sym(sym)      sym
+#endif
 
 #ifdef LINKER_SCRIPT
 
@@ -56,6 +60,9 @@
  */
 #define KVM_NVHE_ALIAS(sym)    kvm_nvhe_sym(sym) = sym;
 
+/* Defines a linker script alias for KVM nVHE hyp symbols */
+#define KVM_NVHE_ALIAS_HYP(first, sec) kvm_nvhe_sym(first) = kvm_nvhe_sym(sec);
+
 #endif /* LINKER_SCRIPT */
 
 #endif /* __ARM64_HYP_IMAGE_H__ */
index f9cc1d0..0ae427f 100644 (file)
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
index 94d4025..692c904 100644 (file)
 #define CPTR_EL2_DEFAULT       CPTR_EL2_RES1
 
 /* Hyp Debug Configuration Register bits */
+#define MDCR_EL2_E2TB_MASK     (UL(0x3))
+#define MDCR_EL2_E2TB_SHIFT    (UL(24))
 #define MDCR_EL2_TTRF          (1 << 19)
 #define MDCR_EL2_TPMS          (1 << 14)
 #define MDCR_EL2_E2PB_MASK     (UL(0x3))
index a7ab84f..cf8df03 100644 (file)
 #define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2               12
 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs              13
 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs           14
+#define __KVM_HOST_SMCCC_FUNC___pkvm_init                      15
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings           16
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping    17
+#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector            18
+#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize             19
+#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp                  20
 
 #ifndef __ASSEMBLY__
 
@@ -154,6 +160,9 @@ struct kvm_nvhe_init_params {
        unsigned long tpidr_el2;
        unsigned long stack_hyp_va;
        phys_addr_t pgd_pa;
+       unsigned long hcr_el2;
+       unsigned long vttbr;
+       unsigned long vtcr;
 };
 
 /* Translate a kernel address @ptr into its equivalent linear mapping */
index 3a708be..7cd7d5c 100644 (file)
@@ -94,7 +94,7 @@ struct kvm_s2_mmu {
        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;
 
-       struct kvm *kvm;
+       struct kvm_arch *arch;
 };
 
 struct kvm_arch_memory_slot {
@@ -315,6 +315,8 @@ struct kvm_vcpu_arch {
                struct kvm_guest_debug_arch regs;
                /* Statistical profiling extension */
                u64 pmscr_el1;
+               /* Self-hosted trace */
+               u64 trfcr_el1;
        } host_debug_state;
 
        /* VGIC state */
@@ -372,8 +374,10 @@ struct kvm_vcpu_arch {
 };
 
 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
-#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
-                                     sve_ffr_offset((vcpu)->arch.sve_max_vl)))
+#define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) +     \
+                            sve_ffr_offset((vcpu)->arch.sve_max_vl))
+
+#define vcpu_sve_max_vq(vcpu)  sve_vq_from_vl((vcpu)->arch.sve_max_vl)
 
 #define vcpu_sve_state_size(vcpu) ({                                   \
        size_t __size_ret;                                              \
@@ -382,7 +386,7 @@ struct kvm_vcpu_arch {
        if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) {          \
                __size_ret = 0;                                         \
        } else {                                                        \
-               __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl);    \
+               __vcpu_vq = vcpu_sve_max_vq(vcpu);                      \
                __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq);              \
        }                                                               \
                                                                        \
@@ -400,6 +404,8 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_GUEST_HAS_PTRAUTH    (1 << 7) /* PTRAUTH exposed to guest */
 #define KVM_ARM64_PENDING_EXCEPTION    (1 << 8) /* Exception pending */
 #define KVM_ARM64_EXCEPT_MASK          (7 << 9) /* Target EL/MODE */
+#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active  */
+#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE        (1 << 13) /* Save TRBE context if active  */
 
 #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
                                 KVM_GUESTDBG_USE_SW_BP | \
@@ -590,6 +596,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
 
+#ifndef __KVM_NVHE_HYPERVISOR__
 #define kvm_call_hyp_nvhe(f, ...)                                              \
        ({                                                              \
                struct arm_smccc_res res;                               \
@@ -629,9 +636,13 @@ void kvm_arm_resume_guest(struct kvm *kvm);
                                                                        \
                ret;                                                    \
        })
+#else /* __KVM_NVHE_HYPERVISOR__ */
+#define kvm_call_hyp(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__)
+#endif /* __KVM_NVHE_HYPERVISOR__ */
 
 void force_vm_exit(const cpumask_t *mask);
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
 void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
@@ -691,19 +702,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
        ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr();
 }
 
-static inline bool kvm_arch_requires_vhe(void)
-{
-       /*
-        * The Arm architecture specifies that implementation of SVE
-        * requires VHE also to be implemented.  The KVM code for arm64
-        * relies on this when SVE is present:
-        */
-       if (system_supports_sve())
-               return true;
-
-       return false;
-}
-
 void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu);
 
 static inline void kvm_arch_hardware_unsetup(void) {}
@@ -712,6 +710,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
 void kvm_arm_init_debug(void);
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
@@ -733,6 +732,10 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
        return (!has_vhe() && attr->exclude_host);
 }
 
+/* Flags for host debug state */
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu);
+
 #ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */
 static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 {
@@ -770,5 +773,12 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
        (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
 
 int kvm_trng_call(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM
+extern phys_addr_t hyp_mem_base;
+extern phys_addr_t hyp_mem_size;
+void __init kvm_hyp_reserve(void);
+#else
+static inline void kvm_hyp_reserve(void) { }
+#endif
 
 #endif /* __ARM64_KVM_HOST_H__ */
index 32ae676..9d60b30 100644 (file)
@@ -90,6 +90,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
 
 void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+void __sve_save_state(void *sve_pffr, u32 *fpsr);
+void __sve_restore_state(void *sve_pffr, u32 *fpsr);
 
 #ifndef __KVM_NVHE_HYPERVISOR__
 void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
@@ -100,10 +102,20 @@ u64 __guest_enter(struct kvm_vcpu *vcpu);
 
 bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
 
-void __noreturn hyp_panic(void);
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
                               u64 elr, u64 par);
 #endif
 
+#ifdef __KVM_NVHE_HYPERVISOR__
+void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size,
+                           phys_addr_t pgd, void *sp, void *cont_fn);
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+               unsigned long *per_cpu_base, u32 hyp_va_bits);
+void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+#endif
+
+extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
+extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
+
 #endif /* __ARM64_KVM_HYP_H__ */
index 9087385..25ed956 100644 (file)
@@ -121,6 +121,8 @@ void kvm_update_va_mask(struct alt_instr *alt,
 void kvm_compute_layout(void);
 void kvm_apply_hyp_relocations(void);
 
+#define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)
+
 static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 {
        asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
@@ -166,7 +168,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
-int kvm_mmu_init(void);
+int kvm_mmu_init(u32 *hyp_va_bits);
+
+static inline void *__kvm_vector_slot2addr(void *base,
+                                          enum arm64_hyp_spectre_vector slot)
+{
+       int idx = slot - (slot != HYP_VECTOR_DIRECT);
+
+       return base + (idx * SZ_2K);
+}
 
 struct kvm;
 
@@ -262,9 +272,9 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
  * Must be called from hyp code running at EL2 with an updated VTTBR
  * and interrupts disabled.
  */
-static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr)
 {
-       write_sysreg(kern_hyp_va(mmu->kvm)->arch.vtcr, vtcr_el2);
+       write_sysreg(vtcr, vtcr_el2);
        write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
 
        /*
@@ -275,5 +285,14 @@ static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
        asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
 }
 
+static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+{
+       __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr);
+}
+
+static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
+{
+       return container_of(mmu->arch, struct kvm, arch);
+}
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
index 8886d43..c3674c4 100644 (file)
 #include <linux/kvm_host.h>
 #include <linux/types.h>
 
+#define KVM_PGTABLE_MAX_LEVELS         4U
+
+static inline u64 kvm_get_parange(u64 mmfr0)
+{
+       u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
+                               ID_AA64MMFR0_PARANGE_SHIFT);
+       if (parange > ID_AA64MMFR0_PARANGE_MAX)
+               parange = ID_AA64MMFR0_PARANGE_MAX;
+
+       return parange;
+}
+
 typedef u64 kvm_pte_t;
 
 /**
+ * struct kvm_pgtable_mm_ops - Memory management callbacks.
+ * @zalloc_page:       Allocate a single zeroed memory page. The @arg parameter
+ *                     can be used by the walker to pass a memcache. The
+ *                     initial refcount of the page is 1.
+ * @zalloc_pages_exact:        Allocate an exact number of zeroed memory pages. The
+ *                     @size parameter is in bytes, and is rounded-up to the
+ *                     next page boundary. The resulting allocation is
+ *                     physically contiguous.
+ * @free_pages_exact:  Free an exact number of memory pages previously
+ *                     allocated by zalloc_pages_exact.
+ * @get_page:          Increment the refcount on a page.
+ * @put_page:          Decrement the refcount on a page. When the refcount
+ *                     reaches 0 the page is automatically freed.
+ * @page_count:                Return the refcount of a page.
+ * @phys_to_virt:      Convert a physical address into a virtual address mapped
+ *                     in the current context.
+ * @virt_to_phys:      Convert a virtual address mapped in the current context
+ *                     into a physical address.
+ */
+struct kvm_pgtable_mm_ops {
+       void*           (*zalloc_page)(void *arg);
+       void*           (*zalloc_pages_exact)(size_t size);
+       void            (*free_pages_exact)(void *addr, size_t size);
+       void            (*get_page)(void *addr);
+       void            (*put_page)(void *addr);
+       int             (*page_count)(void *addr);
+       void*           (*phys_to_virt)(phys_addr_t phys);
+       phys_addr_t     (*virt_to_phys)(void *addr);
+};
+
+/**
+ * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags.
+ * @KVM_PGTABLE_S2_NOFWB:      Don't enforce Normal-WB even if the CPUs have
+ *                             ARM64_HAS_STAGE2_FWB.
+ * @KVM_PGTABLE_S2_IDMAP:      Only use identity mappings.
+ */
+enum kvm_pgtable_stage2_flags {
+       KVM_PGTABLE_S2_NOFWB                    = BIT(0),
+       KVM_PGTABLE_S2_IDMAP                    = BIT(1),
+};
+
+/**
  * struct kvm_pgtable - KVM page-table.
  * @ia_bits:           Maximum input address size, in bits.
  * @start_level:       Level at which the page-table walk starts.
  * @pgd:               Pointer to the first top-level entry of the page-table.
+ * @mm_ops:            Memory management callbacks.
  * @mmu:               Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
  */
 struct kvm_pgtable {
        u32                                     ia_bits;
        u32                                     start_level;
        kvm_pte_t                               *pgd;
+       struct kvm_pgtable_mm_ops               *mm_ops;
 
        /* Stage-2 only */
        struct kvm_s2_mmu                       *mmu;
+       enum kvm_pgtable_stage2_flags           flags;
 };
 
 /**
@@ -50,6 +107,16 @@ enum kvm_pgtable_prot {
 #define PAGE_HYP_DEVICE                (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
 
 /**
+ * struct kvm_mem_range - Range of Intermediate Physical Addresses
+ * @start:     Start of the range.
+ * @end:       End of the range.
+ */
+struct kvm_mem_range {
+       u64 start;
+       u64 end;
+};
+
+/**
  * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
  * @KVM_PGTABLE_WALK_LEAF:             Visit leaf entries, including invalid
  *                                     entries.
@@ -86,10 +153,12 @@ struct kvm_pgtable_walker {
  * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
  * @pgt:       Uninitialised page-table structure to initialise.
  * @va_bits:   Maximum virtual address bits.
+ * @mm_ops:    Memory management callbacks.
  *
  * Return: 0 on success, negative error code on failure.
  */
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+                        struct kvm_pgtable_mm_ops *mm_ops);
 
 /**
  * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
@@ -123,17 +192,41 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                        enum kvm_pgtable_prot prot);
 
 /**
- * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * kvm_get_vtcr() - Helper to construct VTCR_EL2
+ * @mmfr0:     Sanitized value of SYS_ID_AA64MMFR0_EL1 register.
+ * @mmfr1:     Sanitized value of SYS_ID_AA64MMFR1_EL1 register.
+ * @phys_shfit:        Value to set in VTCR_EL2.T0SZ.
+ *
+ * The VTCR value is common across all the physical CPUs on the system.
+ * We use system wide sanitised values to fill in different fields,
+ * except for Hardware Management of Access Flags. HA Flag is set
+ * unconditionally on all CPUs, as it is safe to run with or without
+ * the feature and the bit is RES0 on CPUs that don't support it.
+ *
+ * Return: VTCR_EL2 value
+ */
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
+
+/**
+ * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
  * @pgt:       Uninitialised page-table structure to initialise.
- * @kvm:       KVM structure representing the guest virtual machine.
+ * @arch:      Arch-specific KVM structure representing the guest virtual
+ *             machine.
+ * @mm_ops:    Memory management callbacks.
+ * @flags:     Stage-2 configuration flags.
  *
  * Return: 0 on success, negative error code on failure.
  */
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                                 struct kvm_pgtable_mm_ops *mm_ops,
+                                 enum kvm_pgtable_stage2_flags flags);
+
+#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
+       kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
 
 /**
  * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  *
  * The page-table is assumed to be unreachable by any hardware walkers prior
  * to freeing and therefore no TLB invalidation is performed.
@@ -142,13 +235,13 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 
 /**
  * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address at which to place the mapping.
  * @size:      Size of the mapping.
  * @phys:      Physical address of the memory to map.
  * @prot:      Permissions and attributes for the mapping.
- * @mc:                Cache of pre-allocated GFP_PGTABLE_USER memory from which to
- *             allocate page-table pages.
+ * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
+ *             page-table pages.
  *
  * The offset of @addr within a page is ignored, @size is rounded-up to
  * the next page boundary and @phys is rounded-down to the previous page
@@ -170,11 +263,31 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
  */
 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                           u64 phys, enum kvm_pgtable_prot prot,
-                          struct kvm_mmu_memory_cache *mc);
+                          void *mc);
+
+/**
+ * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
+ *                                 track ownership.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr:      Base intermediate physical address to annotate.
+ * @size:      Size of the annotated range.
+ * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
+ *             page-table pages.
+ * @owner_id:  Unique identifier for the owner of the page.
+ *
+ * By default, all page-tables are owned by identifier 0. This function can be
+ * used to mark portions of the IPA space as owned by other entities. When a
+ * stage 2 is used with identity-mappings, these annotations allow to use the
+ * page-table data structure as a simple rmap.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                                void *mc, u8 owner_id);
 
 /**
  * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address from which to remove the mapping.
  * @size:      Size of the mapping.
  *
@@ -194,7 +307,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
 /**
  * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
  *                                  without TLB invalidation.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address from which to write-protect,
  * @size:      Size of the range.
  *
@@ -211,7 +324,7 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
 /**
  * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  *
  * The offset of @addr within a page is ignored.
@@ -225,7 +338,7 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
 
 /**
  * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  *
  * The offset of @addr within a page is ignored.
@@ -244,7 +357,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
 /**
  * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
  *                                   page-table entry.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  * @prot:      Additional permissions to grant for the mapping.
  *
@@ -263,7 +376,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 /**
  * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
  *                                access flag set.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address to identify the page-table entry.
  *
  * The offset of @addr within a page is ignored.
@@ -276,7 +389,7 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
  * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
  *                                   of Coherency for guest stage-2 address
  *                                   range.
- * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:      Intermediate physical address from which to flush.
  * @size:      Size of the range.
  *
@@ -311,4 +424,23 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
                     struct kvm_pgtable_walker *walker);
 
+/**
+ * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
+ *                                  Addresses with compatible permission
+ *                                  attributes.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr:      Address that must be covered by the range.
+ * @prot:      Protection attributes that the range must be compatible with.
+ * @range:     Range structure used to limit the search space at call time and
+ *             that will hold the result.
+ *
+ * The offset of @addr within a page is ignored. An IPA is compatible with @prot
+ * iff its corresponding stage-2 page-table entry has default ownership and, if
+ * valid, is mapped with protection attributes identical to @prot.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+                                 enum kvm_pgtable_prot prot,
+                                 struct kvm_mem_range *range);
 #endif /* __ARM64_KVM_PGTABLE_H__ */
index 9a65fb5..079f4e9 100644 (file)
@@ -71,10 +71,10 @@ extern bool arm64_use_ng_mappings;
 #define PAGE_KERNEL_EXEC       __pgprot(PROT_NORMAL & ~PTE_PXN)
 #define PAGE_KERNEL_EXEC_CONT  __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
 
-#define PAGE_S2_MEMATTR(attr)                                          \
+#define PAGE_S2_MEMATTR(attr, has_fwb)                                 \
        ({                                                              \
                u64 __val;                                              \
-               if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))          \
+               if (has_fwb)                                            \
                        __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr);     \
                else                                                    \
                        __val = PTE_S2_MEMATTR(MT_S2_ ## attr);         \
index 2f36b16..e4ad9db 100644 (file)
@@ -13,6 +13,7 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 extern char __hyp_text_start[], __hyp_text_end[];
 extern char __hyp_rodata_start[], __hyp_rodata_end[];
 extern char __hyp_reloc_begin[], __hyp_reloc_end[];
+extern char __hyp_bss_start[], __hyp_bss_end[];
 extern char __idmap_text_start[], __idmap_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __inittext_begin[], __inittext_end[];
index d4a5fca..f6a77f3 100644 (file)
 #define SYS_PMSIRR_EL1_INTERVAL_MASK   0xffffffUL
 
 /* Filtering controls */
+#define SYS_PMSNEVFR_EL1               sys_reg(3, 0, 9, 9, 1)
+
 #define SYS_PMSFCR_EL1                 sys_reg(3, 0, 9, 9, 4)
 #define SYS_PMSFCR_EL1_FE_SHIFT                0
 #define SYS_PMSFCR_EL1_FT_SHIFT                1
 
 /*** End of Statistical Profiling Extension ***/
 
+/*
+ * TRBE Registers
+ */
+#define SYS_TRBLIMITR_EL1              sys_reg(3, 0, 9, 11, 0)
+#define SYS_TRBPTR_EL1                 sys_reg(3, 0, 9, 11, 1)
+#define SYS_TRBBASER_EL1               sys_reg(3, 0, 9, 11, 2)
+#define SYS_TRBSR_EL1                  sys_reg(3, 0, 9, 11, 3)
+#define SYS_TRBMAR_EL1                 sys_reg(3, 0, 9, 11, 4)
+#define SYS_TRBTRG_EL1                 sys_reg(3, 0, 9, 11, 6)
+#define SYS_TRBIDR_EL1                 sys_reg(3, 0, 9, 11, 7)
+
+#define TRBLIMITR_LIMIT_MASK           GENMASK_ULL(51, 0)
+#define TRBLIMITR_LIMIT_SHIFT          12
+#define TRBLIMITR_NVM                  BIT(5)
+#define TRBLIMITR_TRIG_MODE_MASK       GENMASK(1, 0)
+#define TRBLIMITR_TRIG_MODE_SHIFT      3
+#define TRBLIMITR_FILL_MODE_MASK       GENMASK(1, 0)
+#define TRBLIMITR_FILL_MODE_SHIFT      1
+#define TRBLIMITR_ENABLE               BIT(0)
+#define TRBPTR_PTR_MASK                        GENMASK_ULL(63, 0)
+#define TRBPTR_PTR_SHIFT               0
+#define TRBBASER_BASE_MASK             GENMASK_ULL(51, 0)
+#define TRBBASER_BASE_SHIFT            12
+#define TRBSR_EC_MASK                  GENMASK(5, 0)
+#define TRBSR_EC_SHIFT                 26
+#define TRBSR_IRQ                      BIT(22)
+#define TRBSR_TRG                      BIT(21)
+#define TRBSR_WRAP                     BIT(20)
+#define TRBSR_ABORT                    BIT(18)
+#define TRBSR_STOP                     BIT(17)
+#define TRBSR_MSS_MASK                 GENMASK(15, 0)
+#define TRBSR_MSS_SHIFT                        0
+#define TRBSR_BSC_MASK                 GENMASK(5, 0)
+#define TRBSR_BSC_SHIFT                        0
+#define TRBSR_FSC_MASK                 GENMASK(5, 0)
+#define TRBSR_FSC_SHIFT                        0
+#define TRBMAR_SHARE_MASK              GENMASK(1, 0)
+#define TRBMAR_SHARE_SHIFT             8
+#define TRBMAR_OUTER_MASK              GENMASK(3, 0)
+#define TRBMAR_OUTER_SHIFT             4
+#define TRBMAR_INNER_MASK              GENMASK(3, 0)
+#define TRBMAR_INNER_SHIFT             0
+#define TRBTRG_TRG_MASK                        GENMASK(31, 0)
+#define TRBTRG_TRG_SHIFT               0
+#define TRBIDR_FLAG                    BIT(5)
+#define TRBIDR_PROG                    BIT(4)
+#define TRBIDR_ALIGN_MASK              GENMASK(3, 0)
+#define TRBIDR_ALIGN_SHIFT             0
+
 #define SYS_PMINTENSET_EL1             sys_reg(3, 0, 9, 14, 1)
 #define SYS_PMINTENCLR_EL1             sys_reg(3, 0, 9, 14, 2)
 
 #define SCTLR_ELx_A    (BIT(1))
 #define SCTLR_ELx_M    (BIT(0))
 
-#define SCTLR_ELx_FLAGS        (SCTLR_ELx_M  | SCTLR_ELx_A | SCTLR_ELx_C | \
-                        SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
-
 /* SCTLR_EL2 specific flags. */
 #define SCTLR_EL2_RES1 ((BIT(4))  | (BIT(5))  | (BIT(11)) | (BIT(16)) | \
                         (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
 #define ENDIAN_SET_EL2         0
 #endif
 
+#define INIT_SCTLR_EL2_MMU_ON                                          \
+       (SCTLR_ELx_M  | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I |      \
+        SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1)
+
 #define INIT_SCTLR_EL2_MMU_OFF \
        (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
 
 #define ID_AA64MMFR2_CNP_SHIFT         0
 
 /* id_aa64dfr0 */
+#define ID_AA64DFR0_TRBE_SHIFT         44
 #define ID_AA64DFR0_TRACE_FILT_SHIFT   40
 #define ID_AA64DFR0_DOUBLELOCK_SHIFT   36
 #define ID_AA64DFR0_PMSVER_SHIFT       32
index a36e2fc..8060e58 100644 (file)
@@ -95,6 +95,8 @@ int main(void)
   DEFINE(DMA_FROM_DEVICE,      DMA_FROM_DEVICE);
   BLANK();
   DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
+  DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
+  DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
   BLANK();
   DEFINE(CPU_BOOT_STACK,       offsetof(struct secondary_data, stack));
   DEFINE(CPU_BOOT_TASK,                offsetof(struct secondary_data, task));
@@ -120,6 +122,9 @@ int main(void)
   DEFINE(NVHE_INIT_TPIDR_EL2,  offsetof(struct kvm_nvhe_init_params, tpidr_el2));
   DEFINE(NVHE_INIT_STACK_HYP_VA,       offsetof(struct kvm_nvhe_init_params, stack_hyp_va));
   DEFINE(NVHE_INIT_PGD_PA,     offsetof(struct kvm_nvhe_init_params, pgd_pa));
+  DEFINE(NVHE_INIT_HCR_EL2,    offsetof(struct kvm_nvhe_init_params, hcr_el2));
+  DEFINE(NVHE_INIT_VTTBR,      offsetof(struct kvm_nvhe_init_params, vttbr));
+  DEFINE(NVHE_INIT_VTCR,       offsetof(struct kvm_nvhe_init_params, vtcr));
 #endif
 #ifdef CONFIG_CPU_PM
   DEFINE(CPU_CTX_SP,           offsetof(struct cpu_suspend_ctx, sp));
index 37721eb..d47ff63 100644 (file)
  * flat identity mapping.
  */
 SYM_CODE_START(__cpu_soft_restart)
-       /* Clear sctlr_el1 flags. */
-       mrs     x12, sctlr_el1
-       mov_q   x13, SCTLR_ELx_FLAGS
-       bic     x12, x12, x13
+       mov_q   x12, INIT_SCTLR_EL1_MMU_OFF
        pre_disable_mmu_workaround
        /*
         * either disable EL1&0 translation regime or disable EL2&0 translation
index e5281e1..e3e0dcb 100644 (file)
@@ -808,6 +808,12 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new)
                                        reg->name,
                                        ftrp->shift + ftrp->width - 1,
                                        ftrp->shift, str, tmp);
+               } else if ((ftr_mask & reg->override->val) == ftr_mask) {
+                       reg->override->val &= ~ftr_mask;
+                       pr_warn("%s[%d:%d]: impossible override, ignored\n",
+                               reg->name,
+                               ftrp->shift + ftrp->width - 1,
+                               ftrp->shift);
                }
 
                val = arm64_ftr_set_value(ftrp, val, ftr_new);
@@ -1619,7 +1625,6 @@ int get_cpu_with_amu_feat(void)
 }
 #endif
 
-#ifdef CONFIG_ARM64_VHE
 static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused)
 {
        return is_kernel_in_hyp_mode();
@@ -1638,7 +1643,6 @@ static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused)
        if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN))
                write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
 }
-#endif
 
 static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
 {
@@ -1841,7 +1845,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
                .matches = has_no_hw_prefetch,
        },
-#ifdef CONFIG_ARM64_VHE
        {
                .desc = "Virtualization Host Extensions",
                .capability = ARM64_HAS_VIRT_HOST_EXTN,
@@ -1849,7 +1852,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                .matches = runs_at_el2,
                .cpu_enable = cpu_copy_el2regs,
        },
-#endif /* CONFIG_ARM64_VHE */
        {
                .desc = "32-bit EL0 Support",
                .capability = ARM64_HAS_32BIT_EL0,
index 062b21f..823e3a8 100644 (file)
@@ -180,7 +180,7 @@ static void __get_cpu_fpsimd_context(void)
  */
 static void get_cpu_fpsimd_context(void)
 {
-       preempt_disable();
+       local_bh_disable();
        __get_cpu_fpsimd_context();
 }
 
@@ -201,7 +201,7 @@ static void __put_cpu_fpsimd_context(void)
 static void put_cpu_fpsimd_context(void)
 {
        __put_cpu_fpsimd_context();
-       preempt_enable();
+       local_bh_enable();
 }
 
 static bool have_cpu_fpsimd_context(void)
index 840bda1..96873df 100644 (file)
@@ -477,14 +477,13 @@ EXPORT_SYMBOL(kimage_vaddr)
  * booted in EL1 or EL2 respectively.
  */
 SYM_FUNC_START(init_kernel_el)
-       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
-       msr     sctlr_el1, x0
-
        mrs     x0, CurrentEL
        cmp     x0, #CurrentEL_EL2
        b.eq    init_el2
 
 SYM_INNER_LABEL(init_el1, SYM_L_LOCAL)
+       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
+       msr     sctlr_el1, x0
        isb
        mov_q   x0, INIT_PSTATE_EL1
        msr     spsr_el1, x0
@@ -504,9 +503,43 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
        msr     vbar_el2, x0
        isb
 
+       /*
+        * Fruity CPUs seem to have HCR_EL2.E2H set to RES1,
+        * making it impossible to start in nVHE mode. Is that
+        * compliant with the architecture? Absolutely not!
+        */
+       mrs     x0, hcr_el2
+       and     x0, x0, #HCR_E2H
+       cbz     x0, 1f
+
+       /* Switching to VHE requires a sane SCTLR_EL1 as a start */
+       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
+       msr_s   SYS_SCTLR_EL12, x0
+
+       /*
+        * Force an eret into a helper "function", and let it return
+        * to our original caller... This makes sure that we have
+        * initialised the basic PSTATE state.
+        */
+       mov     x0, #INIT_PSTATE_EL2
+       msr     spsr_el1, x0
+       adr     x0, __cpu_stick_to_vhe
+       msr     elr_el1, x0
+       eret
+
+1:
+       mov_q   x0, INIT_SCTLR_EL1_MMU_OFF
+       msr     sctlr_el1, x0
+
        msr     elr_el2, lr
        mov     w0, #BOOT_CPU_MODE_EL2
        eret
+
+__cpu_stick_to_vhe:
+       mov     x0, #HVC_VHE_RESTART
+       hvc     #0
+       mov     x0, #BOOT_CPU_MODE_EL2
+       ret
 SYM_FUNC_END(init_kernel_el)
 
 /*
index 5eccbd6..43d2126 100644 (file)
@@ -27,12 +27,12 @@ SYM_CODE_START(__hyp_stub_vectors)
        ventry  el2_fiq_invalid                 // FIQ EL2t
        ventry  el2_error_invalid               // Error EL2t
 
-       ventry  el2_sync_invalid                // Synchronous EL2h
+       ventry  elx_sync                        // Synchronous EL2h
        ventry  el2_irq_invalid                 // IRQ EL2h
        ventry  el2_fiq_invalid                 // FIQ EL2h
        ventry  el2_error_invalid               // Error EL2h
 
-       ventry  el1_sync                        // Synchronous 64-bit EL1
+       ventry  elx_sync                        // Synchronous 64-bit EL1
        ventry  el1_irq_invalid                 // IRQ 64-bit EL1
        ventry  el1_fiq_invalid                 // FIQ 64-bit EL1
        ventry  el1_error_invalid               // Error 64-bit EL1
@@ -45,7 +45,7 @@ SYM_CODE_END(__hyp_stub_vectors)
 
        .align 11
 
-SYM_CODE_START_LOCAL(el1_sync)
+SYM_CODE_START_LOCAL(elx_sync)
        cmp     x0, #HVC_SET_VECTORS
        b.ne    1f
        msr     vbar_el2, x1
@@ -71,7 +71,7 @@ SYM_CODE_START_LOCAL(el1_sync)
 
 9:     mov     x0, xzr
        eret
-SYM_CODE_END(el1_sync)
+SYM_CODE_END(elx_sync)
 
 // nVHE? No way! Give me the real thing!
 SYM_CODE_START_LOCAL(mutate_to_vhe)
@@ -115,9 +115,10 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
        mrs_s   x0, SYS_VBAR_EL12
        msr     vbar_el1, x0
 
-       // Use EL2 translations for SPE and disable access from EL1
+       // Use EL2 translations for SPE & TRBE and disable access from EL1
        mrs     x0, mdcr_el2
        bic     x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
+       bic     x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
        msr     mdcr_el2, x0
 
        // Transfer the MM state from EL1 to EL2
@@ -224,7 +225,6 @@ SYM_FUNC_END(__hyp_reset_vectors)
  * Entry point to switch to VHE if deemed capable
  */
 SYM_FUNC_START(switch_to_vhe)
-#ifdef CONFIG_ARM64_VHE
        // Need to have booted at EL2
        adr_l   x1, __boot_cpu_mode
        ldr     w0, [x1]
@@ -240,6 +240,5 @@ SYM_FUNC_START(switch_to_vhe)
        mov     x0, #HVC_VHE_RESTART
        hvc     #0
 1:
-#endif
        ret
 SYM_FUNC_END(switch_to_vhe)
index 83f1c4b..e628c8c 100644 (file)
@@ -25,14 +25,26 @@ struct ftr_set_desc {
        struct {
                char                    name[FTR_DESC_FIELD_LEN];
                u8                      shift;
+               bool                    (*filter)(u64 val);
        }                               fields[];
 };
 
+static bool __init mmfr1_vh_filter(u64 val)
+{
+       /*
+        * If we ever reach this point while running VHE, we're
+        * guaranteed to be on one of these funky, VHE-stuck CPUs. If
+        * the user was trying to force nVHE on us, proceed with
+        * attitude adjustment.
+        */
+       return !(is_kernel_in_hyp_mode() && val == 0);
+}
+
 static const struct ftr_set_desc mmfr1 __initconst = {
        .name           = "id_aa64mmfr1",
        .override       = &id_aa64mmfr1_override,
        .fields         = {
-               { "vh", ID_AA64MMFR1_VHE_SHIFT },
+               { "vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter },
                {}
        },
 };
@@ -124,6 +136,18 @@ static void __init match_options(const char *cmdline)
                        if (find_field(cmdline, regs[i], f, &v))
                                continue;
 
+                       /*
+                        * If an override gets filtered out, advertise
+                        * it by setting the value to 0xf, but
+                        * clearing the mask... Yes, this is fragile.
+                        */
+                       if (regs[i]->fields[f].filter &&
+                           !regs[i]->fields[f].filter(v)) {
+                               regs[i]->override->val  |= mask;
+                               regs[i]->override->mask &= ~mask;
+                               continue;
+                       }
+
                        regs[i]->override->val  &= ~mask;
                        regs[i]->override->val  |= (v << shift) & mask;
                        regs[i]->override->mask |= mask;
index 5aa9ed1..bcf3c27 100644 (file)
@@ -65,13 +65,13 @@ __efistub__ctype            = _ctype;
 KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);
 KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
+KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0);
 
 /* Global kernel state accessed by nVHE hyp code. */
 KVM_NVHE_ALIAS(kvm_vgic_global_state);
 
 /* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */
-KVM_NVHE_ALIAS(__hyp_panic_string);
-KVM_NVHE_ALIAS(panic);
+KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
 
 /* Vectors installed by hyp-init on reset HVC. */
 KVM_NVHE_ALIAS(__hyp_stub_vectors);
@@ -104,6 +104,36 @@ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
 /* PMU available static key */
 KVM_NVHE_ALIAS(kvm_arm_pmu_available);
 
+/* Position-independent library routines */
+KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page);
+KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page);
+KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(memset, __pi_memset);
+
+#ifdef CONFIG_KASAN
+KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
+#endif
+
+/* Kernel memory sections */
+KVM_NVHE_ALIAS(__start_rodata);
+KVM_NVHE_ALIAS(__end_rodata);
+KVM_NVHE_ALIAS(__bss_start);
+KVM_NVHE_ALIAS(__bss_stop);
+
+/* Hyp memory sections */
+KVM_NVHE_ALIAS(__hyp_idmap_text_start);
+KVM_NVHE_ALIAS(__hyp_idmap_text_end);
+KVM_NVHE_ALIAS(__hyp_text_start);
+KVM_NVHE_ALIAS(__hyp_text_end);
+KVM_NVHE_ALIAS(__hyp_bss_start);
+KVM_NVHE_ALIAS(__hyp_bss_end);
+KVM_NVHE_ALIAS(__hyp_rodata_start);
+KVM_NVHE_ALIAS(__hyp_rodata_end);
+
+/* pKVM static key */
+KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
+
 #endif /* CONFIG_KVM */
 
 #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
index 7eea788..709d2c4 100644 (file)
@@ -5,24 +5,7 @@
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
  */
 
-#define RO_EXCEPTION_TABLE_ALIGN       8
-#define RUNTIME_DISCARD_EXIT
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/cache.h>
 #include <asm/hyp_image.h>
-#include <asm/kernel-pgtable.h>
-#include <asm/memory.h>
-#include <asm/page.h>
-
-#include "image.h"
-
-OUTPUT_ARCH(aarch64)
-ENTRY(_text)
-
-jiffies = jiffies_64;
-
-
 #ifdef CONFIG_KVM
 #define HYPERVISOR_EXTABLE                                     \
        . = ALIGN(SZ_8);                                        \
@@ -32,9 +15,11 @@ jiffies = jiffies_64;
 
 #define HYPERVISOR_DATA_SECTIONS                               \
        HYP_SECTION_NAME(.rodata) : {                           \
+               . = ALIGN(PAGE_SIZE);                           \
                __hyp_rodata_start = .;                         \
                *(HYP_SECTION_NAME(.data..ro_after_init))       \
                *(HYP_SECTION_NAME(.rodata))                    \
+               . = ALIGN(PAGE_SIZE);                           \
                __hyp_rodata_end = .;                           \
        }
 
@@ -51,29 +36,52 @@ jiffies = jiffies_64;
                __hyp_reloc_end = .;                            \
        }
 
+#define BSS_FIRST_SECTIONS                                     \
+       __hyp_bss_start = .;                                    \
+       *(HYP_SECTION_NAME(.bss))                               \
+       . = ALIGN(PAGE_SIZE);                                   \
+       __hyp_bss_end = .;
+
+/*
+ * We require that __hyp_bss_start and __bss_start are aligned, and enforce it
+ * with an assertion. But the BSS_SECTION macro places an empty .sbss section
+ * between them, which can in some cases cause the linker to misalign them. To
+ * work around the issue, force a page alignment for __bss_start.
+ */
+#define SBSS_ALIGN                     PAGE_SIZE
 #else /* CONFIG_KVM */
 #define HYPERVISOR_EXTABLE
 #define HYPERVISOR_DATA_SECTIONS
 #define HYPERVISOR_PERCPU_SECTION
 #define HYPERVISOR_RELOC_SECTION
+#define SBSS_ALIGN                     0
 #endif
 
+#define RO_EXCEPTION_TABLE_ALIGN       8
+#define RUNTIME_DISCARD_EXIT
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+
+#include "image.h"
+
+OUTPUT_ARCH(aarch64)
+ENTRY(_text)
+
+jiffies = jiffies_64;
+
 #define HYPERVISOR_TEXT                                        \
-       /*                                              \
-        * Align to 4 KB so that                        \
-        * a) the HYP vector table is at its minimum    \
-        *    alignment of 2048 bytes                   \
-        * b) the HYP init code will not cross a page   \
-        *    boundary if its size does not exceed      \
-        *    4 KB (see related ASSERT() below)         \
-        */                                             \
-       . = ALIGN(SZ_4K);                               \
+       . = ALIGN(PAGE_SIZE);                           \
        __hyp_idmap_text_start = .;                     \
        *(.hyp.idmap.text)                              \
        __hyp_idmap_text_end = .;                       \
        __hyp_text_start = .;                           \
        *(.hyp.text)                                    \
        HYPERVISOR_EXTABLE                              \
+       . = ALIGN(PAGE_SIZE);                           \
        __hyp_text_end = .;
 
 #define IDMAP_TEXT                                     \
@@ -276,7 +284,7 @@ SECTIONS
        __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
        _edata = .;
 
-       BSS_SECTION(0, 0, 0)
+       BSS_SECTION(SBSS_ALIGN, 0, 0)
 
        . = ALIGN(PAGE_SIZE);
        init_pg_dir = .;
@@ -309,11 +317,12 @@ SECTIONS
 #include "image-vars.h"
 
 /*
- * The HYP init code and ID map text can't be longer than a page each,
- * and should not cross a page boundary.
+ * The HYP init code and ID map text can't be longer than a page each. The
+ * former is page-aligned, but the latter may not be with 16K or 64K pages, so
+ * it should also not cross a page boundary.
  */
-ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
-       "HYP init code too big or misaligned")
+ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE,
+       "HYP init code too big")
 ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
        "ID map text too big or misaligned")
 #ifdef CONFIG_HIBERNATION
@@ -324,6 +333,9 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
 ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
        "Entry trampoline text too big")
 #endif
+#ifdef CONFIG_KVM
+ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned")
+#endif
 /*
  * If padding is applied before .head.text, virt<->phys conversions will fail.
  */
index 0d92a4e..1cb39c0 100644 (file)
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ARM_INJECT_EXT_DABT:
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_VCPU_ATTRIBUTES:
+       case KVM_CAP_PTP_KVM:
                r = 1;
                break;
        case KVM_CAP_SET_GUEST_DEBUG2:
@@ -418,10 +419,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        if (vcpu_has_ptrauth(vcpu))
                vcpu_ptrauth_disable(vcpu);
+       kvm_arch_vcpu_load_debug_state_flags(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+       kvm_arch_vcpu_put_debug_state_flags(vcpu);
        kvm_arch_vcpu_put_fp(vcpu);
        if (has_vhe())
                kvm_vcpu_put_sysregs_vhe(vcpu);
@@ -582,6 +585,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 
        vcpu->arch.has_run_once = true;
 
+       kvm_arm_vcpu_init_debug(vcpu);
+
        if (likely(irqchip_in_kernel(kvm))) {
                /*
                 * Map the VGIC hardware resources before running a vcpu the
@@ -1352,16 +1357,9 @@ static unsigned long nvhe_percpu_order(void)
 /* A lookup table holding the hypervisor VA for each vector slot */
 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
 
-static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot)
-{
-       return slot - (slot != HYP_VECTOR_DIRECT);
-}
-
 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
 {
-       int idx = __kvm_vector_slot2idx(slot);
-
-       hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K);
+       hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
 }
 
 static int kvm_init_vector_slots(void)
@@ -1390,22 +1388,18 @@ static int kvm_init_vector_slots(void)
        return 0;
 }
 
-static void cpu_init_hyp_mode(void)
+static void cpu_prepare_hyp_mode(int cpu)
 {
-       struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params);
-       struct arm_smccc_res res;
+       struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
        unsigned long tcr;
 
-       /* Switch from the HYP stub to our own HYP init vector */
-       __hyp_set_vectors(kvm_get_idmap_vector());
-
        /*
         * Calculate the raw per-cpu offset without a translation from the
         * kernel's mapping to the linear mapping, and store it in tpidr_el2
         * so that we can use adr_l to access per-cpu variables in EL2.
         * Also drop the KASAN tag which gets in the way...
         */
-       params->tpidr_el2 = (unsigned long)kasan_reset_tag(this_cpu_ptr_nvhe_sym(__per_cpu_start)) -
+       params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
                            (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
 
        params->mair_el2 = read_sysreg(mair_el1);
@@ -1429,14 +1423,28 @@ static void cpu_init_hyp_mode(void)
        tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
        params->tcr_el2 = tcr;
 
-       params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE);
+       params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE);
        params->pgd_pa = kvm_mmu_get_httbr();
+       if (is_protected_kvm_enabled())
+               params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
+       else
+               params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
+       params->vttbr = params->vtcr = 0;
 
        /*
         * Flush the init params from the data cache because the struct will
         * be read while the MMU is off.
         */
        kvm_flush_dcache_to_poc(params, sizeof(*params));
+}
+
+static void hyp_install_host_vector(void)
+{
+       struct kvm_nvhe_init_params *params;
+       struct arm_smccc_res res;
+
+       /* Switch from the HYP stub to our own HYP init vector */
+       __hyp_set_vectors(kvm_get_idmap_vector());
 
        /*
         * Call initialization code, and switch to the full blown HYP code.
@@ -1445,8 +1453,14 @@ static void cpu_init_hyp_mode(void)
         * cpus_have_const_cap() wrapper.
         */
        BUG_ON(!system_capabilities_finalized());
+       params = this_cpu_ptr_nvhe_sym(kvm_init_params);
        arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
        WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
+}
+
+static void cpu_init_hyp_mode(void)
+{
+       hyp_install_host_vector();
 
        /*
         * Disabling SSBD on a non-VHE system requires us to enable SSBS
@@ -1489,7 +1503,10 @@ static void cpu_set_hyp_vector(void)
        struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
        void *vector = hyp_spectre_vector_selector[data->slot];
 
-       *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+       if (!is_protected_kvm_enabled())
+               *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+       else
+               kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
 }
 
 static void cpu_hyp_reinit(void)
@@ -1497,13 +1514,14 @@ static void cpu_hyp_reinit(void)
        kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
 
        cpu_hyp_reset();
-       cpu_set_hyp_vector();
 
        if (is_kernel_in_hyp_mode())
                kvm_timer_init_vhe();
        else
                cpu_init_hyp_mode();
 
+       cpu_set_hyp_vector();
+
        kvm_arm_init_debug();
 
        if (vgic_present)
@@ -1699,18 +1717,62 @@ static void teardown_hyp_mode(void)
        }
 }
 
+static int do_pkvm_init(u32 hyp_va_bits)
+{
+       void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
+       int ret;
+
+       preempt_disable();
+       hyp_install_host_vector();
+       ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
+                               num_possible_cpus(), kern_hyp_va(per_cpu_base),
+                               hyp_va_bits);
+       preempt_enable();
+
+       return ret;
+}
+
+static int kvm_hyp_init_protection(u32 hyp_va_bits)
+{
+       void *addr = phys_to_virt(hyp_mem_base);
+       int ret;
+
+       kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+       kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+
+       ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       ret = do_pkvm_init(hyp_va_bits);
+       if (ret)
+               return ret;
+
+       free_hyp_pgds();
+
+       return 0;
+}
+
 /**
  * Inits Hyp-mode on all online CPUs
  */
 static int init_hyp_mode(void)
 {
+       u32 hyp_va_bits;
        int cpu;
-       int err = 0;
+       int err = -ENOMEM;
+
+       /*
+        * The protected Hyp-mode cannot be initialized if the memory pool
+        * allocation has failed.
+        */
+       if (is_protected_kvm_enabled() && !hyp_mem_base)
+               goto out_err;
 
        /*
         * Allocate Hyp PGD and setup Hyp identity mapping
         */
-       err = kvm_mmu_init();
+       err = kvm_mmu_init(&hyp_va_bits);
        if (err)
                goto out_err;
 
@@ -1771,7 +1833,19 @@ static int init_hyp_mode(void)
                goto out_err;
        }
 
-       err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+       /*
+        * .hyp.bss is guaranteed to be placed at the beginning of the .bss
+        * section thanks to an assertion in the linker script. Map it RW and
+        * the rest of .bss RO.
+        */
+       err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
+                                 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
+       if (err) {
+               kvm_err("Cannot map hyp bss section: %d\n", err);
+               goto out_err;
+       }
+
+       err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
                                  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map bss section\n");
@@ -1792,26 +1866,36 @@ static int init_hyp_mode(void)
                }
        }
 
-       /*
-        * Map Hyp percpu pages
-        */
        for_each_possible_cpu(cpu) {
                char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
                char *percpu_end = percpu_begin + nvhe_percpu_size();
 
+               /* Map Hyp percpu pages */
                err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
-
                if (err) {
                        kvm_err("Cannot map hyp percpu region\n");
                        goto out_err;
                }
+
+               /* Prepare the CPU initialization parameters */
+               cpu_prepare_hyp_mode(cpu);
        }
 
        if (is_protected_kvm_enabled()) {
                init_cpu_logical_map();
 
-               if (!init_psci_relay())
+               if (!init_psci_relay()) {
+                       err = -ENODEV;
+                       goto out_err;
+               }
+       }
+
+       if (is_protected_kvm_enabled()) {
+               err = kvm_hyp_init_protection(hyp_va_bits);
+               if (err) {
+                       kvm_err("Failed to init hyp memory protection\n");
                        goto out_err;
+               }
        }
 
        return 0;
@@ -1822,6 +1906,72 @@ out_err:
        return err;
 }
 
+static void _kvm_host_prot_finalize(void *discard)
+{
+       WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
+}
+
+static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+       return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end);
+}
+
+#define pkvm_mark_hyp_section(__section)               \
+       pkvm_mark_hyp(__pa_symbol(__section##_start),   \
+                       __pa_symbol(__section##_end))
+
+static int finalize_hyp_mode(void)
+{
+       int cpu, ret;
+
+       if (!is_protected_kvm_enabled())
+               return 0;
+
+       ret = pkvm_mark_hyp_section(__hyp_idmap_text);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_text);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_rodata);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp_section(__hyp_bss);
+       if (ret)
+               return ret;
+
+       ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size);
+       if (ret)
+               return ret;
+
+       for_each_possible_cpu(cpu) {
+               phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]);
+               phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order());
+
+               ret = pkvm_mark_hyp(start, end);
+               if (ret)
+                       return ret;
+
+               start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu));
+               end = start + PAGE_SIZE;
+               ret = pkvm_mark_hyp(start, end);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * Flip the static key upfront as that may no longer be possible
+        * once the host stage 2 is installed.
+        */
+       static_branch_enable(&kvm_protected_mode_initialized);
+       on_each_cpu(_kvm_host_prot_finalize, NULL, 1);
+
+       return 0;
+}
+
 static void check_kvm_target_cpu(void *ret)
 {
        *(int *)ret = kvm_target_cpu();
@@ -1896,11 +2046,6 @@ int kvm_arch_init(void *opaque)
 
        in_hyp_mode = is_kernel_in_hyp_mode();
 
-       if (!in_hyp_mode && kvm_arch_requires_vhe()) {
-               kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
-               return -ENODEV;
-       }
-
        if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
            cpus_have_final_cap(ARM64_WORKAROUND_1508412))
                kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
@@ -1938,8 +2083,15 @@ int kvm_arch_init(void *opaque)
        if (err)
                goto out_hyp;
 
+       if (!in_hyp_mode) {
+               err = finalize_hyp_mode();
+               if (err) {
+                       kvm_err("Failed to finalize Hyp protection\n");
+                       goto out_hyp;
+               }
+       }
+
        if (is_protected_kvm_enabled()) {
-               static_branch_enable(&kvm_protected_mode_initialized);
                kvm_info("Protected nVHE mode initialized successfully\n");
        } else if (in_hyp_mode) {
                kvm_info("VHE mode initialized successfully\n");
index dbc8905..d5e79d7 100644 (file)
@@ -69,6 +69,65 @@ void kvm_arm_init_debug(void)
 }
 
 /**
+ * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * This ensures we will trap access to:
+ *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ *  - Debug ROM Address (MDCR_EL2_TDRA)
+ *  - OS related registers (MDCR_EL2_TDOSA)
+ *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
+ *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ *  - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
+ */
+static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+{
+       /*
+        * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
+        * to disable guest access to the profiling and trace buffers
+        */
+       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+                               MDCR_EL2_TPMS |
+                               MDCR_EL2_TTRF |
+                               MDCR_EL2_TPMCR |
+                               MDCR_EL2_TDRA |
+                               MDCR_EL2_TDOSA);
+
+       /* Is the VM being debugged by userspace? */
+       if (vcpu->guest_debug)
+               /* Route all software debug exceptions to EL2 */
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+       /*
+        * Trap debug register access when one of the following is true:
+        *  - Userspace is using the hardware to debug the guest
+        *  (KVM_GUESTDBG_USE_HW is set).
+        *  - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear).
+        */
+       if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) ||
+           !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+}
+
+/**
+ * kvm_arm_vcpu_init_debug - setup vcpu debug traps
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * Set vcpu initial mdcr_el2 value.
+ */
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       kvm_arm_setup_mdcr_el2(vcpu);
+       preempt_enable();
+}
+
+/**
  * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
  */
 
@@ -83,13 +142,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
  * @vcpu:      the vcpu pointer
  *
  * This is called before each entry into the hypervisor to setup any
- * debug related registers. Currently this just ensures we will trap
- * access to:
- *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
- *  - Debug ROM Address (MDCR_EL2_TDRA)
- *  - OS related registers (MDCR_EL2_TDOSA)
- *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
- *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ * debug related registers.
  *
  * Additionally, KVM only traps guest accesses to the debug registers if
  * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
@@ -101,28 +154,14 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
 
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 {
-       bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY);
        unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2;
 
        trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
 
-       /*
-        * This also clears MDCR_EL2_E2PB_MASK to disable guest access
-        * to the profiling buffer.
-        */
-       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
-       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
-                               MDCR_EL2_TPMS |
-                               MDCR_EL2_TTRF |
-                               MDCR_EL2_TPMCR |
-                               MDCR_EL2_TDRA |
-                               MDCR_EL2_TDOSA);
+       kvm_arm_setup_mdcr_el2(vcpu);
 
        /* Is Guest debugging in effect? */
        if (vcpu->guest_debug) {
-               /* Route all software debug exceptions to EL2 */
-               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
-
                /* Save guest debug state */
                save_guest_debug_regs(vcpu);
 
@@ -176,7 +215,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 
                        vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
                        vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
-                       trap_debug = true;
 
                        trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
                                                &vcpu->arch.debug_ptr->dbg_bcr[0],
@@ -191,10 +229,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
        BUG_ON(!vcpu->guest_debug &&
                vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
 
-       /* Trap debug register access */
-       if (trap_debug)
-               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
-
        /* If KDE or MDE are set, perform a full save/restore cycle. */
        if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
                vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
@@ -203,7 +237,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
        if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2)
                write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
 
-       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
        trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
 }
 
@@ -231,3 +264,32 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
                }
        }
 }
+
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+       u64 dfr0;
+
+       /* For VHE, there is nothing to do */
+       if (has_vhe())
+               return;
+
+       dfr0 = read_sysreg(id_aa64dfr0_el1);
+       /*
+        * If SPE is present on this CPU and is available at current EL,
+        * we may need to check if the host state needs to be saved.
+        */
+       if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) &&
+           !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT)))
+               vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE;
+
+       /* Check if we have TRBE implemented and available at the host */
+       if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) &&
+           !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG))
+               vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE;
+}
+
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE |
+                             KVM_ARM64_DEBUG_STATE_SAVE_TRBE);
+}
index 3e081d5..5621020 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kvm_host.h>
 #include <asm/fpsimd.h>
 #include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 #include <asm/sysreg.h>
 
@@ -42,6 +43,17 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
        if (ret)
                goto error;
 
+       if (vcpu->arch.sve_state) {
+               void *sve_end;
+
+               sve_end = vcpu->arch.sve_state + vcpu_sve_state_size(vcpu);
+
+               ret = create_hyp_mappings(vcpu->arch.sve_state, sve_end,
+                                         PAGE_HYP);
+               if (ret)
+                       goto error;
+       }
+
        vcpu->arch.host_thread_info = kern_hyp_va(ti);
        vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
 error:
@@ -109,11 +121,17 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
        local_irq_save(flags);
 
        if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
-               fpsimd_save_and_flush_cpu_state();
+               if (guest_has_sve) {
+                       __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+
+                       /* Restore the VL that was saved when bound to the CPU */
+                       if (!has_vhe())
+                               sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
+                                                      SYS_ZCR_EL1);
+               }
 
-               if (guest_has_sve)
-                       __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_s(SYS_ZCR_EL12);
-       } else if (host_has_sve) {
+               fpsimd_save_and_flush_cpu_state();
+       } else if (has_vhe() && host_has_sve) {
                /*
                 * The FPSIMD/SVE state in the CPU has not been touched, and we
                 * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
index 6cb39ee..5cb4a1c 100644 (file)
@@ -299,7 +299,7 @@ static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 
        memset(vqs, 0, sizeof(vqs));
 
-       max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+       max_vq = vcpu_sve_max_vq(vcpu);
        for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
                if (sve_vq_available(vq))
                        vqs[vq_word(vq)] |= vq_mask(vq);
@@ -427,7 +427,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
                if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                        return -ENOENT;
 
-               vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+               vq = vcpu_sve_max_vq(vcpu);
 
                reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) -
                                SVE_SIG_REGS_OFFSET;
@@ -437,7 +437,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
                if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                        return -ENOENT;
 
-               vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+               vq = vcpu_sve_max_vq(vcpu);
 
                reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) -
                                SVE_SIG_REGS_OFFSET;
index cebe39f..6f48336 100644 (file)
@@ -291,3 +291,48 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
        if (exception_index == ARM_EXCEPTION_EL1_SERROR)
                kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
 }
+
+void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
+                                             u64 par, uintptr_t vcpu,
+                                             u64 far, u64 hpfar) {
+       u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr));
+       u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr;
+       u64 mode = spsr & PSR_MODE_MASK;
+
+       /*
+        * The nVHE hyp symbols are not included by kallsyms to avoid issues
+        * with aliasing. That means that the symbols cannot be printed with the
+        * "%pS" format specifier, so fall back to the vmlinux address if
+        * there's no better option.
+        */
+       if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
+               kvm_err("Invalid host exception to nVHE hyp!\n");
+       } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
+                  (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
+               struct bug_entry *bug = find_bug(elr_in_kimg);
+               const char *file = NULL;
+               unsigned int line = 0;
+
+               /* All hyp bugs, including warnings, are treated as fatal. */
+               if (bug)
+                       bug_get_file_line(bug, &file, &line);
+
+               if (file)
+                       kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
+               else
+                       kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset);
+       } else {
+               kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset);
+       }
+
+       /*
+        * Hyp has panicked and we're going to handle that by panicking the
+        * kernel. The kernel offset will be revealed in the panic so we're
+        * also safe to reveal the hyp offset as a debugging aid for translating
+        * hyp VAs to vmlinux addresses.
+        */
+       kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);
+
+       panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
+             spsr, elr, esr, far, hpfar, par, vcpu);
+}
index 687598e..b726332 100644 (file)
@@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir)                               \
                    -DDISABLE_BRANCH_PROFILING          \
                    $(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o
index 01f114a..3c63592 100644 (file)
@@ -19,3 +19,13 @@ SYM_FUNC_START(__fpsimd_restore_state)
        fpsimd_restore  x0, 1
        ret
 SYM_FUNC_END(__fpsimd_restore_state)
+
+SYM_FUNC_START(__sve_restore_state)
+       __sve_load 0, x1, 2
+       ret
+SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+       sve_save 0, x1, 2
+       ret
+SYM_FUNC_END(__sve_save_state)
index 6c1f51f..e4a2f29 100644 (file)
@@ -30,8 +30,6 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
-extern const char __hyp_panic_string[];
-
 extern struct exception_table_entry __start___kvm_ex_table;
 extern struct exception_table_entry __stop___kvm_ex_table;
 
@@ -160,18 +158,10 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
        return true;
 }
 
-static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
 {
-       u8 ec;
-       u64 esr;
        u64 hpfar, far;
 
-       esr = vcpu->arch.fault.esr_el2;
-       ec = ESR_ELx_EC(esr);
-
-       if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
-               return true;
-
        far = read_sysreg_el2(SYS_FAR);
 
        /*
@@ -194,33 +184,59 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
                hpfar = read_sysreg(hpfar_el2);
        }
 
-       vcpu->arch.fault.far_el2 = far;
-       vcpu->arch.fault.hpfar_el2 = hpfar;
+       fault->far_el2 = far;
+       fault->hpfar_el2 = hpfar;
        return true;
 }
 
+static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+{
+       u8 ec;
+       u64 esr;
+
+       esr = vcpu->arch.fault.esr_el2;
+       ec = ESR_ELx_EC(esr);
+
+       if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
+               return true;
+
+       return __get_fault_info(esr, &vcpu->arch.fault);
+}
+
+static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu)
+{
+       struct thread_struct *thread;
+
+       thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct,
+                             uw.fpsimd_state);
+
+       __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr);
+}
+
+static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
+{
+       sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
+       __sve_restore_state(vcpu_sve_pffr(vcpu),
+                           &vcpu->arch.ctxt.fp_regs.fpsr);
+       write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
+}
+
 /* Check for an FPSIMD/SVE trap and handle as appropriate */
 static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
 {
-       bool vhe, sve_guest, sve_host;
+       bool sve_guest, sve_host;
        u8 esr_ec;
+       u64 reg;
 
        if (!system_supports_fpsimd())
                return false;
 
-       /*
-        * Currently system_supports_sve() currently implies has_vhe(),
-        * so the check is redundant. However, has_vhe() can be determined
-        * statically and helps the compiler remove dead code.
-        */
-       if (has_vhe() && system_supports_sve()) {
+       if (system_supports_sve()) {
                sve_guest = vcpu_has_sve(vcpu);
                sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE;
-               vhe = true;
        } else {
                sve_guest = false;
                sve_host = false;
-               vhe = has_vhe();
        }
 
        esr_ec = kvm_vcpu_trap_get_class(vcpu);
@@ -229,53 +245,38 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
                return false;
 
        /* Don't handle SVE traps for non-SVE vcpus here: */
-       if (!sve_guest)
-               if (esr_ec != ESR_ELx_EC_FP_ASIMD)
-                       return false;
+       if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD)
+               return false;
 
        /* Valid trap.  Switch the context: */
-
-       if (vhe) {
-               u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN;
-
+       if (has_vhe()) {
+               reg = CPACR_EL1_FPEN;
                if (sve_guest)
                        reg |= CPACR_EL1_ZEN;
 
-               write_sysreg(reg, cpacr_el1);
+               sysreg_clear_set(cpacr_el1, 0, reg);
        } else {
-               write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP,
-                            cptr_el2);
-       }
+               reg = CPTR_EL2_TFP;
+               if (sve_guest)
+                       reg |= CPTR_EL2_TZ;
 
+               sysreg_clear_set(cptr_el2, reg, 0);
+       }
        isb();
 
        if (vcpu->arch.flags & KVM_ARM64_FP_HOST) {
-               /*
-                * In the SVE case, VHE is assumed: it is enforced by
-                * Kconfig and kvm_arch_init().
-                */
-               if (sve_host) {
-                       struct thread_struct *thread = container_of(
-                               vcpu->arch.host_fpsimd_state,
-                               struct thread_struct, uw.fpsimd_state);
-
-                       sve_save_state(sve_pffr(thread),
-                                      &vcpu->arch.host_fpsimd_state->fpsr);
-               } else {
+               if (sve_host)
+                       __hyp_sve_save_host(vcpu);
+               else
                        __fpsimd_save_state(vcpu->arch.host_fpsimd_state);
-               }
 
                vcpu->arch.flags &= ~KVM_ARM64_FP_HOST;
        }
 
-       if (sve_guest) {
-               sve_load_state(vcpu_sve_pffr(vcpu),
-                              &vcpu->arch.ctxt.fp_regs.fpsr,
-                              sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1);
-               write_sysreg_s(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR_EL12);
-       } else {
+       if (sve_guest)
+               __hyp_sve_restore_guest(vcpu);
+       else
                __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
-       }
 
        /* Skip restoring fpexc32 for AArch64 guests */
        if (!(read_sysreg(hcr_el2) & HCR_RW))
diff --git a/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h
new file mode 100644 (file)
index 0000000..dc61aaa
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_EARLY_ALLOC_H
+#define __KVM_HYP_EARLY_ALLOC_H
+
+#include <asm/kvm_pgtable.h>
+
+void hyp_early_alloc_init(void *virt, unsigned long size);
+unsigned long hyp_early_alloc_nr_used_pages(void);
+void *hyp_early_alloc_page(void *arg);
+void *hyp_early_alloc_contig(unsigned int nr_pages);
+
+extern struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+
+#endif /* __KVM_HYP_EARLY_ALLOC_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
new file mode 100644 (file)
index 0000000..18a4494
--- /dev/null
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_GFP_H
+#define __KVM_HYP_GFP_H
+
+#include <linux/list.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_NO_ORDER   UINT_MAX
+
+struct hyp_pool {
+       /*
+        * Spinlock protecting concurrent changes to the memory pool as well as
+        * the struct hyp_page of the pool's pages until we have a proper atomic
+        * API at EL2.
+        */
+       hyp_spinlock_t lock;
+       struct list_head free_area[MAX_ORDER];
+       phys_addr_t range_start;
+       phys_addr_t range_end;
+       unsigned int max_order;
+};
+
+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       p->refcount++;
+       hyp_spin_unlock(&pool->lock);
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+       int ret;
+
+       hyp_spin_lock(&pool->lock);
+       p->refcount--;
+       ret = (p->refcount == 0);
+       hyp_spin_unlock(&pool->lock);
+
+       return ret;
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       if (p->refcount) {
+               hyp_spin_unlock(&pool->lock);
+               BUG();
+       }
+       p->refcount = 1;
+       hyp_spin_unlock(&pool->lock);
+}
+
+/* Allocation */
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
+void hyp_get_page(void *addr);
+void hyp_put_page(void *addr);
+
+/* Used pages cannot be freed */
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+                 unsigned int reserved_pages);
+#endif /* __KVM_HYP_GFP_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
new file mode 100644 (file)
index 0000000..42d81ec
--- /dev/null
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#ifndef __KVM_NVHE_MEM_PROTECT__
+#define __KVM_NVHE_MEM_PROTECT__
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/virt.h>
+#include <nvhe/spinlock.h>
+
+struct host_kvm {
+       struct kvm_arch arch;
+       struct kvm_pgtable pgt;
+       struct kvm_pgtable_mm_ops mm_ops;
+       hyp_spinlock_t lock;
+};
+extern struct host_kvm host_kvm;
+
+int __pkvm_prot_finalize(void);
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
+
+static __always_inline void __load_host_stage2(void)
+{
+       if (static_branch_likely(&kvm_protected_mode_initialized))
+               __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+       else
+               write_sysreg(0, vttbr_el2);
+}
+#endif /* __KVM_NVHE_MEM_PROTECT__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
new file mode 100644 (file)
index 0000000..fd78bde
--- /dev/null
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MEMORY_H
+#define __KVM_HYP_MEMORY_H
+
+#include <asm/kvm_mmu.h>
+#include <asm/page.h>
+
+#include <linux/types.h>
+
+struct hyp_pool;
+struct hyp_page {
+       unsigned int refcount;
+       unsigned int order;
+       struct hyp_pool *pool;
+       struct list_head node;
+};
+
+extern u64 __hyp_vmemmap;
+#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap)
+
+#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset))
+
+static inline void *hyp_phys_to_virt(phys_addr_t phys)
+{
+       return __hyp_va(phys);
+}
+
+static inline phys_addr_t hyp_virt_to_phys(void *addr)
+{
+       return __hyp_pa(addr);
+}
+
+#define hyp_phys_to_pfn(phys)  ((phys) >> PAGE_SHIFT)
+#define hyp_pfn_to_phys(pfn)   ((phys_addr_t)((pfn) << PAGE_SHIFT))
+#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)])
+#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt))
+#define hyp_virt_to_pfn(virt)  hyp_phys_to_pfn(__hyp_pa(virt))
+
+#define hyp_page_to_pfn(page)  ((struct hyp_page *)(page) - hyp_vmemmap)
+#define hyp_page_to_phys(page)  hyp_pfn_to_phys((hyp_page_to_pfn(page)))
+#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
+#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
+
+static inline int hyp_page_count(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       return p->refcount;
+}
+
+#endif /* __KVM_HYP_MEMORY_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
new file mode 100644 (file)
index 0000000..0095f62
--- /dev/null
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MM_H
+#define __KVM_HYP_MM_H
+
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+#include <linux/memblock.h>
+#include <linux/types.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_MEMBLOCK_REGIONS 128
+extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
+extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
+extern struct kvm_pgtable pkvm_pgtable;
+extern hyp_spinlock_t pkvm_pgd_lock;
+extern struct hyp_pool hpool;
+extern u64 __io_map_base;
+
+int hyp_create_idmap(u32 hyp_va_bits);
+int hyp_map_vectors(void);
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+                          unsigned long phys, enum kvm_pgtable_prot prot);
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                           enum kvm_pgtable_prot prot);
+
+static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
+                                    unsigned long *start, unsigned long *end)
+{
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct hyp_page *p = hyp_phys_to_page(phys);
+
+       *start = (unsigned long)p;
+       *end = *start + nr_pages * sizeof(struct hyp_page);
+       *start = ALIGN_DOWN(*start, PAGE_SIZE);
+       *end = ALIGN(*end, PAGE_SIZE);
+}
+
+static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
+{
+       unsigned long total = 0, i;
+
+       /* Provision the worst case scenario */
+       for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+               nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
+               total += nr_pages;
+       }
+
+       return total;
+}
+
+static inline unsigned long __hyp_pgtable_total_pages(void)
+{
+       unsigned long res = 0, i;
+
+       /* Cover all of memory with page-granularity */
+       for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+               struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
+               res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
+       }
+
+       return res;
+}
+
+static inline unsigned long hyp_s1_pgtable_pages(void)
+{
+       unsigned long res;
+
+       res = __hyp_pgtable_total_pages();
+
+       /* Allow 1 GiB for private mappings */
+       res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+
+       return res;
+}
+
+static inline unsigned long host_s2_mem_pgtable_pages(void)
+{
+       /*
+        * Include an extra 16 pages to safely upper-bound the worst case of
+        * concatenated pgds.
+        */
+       return __hyp_pgtable_total_pages() + 16;
+}
+
+static inline unsigned long host_s2_dev_pgtable_pages(void)
+{
+       /* Allow 1 GiB for MMIO mappings */
+       return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+}
+
+#endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
new file mode 100644 (file)
index 0000000..76b537f
--- /dev/null
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A stand-alone ticket spinlock implementation for use by the non-VHE
+ * KVM hypervisor code running at EL2.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ *
+ * Heavily based on the implementation removed by c11090474d70 which was:
+ * Copyright (C) 2012 ARM Ltd.
+ */
+
+#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__
+#define __ARM64_KVM_NVHE_SPINLOCK_H__
+
+#include <asm/alternative.h>
+#include <asm/lse.h>
+
+typedef union hyp_spinlock {
+       u32     __val;
+       struct {
+#ifdef __AARCH64EB__
+               u16 next, owner;
+#else
+               u16 owner, next;
+#endif
+       };
+} hyp_spinlock_t;
+
+#define hyp_spin_lock_init(l)                                          \
+do {                                                                   \
+       *(l) = (hyp_spinlock_t){ .__val = 0 };                          \
+} while (0)
+
+static inline void hyp_spin_lock(hyp_spinlock_t *lock)
+{
+       u32 tmp;
+       hyp_spinlock_t lockval, newval;
+
+       asm volatile(
+       /* Atomically increment the next ticket. */
+       ARM64_LSE_ATOMIC_INSN(
+       /* LL/SC */
+"      prfm    pstl1strm, %3\n"
+"1:    ldaxr   %w0, %3\n"
+"      add     %w1, %w0, #(1 << 16)\n"
+"      stxr    %w2, %w1, %3\n"
+"      cbnz    %w2, 1b\n",
+       /* LSE atomics */
+"      mov     %w2, #(1 << 16)\n"
+"      ldadda  %w2, %w0, %3\n"
+       __nops(3))
+
+       /* Did we get the lock? */
+"      eor     %w1, %w0, %w0, ror #16\n"
+"      cbz     %w1, 3f\n"
+       /*
+        * No: spin on the owner. Send a local event to avoid missing an
+        * unlock before the exclusive load.
+        */
+"      sevl\n"
+"2:    wfe\n"
+"      ldaxrh  %w2, %4\n"
+"      eor     %w1, %w2, %w0, lsr #16\n"
+"      cbnz    %w1, 2b\n"
+       /* We got the lock. Critical section starts here. */
+"3:"
+       : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock)
+       : "Q" (lock->owner)
+       : "memory");
+}
+
+static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
+{
+       u64 tmp;
+
+       asm volatile(
+       ARM64_LSE_ATOMIC_INSN(
+       /* LL/SC */
+       "       ldrh    %w1, %0\n"
+       "       add     %w1, %w1, #1\n"
+       "       stlrh   %w1, %0",
+       /* LSE atomics */
+       "       mov     %w1, #1\n"
+       "       staddlh %w1, %0\n"
+       __nops(1))
+       : "=Q" (lock->owner), "=&r" (tmp)
+       :
+       : "memory");
+}
+
+#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
index a6707df..f55201a 100644 (file)
@@ -9,10 +9,15 @@ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
 hostprogs := gen-hyprel
 HOST_EXTRACFLAGS += -I$(objtree)/include
 
+lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+
 obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
-        hyp-main.o hyp-smp.o psci-relay.o
+        hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
+        cache.o setup.o mm.o mem_protect.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
-        ../fpsimd.o ../hyp-entry.o ../exception.o
+        ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+obj-y += $(lib-objs)
 
 ##
 ## Build rules for compiling nVHE hyp code
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
new file mode 100644 (file)
index 0000000..36cef69
--- /dev/null
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Code copied from arch/arm64/mm/cache.S.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/alternative.h>
+
+SYM_FUNC_START_PI(__flush_dcache_area)
+       dcache_by_line_op civac, sy, x0, x1, x2, x3
+       ret
+SYM_FUNC_END_PI(__flush_dcache_area)
index f401724..7d3f258 100644 (file)
@@ -21,17 +21,11 @@ static void __debug_save_spe(u64 *pmscr_el1)
        /* Clear pmscr in case of early return */
        *pmscr_el1 = 0;
 
-       /* SPE present on this CPU? */
-       if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
-                                                 ID_AA64DFR0_PMSVER_SHIFT))
-               return;
-
-       /* Yes; is it owned by EL3? */
-       reg = read_sysreg_s(SYS_PMBIDR_EL1);
-       if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
-               return;
-
-       /* No; is the host actually using the thing? */
+       /*
+        * At this point, we know that this CPU implements
+        * SPE and is available to the host.
+        * Check if the host is actually using it ?
+        */
        reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
        if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))
                return;
@@ -58,10 +52,43 @@ static void __debug_restore_spe(u64 pmscr_el1)
        write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
 }
 
+static void __debug_save_trace(u64 *trfcr_el1)
+{
+       *trfcr_el1 = 0;
+
+       /* Check if the TRBE is enabled */
+       if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE))
+               return;
+       /*
+        * Prohibit trace generation while we are in guest.
+        * Since access to TRFCR_EL1 is trapped, the guest can't
+        * modify the filtering set by the host.
+        */
+       *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1);
+       write_sysreg_s(0, SYS_TRFCR_EL1);
+       isb();
+       /* Drain the trace buffer to memory */
+       tsb_csync();
+       dsb(nsh);
+}
+
+static void __debug_restore_trace(u64 trfcr_el1)
+{
+       if (!trfcr_el1)
+               return;
+
+       /* Restore trace filter controls */
+       write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1);
+}
+
 void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
 {
        /* Disable and flush SPE data generation */
-       __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+               __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+       /* Disable and flush Self-Hosted Trace generation */
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+               __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1);
 }
 
 void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -71,7 +98,10 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
 
 void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
 {
-       __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+               __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+       if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+               __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1);
 }
 
 void __debug_switch_to_host(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
new file mode 100644 (file)
index 0000000..1306c43
--- /dev/null
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/memory.h>
+
+struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+s64 __ro_after_init hyp_physvirt_offset;
+
+static unsigned long base;
+static unsigned long end;
+static unsigned long cur;
+
+unsigned long hyp_early_alloc_nr_used_pages(void)
+{
+       return (cur - base) >> PAGE_SHIFT;
+}
+
+void *hyp_early_alloc_contig(unsigned int nr_pages)
+{
+       unsigned long size = (nr_pages << PAGE_SHIFT);
+       void *ret = (void *)cur;
+
+       if (!nr_pages)
+               return NULL;
+
+       if (end - cur < size)
+               return NULL;
+
+       cur += size;
+       memset(ret, 0, size);
+
+       return ret;
+}
+
+void *hyp_early_alloc_page(void *arg)
+{
+       return hyp_early_alloc_contig(1);
+}
+
+void hyp_early_alloc_init(void *virt, unsigned long size)
+{
+       base = cur = (unsigned long)virt;
+       end = base + size;
+
+       hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page;
+       hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt;
+       hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys;
+}
index ead02c6..6bc88a7 100644 (file)
 #ifndef R_AARCH64_ABS64
 #define R_AARCH64_ABS64                        257
 #endif
+#ifndef R_AARCH64_PREL64
+#define R_AARCH64_PREL64               260
+#endif
+#ifndef R_AARCH64_PREL32
+#define R_AARCH64_PREL32               261
+#endif
+#ifndef R_AARCH64_PREL16
+#define R_AARCH64_PREL16               262
+#endif
+#ifndef R_AARCH64_PLT32
+#define R_AARCH64_PLT32                        314
+#endif
 #ifndef R_AARCH64_LD_PREL_LO19
 #define R_AARCH64_LD_PREL_LO19         273
 #endif
@@ -371,6 +383,12 @@ static void emit_rela_section(Elf64_Shdr *sh_rela)
                case R_AARCH64_ABS64:
                        emit_rela_abs64(rela, sh_orig_name);
                        break;
+               /* Allow position-relative data relocations. */
+               case R_AARCH64_PREL64:
+               case R_AARCH64_PREL32:
+               case R_AARCH64_PREL16:
+               case R_AARCH64_PLT32:
+                       break;
                /* Allow relocations to generate PC-relative addressing. */
                case R_AARCH64_LD_PREL_LO19:
                case R_AARCH64_ADR_PREL_LO21:
index 5d94584..2b23400 100644 (file)
@@ -79,22 +79,18 @@ SYM_FUNC_START(__hyp_do_panic)
        mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
                      PSR_MODE_EL1h)
        msr     spsr_el2, lr
-       ldr     lr, =panic
+       ldr     lr, =nvhe_hyp_panic_handler
        hyp_kimg_va lr, x6
        msr     elr_el2, lr
 
        mov     x29, x0
 
-       /* Load the format string into x0 and arguments into x1-7 */
-       ldr     x0, =__hyp_panic_string
-       hyp_kimg_va x0, x6
-
-       /* Load the format arguments into x1-7. */
-       mov     x6, x3
-       get_vcpu_ptr x7, x3
-       mrs     x3, esr_el2
-       mrs     x4, far_el2
-       mrs     x5, hpfar_el2
+       /* Load the panic arguments into x0-7 */
+       mrs     x0, esr_el2
+       get_vcpu_ptr x4, x5
+       mrs     x5, far_el2
+       mrs     x6, hpfar_el2
+       mov     x7, xzr                 // Unused argument
 
        /* Enter the host, conditionally restoring the host context. */
        cbz     x29, __host_enter_without_restoring
index c631e29..c953fb4 100644 (file)
@@ -83,11 +83,6 @@ SYM_CODE_END(__kvm_hyp_init)
  * x0: struct kvm_nvhe_init_params PA
  */
 SYM_CODE_START_LOCAL(___kvm_hyp_init)
-alternative_if ARM64_KVM_PROTECTED_MODE
-       mov_q   x1, HCR_HOST_NVHE_PROTECTED_FLAGS
-       msr     hcr_el2, x1
-alternative_else_nop_endif
-
        ldr     x1, [x0, #NVHE_INIT_TPIDR_EL2]
        msr     tpidr_el2, x1
 
@@ -97,6 +92,15 @@ alternative_else_nop_endif
        ldr     x1, [x0, #NVHE_INIT_MAIR_EL2]
        msr     mair_el2, x1
 
+       ldr     x1, [x0, #NVHE_INIT_HCR_EL2]
+       msr     hcr_el2, x1
+
+       ldr     x1, [x0, #NVHE_INIT_VTTBR]
+       msr     vttbr_el2, x1
+
+       ldr     x1, [x0, #NVHE_INIT_VTCR]
+       msr     vtcr_el2, x1
+
        ldr     x1, [x0, #NVHE_INIT_PGD_PA]
        phys_to_ttbr x2, x1
 alternative_if ARM64_HAS_CNP
@@ -115,15 +119,10 @@ alternative_else_nop_endif
 
        /* Invalidate the stale TLBs from Bootloader */
        tlbi    alle2
+       tlbi    vmalls12e1
        dsb     sy
 
-       /*
-        * Preserve all the RES1 bits while setting the default flags,
-        * as well as the EE bit on BE. Drop the A flag since the compiler
-        * is allowed to generate unaligned accesses.
-        */
-       mov_q   x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE(        orr     x0, x0, #SCTLR_ELx_EE)
+       mov_q   x0, INIT_SCTLR_EL2_MMU_ON
 alternative_if ARM64_HAS_ADDRESS_AUTH
        mov_q   x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
                     SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
@@ -221,9 +220,7 @@ SYM_CODE_START(__kvm_handle_stub_hvc)
        mov     x0, xzr
 reset:
        /* Reset kvm back to the hyp stub. */
-       mrs     x5, sctlr_el2
-       mov_q   x6, SCTLR_ELx_FLAGS
-       bic     x5, x5, x6              // Clear SCTL_M and etc
+       mov_q   x5, INIT_SCTLR_EL2_MMU_OFF
        pre_disable_mmu_workaround
        msr     sctlr_el2, x5
        isb
@@ -244,4 +241,31 @@ alternative_else_nop_endif
 
 SYM_CODE_END(__kvm_handle_stub_hvc)
 
+SYM_FUNC_START(__pkvm_init_switch_pgd)
+       /* Turn the MMU off */
+       pre_disable_mmu_workaround
+       mrs     x2, sctlr_el2
+       bic     x3, x2, #SCTLR_ELx_M
+       msr     sctlr_el2, x3
+       isb
+
+       tlbi    alle2
+
+       /* Install the new pgtables */
+       ldr     x3, [x0, #NVHE_INIT_PGD_PA]
+       phys_to_ttbr x4, x3
+alternative_if ARM64_HAS_CNP
+       orr     x4, x4, #TTBR_CNP_BIT
+alternative_else_nop_endif
+       msr     ttbr0_el2, x4
+
+       /* Set the new stack pointer */
+       ldr     x0, [x0, #NVHE_INIT_STACK_HYP_VA]
+       mov     sp, x0
+
+       /* And turn the MMU back on! */
+       set_sctlr_el2   x2
+       ret     x1
+SYM_FUNC_END(__pkvm_init_switch_pgd)
+
        .popsection
index 9363282..f36420a 100644 (file)
@@ -6,12 +6,15 @@
 
 #include <hyp/switch.h>
 
+#include <asm/pgtable-types.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
 #include <nvhe/trap_handler.h>
 
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -106,6 +109,61 @@ static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
        __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
 }
 
+static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+       DECLARE_REG(unsigned long, size, host_ctxt, 2);
+       DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
+       DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
+       DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+
+       /*
+        * __pkvm_init() will return only if an error occurred, otherwise it
+        * will tail-call in __pkvm_init_finalise() which will have to deal
+        * with the host context directly.
+        */
+       cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
+                                           hyp_va_bits);
+}
+
+static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1);
+
+       cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
+}
+
+static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(unsigned long, start, host_ctxt, 1);
+       DECLARE_REG(unsigned long, size, host_ctxt, 2);
+       DECLARE_REG(unsigned long, phys, host_ctxt, 3);
+       DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
+}
+
+static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+       DECLARE_REG(size_t, size, host_ctxt, 2);
+       DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
+}
+
+static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
+{
+       cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
+}
+
+static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt)
+{
+       DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
+       DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
+
+       cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end);
+}
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -125,6 +183,12 @@ static const hcall_t host_hcall[] = {
        HANDLE_FUNC(__kvm_get_mdcr_el2),
        HANDLE_FUNC(__vgic_v3_save_aprs),
        HANDLE_FUNC(__vgic_v3_restore_aprs),
+       HANDLE_FUNC(__pkvm_init),
+       HANDLE_FUNC(__pkvm_cpu_set_vector),
+       HANDLE_FUNC(__pkvm_create_mappings),
+       HANDLE_FUNC(__pkvm_create_private_mapping),
+       HANDLE_FUNC(__pkvm_prot_finalize),
+       HANDLE_FUNC(__pkvm_mark_hyp),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -177,7 +241,16 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
        case ESR_ELx_EC_SMC64:
                handle_host_smc(host_ctxt);
                break;
+       case ESR_ELx_EC_SVE:
+               sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
+               isb();
+               sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
+               break;
+       case ESR_ELx_EC_IABT_LOW:
+       case ESR_ELx_EC_DABT_LOW:
+               handle_host_mem_abort(host_ctxt);
+               break;
        default:
-               hyp_panic();
+               BUG();
        }
 }
index 8795590..9f54833 100644 (file)
@@ -18,8 +18,7 @@ u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID
 
 u64 cpu_logical_map(unsigned int cpu)
 {
-       if (cpu >= ARRAY_SIZE(hyp_cpu_logical_map))
-               hyp_panic();
+       BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map));
 
        return hyp_cpu_logical_map[cpu];
 }
@@ -30,8 +29,7 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu)
        unsigned long this_cpu_base;
        unsigned long elf_base;
 
-       if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
-               hyp_panic();
+       BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base));
 
        cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base;
        this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
index cd119d8..f4562f4 100644 (file)
@@ -25,4 +25,5 @@ SECTIONS {
        BEGIN_HYP_SECTION(.data..percpu)
                PERCPU_INPUT(L1_CACHE_BYTES)
        END_HYP_SECTION
+       HYP_SECTION(.bss)
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
new file mode 100644 (file)
index 0000000..e342f7f
--- /dev/null
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/stage2_pgtable.h>
+
+#include <hyp/switch.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+
+#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
+
+extern unsigned long hyp_nr_cpus;
+struct host_kvm host_kvm;
+
+struct hyp_pool host_s2_mem;
+struct hyp_pool host_s2_dev;
+
+/*
+ * Copies of the host's CPU features registers holding sanitized values.
+ */
+u64 id_aa64mmfr0_el1_sys_val;
+u64 id_aa64mmfr1_el1_sys_val;
+
+static const u8 pkvm_hyp_id = 1;
+
+static void *host_s2_zalloc_pages_exact(size_t size)
+{
+       return hyp_alloc_pages(&host_s2_mem, get_order(size));
+}
+
+static void *host_s2_zalloc_page(void *pool)
+{
+       return hyp_alloc_pages(pool, 0);
+}
+
+static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+       unsigned long nr_pages, pfn;
+       int ret;
+
+       pfn = hyp_virt_to_pfn(mem_pgt_pool);
+       nr_pages = host_s2_mem_pgtable_pages();
+       ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
+       if (ret)
+               return ret;
+
+       pfn = hyp_virt_to_pfn(dev_pgt_pool);
+       nr_pages = host_s2_dev_pgtable_pages();
+       ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+       if (ret)
+               return ret;
+
+       host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
+               .zalloc_pages_exact = host_s2_zalloc_pages_exact,
+               .zalloc_page = host_s2_zalloc_page,
+               .phys_to_virt = hyp_phys_to_virt,
+               .virt_to_phys = hyp_virt_to_phys,
+               .page_count = hyp_page_count,
+               .get_page = hyp_get_page,
+               .put_page = hyp_put_page,
+       };
+
+       return 0;
+}
+
+static void prepare_host_vtcr(void)
+{
+       u32 parange, phys_shift;
+
+       /* The host stage 2 is id-mapped, so use parange for T0SZ */
+       parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
+       phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+       host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+                                         id_aa64mmfr1_el1_sys_val, phys_shift);
+}
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+       struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+       int ret;
+
+       prepare_host_vtcr();
+       hyp_spin_lock_init(&host_kvm.lock);
+
+       ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
+       if (ret)
+               return ret;
+
+       ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch,
+                                           &host_kvm.mm_ops, KVM_HOST_S2_FLAGS);
+       if (ret)
+               return ret;
+
+       mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
+       mmu->arch = &host_kvm.arch;
+       mmu->pgt = &host_kvm.pgt;
+       mmu->vmid.vmid_gen = 0;
+       mmu->vmid.vmid = 0;
+
+       return 0;
+}
+
+int __pkvm_prot_finalize(void)
+{
+       struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+       struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+       params->vttbr = kvm_get_vttbr(mmu);
+       params->vtcr = host_kvm.arch.vtcr;
+       params->hcr_el2 |= HCR_VM;
+       kvm_flush_dcache_to_poc(params, sizeof(*params));
+
+       write_sysreg(params->hcr_el2, hcr_el2);
+       __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+
+       /*
+        * Make sure to have an ISB before the TLB maintenance below but only
+        * when __load_stage2() doesn't include one already.
+        */
+       asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+       /* Invalidate stale HCR bits that may be cached in TLBs */
+       __tlbi(vmalls12e1);
+       dsb(nsh);
+       isb();
+
+       return 0;
+}
+
+static int host_stage2_unmap_dev_all(void)
+{
+       struct kvm_pgtable *pgt = &host_kvm.pgt;
+       struct memblock_region *reg;
+       u64 addr = 0;
+       int i, ret;
+
+       /* Unmap all non-memory regions to recycle the pages */
+       for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
+               reg = &hyp_memory[i];
+               ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+               if (ret)
+                       return ret;
+       }
+       return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
+}
+
+static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+{
+       int cur, left = 0, right = hyp_memblock_nr;
+       struct memblock_region *reg;
+       phys_addr_t end;
+
+       range->start = 0;
+       range->end = ULONG_MAX;
+
+       /* The list of memblock regions is sorted, binary search it */
+       while (left < right) {
+               cur = (left + right) >> 1;
+               reg = &hyp_memory[cur];
+               end = reg->base + reg->size;
+               if (addr < reg->base) {
+                       right = cur;
+                       range->end = reg->base;
+               } else if (addr >= end) {
+                       left = cur + 1;
+                       range->start = end;
+               } else {
+                       range->start = reg->base;
+                       range->end = end;
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static bool range_is_memory(u64 start, u64 end)
+{
+       struct kvm_mem_range r1, r2;
+
+       if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2))
+               return false;
+       if (r1.start != r2.start)
+               return false;
+
+       return true;
+}
+
+static inline int __host_stage2_idmap(u64 start, u64 end,
+                                     enum kvm_pgtable_prot prot,
+                                     struct hyp_pool *pool)
+{
+       return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
+                                     prot, pool);
+}
+
+static int host_stage2_idmap(u64 addr)
+{
+       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
+       struct kvm_mem_range range;
+       bool is_memory = find_mem_range(addr, &range);
+       struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
+       int ret;
+
+       if (is_memory)
+               prot |= KVM_PGTABLE_PROT_X;
+
+       hyp_spin_lock(&host_kvm.lock);
+       ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
+       if (ret)
+               goto unlock;
+
+       ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+       if (is_memory || ret != -ENOMEM)
+               goto unlock;
+
+       /*
+        * host_s2_mem has been provided with enough pages to cover all of
+        * memory with page granularity, so we should never hit the ENOMEM case.
+        * However, it is difficult to know how much of the MMIO range we will
+        * need to cover upfront, so we may need to 'recycle' the pages if we
+        * run out.
+        */
+       ret = host_stage2_unmap_dev_all();
+       if (ret)
+               goto unlock;
+
+       ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+
+unlock:
+       hyp_spin_unlock(&host_kvm.lock);
+
+       return ret;
+}
+
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+       int ret;
+
+       /*
+        * host_stage2_unmap_dev_all() currently relies on MMIO mappings being
+        * non-persistent, so don't allow changing page ownership in MMIO range.
+        */
+       if (!range_is_memory(start, end))
+               return -EINVAL;
+
+       hyp_spin_lock(&host_kvm.lock);
+       ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
+                                          &host_s2_mem, pkvm_hyp_id);
+       hyp_spin_unlock(&host_kvm.lock);
+
+       return ret != -EAGAIN ? ret : 0;
+}
+
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+       struct kvm_vcpu_fault_info fault;
+       u64 esr, addr;
+       int ret = 0;
+
+       esr = read_sysreg_el2(SYS_ESR);
+       BUG_ON(!__get_fault_info(esr, &fault));
+
+       addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+       ret = host_stage2_idmap(addr);
+       BUG_ON(ret && ret != -EAGAIN);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
new file mode 100644 (file)
index 0000000..a8efdf0
--- /dev/null
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+#include <nvhe/spinlock.h>
+
+struct kvm_pgtable pkvm_pgtable;
+hyp_spinlock_t pkvm_pgd_lock;
+u64 __io_map_base;
+
+struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
+unsigned int hyp_memblock_nr;
+
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+                         unsigned long phys, enum kvm_pgtable_prot prot)
+{
+       int err;
+
+       hyp_spin_lock(&pkvm_pgd_lock);
+       err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot);
+       hyp_spin_unlock(&pkvm_pgd_lock);
+
+       return err;
+}
+
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+                                           enum kvm_pgtable_prot prot)
+{
+       unsigned long addr;
+       int err;
+
+       hyp_spin_lock(&pkvm_pgd_lock);
+
+       size = PAGE_ALIGN(size + offset_in_page(phys));
+       addr = __io_map_base;
+       __io_map_base += size;
+
+       /* Are we overflowing on the vmemmap ? */
+       if (__io_map_base > __hyp_vmemmap) {
+               __io_map_base -= size;
+               addr = (unsigned long)ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot);
+       if (err) {
+               addr = (unsigned long)ERR_PTR(err);
+               goto out;
+       }
+
+       addr = addr + offset_in_page(phys);
+out:
+       hyp_spin_unlock(&pkvm_pgd_lock);
+
+       return addr;
+}
+
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+       unsigned long start = (unsigned long)from;
+       unsigned long end = (unsigned long)to;
+       unsigned long virt_addr;
+       phys_addr_t phys;
+
+       start = start & PAGE_MASK;
+       end = PAGE_ALIGN(end);
+
+       for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+               int err;
+
+               phys = hyp_virt_to_phys((void *)virt_addr);
+               err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
+{
+       unsigned long start, end;
+
+       hyp_vmemmap_range(phys, size, &start, &end);
+
+       return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
+}
+
+static void *__hyp_bp_vect_base;
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot)
+{
+       void *vector;
+
+       switch (slot) {
+       case HYP_VECTOR_DIRECT: {
+               vector = __kvm_hyp_vector;
+               break;
+       }
+       case HYP_VECTOR_SPECTRE_DIRECT: {
+               vector = __bp_harden_hyp_vecs;
+               break;
+       }
+       case HYP_VECTOR_INDIRECT:
+       case HYP_VECTOR_SPECTRE_INDIRECT: {
+               vector = (void *)__hyp_bp_vect_base;
+               break;
+       }
+       default:
+               return -EINVAL;
+       }
+
+       vector = __kvm_vector_slot2addr(vector, slot);
+       *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector;
+
+       return 0;
+}
+
+int hyp_map_vectors(void)
+{
+       phys_addr_t phys;
+       void *bp_base;
+
+       if (!cpus_have_const_cap(ARM64_SPECTRE_V3A))
+               return 0;
+
+       phys = __hyp_pa(__bp_harden_hyp_vecs);
+       bp_base = (void *)__pkvm_create_private_mapping(phys,
+                                                       __BP_HARDEN_HYP_VECS_SZ,
+                                                       PAGE_HYP_EXEC);
+       if (IS_ERR_OR_NULL(bp_base))
+               return PTR_ERR(bp_base);
+
+       __hyp_bp_vect_base = bp_base;
+
+       return 0;
+}
+
+int hyp_create_idmap(u32 hyp_va_bits)
+{
+       unsigned long start, end;
+
+       start = hyp_virt_to_phys((void *)__hyp_idmap_text_start);
+       start = ALIGN_DOWN(start, PAGE_SIZE);
+
+       end = hyp_virt_to_phys((void *)__hyp_idmap_text_end);
+       end = ALIGN(end, PAGE_SIZE);
+
+       /*
+        * One half of the VA space is reserved to linearly map portions of
+        * memory -- see va_layout.c for more details. The other half of the VA
+        * space contains the trampoline page, and needs some care. Split that
+        * second half in two and find the quarter of VA space not conflicting
+        * with the idmap to place the IOs and the vmemmap. IOs use the lower
+        * half of the quarter and the vmemmap the upper half.
+        */
+       __io_map_base = start & BIT(hyp_va_bits - 2);
+       __io_map_base ^= BIT(hyp_va_bits - 2);
+       __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+
+       return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
new file mode 100644 (file)
index 0000000..237e03b
--- /dev/null
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_hyp.h>
+#include <nvhe/gfp.h>
+
+u64 __hyp_vmemmap;
+
+/*
+ * Index the hyp_vmemmap to find a potential buddy page, but make no assumption
+ * about its current state.
+ *
+ * Example buddy-tree for a 4-pages physically contiguous pool:
+ *
+ *                 o : Page 3
+ *                /
+ *               o-o : Page 2
+ *              /
+ *             /   o : Page 1
+ *            /   /
+ *           o---o-o : Page 0
+ *    Order  2   1 0
+ *
+ * Example of requests on this pool:
+ *   __find_buddy_nocheck(pool, page 0, order 0) => page 1
+ *   __find_buddy_nocheck(pool, page 0, order 1) => page 2
+ *   __find_buddy_nocheck(pool, page 1, order 0) => page 0
+ *   __find_buddy_nocheck(pool, page 2, order 0) => page 3
+ */
+static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
+                                            struct hyp_page *p,
+                                            unsigned int order)
+{
+       phys_addr_t addr = hyp_page_to_phys(p);
+
+       addr ^= (PAGE_SIZE << order);
+
+       /*
+        * Don't return a page outside the pool range -- it belongs to
+        * something else and may not be mapped in hyp_vmemmap.
+        */
+       if (addr < pool->range_start || addr >= pool->range_end)
+               return NULL;
+
+       return hyp_phys_to_page(addr);
+}
+
+/* Find a buddy page currently available for allocation */
+static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
+                                          struct hyp_page *p,
+                                          unsigned int order)
+{
+       struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
+
+       if (!buddy || buddy->order != order || list_empty(&buddy->node))
+               return NULL;
+
+       return buddy;
+
+}
+
+static void __hyp_attach_page(struct hyp_pool *pool,
+                             struct hyp_page *p)
+{
+       unsigned int order = p->order;
+       struct hyp_page *buddy;
+
+       memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
+
+       /*
+        * Only the first struct hyp_page of a high-order page (otherwise known
+        * as the 'head') should have p->order set. The non-head pages should
+        * have p->order = HYP_NO_ORDER. Here @p may no longer be the head
+        * after coallescing, so make sure to mark it HYP_NO_ORDER proactively.
+        */
+       p->order = HYP_NO_ORDER;
+       for (; (order + 1) < pool->max_order; order++) {
+               buddy = __find_buddy_avail(pool, p, order);
+               if (!buddy)
+                       break;
+
+               /* Take the buddy out of its list, and coallesce with @p */
+               list_del_init(&buddy->node);
+               buddy->order = HYP_NO_ORDER;
+               p = min(p, buddy);
+       }
+
+       /* Mark the new head, and insert it */
+       p->order = order;
+       list_add_tail(&p->node, &pool->free_area[order]);
+}
+
+static void hyp_attach_page(struct hyp_page *p)
+{
+       struct hyp_pool *pool = hyp_page_to_pool(p);
+
+       hyp_spin_lock(&pool->lock);
+       __hyp_attach_page(pool, p);
+       hyp_spin_unlock(&pool->lock);
+}
+
+static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
+                                          struct hyp_page *p,
+                                          unsigned int order)
+{
+       struct hyp_page *buddy;
+
+       list_del_init(&p->node);
+       while (p->order > order) {
+               /*
+                * The buddy of order n - 1 currently has HYP_NO_ORDER as it
+                * is covered by a higher-level page (whose head is @p). Use
+                * __find_buddy_nocheck() to find it and inject it in the
+                * free_list[n - 1], effectively splitting @p in half.
+                */
+               p->order--;
+               buddy = __find_buddy_nocheck(pool, p, p->order);
+               buddy->order = p->order;
+               list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
+       }
+
+       return p;
+}
+
+void hyp_put_page(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       if (hyp_page_ref_dec_and_test(p))
+               hyp_attach_page(p);
+}
+
+void hyp_get_page(void *addr)
+{
+       struct hyp_page *p = hyp_virt_to_page(addr);
+
+       hyp_page_ref_inc(p);
+}
+
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
+{
+       unsigned int i = order;
+       struct hyp_page *p;
+
+       hyp_spin_lock(&pool->lock);
+
+       /* Look for a high-enough-order page */
+       while (i < pool->max_order && list_empty(&pool->free_area[i]))
+               i++;
+       if (i >= pool->max_order) {
+               hyp_spin_unlock(&pool->lock);
+               return NULL;
+       }
+
+       /* Extract it from the tree at the right order */
+       p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
+       p = __hyp_extract_page(pool, p, order);
+
+       hyp_spin_unlock(&pool->lock);
+       hyp_set_page_refcounted(p);
+
+       return hyp_page_to_virt(p);
+}
+
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+                 unsigned int reserved_pages)
+{
+       phys_addr_t phys = hyp_pfn_to_phys(pfn);
+       struct hyp_page *p;
+       int i;
+
+       hyp_spin_lock_init(&pool->lock);
+       pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+       for (i = 0; i < pool->max_order; i++)
+               INIT_LIST_HEAD(&pool->free_area[i]);
+       pool->range_start = phys;
+       pool->range_end = phys + (nr_pages << PAGE_SHIFT);
+
+       /* Init the vmemmap portion */
+       p = hyp_phys_to_page(phys);
+       memset(p, 0, sizeof(*p) * nr_pages);
+       for (i = 0; i < nr_pages; i++) {
+               p[i].pool = pool;
+               INIT_LIST_HEAD(&p[i].node);
+       }
+
+       /* Attach the unused pages to the buddy tree */
+       for (i = reserved_pages; i < nr_pages; i++)
+               __hyp_attach_page(pool, &p[i]);
+
+       return 0;
+}
index 63de71c..0850878 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kvm_host.h>
 #include <uapi/linux/psci.h>
 
+#include <nvhe/memory.h>
 #include <nvhe/trap_handler.h>
 
 void kvm_hyp_cpu_entry(unsigned long r0);
@@ -20,9 +21,6 @@ void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
 
 /* Config options set by the host. */
 struct kvm_host_psci_config __ro_after_init kvm_host_psci_config;
-s64 __ro_after_init hyp_physvirt_offset;
-
-#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)
 
 #define INVALID_CPU_ID UINT_MAX
 
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
new file mode 100644 (file)
index 0000000..7488f53
--- /dev/null
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/trap_handler.h>
+
+struct hyp_pool hpool;
+struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
+unsigned long hyp_nr_cpus;
+
+#define hyp_percpu_size ((unsigned long)__per_cpu_end - \
+                        (unsigned long)__per_cpu_start)
+
+static void *vmemmap_base;
+static void *hyp_pgt_base;
+static void *host_s2_mem_pgt_base;
+static void *host_s2_dev_pgt_base;
+
+static int divide_memory_pool(void *virt, unsigned long size)
+{
+       unsigned long vstart, vend, nr_pages;
+
+       hyp_early_alloc_init(virt, size);
+
+       hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend);
+       nr_pages = (vend - vstart) >> PAGE_SHIFT;
+       vmemmap_base = hyp_early_alloc_contig(nr_pages);
+       if (!vmemmap_base)
+               return -ENOMEM;
+
+       nr_pages = hyp_s1_pgtable_pages();
+       hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!hyp_pgt_base)
+               return -ENOMEM;
+
+       nr_pages = host_s2_mem_pgtable_pages();
+       host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!host_s2_mem_pgt_base)
+               return -ENOMEM;
+
+       nr_pages = host_s2_dev_pgtable_pages();
+       host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages);
+       if (!host_s2_dev_pgt_base)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
+                                unsigned long *per_cpu_base,
+                                u32 hyp_va_bits)
+{
+       void *start, *end, *virt = hyp_phys_to_virt(phys);
+       unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
+       int ret, i;
+
+       /* Recreate the hyp page-table using the early page allocator */
+       hyp_early_alloc_init(hyp_pgt_base, pgt_size);
+       ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits,
+                                  &hyp_early_alloc_mm_ops);
+       if (ret)
+               return ret;
+
+       ret = hyp_create_idmap(hyp_va_bits);
+       if (ret)
+               return ret;
+
+       ret = hyp_map_vectors();
+       if (ret)
+               return ret;
+
+       ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base));
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
+       if (ret)
+               return ret;
+
+       ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < hyp_nr_cpus; i++) {
+               start = (void *)kern_hyp_va(per_cpu_base[i]);
+               end = start + PAGE_ALIGN(hyp_percpu_size);
+               ret = pkvm_create_mappings(start, end, PAGE_HYP);
+               if (ret)
+                       return ret;
+
+               end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va;
+               start = end - PAGE_SIZE;
+               ret = pkvm_create_mappings(start, end, PAGE_HYP);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static void update_nvhe_init_params(void)
+{
+       struct kvm_nvhe_init_params *params;
+       unsigned long i;
+
+       for (i = 0; i < hyp_nr_cpus; i++) {
+               params = per_cpu_ptr(&kvm_init_params, i);
+               params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
+               __flush_dcache_area(params, sizeof(*params));
+       }
+}
+
+static void *hyp_zalloc_hyp_page(void *arg)
+{
+       return hyp_alloc_pages(&hpool, 0);
+}
+
+void __noreturn __pkvm_init_finalise(void)
+{
+       struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
+       struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt;
+       unsigned long nr_pages, reserved_pages, pfn;
+       int ret;
+
+       /* Now that the vmemmap is backed, install the full-fledged allocator */
+       pfn = hyp_virt_to_pfn(hyp_pgt_base);
+       nr_pages = hyp_s1_pgtable_pages();
+       reserved_pages = hyp_early_alloc_nr_used_pages();
+       ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages);
+       if (ret)
+               goto out;
+
+       ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
+       if (ret)
+               goto out;
+
+       pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
+               .zalloc_page = hyp_zalloc_hyp_page,
+               .phys_to_virt = hyp_phys_to_virt,
+               .virt_to_phys = hyp_virt_to_phys,
+               .get_page = hyp_get_page,
+               .put_page = hyp_put_page,
+       };
+       pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
+
+out:
+       /*
+        * We tail-called to here from handle___pkvm_init() and will not return,
+        * so make sure to propagate the return value to the host.
+        */
+       cpu_reg(host_ctxt, 1) = ret;
+
+       __host_enter(host_ctxt);
+}
+
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+               unsigned long *per_cpu_base, u32 hyp_va_bits)
+{
+       struct kvm_nvhe_init_params *params;
+       void *virt = hyp_phys_to_virt(phys);
+       void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+       int ret;
+
+       if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+               return -EINVAL;
+
+       hyp_spin_lock_init(&pkvm_pgd_lock);
+       hyp_nr_cpus = nr_cpus;
+
+       ret = divide_memory_pool(virt, size);
+       if (ret)
+               return ret;
+
+       ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits);
+       if (ret)
+               return ret;
+
+       update_nvhe_init_params();
+
+       /* Jump in the idmap page to switch to the new page-tables */
+       params = this_cpu_ptr(&kvm_init_params);
+       fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
+       fn(__hyp_pa(params), __pkvm_init_finalise);
+
+       unreachable();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/stub.c b/arch/arm64/kvm/hyp/nvhe/stub.c
new file mode 100644 (file)
index 0000000..c0aa6bb
--- /dev/null
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stubs for out-of-line function calls caused by re-using kernel
+ * infrastructure at EL2.
+ *
+ * Copyright (C) 2020 - Google LLC
+ */
+
+#include <linux/list.h>
+
+#ifdef CONFIG_DEBUG_LIST
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+                     struct list_head *next)
+{
+               return true;
+}
+
+bool __list_del_entry_valid(struct list_head *entry)
+{
+               return true;
+}
+#endif
index 68ab6b4..e9f6ea7 100644 (file)
@@ -28,6 +28,8 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
+#include <nvhe/mem_protect.h>
+
 /* Non-VHE specific context */
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
 DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -41,9 +43,9 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
        __activate_traps_common(vcpu);
 
        val = CPTR_EL2_DEFAULT;
-       val |= CPTR_EL2_TTA | CPTR_EL2_TZ | CPTR_EL2_TAM;
+       val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
        if (!update_fp_enabled(vcpu)) {
-               val |= CPTR_EL2_TFP;
+               val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
                __activate_traps_fpsimd32(vcpu);
        }
 
@@ -68,7 +70,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
 static void __deactivate_traps(struct kvm_vcpu *vcpu)
 {
        extern char __kvm_hyp_host_vector[];
-       u64 mdcr_el2;
+       u64 mdcr_el2, cptr;
 
        ___deactivate_traps(vcpu);
 
@@ -95,19 +97,17 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
 
        mdcr_el2 &= MDCR_EL2_HPMN_MASK;
        mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+       mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
 
        write_sysreg(mdcr_el2, mdcr_el2);
-       if (is_protected_kvm_enabled())
-               write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2);
-       else
-               write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
-       write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
-       write_sysreg(__kvm_hyp_host_vector, vbar_el2);
-}
+       write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
 
-static void __load_host_stage2(void)
-{
-       write_sysreg(0, vttbr_el2);
+       cptr = CPTR_EL2_DEFAULT;
+       if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED))
+               cptr |= CPTR_EL2_TZ;
+
+       write_sysreg(cptr, cptr_el2);
+       write_sysreg(__kvm_hyp_host_vector, vbar_el2);
 }
 
 /* Save VGICv3 state on non-VHE systems */
index 229b067..83dc3b2 100644 (file)
@@ -8,6 +8,8 @@
 #include <asm/kvm_mmu.h>
 #include <asm/tlbflush.h>
 
+#include <nvhe/mem_protect.h>
+
 struct tlb_inv_context {
        u64             tcr;
 };
@@ -43,7 +45,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
 
 static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
 {
-       write_sysreg(0, vttbr_el2);
+       __load_host_stage2();
 
        if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
                /* Ensure write of the host VMID */
index 926fc07..c37c1dc 100644 (file)
@@ -9,8 +9,7 @@
 
 #include <linux/bitfield.h>
 #include <asm/kvm_pgtable.h>
-
-#define KVM_PGTABLE_MAX_LEVELS         4U
+#include <asm/stage2_pgtable.h>
 
 #define KVM_PTE_VALID                  BIT(0)
 
                                         KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
                                         KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
+#define KVM_PTE_LEAF_ATTR_S2_IGNORED   GENMASK(58, 55)
+
+#define KVM_INVALID_PTE_OWNER_MASK     GENMASK(63, 56)
+#define KVM_MAX_OWNER_ID               1
+
 struct kvm_pgtable_walk_data {
        struct kvm_pgtable              *pgt;
        struct kvm_pgtable_walker       *walker;
@@ -68,21 +72,36 @@ static u64 kvm_granule_size(u32 level)
        return BIT(kvm_granule_shift(level));
 }
 
-static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+#define KVM_PHYS_INVALID (-1ULL)
+
+static bool kvm_phys_is_valid(u64 phys)
 {
-       u64 granule = kvm_granule_size(level);
+       return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX));
+}
 
+static bool kvm_level_supports_block_mapping(u32 level)
+{
        /*
         * Reject invalid block mappings and don't bother with 4TB mappings for
         * 52-bit PAs.
         */
-       if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+       return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+       u64 granule = kvm_granule_size(level);
+
+       if (!kvm_level_supports_block_mapping(level))
                return false;
 
        if (granule > (end - addr))
                return false;
 
-       return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+       if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
+               return false;
+
+       return IS_ALIGNED(addr, granule);
 }
 
 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
@@ -152,20 +171,20 @@ static kvm_pte_t kvm_phys_to_pte(u64 pa)
        return pte;
 }
 
-static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
 {
-       return __va(kvm_pte_to_phys(pte));
+       return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
 }
 
-static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+static void kvm_clear_pte(kvm_pte_t *ptep)
 {
-       kvm_pte_t pte = *ptep;
-       WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+       WRITE_ONCE(*ptep, 0);
 }
 
-static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
+                             struct kvm_pgtable_mm_ops *mm_ops)
 {
-       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
 
        pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
        pte |= KVM_PTE_VALID;
@@ -187,6 +206,11 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
        return pte;
 }
 
+static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
+{
+       return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
+}
+
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
                                  u32 level, kvm_pte_t *ptep,
                                  enum kvm_pgtable_walk_flags flag)
@@ -228,7 +252,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
                goto out;
        }
 
-       childp = kvm_pte_follow(pte);
+       childp = kvm_pte_follow(pte, data->pgt->mm_ops);
        ret = __kvm_pgtable_walk(data, childp, level + 1);
        if (ret)
                goto out;
@@ -303,12 +327,12 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
 }
 
 struct hyp_map_data {
-       u64             phys;
-       kvm_pte_t       attr;
+       u64                             phys;
+       kvm_pte_t                       attr;
+       struct kvm_pgtable_mm_ops       *mm_ops;
 };
 
-static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
-                                struct hyp_map_data *data)
+static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
 {
        bool device = prot & KVM_PGTABLE_PROT_DEVICE;
        u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
@@ -333,7 +357,8 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
        attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
        attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
        attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
-       data->attr = attr;
+       *ptep = attr;
+
        return 0;
 }
 
@@ -359,6 +384,8 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                          enum kvm_pgtable_walk_flags flag, void * const arg)
 {
        kvm_pte_t *childp;
+       struct hyp_map_data *data = arg;
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
        if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
                return 0;
@@ -366,11 +393,11 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
        if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
                return -EINVAL;
 
-       childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
        if (!childp)
                return -ENOMEM;
 
-       kvm_set_table_pte(ptep, childp);
+       kvm_set_table_pte(ptep, childp, mm_ops);
        return 0;
 }
 
@@ -380,6 +407,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
        int ret;
        struct hyp_map_data map_data = {
                .phys   = ALIGN_DOWN(phys, PAGE_SIZE),
+               .mm_ops = pgt->mm_ops,
        };
        struct kvm_pgtable_walker walker = {
                .cb     = hyp_map_walker,
@@ -387,7 +415,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                .arg    = &map_data,
        };
 
-       ret = hyp_map_set_prot_attr(prot, &map_data);
+       ret = hyp_set_prot_attr(prot, &map_data.attr);
        if (ret)
                return ret;
 
@@ -397,16 +425,18 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
        return ret;
 }
 
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+                        struct kvm_pgtable_mm_ops *mm_ops)
 {
        u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
 
-       pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
        if (!pgt->pgd)
                return -ENOMEM;
 
        pgt->ia_bits            = va_bits;
        pgt->start_level        = KVM_PGTABLE_MAX_LEVELS - levels;
+       pgt->mm_ops             = mm_ops;
        pgt->mmu                = NULL;
        return 0;
 }
@@ -414,7 +444,9 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
 static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                           enum kvm_pgtable_walk_flags flag, void * const arg)
 {
-       free_page((unsigned long)kvm_pte_follow(*ptep));
+       struct kvm_pgtable_mm_ops *mm_ops = arg;
+
+       mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops));
        return 0;
 }
 
@@ -423,29 +455,75 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
        struct kvm_pgtable_walker walker = {
                .cb     = hyp_free_walker,
                .flags  = KVM_PGTABLE_WALK_TABLE_POST,
+               .arg    = pgt->mm_ops,
        };
 
        WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
-       free_page((unsigned long)pgt->pgd);
+       pgt->mm_ops->put_page(pgt->pgd);
        pgt->pgd = NULL;
 }
 
 struct stage2_map_data {
        u64                             phys;
        kvm_pte_t                       attr;
+       u8                              owner_id;
 
        kvm_pte_t                       *anchor;
+       kvm_pte_t                       *childp;
 
        struct kvm_s2_mmu               *mmu;
-       struct kvm_mmu_memory_cache     *memcache;
+       void                            *memcache;
+
+       struct kvm_pgtable_mm_ops       *mm_ops;
 };
 
-static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
-                                   struct stage2_map_data *data)
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
+{
+       u64 vtcr = VTCR_EL2_FLAGS;
+       u8 lvls;
+
+       vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
+       vtcr |= VTCR_EL2_T0SZ(phys_shift);
+       /*
+        * Use a minimum 2 level page table to prevent splitting
+        * host PMD huge pages at stage2.
+        */
+       lvls = stage2_pgtable_levels(phys_shift);
+       if (lvls < 2)
+               lvls = 2;
+       vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+       /*
+        * Enable the Hardware Access Flag management, unconditionally
+        * on all CPUs. The features is RES0 on CPUs without the support
+        * and must be ignored by the CPUs.
+        */
+       vtcr |= VTCR_EL2_HA;
+
+       /* Set the vmid bits */
+       vtcr |= (get_vmid_bits(mmfr1) == 16) ?
+               VTCR_EL2_VS_16BIT :
+               VTCR_EL2_VS_8BIT;
+
+       return vtcr;
+}
+
+static bool stage2_has_fwb(struct kvm_pgtable *pgt)
+{
+       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+               return false;
+
+       return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
+}
+
+#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
+
+static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
+                               kvm_pte_t *ptep)
 {
        bool device = prot & KVM_PGTABLE_PROT_DEVICE;
-       kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
-                           PAGE_S2_MEMATTR(NORMAL);
+       kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
+                           KVM_S2_MEMATTR(pgt, NORMAL);
        u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
 
        if (!(prot & KVM_PGTABLE_PROT_X))
@@ -461,44 +539,78 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
 
        attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
        attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
-       data->attr = attr;
+       *ptep = attr;
+
        return 0;
 }
 
+static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+{
+       if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
+               return true;
+
+       return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
+}
+
+static bool stage2_pte_is_counted(kvm_pte_t pte)
+{
+       /*
+        * The refcount tracks valid entries as well as invalid entries if they
+        * encode ownership of a page to another entity than the page-table
+        * owner, whose id is 0.
+        */
+       return !!pte;
+}
+
+static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
+                          u32 level, struct kvm_pgtable_mm_ops *mm_ops)
+{
+       /*
+        * Clear the existing PTE, and perform break-before-make with
+        * TLB maintenance if it was valid.
+        */
+       if (kvm_pte_valid(*ptep)) {
+               kvm_clear_pte(ptep);
+               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+       }
+
+       mm_ops->put_page(ptep);
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
                                      kvm_pte_t *ptep,
                                      struct stage2_map_data *data)
 {
        kvm_pte_t new, old = *ptep;
        u64 granule = kvm_granule_size(level), phys = data->phys;
-       struct page *page = virt_to_page(ptep);
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
        if (!kvm_block_mapping_supported(addr, end, phys, level))
                return -E2BIG;
 
-       new = kvm_init_valid_leaf_pte(phys, data->attr, level);
-       if (kvm_pte_valid(old)) {
+       if (kvm_phys_is_valid(phys))
+               new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+       else
+               new = kvm_init_invalid_leaf_owner(data->owner_id);
+
+       if (stage2_pte_is_counted(old)) {
                /*
                 * Skip updating the PTE if we are trying to recreate the exact
                 * same mapping or only change the access permissions. Instead,
                 * the vCPU will exit one more time from guest if still needed
                 * and then go through the path of relaxing permissions.
                 */
-               if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+               if (!stage2_pte_needs_update(old, new))
                        return -EAGAIN;
 
-               /*
-                * There's an existing different valid leaf entry, so perform
-                * break-before-make.
-                */
-               kvm_set_invalid_pte(ptep);
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-               put_page(page);
+               stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
        }
 
        smp_store_release(ptep, new);
-       get_page(page);
-       data->phys += granule;
+       if (stage2_pte_is_counted(new))
+               mm_ops->get_page(ptep);
+       if (kvm_phys_is_valid(phys))
+               data->phys += granule;
        return 0;
 }
 
@@ -512,7 +624,8 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
        if (!kvm_block_mapping_supported(addr, end, data->phys, level))
                return 0;
 
-       kvm_set_invalid_pte(ptep);
+       data->childp = kvm_pte_follow(*ptep, data->mm_ops);
+       kvm_clear_pte(ptep);
 
        /*
         * Invalidate the whole stage-2, as we may have numerous leaf
@@ -527,13 +640,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                                struct stage2_map_data *data)
 {
-       int ret;
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
        kvm_pte_t *childp, pte = *ptep;
-       struct page *page = virt_to_page(ptep);
+       int ret;
 
        if (data->anchor) {
-               if (kvm_pte_valid(pte))
-                       put_page(page);
+               if (stage2_pte_is_counted(pte))
+                       mm_ops->put_page(ptep);
 
                return 0;
        }
@@ -548,7 +661,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
        if (!data->memcache)
                return -ENOMEM;
 
-       childp = kvm_mmu_memory_cache_alloc(data->memcache);
+       childp = mm_ops->zalloc_page(data->memcache);
        if (!childp)
                return -ENOMEM;
 
@@ -557,14 +670,11 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         * a table. Accesses beyond 'end' that fall within the new table
         * will be mapped lazily.
         */
-       if (kvm_pte_valid(pte)) {
-               kvm_set_invalid_pte(ptep);
-               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-               put_page(page);
-       }
+       if (stage2_pte_is_counted(pte))
+               stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
 
-       kvm_set_table_pte(ptep, childp);
-       get_page(page);
+       kvm_set_table_pte(ptep, childp, mm_ops);
+       mm_ops->get_page(ptep);
 
        return 0;
 }
@@ -573,19 +683,25 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
                                      kvm_pte_t *ptep,
                                      struct stage2_map_data *data)
 {
+       struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+       kvm_pte_t *childp;
        int ret = 0;
 
        if (!data->anchor)
                return 0;
 
-       free_page((unsigned long)kvm_pte_follow(*ptep));
-       put_page(virt_to_page(ptep));
-
        if (data->anchor == ptep) {
+               childp = data->childp;
                data->anchor = NULL;
+               data->childp = NULL;
                ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+       } else {
+               childp = kvm_pte_follow(*ptep, mm_ops);
        }
 
+       mm_ops->put_page(childp);
+       mm_ops->put_page(ptep);
+
        return ret;
 }
 
@@ -627,13 +743,14 @@ static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 
 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                           u64 phys, enum kvm_pgtable_prot prot,
-                          struct kvm_mmu_memory_cache *mc)
+                          void *mc)
 {
        int ret;
        struct stage2_map_data map_data = {
                .phys           = ALIGN_DOWN(phys, PAGE_SIZE),
                .mmu            = pgt->mmu,
                .memcache       = mc,
+               .mm_ops         = pgt->mm_ops,
        };
        struct kvm_pgtable_walker walker = {
                .cb             = stage2_map_walker,
@@ -643,7 +760,10 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                .arg            = &map_data,
        };
 
-       ret = stage2_map_set_prot_attr(prot, &map_data);
+       if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
+               return -EINVAL;
+
+       ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
        if (ret)
                return ret;
 
@@ -652,38 +772,63 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
        return ret;
 }
 
-static void stage2_flush_dcache(void *addr, u64 size)
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                                void *mc, u8 owner_id)
 {
-       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-               return;
+       int ret;
+       struct stage2_map_data map_data = {
+               .phys           = KVM_PHYS_INVALID,
+               .mmu            = pgt->mmu,
+               .memcache       = mc,
+               .mm_ops         = pgt->mm_ops,
+               .owner_id       = owner_id,
+       };
+       struct kvm_pgtable_walker walker = {
+               .cb             = stage2_map_walker,
+               .flags          = KVM_PGTABLE_WALK_TABLE_PRE |
+                                 KVM_PGTABLE_WALK_LEAF |
+                                 KVM_PGTABLE_WALK_TABLE_POST,
+               .arg            = &map_data,
+       };
+
+       if (owner_id > KVM_MAX_OWNER_ID)
+               return -EINVAL;
 
-       __flush_dcache_area(addr, size);
+       ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+       return ret;
 }
 
-static bool stage2_pte_cacheable(kvm_pte_t pte)
+static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
 {
        u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-       return memattr == PAGE_S2_MEMATTR(NORMAL);
+       return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
 }
 
 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                               enum kvm_pgtable_walk_flags flag,
                               void * const arg)
 {
-       struct kvm_s2_mmu *mmu = arg;
+       struct kvm_pgtable *pgt = arg;
+       struct kvm_s2_mmu *mmu = pgt->mmu;
+       struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
        kvm_pte_t pte = *ptep, *childp = NULL;
        bool need_flush = false;
 
-       if (!kvm_pte_valid(pte))
+       if (!kvm_pte_valid(pte)) {
+               if (stage2_pte_is_counted(pte)) {
+                       kvm_clear_pte(ptep);
+                       mm_ops->put_page(ptep);
+               }
                return 0;
+       }
 
        if (kvm_pte_table(pte, level)) {
-               childp = kvm_pte_follow(pte);
+               childp = kvm_pte_follow(pte, mm_ops);
 
-               if (page_count(virt_to_page(childp)) != 1)
+               if (mm_ops->page_count(childp) != 1)
                        return 0;
-       } else if (stage2_pte_cacheable(pte)) {
-               need_flush = true;
+       } else if (stage2_pte_cacheable(pgt, pte)) {
+               need_flush = !stage2_has_fwb(pgt);
        }
 
        /*
@@ -691,17 +836,15 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         * block entry and rely on the remaining portions being faulted
         * back lazily.
         */
-       kvm_set_invalid_pte(ptep);
-       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
-       put_page(virt_to_page(ptep));
+       stage2_put_pte(ptep, mmu, addr, level, mm_ops);
 
        if (need_flush) {
-               stage2_flush_dcache(kvm_pte_follow(pte),
+               __flush_dcache_area(kvm_pte_follow(pte, mm_ops),
                                    kvm_granule_size(level));
        }
 
        if (childp)
-               free_page((unsigned long)childp);
+               mm_ops->put_page(childp);
 
        return 0;
 }
@@ -710,7 +853,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
        struct kvm_pgtable_walker walker = {
                .cb     = stage2_unmap_walker,
-               .arg    = pgt->mmu,
+               .arg    = pgt,
                .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
        };
 
@@ -842,12 +985,14 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                               enum kvm_pgtable_walk_flags flag,
                               void * const arg)
 {
+       struct kvm_pgtable *pgt = arg;
+       struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
        kvm_pte_t pte = *ptep;
 
-       if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+       if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
                return 0;
 
-       stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+       __flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level));
        return 0;
 }
 
@@ -856,30 +1001,35 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
        struct kvm_pgtable_walker walker = {
                .cb     = stage2_flush_walker,
                .flags  = KVM_PGTABLE_WALK_LEAF,
+               .arg    = pgt,
        };
 
-       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+       if (stage2_has_fwb(pgt))
                return 0;
 
        return kvm_pgtable_walk(pgt, addr, size, &walker);
 }
 
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+                                 struct kvm_pgtable_mm_ops *mm_ops,
+                                 enum kvm_pgtable_stage2_flags flags)
 {
        size_t pgd_sz;
-       u64 vtcr = kvm->arch.vtcr;
+       u64 vtcr = arch->vtcr;
        u32 ia_bits = VTCR_EL2_IPA(vtcr);
        u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
        u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
 
        pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
-       pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz);
        if (!pgt->pgd)
                return -ENOMEM;
 
        pgt->ia_bits            = ia_bits;
        pgt->start_level        = start_level;
-       pgt->mmu                = &kvm->arch.mmu;
+       pgt->mm_ops             = mm_ops;
+       pgt->mmu                = &arch->mmu;
+       pgt->flags              = flags;
 
        /* Ensure zeroed PGD pages are visible to the hardware walker */
        dsb(ishst);
@@ -890,15 +1040,16 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
                              enum kvm_pgtable_walk_flags flag,
                              void * const arg)
 {
+       struct kvm_pgtable_mm_ops *mm_ops = arg;
        kvm_pte_t pte = *ptep;
 
-       if (!kvm_pte_valid(pte))
+       if (!stage2_pte_is_counted(pte))
                return 0;
 
-       put_page(virt_to_page(ptep));
+       mm_ops->put_page(ptep);
 
        if (kvm_pte_table(pte, level))
-               free_page((unsigned long)kvm_pte_follow(pte));
+               mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
 
        return 0;
 }
@@ -910,10 +1061,85 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
                .cb     = stage2_free_walker,
                .flags  = KVM_PGTABLE_WALK_LEAF |
                          KVM_PGTABLE_WALK_TABLE_POST,
+               .arg    = pgt->mm_ops,
        };
 
        WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
        pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
-       free_pages_exact(pgt->pgd, pgd_sz);
+       pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
        pgt->pgd = NULL;
 }
+
+#define KVM_PTE_LEAF_S2_COMPAT_MASK    (KVM_PTE_LEAF_ATTR_S2_PERMS | \
+                                        KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
+                                        KVM_PTE_LEAF_ATTR_S2_IGNORED)
+
+static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
+                                         kvm_pte_t *ptep,
+                                         enum kvm_pgtable_walk_flags flag,
+                                         void * const arg)
+{
+       kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
+
+       /*
+        * Compatible mappings are either invalid and owned by the page-table
+        * owner (whose id is 0), or valid with matching permission attributes.
+        */
+       if (kvm_pte_valid(pte)) {
+               old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
+               if (old_attr != *new_attr)
+                       return -EEXIST;
+       } else if (pte) {
+               return -EEXIST;
+       }
+
+       return 0;
+}
+
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+                                 enum kvm_pgtable_prot prot,
+                                 struct kvm_mem_range *range)
+{
+       kvm_pte_t attr;
+       struct kvm_pgtable_walker check_perm_walker = {
+               .cb             = stage2_check_permission_walker,
+               .flags          = KVM_PGTABLE_WALK_LEAF,
+               .arg            = &attr,
+       };
+       u64 granule, start, end;
+       u32 level;
+       int ret;
+
+       ret = stage2_set_prot_attr(pgt, prot, &attr);
+       if (ret)
+               return ret;
+       attr &= KVM_PTE_LEAF_S2_COMPAT_MASK;
+
+       for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) {
+               granule = kvm_granule_size(level);
+               start = ALIGN_DOWN(addr, granule);
+               end = start + granule;
+
+               if (!kvm_level_supports_block_mapping(level))
+                       continue;
+
+               if (start < range->start || range->end < end)
+                       continue;
+
+               /*
+                * Check the presence of existing mappings with incompatible
+                * permissions within the current block range, and try one level
+                * deeper if one is found.
+                */
+               ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker);
+               if (ret != -EEXIST)
+                       break;
+       }
+
+       if (!ret) {
+               range->start = start;
+               range->end = end;
+       }
+
+       return ret;
+}
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c
new file mode 100644 (file)
index 0000000..83ca23a
--- /dev/null
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/kvm_host.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+
+static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
+static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
+
+phys_addr_t hyp_mem_base;
+phys_addr_t hyp_mem_size;
+
+static int cmp_hyp_memblock(const void *p1, const void *p2)
+{
+       const struct memblock_region *r1 = p1;
+       const struct memblock_region *r2 = p2;
+
+       return r1->base < r2->base ? -1 : (r1->base > r2->base);
+}
+
+static void __init sort_memblock_regions(void)
+{
+       sort(hyp_memory,
+            *hyp_memblock_nr_ptr,
+            sizeof(struct memblock_region),
+            cmp_hyp_memblock,
+            NULL);
+}
+
+static int __init register_memblock_regions(void)
+{
+       struct memblock_region *reg;
+
+       for_each_mem_region(reg) {
+               if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
+                       return -ENOMEM;
+
+               hyp_memory[*hyp_memblock_nr_ptr] = *reg;
+               (*hyp_memblock_nr_ptr)++;
+       }
+       sort_memblock_regions();
+
+       return 0;
+}
+
+void __init kvm_hyp_reserve(void)
+{
+       u64 nr_pages, prev, hyp_mem_pages = 0;
+       int ret;
+
+       if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
+               return;
+
+       if (kvm_get_mode() != KVM_MODE_PROTECTED)
+               return;
+
+       ret = register_memblock_regions();
+       if (ret) {
+               *hyp_memblock_nr_ptr = 0;
+               kvm_err("Failed to register hyp memblocks: %d\n", ret);
+               return;
+       }
+
+       hyp_mem_pages += hyp_s1_pgtable_pages();
+       hyp_mem_pages += host_s2_mem_pgtable_pages();
+       hyp_mem_pages += host_s2_dev_pgtable_pages();
+
+       /*
+        * The hyp_vmemmap needs to be backed by pages, but these pages
+        * themselves need to be present in the vmemmap, so compute the number
+        * of pages needed by looking for a fixed point.
+        */
+       nr_pages = 0;
+       do {
+               prev = nr_pages;
+               nr_pages = hyp_mem_pages + prev;
+               nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE);
+               nr_pages += __hyp_pgtable_max_pages(nr_pages);
+       } while (nr_pages != prev);
+       hyp_mem_pages += nr_pages;
+
+       /*
+        * Try to allocate a PMD-aligned region to reduce TLB pressure once
+        * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
+        */
+       hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
+       hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+                                             ALIGN(hyp_mem_size, PMD_SIZE),
+                                             PMD_SIZE);
+       if (!hyp_mem_base)
+               hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+                                                     hyp_mem_size, PAGE_SIZE);
+       else
+               hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
+
+       if (!hyp_mem_base) {
+               kvm_err("Failed to reserve hyp memory\n");
+               return;
+       }
+       memblock_reserve(hyp_mem_base, hyp_mem_size);
+
+       kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
+                hyp_mem_base);
+}
index af8e940..7b8f7db 100644 (file)
@@ -27,8 +27,6 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
-const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
-
 /* VHE specific context */
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
 DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -207,7 +205,7 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
        __deactivate_traps(vcpu);
        sysreg_restore_host_state_vhe(host_ctxt);
 
-       panic(__hyp_panic_string,
+       panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n",
              spsr, elr,
              read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR),
              read_sysreg(hpfar_el2), par, vcpu);
index ead21b9..30da78f 100644 (file)
@@ -9,16 +9,65 @@
 #include <kvm/arm_hypercalls.h>
 #include <kvm/arm_psci.h>
 
+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+       struct system_time_snapshot systime_snapshot;
+       u64 cycles = ~0UL;
+       u32 feature;
+
+       /*
+        * system time and counter value must captured at the same
+        * time to keep consistency and precision.
+        */
+       ktime_get_snapshot(&systime_snapshot);
+
+       /*
+        * This is only valid if the current clocksource is the
+        * architected counter, as this is the only one the guest
+        * can see.
+        */
+       if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+               return;
+
+       /*
+        * The guest selects one of the two reference counters
+        * (virtual or physical) with the first argument of the SMCCC
+        * call. In case the identifier is not supported, error out.
+        */
+       feature = smccc_get_arg1(vcpu);
+       switch (feature) {
+       case KVM_PTP_VIRT_COUNTER:
+               cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+               break;
+       case KVM_PTP_PHYS_COUNTER:
+               cycles = systime_snapshot.cycles;
+               break;
+       default:
+               return;
+       }
+
+       /*
+        * This relies on the top bit of val[0] never being set for
+        * valid values of system time, because that is *really* far
+        * in the future (about 292 years from 1970, and at that stage
+        * nobody will give a damn about it).
+        */
+       val[0] = upper_32_bits(systime_snapshot.real);
+       val[1] = lower_32_bits(systime_snapshot.real);
+       val[2] = upper_32_bits(cycles);
+       val[3] = lower_32_bits(cycles);
+}
+
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
        u32 func_id = smccc_get_function(vcpu);
-       long val = SMCCC_RET_NOT_SUPPORTED;
+       u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
        u32 feature;
        gpa_t gpa;
 
        switch (func_id) {
        case ARM_SMCCC_VERSION_FUNC_ID:
-               val = ARM_SMCCC_VERSION_1_1;
+               val[0] = ARM_SMCCC_VERSION_1_1;
                break;
        case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
                feature = smccc_get_arg1(vcpu);
@@ -28,10 +77,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                        case SPECTRE_VULNERABLE:
                                break;
                        case SPECTRE_MITIGATED:
-                               val = SMCCC_RET_SUCCESS;
+                               val[0] = SMCCC_RET_SUCCESS;
                                break;
                        case SPECTRE_UNAFFECTED:
-                               val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
+                               val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
                                break;
                        }
                        break;
@@ -54,22 +103,35 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                                        break;
                                fallthrough;
                        case SPECTRE_UNAFFECTED:
-                               val = SMCCC_RET_NOT_REQUIRED;
+                               val[0] = SMCCC_RET_NOT_REQUIRED;
                                break;
                        }
                        break;
                case ARM_SMCCC_HV_PV_TIME_FEATURES:
-                       val = SMCCC_RET_SUCCESS;
+                       val[0] = SMCCC_RET_SUCCESS;
                        break;
                }
                break;
        case ARM_SMCCC_HV_PV_TIME_FEATURES:
-               val = kvm_hypercall_pv_features(vcpu);
+               val[0] = kvm_hypercall_pv_features(vcpu);
                break;
        case ARM_SMCCC_HV_PV_TIME_ST:
                gpa = kvm_init_stolen_time(vcpu);
                if (gpa != GPA_INVALID)
-                       val = gpa;
+                       val[0] = gpa;
+               break;
+       case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+               val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+               val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+               val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+               val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+               break;
+       case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+               val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+               val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+               break;
+       case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+               kvm_ptp_get_time(vcpu, val);
                break;
        case ARM_SMCCC_TRNG_VERSION:
        case ARM_SMCCC_TRNG_FEATURES:
@@ -81,6 +143,6 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
                return kvm_psci_call(vcpu);
        }
 
-       smccc_set_retval(vcpu, val, 0, 0, 0);
+       smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
        return 1;
 }
index 3572823..c5d1f3c 100644 (file)
@@ -88,6 +88,44 @@ static bool kvm_is_device_pfn(unsigned long pfn)
        return !pfn_valid(pfn);
 }
 
+static void *stage2_memcache_zalloc_page(void *arg)
+{
+       struct kvm_mmu_memory_cache *mc = arg;
+
+       /* Allocated with __GFP_ZERO, so no need to zero */
+       return kvm_mmu_memory_cache_alloc(mc);
+}
+
+static void *kvm_host_zalloc_pages_exact(size_t size)
+{
+       return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+}
+
+static void kvm_host_get_page(void *addr)
+{
+       get_page(virt_to_page(addr));
+}
+
+static void kvm_host_put_page(void *addr)
+{
+       put_page(virt_to_page(addr));
+}
+
+static int kvm_host_page_count(void *addr)
+{
+       return page_count(virt_to_page(addr));
+}
+
+static phys_addr_t kvm_host_pa(void *addr)
+{
+       return __pa(addr);
+}
+
+static void *kvm_host_va(phys_addr_t phys)
+{
+       return __va(phys);
+}
+
 /*
  * Unmapping vs dcache management:
  *
@@ -127,7 +165,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
                                 bool may_block)
 {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        phys_addr_t end = start + size;
 
        assert_spin_locked(&kvm->mmu_lock);
@@ -183,15 +221,39 @@ void free_hyp_pgds(void)
        if (hyp_pgtable) {
                kvm_pgtable_hyp_destroy(hyp_pgtable);
                kfree(hyp_pgtable);
+               hyp_pgtable = NULL;
        }
        mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
+static bool kvm_host_owns_hyp_mappings(void)
+{
+       if (static_branch_likely(&kvm_protected_mode_initialized))
+               return false;
+
+       /*
+        * This can happen at boot time when __create_hyp_mappings() is called
+        * after the hyp protection has been enabled, but the static key has
+        * not been flipped yet.
+        */
+       if (!hyp_pgtable && is_protected_kvm_enabled())
+               return false;
+
+       WARN_ON(!hyp_pgtable);
+
+       return true;
+}
+
 static int __create_hyp_mappings(unsigned long start, unsigned long size,
                                 unsigned long phys, enum kvm_pgtable_prot prot)
 {
        int err;
 
+       if (!kvm_host_owns_hyp_mappings()) {
+               return kvm_call_hyp_nvhe(__pkvm_create_mappings,
+                                        start, size, phys, prot);
+       }
+
        mutex_lock(&kvm_hyp_pgd_mutex);
        err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
        mutex_unlock(&kvm_hyp_pgd_mutex);
@@ -253,6 +315,16 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
        unsigned long base;
        int ret = 0;
 
+       if (!kvm_host_owns_hyp_mappings()) {
+               base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
+                                        phys_addr, size, prot);
+               if (IS_ERR_OR_NULL((void *)base))
+                       return PTR_ERR((void *)base);
+               *haddr = base;
+
+               return 0;
+       }
+
        mutex_lock(&kvm_hyp_pgd_mutex);
 
        /*
@@ -351,6 +423,17 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
        return 0;
 }
 
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
+       .zalloc_page            = stage2_memcache_zalloc_page,
+       .zalloc_pages_exact     = kvm_host_zalloc_pages_exact,
+       .free_pages_exact       = free_pages_exact,
+       .get_page               = kvm_host_get_page,
+       .put_page               = kvm_host_put_page,
+       .page_count             = kvm_host_page_count,
+       .phys_to_virt           = kvm_host_va,
+       .virt_to_phys           = kvm_host_pa,
+};
+
 /**
  * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
  * @kvm:       The pointer to the KVM structure
@@ -374,7 +457,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
        if (!pgt)
                return -ENOMEM;
 
-       err = kvm_pgtable_stage2_init(pgt, kvm);
+       err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops);
        if (err)
                goto out_free_pgtable;
 
@@ -387,7 +470,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
 
-       mmu->kvm = kvm;
+       mmu->arch = &kvm->arch;
        mmu->pgt = pgt;
        mmu->pgd_phys = __pa(pgt->pgd);
        mmu->vmid.vmid_gen = 0;
@@ -421,10 +504,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
         *     +--------------------------------------------+
         */
        do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               struct vm_area_struct *vma;
                hva_t vm_start, vm_end;
 
-               if (!vma || vma->vm_start >= reg_end)
+               vma = find_vma_intersection(current->mm, hva, reg_end);
+               if (!vma)
                        break;
 
                /*
@@ -469,7 +553,7 @@ void stage2_unmap_vm(struct kvm *kvm)
 
 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        struct kvm_pgtable *pgt = NULL;
 
        spin_lock(&kvm->mmu_lock);
@@ -538,7 +622,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  */
 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
 {
-       struct kvm *kvm = mmu->kvm;
+       struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
 }
 
@@ -555,7 +639,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_
  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
  * serializing operations for VM memory regions.
  */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
@@ -842,10 +926,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         * unmapped afterwards, the call to kvm_unmap_gfn will take it away
         * from us again properly. This smp_rmb() interacts with the smp_wmb()
         * in kvm_mmu_notifier_invalidate_<page|range_end>.
+        *
+        * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
+        * used to avoid unnecessary overhead introduced to locate the memory
+        * slot because it's always fixed even @gfn is adjusted for huge pages.
         */
        smp_rmb();
 
-       pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+                                  write_fault, &writable, NULL);
        if (pfn == KVM_PFN_ERR_HWPOISON) {
                kvm_send_hwpoison_signal(hva, vma_shift);
                return 0;
@@ -911,7 +1000,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        /* Mark the page dirty only if the fault is handled successfully */
        if (writable && !ret) {
                kvm_set_pfn_dirty(pfn);
-               mark_page_dirty(kvm, gfn);
+               mark_page_dirty_in_slot(kvm, memslot, gfn);
        }
 
 out_unlock:
@@ -1152,10 +1241,22 @@ static int kvm_map_idmap_text(void)
        return err;
 }
 
-int kvm_mmu_init(void)
+static void *kvm_hyp_zalloc_page(void *arg)
+{
+       return (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
+       .zalloc_page            = kvm_hyp_zalloc_page,
+       .get_page               = kvm_host_get_page,
+       .put_page               = kvm_host_put_page,
+       .phys_to_virt           = kvm_host_va,
+       .virt_to_phys           = kvm_host_pa,
+};
+
+int kvm_mmu_init(u32 *hyp_va_bits)
 {
        int err;
-       u32 hyp_va_bits;
 
        hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
        hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1169,8 +1270,8 @@ int kvm_mmu_init(void)
         */
        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
-       hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
-       kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
+       *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
        kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
        kvm_debug("HYP VA range: %lx:%lx\n",
                  kern_hyp_va(PAGE_OFFSET),
@@ -1195,7 +1296,7 @@ int kvm_mmu_init(void)
                goto out;
        }
 
-       err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+       err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
        if (err)
                goto out_free_pgtable;
 
@@ -1273,10 +1374,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
         *     +--------------------------------------------+
         */
        do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               struct vm_area_struct *vma;
                hva_t vm_start, vm_end;
 
-               if (!vma || vma->vm_start >= reg_end)
+               vma = find_vma_intersection(current->mm, hva, reg_end);
+               if (!vma)
                        break;
 
                /*
index 7391643..151c31f 100644 (file)
@@ -50,12 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
 
 int kvm_perf_init(void)
 {
-       /*
-        * Check if HW_PERF_EVENTS are supported by checking the number of
-        * hardware performance counters. This could ensure the presence of
-        * a physical PMU and CONFIG_PERF_EVENT is selected.
-        */
-       if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0)
+       if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled())
                static_branch_enable(&kvm_arm_pmu_available);
 
        return perf_register_guest_info_callbacks(&kvm_guest_cbs);
index e32c6e1..fd167d4 100644 (file)
@@ -739,7 +739,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
        kvm_pmu_create_perf_event(vcpu, select_idx);
 }
 
-static int kvm_pmu_probe_pmuver(void)
+int kvm_pmu_probe_pmuver(void)
 {
        struct perf_event_attr attr = { };
        struct perf_event *event;
index faf32a4..03a6c1f 100644 (file)
@@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
 {
        struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
-       if (!ctx || !kvm_pmu_switch_needed(attr))
+       if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr))
                return;
 
        if (!attr->exclude_host)
@@ -49,7 +49,7 @@ void kvm_clr_pmu_events(u32 clr)
 {
        struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
-       if (!ctx)
+       if (!kvm_arm_support_pmu_v3() || !ctx)
                return;
 
        ctx->pmu_events.events_host &= ~clr;
@@ -172,7 +172,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
        struct kvm_host_data *host;
        u32 events_guest, events_host;
 
-       if (!has_vhe())
+       if (!kvm_arm_support_pmu_v3() || !has_vhe())
                return;
 
        preempt_disable();
@@ -193,7 +193,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
        struct kvm_host_data *host;
        u32 events_guest, events_host;
 
-       if (!has_vhe())
+       if (!kvm_arm_support_pmu_v3() || !has_vhe())
                return;
 
        host = this_cpu_ptr_hyp_sym(kvm_host_data);
index bd354cd..956cdc2 100644 (file)
@@ -74,10 +74,6 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
        if (!system_supports_sve())
                return -EINVAL;
 
-       /* Verify that KVM startup enforced this when SVE was detected: */
-       if (WARN_ON(!has_vhe()))
-               return -EINVAL;
-
        vcpu->arch.sve_max_vl = kvm_sve_max_vl;
 
        /*
@@ -242,6 +238,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 
        /* Reset core registers */
        memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+       memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
+       vcpu->arch.ctxt.spsr_abt = 0;
+       vcpu->arch.ctxt.spsr_und = 0;
+       vcpu->arch.ctxt.spsr_irq = 0;
+       vcpu->arch.ctxt.spsr_fiq = 0;
        vcpu_gp_regs(vcpu)->pstate = pstate;
 
        /* Reset system registers */
@@ -333,19 +334,10 @@ int kvm_set_ipa_limit(void)
        return 0;
 }
 
-/*
- * Configure the VTCR_EL2 for this VM. The VTCR value is common
- * across all the physical CPUs on the system. We use system wide
- * sanitised values to fill in different fields, except for Hardware
- * Management of Access Flags. HA Flag is set unconditionally on
- * all CPUs, as it is safe to run with or without the feature and
- * the bit is RES0 on CPUs that don't support it.
- */
 int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
 {
-       u64 vtcr = VTCR_EL2_FLAGS, mmfr0;
-       u32 parange, phys_shift;
-       u8 lvls;
+       u64 mmfr0, mmfr1;
+       u32 phys_shift;
 
        if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
                return -EINVAL;
@@ -365,33 +357,8 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
        }
 
        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
-       parange = cpuid_feature_extract_unsigned_field(mmfr0,
-                               ID_AA64MMFR0_PARANGE_SHIFT);
-       if (parange > ID_AA64MMFR0_PARANGE_MAX)
-               parange = ID_AA64MMFR0_PARANGE_MAX;
-       vtcr |= parange << VTCR_EL2_PS_SHIFT;
-
-       vtcr |= VTCR_EL2_T0SZ(phys_shift);
-       /*
-        * Use a minimum 2 level page table to prevent splitting
-        * host PMD huge pages at stage2.
-        */
-       lvls = stage2_pgtable_levels(phys_shift);
-       if (lvls < 2)
-               lvls = 2;
-       vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
-
-       /*
-        * Enable the Hardware Access Flag management, unconditionally
-        * on all CPUs. The features is RES0 on CPUs without the support
-        * and must be ignored by the CPUs.
-        */
-       vtcr |= VTCR_EL2_HA;
+       mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+       kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
 
-       /* Set the vmid bits */
-       vtcr |= (kvm_get_vmid_bits() == 16) ?
-               VTCR_EL2_VS_16BIT :
-               VTCR_EL2_VS_8BIT;
-       kvm->arch.vtcr = vtcr;
        return 0;
 }
index 4f2f1e3..76ea280 100644 (file)
@@ -1063,6 +1063,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
                val = cpuid_feature_cap_perfmon_field(val,
                                                      ID_AA64DFR0_PMUVER_SHIFT,
                                                      kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0);
+               /* Hide SPE from guests */
+               val &= ~FEATURE(ID_AA64DFR0_PMSVER);
                break;
        case SYS_ID_DFR0_EL1:
                /* Limit guests to PMUv3 for ARMv8.4 */
@@ -1472,6 +1474,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
        { SYS_DESC(SYS_GCR_EL1), undef_access },
 
        { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
+       { SYS_DESC(SYS_TRFCR_EL1), undef_access },
        { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
        { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
        { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
@@ -1501,6 +1504,19 @@ static const struct sys_reg_desc sys_reg_descs[] = {
        { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
        { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
 
+       { SYS_DESC(SYS_PMSCR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSICR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSIRR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSFCR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSEVFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSLATFR_EL1), undef_access },
+       { SYS_DESC(SYS_PMSIDR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBPTR_EL1), undef_access },
+       { SYS_DESC(SYS_PMBSR_EL1), undef_access },
+       /* PMBIDR_EL1 is not trapped */
+
        { PMU_SYS_REG(SYS_PMINTENSET_EL1),
          .access = access_pminten, .reg = PMINTENSET_EL1 },
        { PMU_SYS_REG(SYS_PMINTENCLR_EL1),
index 9783013..acdb7b3 100644 (file)
@@ -288,3 +288,10 @@ void kvm_get_kimage_voffset(struct alt_instr *alt,
 {
        generate_mov_q(kimage_voffset, origptr, updptr, nr_inst);
 }
+
+void kvm_compute_final_ctr_el0(struct alt_instr *alt,
+                              __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+       generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0),
+                      origptr, updptr, nr_inst);
+}
index 052917d..58cbda0 100644 (file)
@@ -335,13 +335,14 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
        kfree(dist->spis);
        dist->spis = NULL;
        dist->nr_spis = 0;
+       dist->vgic_dist_base = VGIC_ADDR_UNDEF;
 
-       if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
-                       list_del(&rdreg->list);
-                       kfree(rdreg);
-               }
+       if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+               list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
+                       vgic_v3_free_redist_region(rdreg);
                INIT_LIST_HEAD(&dist->rd_regions);
+       } else {
+               dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
        }
 
        if (vgic_has_its(kvm))
@@ -362,6 +363,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
        vgic_flush_pending_lpis(vcpu);
 
        INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+       vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
 }
 
 /* To be called with kvm->lock held */
index 40cbaca..ec7543a 100644 (file)
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
                /*
                 * If an LPI carries the HW bit, this means that this
                 * interrupt is controlled by GICv4, and we do not
-                * have direct access to that state. Let's simply fail
-                * the save operation...
+                * have direct access to that state without GICv4.1.
+                * Let's simply fail the save operation...
                 */
-               if (ite->irq->hw)
+               if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
                        return -EACCES;
 
                ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
index 4441967..7740995 100644 (file)
@@ -87,8 +87,8 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
                        r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
                        goto out;
                }
-               rdreg = list_first_entry(&vgic->rd_regions,
-                                        struct vgic_redist_region, list);
+               rdreg = list_first_entry_or_null(&vgic->rd_regions,
+                                                struct vgic_redist_region, list);
                if (!rdreg)
                        addr_ptr = &undef_value;
                else
@@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
                u64 addr;
                unsigned long type = (unsigned long)attr->attr;
 
+               if (copy_from_user(&addr, uaddr, sizeof(addr)))
+                       return -EFAULT;
+
                r = kvm_vgic_addr(dev->kvm, type, &addr, false);
                if (r)
                        return (r == -ENODEV) ? -ENXIO : r;
index 15a6c98..03a2537 100644 (file)
@@ -251,30 +251,35 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
                vgic_enable_lpis(vcpu);
 }
 
-static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
-                                             gpa_t addr, unsigned int len)
+static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu)
 {
-       unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+       struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-       struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
-       int target_vcpu_id = vcpu->vcpu_id;
-       gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
-                       (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
-       u64 value;
+       struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg;
 
-       value = (u64)(mpidr & GENMASK(23, 0)) << 32;
-       value |= ((target_vcpu_id & 0xffff) << 8);
+       if (!rdreg)
+               return false;
 
-       if (addr == last_rdist_typer)
-               value |= GICR_TYPER_LAST;
-       if (vgic_has_its(vcpu->kvm))
-               value |= GICR_TYPER_PLPIS;
+       if (vgic_cpu->rdreg_index < rdreg->free_index - 1) {
+               return false;
+       } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) {
+               struct list_head *rd_regions = &vgic->rd_regions;
+               gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
 
-       return extract_bytes(value, addr & 7, len);
+               /*
+                * the rdist is the last one of the redist region,
+                * check whether there is no other contiguous rdist region
+                */
+               list_for_each_entry(iter, rd_regions, list) {
+                       if (iter->base == end && iter->free_index > 0)
+                               return false;
+               }
+       }
+       return true;
 }
 
-static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
-                                                gpa_t addr, unsigned int len)
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+                                             gpa_t addr, unsigned int len)
 {
        unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
        int target_vcpu_id = vcpu->vcpu_id;
@@ -286,7 +291,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
        if (vgic_has_its(vcpu->kvm))
                value |= GICR_TYPER_PLPIS;
 
-       /* reporting of the Last bit is not supported for userspace */
+       if (vgic_mmio_vcpu_rdist_is_last(vcpu))
+               value |= GICR_TYPER_LAST;
+
        return extract_bytes(value, addr & 7, len);
 }
 
@@ -612,7 +619,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER,
                vgic_mmio_read_v3r_typer, vgic_mmio_write_wi,
-               vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8,
+               NULL, vgic_mmio_uaccess_write_wi, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
                vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
@@ -714,6 +721,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
                return -EINVAL;
 
        vgic_cpu->rdreg = rdreg;
+       vgic_cpu->rdreg_index = rdreg->free_index;
 
        rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
 
@@ -768,7 +776,7 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
 }
 
 /**
- * vgic_v3_insert_redist_region - Insert a new redistributor region
+ * vgic_v3_alloc_redist_region - Allocate a new redistributor region
  *
  * Performs various checks before inserting the rdist region in the list.
  * Those tests depend on whether the size of the rdist region is known
@@ -782,8 +790,8 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
  *
  * Return 0 on success, < 0 otherwise
  */
-static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
-                                       gpa_t base, uint32_t count)
+static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index,
+                                      gpa_t base, uint32_t count)
 {
        struct vgic_dist *d = &kvm->arch.vgic;
        struct vgic_redist_region *rdreg;
@@ -791,10 +799,6 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
        size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
        int ret;
 
-       /* single rdist region already set ?*/
-       if (!count && !list_empty(rd_regions))
-               return -EINVAL;
-
        /* cross the end of memory ? */
        if (base + size < base)
                return -EINVAL;
@@ -805,11 +809,15 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
        } else {
                rdreg = list_last_entry(rd_regions,
                                        struct vgic_redist_region, list);
-               if (index != rdreg->index + 1)
+
+               /* Don't mix single region and discrete redist regions */
+               if (!count && rdreg->count)
                        return -EINVAL;
 
-               /* Cannot add an explicitly sized regions after legacy region */
-               if (!rdreg->count)
+               if (!count)
+                       return -EEXIST;
+
+               if (index != rdreg->index + 1)
                        return -EINVAL;
        }
 
@@ -848,11 +856,17 @@ free:
        return ret;
 }
 
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg)
+{
+       list_del(&rdreg->list);
+       kfree(rdreg);
+}
+
 int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
 {
        int ret;
 
-       ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
+       ret = vgic_v3_alloc_redist_region(kvm, index, addr, count);
        if (ret)
                return ret;
 
@@ -861,8 +875,13 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
         * afterwards will register the iodevs when needed.
         */
        ret = vgic_register_all_redist_iodevs(kvm);
-       if (ret)
+       if (ret) {
+               struct vgic_redist_region *rdreg;
+
+               rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+               vgic_v3_free_redist_region(rdreg);
                return ret;
+       }
 
        return 0;
 }
index b2d73fc..48c6067 100644 (file)
@@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
        return region;
 }
 
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                             gpa_t addr, u32 *val)
 {
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
        struct kvm_vcpu *r_vcpu;
 
@@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
        return 0;
 }
 
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                              gpa_t addr, const u32 *val)
 {
-       struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
        struct kvm_vcpu *r_vcpu;
 
@@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
                 bool is_write, int offset, u32 *val)
 {
        if (is_write)
-               return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+               return vgic_uaccess_write(vcpu, dev, offset, val);
        else
-               return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+               return vgic_uaccess_read(vcpu, dev, offset, val);
 }
 
 static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
index 6f53092..41ecf21 100644 (file)
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <kvm/arm_vgic.h>
@@ -356,6 +358,32 @@ retry:
        return 0;
 }
 
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+               desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+               irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+       }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+               desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+               irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+       }
+}
+
 /**
  * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
  * kvm lock and all vcpu lock must be held
@@ -365,13 +393,28 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq;
        gpa_t last_ptr = ~(gpa_t)0;
-       int ret;
+       bool vlpi_avail = false;
+       int ret = 0;
        u8 val;
 
+       if (unlikely(!vgic_initialized(kvm)))
+               return -ENXIO;
+
+       /*
+        * A preparation for getting any VLPI states.
+        * The above vgic initialized check also ensures that the allocation
+        * and enabling of the doorbells have already been done.
+        */
+       if (kvm_vgic_global_state.has_gicv4_1) {
+               unmap_all_vpes(dist);
+               vlpi_avail = true;
+       }
+
        list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
                int byte_offset, bit_nr;
                struct kvm_vcpu *vcpu;
                gpa_t pendbase, ptr;
+               bool is_pending;
                bool stored;
 
                vcpu = irq->target_vcpu;
@@ -387,24 +430,35 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
                if (ptr != last_ptr) {
                        ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
                        if (ret)
-                               return ret;
+                               goto out;
                        last_ptr = ptr;
                }
 
                stored = val & (1U << bit_nr);
-               if (stored == irq->pending_latch)
+
+               is_pending = irq->pending_latch;
+
+               if (irq->hw && vlpi_avail)
+                       vgic_v4_get_vlpi_state(irq, &is_pending);
+
+               if (stored == is_pending)
                        continue;
 
-               if (irq->pending_latch)
+               if (is_pending)
                        val |= 1 << bit_nr;
                else
                        val &= ~(1 << bit_nr);
 
                ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
                if (ret)
-                       return ret;
+                       goto out;
        }
-       return 0;
+
+out:
+       if (vlpi_avail)
+               map_all_vpes(dist);
+
+       return ret;
 }
 
 /**
index 66508b0..c1845d8 100644 (file)
@@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
        kvm_arm_resume_guest(kvm);
 }
 
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+       struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+       int mask = BIT(irq->intid % BITS_PER_BYTE);
+       void *va;
+       u8 *ptr;
+
+       va = page_address(vpe->vpt_page);
+       ptr = va + irq->intid / BITS_PER_BYTE;
+
+       *val = !!(*ptr & mask);
+}
+
 /**
  * vgic_v4_init - Initialize the GICv4 data structures
  * @kvm:       Pointer to the VM being initialized
@@ -385,6 +404,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
        struct vgic_its *its;
        struct vgic_irq *irq;
        struct its_vlpi_map map;
+       unsigned long flags;
        int ret;
 
        if (!vgic_supports_direct_msis(kvm))
@@ -430,6 +450,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
        irq->host_irq   = virq;
        atomic_inc(&map.vpe->vlpi_count);
 
+       /* Transfer pending state */
+       raw_spin_lock_irqsave(&irq->irq_lock, flags);
+       if (irq->pending_latch) {
+               ret = irq_set_irqchip_state(irq->host_irq,
+                                           IRQCHIP_STATE_PENDING,
+                                           irq->pending_latch);
+               WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+               /*
+                * Clear pending_latch and communicate this state
+                * change via vgic_queue_irq_unlock.
+                */
+               irq->pending_latch = false;
+               vgic_queue_irq_unlock(kvm, irq, flags);
+       } else {
+               raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+       }
+
 out:
        mutex_unlock(&its->its_lock);
        return ret;
index 64fcd75..dc1f3d1 100644 (file)
@@ -293,6 +293,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
 
 struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
                                                           u32 index);
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg);
 
 bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
 
@@ -317,5 +318,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
 void vgic_v4_configure_vsgis(struct kvm *kvm);
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
 
 #endif
index 073acbf..b84b179 100644 (file)
@@ -14,7 +14,7 @@
  * Parameters:
  *     x0 - dest
  */
-SYM_FUNC_START(clear_page)
+SYM_FUNC_START_PI(clear_page)
        mrs     x1, dczid_el0
        and     w1, w1, #0xf
        mov     x2, #4
@@ -25,5 +25,5 @@ SYM_FUNC_START(clear_page)
        tst     x0, #(PAGE_SIZE - 1)
        b.ne    1b
        ret
-SYM_FUNC_END(clear_page)
+SYM_FUNC_END_PI(clear_page)
 EXPORT_SYMBOL(clear_page)
index e7a7939..29144f4 100644 (file)
@@ -17,7 +17,7 @@
  *     x0 - dest
  *     x1 - src
  */
-SYM_FUNC_START(copy_page)
+SYM_FUNC_START_PI(copy_page)
 alternative_if ARM64_HAS_NO_HW_PREFETCH
        // Prefetch three cache lines ahead.
        prfm    pldl1strm, [x1, #128]
@@ -75,5 +75,5 @@ alternative_else_nop_endif
        stnp    x16, x17, [x0, #112 - 256]
 
        ret
-SYM_FUNC_END(copy_page)
+SYM_FUNC_END_PI(copy_page)
 EXPORT_SYMBOL(copy_page)
index 3685e12..6cb22da 100644 (file)
@@ -35,6 +35,7 @@
 #include <asm/fixmap.h>
 #include <asm/kasan.h>
 #include <asm/kernel-pgtable.h>
+#include <asm/kvm_host.h>
 #include <asm/memory.h>
 #include <asm/numa.h>
 #include <asm/sections.h>
@@ -452,6 +453,8 @@ void __init bootmem_init(void)
 
        dma_pernuma_cma_reserve();
 
+       kvm_hyp_reserve();
+
        /*
         * sparse_init() tries to allocate memory from memblock, so must be
         * done after the fixed reservations
index 1e75cc9..ea7729b 100644 (file)
 #include <asm/sysinfo.h>
 #include <asm/unwind.h>
 
-const char *perf_pmu_name(void)
-{
-       if (cpum_cf_avail() || cpum_sf_avail())
-               return "CPU-Measurement Facilities (CPU-MF)";
-       return "pmu";
-}
-EXPORT_SYMBOL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       int num = 0;
-
-       if (cpum_cf_avail())
-               num += PERF_CPUM_CF_MAX_CTR;
-       if (cpum_sf_avail())
-               num += PERF_CPUM_SF_MAX_CTR;
-
-       return num;
-}
-EXPORT_SYMBOL(perf_num_counters);
-
 static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
 {
        struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
index 445e3ec..1d2507f 100644 (file)
@@ -57,24 +57,6 @@ static inline int sh_pmu_initialized(void)
        return !!sh_pmu;
 }
 
-const char *perf_pmu_name(void)
-{
-       if (!sh_pmu)
-               return NULL;
-
-       return sh_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       if (!sh_pmu)
-               return 0;
-
-       return sh_pmu->num_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
 /*
  * Release the PMU if this is the last perf_event.
  */
index d017782..e0f167e 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
+#include <linux/clocksource_ids.h>
 #include <linux/interrupt.h>
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
@@ -24,6 +25,8 @@
 #include <linux/sched/clock.h>
 #include <linux/sched_clock.h>
 #include <linux/acpi.h>
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
 
 #include <asm/arch_timer.h>
 #include <asm/virt.h>
@@ -191,6 +194,7 @@ static u64 arch_counter_read_cc(const struct cyclecounter *cc)
 
 static struct clocksource clocksource_counter = {
        .name   = "arch_sys_counter",
+       .id     = CSID_ARM_ARCH_COUNTER,
        .rating = 400,
        .read   = arch_counter_read,
        .mask   = CLOCKSOURCE_MASK(56),
@@ -1657,3 +1661,35 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 }
 TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
 #endif
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts,
+                                struct clocksource **cs)
+{
+       struct arm_smccc_res hvc_res;
+       u32 ptp_counter;
+       ktime_t ktime;
+
+       if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY))
+               return -EOPNOTSUPP;
+
+       if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
+               ptp_counter = KVM_PTP_VIRT_COUNTER;
+       else
+               ptp_counter = KVM_PTP_PHYS_COUNTER;
+
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+                            ptp_counter, &hvc_res);
+
+       if ((int)(hvc_res.a0) < 0)
+               return -EOPNOTSUPP;
+
+       ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1;
+       *ts = ktime_to_timespec64(ktime);
+       if (cycle)
+               *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3;
+       if (cs)
+               *cs = &clocksource_counter;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);
index f5fc429..69e296f 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <asm/cpuidle.h>
 #include <asm/cputype.h>
+#include <asm/hypervisor.h>
 #include <asm/system_misc.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
@@ -498,6 +499,7 @@ static int __init psci_probe(void)
                psci_init_cpu_suspend();
                psci_init_system_suspend();
                psci_init_system_reset2();
+               kvm_init_hyp_services();
        }
 
        return 0;
index 72ab840..40d1914 100644 (file)
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o
+obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o kvm_guest.o
 obj-$(CONFIG_ARM_SMCCC_SOC_ID) += soc_id.o
diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c
new file mode 100644 (file)
index 0000000..2d3e866
--- /dev/null
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "smccc: KVM: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/hypervisor.h>
+
+static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { };
+
+void __init kvm_init_hyp_services(void)
+{
+       struct arm_smccc_res res;
+       u32 val[4];
+
+       if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_HVC)
+               return;
+
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res);
+       if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 ||
+           res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 ||
+           res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 ||
+           res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3)
+               return;
+
+       memset(&res, 0, sizeof(res));
+       arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res);
+
+       val[0] = lower_32_bits(res.a0);
+       val[1] = lower_32_bits(res.a1);
+       val[2] = lower_32_bits(res.a2);
+       val[3] = lower_32_bits(res.a3);
+
+       bitmap_from_arr32(__kvm_arm_hyp_services, val, ARM_SMCCC_KVM_NUM_FUNCS);
+
+       pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n",
+                res.a3, res.a2, res.a1, res.a0);
+}
+
+bool kvm_arm_hyp_service_available(u32 func_id)
+{
+       if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS)
+               return false;
+
+       return test_bit(func_id, __kvm_arm_hyp_services);
+}
+EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available);
index d52bfc5..028f81d 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/arm-smccc.h>
+#include <linux/kernel.h>
 #include <asm/archrandom.h>
 
 static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
index 7b44ba2..84530fd 100644 (file)
@@ -97,15 +97,15 @@ config CORESIGHT_SOURCE_ETM3X
          module will be called coresight-etm3x.
 
 config CORESIGHT_SOURCE_ETM4X
-       tristate "CoreSight Embedded Trace Macrocell 4.x driver"
+       tristate "CoreSight ETMv4.x / ETE driver"
        depends on ARM64
        select CORESIGHT_LINKS_AND_SINKS
        select PID_IN_CONTEXTIDR
        help
-         This driver provides support for the ETM4.x tracer module, tracing the
-         instructions that a processor is executing. This is primarily useful
-         for instruction level tracing. Depending on the implemented version
-         data tracing may also be available.
+         This driver provides support for the CoreSight Embedded Trace Macrocell
+         version 4.x and the Embedded Trace Extensions (ETE). Both are CPU tracer
+         modules, tracing the instructions that a processor is executing. This is
+         primarily useful for instruction level tracing.
 
          To compile this driver as a module, choose M here: the
          module will be called coresight-etm4x.
@@ -173,4 +173,18 @@ config CORESIGHT_CTI_INTEGRATION_REGS
          CTI trigger connections between this and other devices.These
          registers are not used in normal operation and can leave devices in
          an inconsistent state.
+
+config CORESIGHT_TRBE
+       tristate "Trace Buffer Extension (TRBE) driver"
+       depends on ARM64 && CORESIGHT_SOURCE_ETM4X
+       help
+         This driver provides support for percpu Trace Buffer Extension (TRBE).
+         TRBE always needs to be used along with it's corresponding percpu ETE
+         component. ETE generates trace data which is then captured with TRBE.
+         Unlike traditional sink devices, TRBE is a CPU feature accessible via
+         system registers. But it's explicit dependency with trace unit (ETE)
+         requires it to be plugged in as a coresight sink device.
+
+         To compile this driver as a module, choose M here: the module will be
+         called coresight-trbe.
 endif
index f20e357..d608165 100644 (file)
@@ -21,5 +21,6 @@ obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
 obj-$(CONFIG_CORESIGHT_CPU_DEBUG) += coresight-cpu-debug.o
 obj-$(CONFIG_CORESIGHT_CATU) += coresight-catu.o
 obj-$(CONFIG_CORESIGHT_CTI) += coresight-cti.o
+obj-$(CONFIG_CORESIGHT_TRBE) += coresight-trbe.o
 coresight-cti-y := coresight-cti-core.o        coresight-cti-platform.o \
                   coresight-cti-sysfs.o
index 0062c89..ca75b0b 100644 (file)
@@ -23,6 +23,7 @@
 #include "coresight-priv.h"
 
 static DEFINE_MUTEX(coresight_mutex);
+static DEFINE_PER_CPU(struct coresight_device *, csdev_sink);
 
 /**
  * struct coresight_node - elements of a path, from source to sink
@@ -70,6 +71,18 @@ void coresight_remove_cti_ops(void)
 }
 EXPORT_SYMBOL_GPL(coresight_remove_cti_ops);
 
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev)
+{
+       per_cpu(csdev_sink, cpu) = csdev;
+}
+EXPORT_SYMBOL_GPL(coresight_set_percpu_sink);
+
+struct coresight_device *coresight_get_percpu_sink(int cpu)
+{
+       return per_cpu(csdev_sink, cpu);
+}
+EXPORT_SYMBOL_GPL(coresight_get_percpu_sink);
+
 static int coresight_id_match(struct device *dev, void *data)
 {
        int trace_id, i_trace_id;
@@ -784,6 +797,14 @@ static int _coresight_build_path(struct coresight_device *csdev,
        if (csdev == sink)
                goto out;
 
+       if (coresight_is_percpu_source(csdev) && coresight_is_percpu_sink(sink) &&
+           sink == per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev))) {
+               if (_coresight_build_path(sink, sink, path) == 0) {
+                       found = true;
+                       goto out;
+               }
+       }
+
        /* Not a sink - recursively explore each port found on this element */
        for (i = 0; i < csdev->pdata->nr_outport; i++) {
                struct coresight_device *child_dev;
@@ -999,8 +1020,12 @@ coresight_find_default_sink(struct coresight_device *csdev)
        int depth = 0;
 
        /* look for a default sink if we have not found for this device */
-       if (!csdev->def_sink)
-               csdev->def_sink = coresight_find_sink(csdev, &depth);
+       if (!csdev->def_sink) {
+               if (coresight_is_percpu_source(csdev))
+                       csdev->def_sink = per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev));
+               if (!csdev->def_sink)
+                       csdev->def_sink = coresight_find_sink(csdev, &depth);
+       }
        return csdev->def_sink;
 }
 
index 0f603b4..f123c26 100644 (file)
 static struct pmu etm_pmu;
 static bool etm_perf_up;
 
-static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
+/*
+ * An ETM context for a running event includes the perf aux handle
+ * and aux_data. For ETM, the aux_data (etm_event_data), consists of
+ * the trace path and the sink configuration. The event data is accessible
+ * via perf_get_aux(handle). However, a sink could "end" a perf output
+ * handle via the IRQ handler. And if the "sink" encounters a failure
+ * to "begin" another session (e.g due to lack of space in the buffer),
+ * the handle will be cleared. Thus, the event_data may not be accessible
+ * from the handle when we get to the etm_event_stop(), which is required
+ * for stopping the trace path. The event_data is guaranteed to stay alive
+ * until "free_aux()", which cannot happen as long as the event is active on
+ * the ETM. Thus the event_data for the session must be part of the ETM context
+ * to make sure we can disable the trace path.
+ */
+struct etm_ctxt {
+       struct perf_output_handle handle;
+       struct etm_event_data *event_data;
+};
+
+static DEFINE_PER_CPU(struct etm_ctxt, etm_ctxt);
 static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 
 /*
@@ -232,6 +251,25 @@ static void etm_free_aux(void *data)
        schedule_work(&event_data->work);
 }
 
+/*
+ * Check if two given sinks are compatible with each other,
+ * so that they can use the same sink buffers, when an event
+ * moves around.
+ */
+static bool sinks_compatible(struct coresight_device *a,
+                            struct coresight_device *b)
+{
+       if (!a || !b)
+               return false;
+       /*
+        * If the sinks are of the same subtype and driven
+        * by the same driver, we can use the same buffer
+        * on these sinks.
+        */
+       return (a->subtype.sink_subtype == b->subtype.sink_subtype) &&
+              (sink_ops(a) == sink_ops(b));
+}
+
 static void *etm_setup_aux(struct perf_event *event, void **pages,
                           int nr_pages, bool overwrite)
 {
@@ -239,6 +277,7 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
        int cpu = event->cpu;
        cpumask_t *mask;
        struct coresight_device *sink = NULL;
+       struct coresight_device *user_sink = NULL, *last_sink = NULL;
        struct etm_event_data *event_data = NULL;
 
        event_data = alloc_event_data(cpu);
@@ -249,7 +288,7 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
        /* First get the selected sink from user space. */
        if (event->attr.config2) {
                id = (u32)event->attr.config2;
-               sink = coresight_get_sink_by_id(id);
+               sink = user_sink = coresight_get_sink_by_id(id);
        }
 
        mask = &event_data->mask;
@@ -277,14 +316,33 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
                }
 
                /*
-                * No sink provided - look for a default sink for one of the
-                * devices. At present we only support topology where all CPUs
-                * use the same sink [N:1], so only need to find one sink. The
-                * coresight_build_path later will remove any CPU that does not
-                * attach to the sink, or if we have not found a sink.
+                * No sink provided - look for a default sink for all the ETMs,
+                * where this event can be scheduled.
+                * We allocate the sink specific buffers only once for this
+                * event. If the ETMs have different default sink devices, we
+                * can only use a single "type" of sink as the event can carry
+                * only one sink specific buffer. Thus we have to make sure
+                * that the sinks are of the same type and driven by the same
+                * driver, as the one we allocate the buffer for. As such
+                * we choose the first sink and check if the remaining ETMs
+                * have a compatible default sink. We don't trace on a CPU
+                * if the sink is not compatible.
                 */
-               if (!sink)
+               if (!user_sink) {
+                       /* Find the default sink for this ETM */
                        sink = coresight_find_default_sink(csdev);
+                       if (!sink) {
+                               cpumask_clear_cpu(cpu, mask);
+                               continue;
+                       }
+
+                       /* Check if this sink compatible with the last sink */
+                       if (last_sink && !sinks_compatible(last_sink, sink)) {
+                               cpumask_clear_cpu(cpu, mask);
+                               continue;
+                       }
+                       last_sink = sink;
+               }
 
                /*
                 * Building a path doesn't enable it, it simply builds a
@@ -312,7 +370,12 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
        if (!sink_ops(sink)->alloc_buffer || !sink_ops(sink)->free_buffer)
                goto err;
 
-       /* Allocate the sink buffer for this session */
+       /*
+        * Allocate the sink buffer for this session. All the sinks
+        * where this event can be scheduled are ensured to be of the
+        * same type. Thus the same sink configuration is used by the
+        * sinks.
+        */
        event_data->snk_config =
                        sink_ops(sink)->alloc_buffer(sink, event, pages,
                                                     nr_pages, overwrite);
@@ -332,13 +395,18 @@ static void etm_event_start(struct perf_event *event, int flags)
 {
        int cpu = smp_processor_id();
        struct etm_event_data *event_data;
-       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
+       struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt);
+       struct perf_output_handle *handle = &ctxt->handle;
        struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
        struct list_head *path;
 
        if (!csdev)
                goto fail;
 
+       /* Have we messed up our tracking ? */
+       if (WARN_ON(ctxt->event_data))
+               goto fail;
+
        /*
         * Deal with the ring buffer API and get a handle on the
         * session's information.
@@ -374,6 +442,8 @@ static void etm_event_start(struct perf_event *event, int flags)
        if (source_ops(csdev)->enable(csdev, event, CS_MODE_PERF))
                goto fail_disable_path;
 
+       /* Save the event_data for this ETM */
+       ctxt->event_data = event_data;
 out:
        return;
 
@@ -392,13 +462,30 @@ static void etm_event_stop(struct perf_event *event, int mode)
        int cpu = smp_processor_id();
        unsigned long size;
        struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
-       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
-       struct etm_event_data *event_data = perf_get_aux(handle);
+       struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt);
+       struct perf_output_handle *handle = &ctxt->handle;
+       struct etm_event_data *event_data;
        struct list_head *path;
 
+       /*
+        * If we still have access to the event_data via handle,
+        * confirm that we haven't messed up the tracking.
+        */
+       if (handle->event &&
+           WARN_ON(perf_get_aux(handle) != ctxt->event_data))
+               return;
+
+       event_data = ctxt->event_data;
+       /* Clear the event_data as this ETM is stopping the trace. */
+       ctxt->event_data = NULL;
+
        if (event->hw.state == PERF_HES_STOPPED)
                return;
 
+       /* We must have a valid event_data for a running event */
+       if (WARN_ON(!event_data))
+               return;
+
        if (!csdev)
                return;
 
@@ -416,7 +503,13 @@ static void etm_event_stop(struct perf_event *event, int mode)
        /* tell the core */
        event->hw.state = PERF_HES_STOPPED;
 
-       if (mode & PERF_EF_UPDATE) {
+       /*
+        * If the handle is not bound to an event anymore
+        * (e.g, the sink driver was unable to restart the
+        * handle due to lack of buffer space), we don't
+        * have to do anything here.
+        */
+       if (handle->event && (mode & PERF_EF_UPDATE)) {
                if (WARN_ON_ONCE(handle->event != event))
                        return;
 
index 15016f7..efb84ce 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/property.h>
 
+#include <asm/barrier.h>
 #include <asm/sections.h>
 #include <asm/sysreg.h>
 #include <asm/local.h>
@@ -114,30 +115,91 @@ void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit)
        }
 }
 
-static void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata, struct csdev_access *csa)
+static u64 ete_sysreg_read(u32 offset, bool _relaxed, bool _64bit)
 {
-       /* Writing 0 to TRCOSLAR unlocks the trace registers */
-       etm4x_relaxed_write32(csa, 0x0, TRCOSLAR);
-       drvdata->os_unlock = true;
+       u64 res = 0;
+
+       switch (offset) {
+       ETE_READ_CASES(res)
+       default :
+               pr_warn_ratelimited("ete: trying to read unsupported register @%x\n",
+                                   offset);
+       }
+
+       if (!_relaxed)
+               __iormb(res);   /* Imitate the !relaxed I/O helpers */
+
+       return res;
+}
+
+static void ete_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit)
+{
+       if (!_relaxed)
+               __iowmb();      /* Imitate the !relaxed I/O helpers */
+       if (!_64bit)
+               val &= GENMASK(31, 0);
+
+       switch (offset) {
+       ETE_WRITE_CASES(val)
+       default :
+               pr_warn_ratelimited("ete: trying to write to unsupported register @%x\n",
+                                   offset);
+       }
+}
+
+static void etm_detect_os_lock(struct etmv4_drvdata *drvdata,
+                              struct csdev_access *csa)
+{
+       u32 oslsr = etm4x_relaxed_read32(csa, TRCOSLSR);
+
+       drvdata->os_lock_model = ETM_OSLSR_OSLM(oslsr);
+}
+
+static void etm_write_os_lock(struct etmv4_drvdata *drvdata,
+                             struct csdev_access *csa, u32 val)
+{
+       val = !!val;
+
+       switch (drvdata->os_lock_model) {
+       case ETM_OSLOCK_PRESENT:
+               etm4x_relaxed_write32(csa, val, TRCOSLAR);
+               break;
+       case ETM_OSLOCK_PE:
+               write_sysreg_s(val, SYS_OSLAR_EL1);
+               break;
+       default:
+               pr_warn_once("CPU%d: Unsupported Trace OSLock model: %x\n",
+                            smp_processor_id(), drvdata->os_lock_model);
+               fallthrough;
+       case ETM_OSLOCK_NI:
+               return;
+       }
        isb();
 }
 
+static inline void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata,
+                                     struct csdev_access *csa)
+{
+       WARN_ON(drvdata->cpu != smp_processor_id());
+
+       /* Writing 0 to OS Lock unlocks the trace unit registers */
+       etm_write_os_lock(drvdata, csa, 0x0);
+       drvdata->os_unlock = true;
+}
+
 static void etm4_os_unlock(struct etmv4_drvdata *drvdata)
 {
        if (!WARN_ON(!drvdata->csdev))
                etm4_os_unlock_csa(drvdata, &drvdata->csdev->access);
-
 }
 
 static void etm4_os_lock(struct etmv4_drvdata *drvdata)
 {
        if (WARN_ON(!drvdata->csdev))
                return;
-
-       /* Writing 0x1 to TRCOSLAR locks the trace registers */
-       etm4x_relaxed_write32(&drvdata->csdev->access, 0x1, TRCOSLAR);
+       /* Writing 0x1 to OS Lock locks the trace registers */
+       etm_write_os_lock(drvdata, &drvdata->csdev->access, 0x1);
        drvdata->os_unlock = false;
-       isb();
 }
 
 static void etm4_cs_lock(struct etmv4_drvdata *drvdata,
@@ -371,6 +433,13 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata)
                etm4x_relaxed_write32(csa, trcpdcr | TRCPDCR_PU, TRCPDCR);
        }
 
+       /*
+        * ETE mandates that the TRCRSR is written to before
+        * enabling it.
+        */
+       if (etm4x_is_ete(drvdata))
+               etm4x_relaxed_write32(csa, TRCRSR_TA, TRCRSR);
+
        /* Enable the trace unit */
        etm4x_relaxed_write32(csa, 1, TRCPRGCTLR);
 
@@ -654,6 +723,7 @@ static int etm4_enable(struct coresight_device *csdev,
 static void etm4_disable_hw(void *info)
 {
        u32 control;
+       u64 trfcr;
        struct etmv4_drvdata *drvdata = info;
        struct etmv4_config *config = &drvdata->config;
        struct coresight_device *csdev = drvdata->csdev;
@@ -677,18 +747,32 @@ static void etm4_disable_hw(void *info)
        control &= ~0x1;
 
        /*
+        * If the CPU supports v8.4 Trace filter Control,
+        * set the ETM to trace prohibited region.
+        */
+       if (drvdata->trfc) {
+               trfcr = read_sysreg_s(SYS_TRFCR_EL1);
+               write_sysreg_s(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE),
+                              SYS_TRFCR_EL1);
+               isb();
+       }
+       /*
         * Make sure everything completes before disabling, as recommended
         * by section 7.3.77 ("TRCVICTLR, ViewInst Main Control Register,
         * SSTATUS") of ARM IHI 0064D
         */
        dsb(sy);
        isb();
+       /* Trace synchronization barrier, is a nop if not supported */
+       tsb_csync();
        etm4x_relaxed_write32(csa, control, TRCPRGCTLR);
 
        /* wait for TRCSTATR.PMSTABLE to go to '1' */
        if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1))
                dev_err(etm_dev,
                        "timeout while waiting for PM stable Trace Status\n");
+       if (drvdata->trfc)
+               write_sysreg_s(trfcr, SYS_TRFCR_EL1);
 
        /* read the status of the single shot comparators */
        for (i = 0; i < drvdata->nr_ss_cmp; i++) {
@@ -817,13 +901,24 @@ static bool etm4_init_sysreg_access(struct etmv4_drvdata *drvdata,
         * ETMs implementing sysreg access must implement TRCDEVARCH.
         */
        devarch = read_etm4x_sysreg_const_offset(TRCDEVARCH);
-       if ((devarch & ETM_DEVARCH_ID_MASK) != ETM_DEVARCH_ETMv4x_ARCH)
+       switch (devarch & ETM_DEVARCH_ID_MASK) {
+       case ETM_DEVARCH_ETMv4x_ARCH:
+               *csa = (struct csdev_access) {
+                       .io_mem = false,
+                       .read   = etm4x_sysreg_read,
+                       .write  = etm4x_sysreg_write,
+               };
+               break;
+       case ETM_DEVARCH_ETE_ARCH:
+               *csa = (struct csdev_access) {
+                       .io_mem = false,
+                       .read   = ete_sysreg_read,
+                       .write  = ete_sysreg_write,
+               };
+               break;
+       default:
                return false;
-       *csa = (struct csdev_access) {
-               .io_mem = false,
-               .read   = etm4x_sysreg_read,
-               .write  = etm4x_sysreg_write,
-       };
+       }
 
        drvdata->arch = etm_devarch_to_arch(devarch);
        return true;
@@ -873,7 +968,7 @@ static bool etm4_init_csdev_access(struct etmv4_drvdata *drvdata,
        return false;
 }
 
-static void cpu_enable_tracing(void)
+static void cpu_enable_tracing(struct etmv4_drvdata *drvdata)
 {
        u64 dfr0 = read_sysreg(id_aa64dfr0_el1);
        u64 trfcr;
@@ -881,6 +976,7 @@ static void cpu_enable_tracing(void)
        if (!cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRACE_FILT_SHIFT))
                return;
 
+       drvdata->trfc = true;
        /*
         * If the CPU supports v8.4 SelfHosted Tracing, enable
         * tracing at the kernel EL and EL0, forcing to use the
@@ -920,6 +1016,9 @@ static void etm4_init_arch_data(void *info)
        if (!etm4_init_csdev_access(drvdata, csa))
                return;
 
+       /* Detect the support for OS Lock before we actually use it */
+       etm_detect_os_lock(drvdata, csa);
+
        /* Make sure all registers are accessible */
        etm4_os_unlock_csa(drvdata, csa);
        etm4_cs_unlock(drvdata, csa);
@@ -1082,7 +1181,7 @@ static void etm4_init_arch_data(void *info)
        /* NUMCNTR, bits[30:28] number of counters available for tracing */
        drvdata->nr_cntr = BMVAL(etmidr5, 28, 30);
        etm4_cs_lock(drvdata, csa);
-       cpu_enable_tracing();
+       cpu_enable_tracing(drvdata);
 }
 
 static inline u32 etm4_get_victlr_access_type(struct etmv4_config *config)
@@ -1760,6 +1859,8 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
        struct etmv4_drvdata *drvdata;
        struct coresight_desc desc = { 0 };
        struct etm4_init_arg init_arg = { 0 };
+       u8 major, minor;
+       char *type_name;
 
        drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
        if (!drvdata)
@@ -1786,10 +1887,6 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
        if (drvdata->cpu < 0)
                return drvdata->cpu;
 
-       desc.name = devm_kasprintf(dev, GFP_KERNEL, "etm%d", drvdata->cpu);
-       if (!desc.name)
-               return -ENOMEM;
-
        init_arg.drvdata = drvdata;
        init_arg.csa = &desc.access;
        init_arg.pid = etm_pid;
@@ -1806,6 +1903,22 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
            fwnode_property_present(dev_fwnode(dev), "qcom,skip-power-up"))
                drvdata->skip_power_up = true;
 
+       major = ETM_ARCH_MAJOR_VERSION(drvdata->arch);
+       minor = ETM_ARCH_MINOR_VERSION(drvdata->arch);
+
+       if (etm4x_is_ete(drvdata)) {
+               type_name = "ete";
+               /* ETE v1 has major version == 0b101. Adjust this for logging.*/
+               major -= 4;
+       } else {
+               type_name = "etm";
+       }
+
+       desc.name = devm_kasprintf(dev, GFP_KERNEL,
+                                  "%s%d", type_name, drvdata->cpu);
+       if (!desc.name)
+               return -ENOMEM;
+
        etm4_init_trace_id(drvdata);
        etm4_set_default(&drvdata->config);
 
@@ -1833,9 +1946,8 @@ static int etm4_probe(struct device *dev, void __iomem *base, u32 etm_pid)
 
        etmdrvdata[drvdata->cpu] = drvdata;
 
-       dev_info(&drvdata->csdev->dev, "CPU%d: ETM v%d.%d initialized\n",
-                drvdata->cpu, ETM_ARCH_MAJOR_VERSION(drvdata->arch),
-                ETM_ARCH_MINOR_VERSION(drvdata->arch));
+       dev_info(&drvdata->csdev->dev, "CPU%d: %s v%d.%d initialized\n",
+                drvdata->cpu, type_name, major, minor);
 
        if (boot_enable) {
                coresight_enable(drvdata->csdev);
@@ -1978,6 +2090,7 @@ static struct amba_driver etm4x_amba_driver = {
 
 static const struct of_device_id etm4_sysreg_match[] = {
        { .compatible   = "arm,coresight-etm4x-sysreg" },
+       { .compatible   = "arm,embedded-trace-extension" },
        {}
 };
 
index 0995a10..007bad9 100644 (file)
@@ -2374,12 +2374,20 @@ static inline bool
 etm4x_register_implemented(struct etmv4_drvdata *drvdata, u32 offset)
 {
        switch (offset) {
-       ETM4x_SYSREG_LIST_CASES
+       ETM_COMMON_SYSREG_LIST_CASES
                /*
-                * Registers accessible via system instructions are always
-                * implemented.
+                * Common registers to ETE & ETM4x accessible via system
+                * instructions are always implemented.
                 */
                return true;
+
+       ETM4x_ONLY_SYSREG_LIST_CASES
+               /*
+                * We only support etm4x and ete. So if the device is not
+                * ETE, it must be ETMv4x.
+                */
+               return !etm4x_is_ete(drvdata);
+
        ETM4x_MMAP_LIST_CASES
                /*
                 * Registers accessible only via memory-mapped registers
@@ -2389,8 +2397,13 @@ etm4x_register_implemented(struct etmv4_drvdata *drvdata, u32 offset)
                 * coresight_register() and the csdev is not initialized
                 * until that is done. So rely on the drvdata->base to
                 * detect if we have a memory mapped access.
+                * Also ETE doesn't implement memory mapped access, thus
+                * it is sufficient to check that we are using mmio.
                 */
                return !!drvdata->base;
+
+       ETE_ONLY_SYSREG_LIST_CASES
+               return etm4x_is_ete(drvdata);
        }
 
        return false;
index 0af6057..e5b79bd 100644 (file)
@@ -29,6 +29,7 @@
 #define TRCAUXCTLR                     0x018
 #define TRCEVENTCTL0R                  0x020
 #define TRCEVENTCTL1R                  0x024
+#define TRCRSR                         0x028
 #define TRCSTALLCTLR                   0x02C
 #define TRCTSCTLR                      0x030
 #define TRCSYNCPR                      0x034
@@ -49,6 +50,7 @@
 #define TRCSEQRSTEVR                   0x118
 #define TRCSEQSTR                      0x11C
 #define TRCEXTINSELR                   0x120
+#define TRCEXTINSELRn(n)               (0x120 + (n * 4)) /* n = 0-3 */
 #define TRCCNTRLDVRn(n)                        (0x140 + (n * 4)) /* n = 0-3 */
 #define TRCCNTCTLRn(n)                 (0x150 + (n * 4)) /* n = 0-3 */
 #define TRCCNTVRn(n)                   (0x160 + (n * 4)) /* n = 0-3 */
 #define TRCCIDR2                       0xFF8
 #define TRCCIDR3                       0xFFC
 
+#define TRCRSR_TA                      BIT(12)
+
 /*
  * System instructions to access ETM registers.
  * See ETMv4.4 spec ARM IHI0064F section 4.3.6 System instructions
 #define CASE_NOP(__unused, x)                                  \
        case (x):       /* fall through */
 
+#define ETE_ONLY_SYSREG_LIST(op, val)          \
+       CASE_##op((val), TRCRSR)                \
+       CASE_##op((val), TRCEXTINSELRn(1))      \
+       CASE_##op((val), TRCEXTINSELRn(2))      \
+       CASE_##op((val), TRCEXTINSELRn(3))
+
 /* List of registers accessible via System instructions */
-#define ETM_SYSREG_LIST(op, val)               \
-       CASE_##op((val), TRCPRGCTLR)            \
+#define ETM4x_ONLY_SYSREG_LIST(op, val)                \
        CASE_##op((val), TRCPROCSELR)           \
+       CASE_##op((val), TRCVDCTLR)             \
+       CASE_##op((val), TRCVDSACCTLR)          \
+       CASE_##op((val), TRCVDARCCTLR)          \
+       CASE_##op((val), TRCOSLAR)
+
+#define ETM_COMMON_SYSREG_LIST(op, val)                \
+       CASE_##op((val), TRCPRGCTLR)            \
        CASE_##op((val), TRCSTATR)              \
        CASE_##op((val), TRCCONFIGR)            \
        CASE_##op((val), TRCAUXCTLR)            \
        CASE_##op((val), TRCVIIECTLR)           \
        CASE_##op((val), TRCVISSCTLR)           \
        CASE_##op((val), TRCVIPCSSCTLR)         \
-       CASE_##op((val), TRCVDCTLR)             \
-       CASE_##op((val), TRCVDSACCTLR)          \
-       CASE_##op((val), TRCVDARCCTLR)          \
        CASE_##op((val), TRCSEQEVRn(0))         \
        CASE_##op((val), TRCSEQEVRn(1))         \
        CASE_##op((val), TRCSEQEVRn(2))         \
        CASE_##op((val), TRCSSPCICRn(5))        \
        CASE_##op((val), TRCSSPCICRn(6))        \
        CASE_##op((val), TRCSSPCICRn(7))        \
-       CASE_##op((val), TRCOSLAR)              \
        CASE_##op((val), TRCOSLSR)              \
        CASE_##op((val), TRCACVRn(0))           \
        CASE_##op((val), TRCACVRn(1))           \
        CASE_##op((val), TRCPIDR2)              \
        CASE_##op((val), TRCPIDR3)
 
-#define ETM4x_READ_SYSREG_CASES(res)   ETM_SYSREG_LIST(READ, (res))
-#define ETM4x_WRITE_SYSREG_CASES(val)  ETM_SYSREG_LIST(WRITE, (val))
+#define ETM4x_READ_SYSREG_CASES(res)           \
+       ETM_COMMON_SYSREG_LIST(READ, (res))     \
+       ETM4x_ONLY_SYSREG_LIST(READ, (res))
+
+#define ETM4x_WRITE_SYSREG_CASES(val)          \
+       ETM_COMMON_SYSREG_LIST(WRITE, (val))    \
+       ETM4x_ONLY_SYSREG_LIST(WRITE, (val))
+
+#define ETM_COMMON_SYSREG_LIST_CASES           \
+       ETM_COMMON_SYSREG_LIST(NOP, __unused)
+
+#define ETM4x_ONLY_SYSREG_LIST_CASES           \
+       ETM4x_ONLY_SYSREG_LIST(NOP, __unused)
+
+#define ETM4x_SYSREG_LIST_CASES                        \
+       ETM_COMMON_SYSREG_LIST_CASES            \
+       ETM4x_ONLY_SYSREG_LIST(NOP, __unused)
 
-#define ETM4x_SYSREG_LIST_CASES                ETM_SYSREG_LIST(NOP, __unused)
 #define ETM4x_MMAP_LIST_CASES          ETM_MMAP_LIST(NOP, __unused)
 
+/* ETE only supports system register access */
+#define ETE_READ_CASES(res)                    \
+       ETM_COMMON_SYSREG_LIST(READ, (res))     \
+       ETE_ONLY_SYSREG_LIST(READ, (res))
+
+#define ETE_WRITE_CASES(val)                   \
+       ETM_COMMON_SYSREG_LIST(WRITE, (val))    \
+       ETE_ONLY_SYSREG_LIST(WRITE, (val))
+
+#define ETE_ONLY_SYSREG_LIST_CASES             \
+       ETE_ONLY_SYSREG_LIST(NOP, __unused)
+
 #define read_etm4x_sysreg_offset(offset, _64bit)                               \
        ({                                                                      \
                u64 __val;                                                      \
                                         ETM_MODE_EXCL_USER)
 
 /*
+ * TRCOSLSR.OSLM advertises the OS Lock model.
+ * OSLM[2:0] = TRCOSLSR[4:3,0]
+ *
+ *     0b000 - Trace OS Lock is not implemented.
+ *     0b010 - Trace OS Lock is implemented.
+ *     0b100 - Trace OS Lock is not implemented, unit is controlled by PE OS Lock.
+ */
+#define ETM_OSLOCK_NI          0b000
+#define ETM_OSLOCK_PRESENT     0b010
+#define ETM_OSLOCK_PE          0b100
+
+#define ETM_OSLSR_OSLM(oslsr)  ((((oslsr) & GENMASK(4, 3)) >> 2) | (oslsr & 0x1))
+
+/*
  * TRCDEVARCH Bit field definitions
  * Bits[31:21] - ARCHITECT = Always Arm Ltd.
  *                * Bits[31:28] = 0x4
        ((ETM_DEVARCH_MAKE_ARCHID_ARCH_VER(major)) | ETM_DEVARCH_ARCHID_ARCH_PART(0xA13))
 
 #define ETM_DEVARCH_ARCHID_ETMv4x              ETM_DEVARCH_MAKE_ARCHID(0x4)
+#define ETM_DEVARCH_ARCHID_ETE                 ETM_DEVARCH_MAKE_ARCHID(0x5)
 
 #define ETM_DEVARCH_ID_MASK                                            \
        (ETM_DEVARCH_ARCHITECT_MASK | ETM_DEVARCH_ARCHID_MASK | ETM_DEVARCH_PRESENT)
 #define ETM_DEVARCH_ETMv4x_ARCH                                                \
        (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETMv4x | ETM_DEVARCH_PRESENT)
+#define ETM_DEVARCH_ETE_ARCH                                           \
+       (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETE | ETM_DEVARCH_PRESENT)
 
 #define TRCSTATR_IDLE_BIT              0
 #define TRCSTATR_PMSTABLE_BIT          1
 #define ETM_ARCH_MINOR_VERSION(arch)   ((arch) & 0xfU)
 
 #define ETM_ARCH_V4    ETM_ARCH_VERSION(4, 0)
+#define ETM_ARCH_ETE   ETM_ARCH_VERSION(5, 0)
+
 /* Interpretation of resource numbers change at ETM v4.3 architecture */
 #define ETM_ARCH_V4_3  ETM_ARCH_VERSION(4, 3)
 
@@ -862,6 +919,7 @@ struct etmv4_save_state {
  * @nooverflow:        Indicate if overflow prevention is supported.
  * @atbtrig:   If the implementation can support ATB triggers
  * @lpoverride:        If the implementation can support low-power state over.
+ * @trfc:      If the implementation supports Arm v8.4 trace filter controls.
  * @config:    structure holding configuration parameters.
  * @save_state:        State to be preserved across power loss
  * @state_needs_restore: True when there is context to restore after PM exit
@@ -897,6 +955,7 @@ struct etmv4_drvdata {
        u8                              s_ex_level;
        u8                              ns_ex_level;
        u8                              q_support;
+       u8                              os_lock_model;
        bool                            sticky_enable;
        bool                            boot_enable;
        bool                            os_unlock;
@@ -912,6 +971,7 @@ struct etmv4_drvdata {
        bool                            nooverflow;
        bool                            atbtrig;
        bool                            lpoverride;
+       bool                            trfc;
        struct etmv4_config             config;
        struct etmv4_save_state         *save_state;
        bool                            state_needs_restore;
@@ -940,4 +1000,9 @@ void etm4_config_trace_mode(struct etmv4_config *config);
 
 u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit);
 void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit);
+
+static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata)
+{
+       return drvdata->arch >= ETM_ARCH_ETE;
+}
 #endif
index 3629b78..c594f45 100644 (file)
@@ -90,6 +90,12 @@ static void of_coresight_get_ports_legacy(const struct device_node *node,
        struct of_endpoint endpoint;
        int in = 0, out = 0;
 
+       /*
+        * Avoid warnings in of_graph_get_next_endpoint()
+        * if the device doesn't have any graph connections
+        */
+       if (!of_graph_is_present(node))
+               return;
        do {
                ep = of_graph_get_next_endpoint(node, ep);
                if (!ep)
index f5f654e..ff1dd20 100644 (file)
@@ -232,4 +232,7 @@ coresight_find_csdev_by_fwnode(struct fwnode_handle *r_fwnode);
 void coresight_set_assoc_ectdev_mutex(struct coresight_device *csdev,
                                      struct coresight_device *ect_csdev);
 
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev);
+struct coresight_device *coresight_get_percpu_sink(int cpu);
+
 #endif
diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
new file mode 100644 (file)
index 0000000..1768684
--- /dev/null
@@ -0,0 +1,1157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This driver enables Trace Buffer Extension (TRBE) as a per-cpu coresight
+ * sink device could then pair with an appropriate per-cpu coresight source
+ * device (ETE) thus generating required trace data. Trace can be enabled
+ * via the perf framework.
+ *
+ * The AUX buffer handling is inspired from Arm SPE PMU driver.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define DRVNAME "arm_trbe"
+
+#define pr_fmt(fmt) DRVNAME ": " fmt
+
+#include <asm/barrier.h>
+#include "coresight-trbe.h"
+
+#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT))
+
+/*
+ * A padding packet that will help the user space tools
+ * in skipping relevant sections in the captured trace
+ * data which could not be decoded. TRBE doesn't support
+ * formatting the trace data, unlike the legacy CoreSight
+ * sinks and thus we use ETE trace packets to pad the
+ * sections of the buffer.
+ */
+#define ETE_IGNORE_PACKET              0x70
+
+/*
+ * Minimum amount of meaningful trace will contain:
+ * A-Sync, Trace Info, Trace On, Address, Atom.
+ * This is about 44bytes of ETE trace. To be on
+ * the safer side, we assume 64bytes is the minimum
+ * space required for a meaningful session, before
+ * we hit a "WRAP" event.
+ */
+#define TRBE_TRACE_MIN_BUF_SIZE                64
+
+enum trbe_fault_action {
+       TRBE_FAULT_ACT_WRAP,
+       TRBE_FAULT_ACT_SPURIOUS,
+       TRBE_FAULT_ACT_FATAL,
+};
+
+struct trbe_buf {
+       /*
+        * Even though trbe_base represents vmap()
+        * mapped allocated buffer's start address,
+        * it's being as unsigned long for various
+        * arithmetic and comparision operations &
+        * also to be consistent with trbe_write &
+        * trbe_limit sibling pointers.
+        */
+       unsigned long trbe_base;
+       unsigned long trbe_limit;
+       unsigned long trbe_write;
+       int nr_pages;
+       void **pages;
+       bool snapshot;
+       struct trbe_cpudata *cpudata;
+};
+
+struct trbe_cpudata {
+       bool trbe_flag;
+       u64 trbe_align;
+       int cpu;
+       enum cs_mode mode;
+       struct trbe_buf *buf;
+       struct trbe_drvdata *drvdata;
+};
+
+struct trbe_drvdata {
+       struct trbe_cpudata __percpu *cpudata;
+       struct perf_output_handle * __percpu *handle;
+       struct hlist_node hotplug_node;
+       int irq;
+       cpumask_t supported_cpus;
+       enum cpuhp_state trbe_online;
+       struct platform_device *pdev;
+};
+
+static int trbe_alloc_node(struct perf_event *event)
+{
+       if (event->cpu == -1)
+               return NUMA_NO_NODE;
+       return cpu_to_node(event->cpu);
+}
+
+static void trbe_drain_buffer(void)
+{
+       tsb_csync();
+       dsb(nsh);
+}
+
+static void trbe_drain_and_disable_local(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       trbe_drain_buffer();
+
+       /*
+        * Disable the TRBE without clearing LIMITPTR which
+        * might be required for fetching the buffer limits.
+        */
+       trblimitr &= ~TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+       isb();
+}
+
+static void trbe_reset_local(void)
+{
+       trbe_drain_and_disable_local();
+       write_sysreg_s(0, SYS_TRBLIMITR_EL1);
+       write_sysreg_s(0, SYS_TRBPTR_EL1);
+       write_sysreg_s(0, SYS_TRBBASER_EL1);
+       write_sysreg_s(0, SYS_TRBSR_EL1);
+}
+
+static void trbe_stop_and_truncate_event(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       /*
+        * We cannot proceed with the buffer collection and we
+        * do not have any data for the current session. The
+        * etm_perf driver expects to close out the aux_buffer
+        * at event_stop(). So disable the TRBE here and leave
+        * the update_buffer() to return a 0 size.
+        */
+       trbe_drain_and_disable_local();
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL;
+}
+
+/*
+ * TRBE Buffer Management
+ *
+ * The TRBE buffer spans from the base pointer till the limit pointer. When enabled,
+ * it starts writing trace data from the write pointer onward till the limit pointer.
+ * When the write pointer reaches the address just before the limit pointer, it gets
+ * wrapped around again to the base pointer. This is called a TRBE wrap event, which
+ * generates a maintenance interrupt when operated in WRAP or FILL mode. This driver
+ * uses FILL mode, where the TRBE stops the trace collection at wrap event. The IRQ
+ * handler updates the AUX buffer and re-enables the TRBE with updated WRITE and
+ * LIMIT pointers.
+ *
+ *     Wrap around with an IRQ
+ *     ------ < ------ < ------- < ----- < -----
+ *     |                                       |
+ *     ------ > ------ > ------- > ----- > -----
+ *
+ *     +---------------+-----------------------+
+ *     |               |                       |
+ *     +---------------+-----------------------+
+ *     Base Pointer    Write Pointer           Limit Pointer
+ *
+ * The base and limit pointers always needs to be PAGE_SIZE aligned. But the write
+ * pointer can be aligned to the implementation defined TRBE trace buffer alignment
+ * as captured in trbe_cpudata->trbe_align.
+ *
+ *
+ *             head            tail            wakeup
+ *     +---------------------------------------+----- ~ ~ ------
+ *     |$$$$$$$|################|$$$$$$$$$$$$$$|               |
+ *     +---------------------------------------+----- ~ ~ ------
+ *     Base Pointer    Write Pointer           Limit Pointer
+ *
+ * The perf_output_handle indices (head, tail, wakeup) are monotonically increasing
+ * values which tracks all the driver writes and user reads from the perf auxiliary
+ * buffer. Generally [head..tail] is the area where the driver can write into unless
+ * the wakeup is behind the tail. Enabled TRBE buffer span needs to be adjusted and
+ * configured depending on the perf_output_handle indices, so that the driver does
+ * not override into areas in the perf auxiliary buffer which is being or yet to be
+ * consumed from the user space. The enabled TRBE buffer area is a moving subset of
+ * the allocated perf auxiliary buffer.
+ */
+static void trbe_pad_buf(struct perf_output_handle *handle, int len)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       u64 head = PERF_IDX2OFF(handle->head, buf);
+
+       memset((void *)buf->trbe_base + head, ETE_IGNORE_PACKET, len);
+       if (!buf->snapshot)
+               perf_aux_output_skip(handle, len);
+}
+
+static unsigned long trbe_snapshot_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       /*
+        * The ETE trace has alignment synchronization packets allowing
+        * the decoder to reset in case of an overflow or corruption.
+        * So we can use the entire buffer for the snapshot mode.
+        */
+       return buf->nr_pages * PAGE_SIZE;
+}
+
+/*
+ * TRBE Limit Calculation
+ *
+ * The following markers are used to illustrate various TRBE buffer situations.
+ *
+ * $$$$ - Data area, unconsumed captured trace data, not to be overridden
+ * #### - Free area, enabled, trace will be written
+ * %%%% - Free area, disabled, trace will not be written
+ * ==== - Free area, padded with ETE_IGNORE_PACKET, trace will be skipped
+ */
+static unsigned long __trbe_normal_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       struct trbe_cpudata *cpudata = buf->cpudata;
+       const u64 bufsize = buf->nr_pages * PAGE_SIZE;
+       u64 limit = bufsize;
+       u64 head, tail, wakeup;
+
+       head = PERF_IDX2OFF(handle->head, buf);
+
+       /*
+        *              head
+        *      ------->|
+        *      |
+        *      head    TRBE align      tail
+        * +----|-------|---------------|-------+
+        * |$$$$|=======|###############|$$$$$$$|
+        * +----|-------|---------------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * Perf aux buffer output head position can be misaligned depending on
+        * various factors including user space reads. In case misaligned, head
+        * needs to be aligned before TRBE can be configured. Pad the alignment
+        * gap with ETE_IGNORE_PACKET bytes that will be ignored by user tools
+        * and skip this section thus advancing the head.
+        */
+       if (!IS_ALIGNED(head, cpudata->trbe_align)) {
+               unsigned long delta = roundup(head, cpudata->trbe_align) - head;
+
+               delta = min(delta, handle->size);
+               trbe_pad_buf(handle, delta);
+               head = PERF_IDX2OFF(handle->head, buf);
+       }
+
+       /*
+        *      head = tail (size = 0)
+        * +----|-------------------------------+
+        * |$$$$|$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ |
+        * +----|-------------------------------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * Perf aux buffer does not have any space for the driver to write into.
+        * Just communicate trace truncation event to the user space by marking
+        * it with PERF_AUX_FLAG_TRUNCATED.
+        */
+       if (!handle->size) {
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+               return 0;
+       }
+
+       /* Compute the tail and wakeup indices now that we've aligned head */
+       tail = PERF_IDX2OFF(handle->head + handle->size, buf);
+       wakeup = PERF_IDX2OFF(handle->wakeup, buf);
+
+       /*
+        * Lets calculate the buffer area which TRBE could write into. There
+        * are three possible scenarios here. Limit needs to be aligned with
+        * PAGE_SIZE per the TRBE requirement. Always avoid clobbering the
+        * unconsumed data.
+        *
+        * 1) head < tail
+        *
+        *      head                    tail
+        * +----|-----------------------|-------+
+        * |$$$$|#######################|$$$$$$$|
+        * +----|-----------------------|-------+
+        * trbe_base                    limit   trbe_base + nr_pages
+        *
+        * TRBE could write into [head..tail] area. Unless the tail is right at
+        * the end of the buffer, neither an wrap around nor an IRQ is expected
+        * while being enabled.
+        *
+        * 2) head == tail
+        *
+        *      head = tail (size > 0)
+        * +----|-------------------------------+
+        * |%%%%|###############################|
+        * +----|-------------------------------+
+        * trbe_base                            limit = trbe_base + nr_pages
+        *
+        * TRBE should just write into [head..base + nr_pages] area even though
+        * the entire buffer is empty. Reason being, when the trace reaches the
+        * end of the buffer, it will just wrap around with an IRQ giving an
+        * opportunity to reconfigure the buffer.
+        *
+        * 3) tail < head
+        *
+        *      tail                    head
+        * +----|-----------------------|-------+
+        * |%%%%|$$$$$$$$$$$$$$$$$$$$$$$|#######|
+        * +----|-----------------------|-------+
+        * trbe_base                            limit = trbe_base + nr_pages
+        *
+        * TRBE should just write into [head..base + nr_pages] area even though
+        * the [trbe_base..tail] is also empty. Reason being, when the trace
+        * reaches the end of the buffer, it will just wrap around with an IRQ
+        * giving an opportunity to reconfigure the buffer.
+        */
+       if (head < tail)
+               limit = round_down(tail, PAGE_SIZE);
+
+       /*
+        * Wakeup may be arbitrarily far into the future. If it's not in the
+        * current generation, either we'll wrap before hitting it, or it's
+        * in the past and has been handled already.
+        *
+        * If there's a wakeup before we wrap, arrange to be woken up by the
+        * page boundary following it. Keep the tail boundary if that's lower.
+        *
+        *      head            wakeup  tail
+        * +----|---------------|-------|-------+
+        * |$$$$|###############|%%%%%%%|$$$$$$$|
+        * +----|---------------|-------|-------+
+        * trbe_base            limit           trbe_base + nr_pages
+        */
+       if (handle->wakeup < (handle->head + handle->size) && head <= wakeup)
+               limit = min(limit, round_up(wakeup, PAGE_SIZE));
+
+       /*
+        * There are two situation when this can happen i.e limit is before
+        * the head and hence TRBE cannot be configured.
+        *
+        * 1) head < tail (aligned down with PAGE_SIZE) and also they are both
+        * within the same PAGE size range.
+        *
+        *                      PAGE_SIZE
+        *              |----------------------|
+        *
+        *              limit   head    tail
+        * +------------|------|--------|-------+
+        * |$$$$$$$$$$$$$$$$$$$|========|$$$$$$$|
+        * +------------|------|--------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        *
+        * 2) head < wakeup (aligned up with PAGE_SIZE) < tail and also both
+        * head and wakeup are within same PAGE size range.
+        *
+        *              PAGE_SIZE
+        *      |----------------------|
+        *
+        *      limit   head    wakeup  tail
+        * +----|------|-------|--------|-------+
+        * |$$$$$$$$$$$|=======|========|$$$$$$$|
+        * +----|------|-------|--------|-------+
+        * trbe_base                            trbe_base + nr_pages
+        */
+       if (limit > head)
+               return limit;
+
+       trbe_pad_buf(handle, handle->size);
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       return 0;
+}
+
+static unsigned long trbe_normal_offset(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = perf_get_aux(handle);
+       u64 limit = __trbe_normal_offset(handle);
+       u64 head = PERF_IDX2OFF(handle->head, buf);
+
+       /*
+        * If the head is too close to the limit and we don't
+        * have space for a meaningful run, we rather pad it
+        * and start fresh.
+        */
+       if (limit && (limit - head < TRBE_TRACE_MIN_BUF_SIZE)) {
+               trbe_pad_buf(handle, limit - head);
+               limit = __trbe_normal_offset(handle);
+       }
+       return limit;
+}
+
+static unsigned long compute_trbe_buffer_limit(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       unsigned long offset;
+
+       if (buf->snapshot)
+               offset = trbe_snapshot_offset(handle);
+       else
+               offset = trbe_normal_offset(handle);
+       return buf->trbe_base + offset;
+}
+
+static void clr_trbe_status(void)
+{
+       u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1);
+
+       WARN_ON(is_trbe_enabled());
+       trbsr &= ~TRBSR_IRQ;
+       trbsr &= ~TRBSR_TRG;
+       trbsr &= ~TRBSR_WRAP;
+       trbsr &= ~(TRBSR_EC_MASK << TRBSR_EC_SHIFT);
+       trbsr &= ~(TRBSR_BSC_MASK << TRBSR_BSC_SHIFT);
+       trbsr &= ~TRBSR_STOP;
+       write_sysreg_s(trbsr, SYS_TRBSR_EL1);
+}
+
+static void set_trbe_limit_pointer_enabled(unsigned long addr)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       WARN_ON(!IS_ALIGNED(addr, (1UL << TRBLIMITR_LIMIT_SHIFT)));
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+
+       trblimitr &= ~TRBLIMITR_NVM;
+       trblimitr &= ~(TRBLIMITR_FILL_MODE_MASK << TRBLIMITR_FILL_MODE_SHIFT);
+       trblimitr &= ~(TRBLIMITR_TRIG_MODE_MASK << TRBLIMITR_TRIG_MODE_SHIFT);
+       trblimitr &= ~(TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT);
+
+       /*
+        * Fill trace buffer mode is used here while configuring the
+        * TRBE for trace capture. In this particular mode, the trace
+        * collection is stopped and a maintenance interrupt is raised
+        * when the current write pointer wraps. This pause in trace
+        * collection gives the software an opportunity to capture the
+        * trace data in the interrupt handler, before reconfiguring
+        * the TRBE.
+        */
+       trblimitr |= (TRBE_FILL_MODE_FILL & TRBLIMITR_FILL_MODE_MASK) << TRBLIMITR_FILL_MODE_SHIFT;
+
+       /*
+        * Trigger mode is not used here while configuring the TRBE for
+        * the trace capture. Hence just keep this in the ignore mode.
+        */
+       trblimitr |= (TRBE_TRIG_MODE_IGNORE & TRBLIMITR_TRIG_MODE_MASK) <<
+                     TRBLIMITR_TRIG_MODE_SHIFT;
+       trblimitr |= (addr & PAGE_MASK);
+
+       trblimitr |= TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+
+       /* Synchronize the TRBE enable event */
+       isb();
+}
+
+static void trbe_enable_hw(struct trbe_buf *buf)
+{
+       WARN_ON(buf->trbe_write < buf->trbe_base);
+       WARN_ON(buf->trbe_write >= buf->trbe_limit);
+       set_trbe_disabled();
+       isb();
+       clr_trbe_status();
+       set_trbe_base_pointer(buf->trbe_base);
+       set_trbe_write_pointer(buf->trbe_write);
+
+       /*
+        * Synchronize all the register updates
+        * till now before enabling the TRBE.
+        */
+       isb();
+       set_trbe_limit_pointer_enabled(buf->trbe_limit);
+}
+
+static enum trbe_fault_action trbe_get_fault_act(u64 trbsr)
+{
+       int ec = get_trbe_ec(trbsr);
+       int bsc = get_trbe_bsc(trbsr);
+
+       WARN_ON(is_trbe_running(trbsr));
+       if (is_trbe_trg(trbsr) || is_trbe_abort(trbsr))
+               return TRBE_FAULT_ACT_FATAL;
+
+       if ((ec == TRBE_EC_STAGE1_ABORT) || (ec == TRBE_EC_STAGE2_ABORT))
+               return TRBE_FAULT_ACT_FATAL;
+
+       if (is_trbe_wrap(trbsr) && (ec == TRBE_EC_OTHERS) && (bsc == TRBE_BSC_FILLED)) {
+               if (get_trbe_write_pointer() == get_trbe_base_pointer())
+                       return TRBE_FAULT_ACT_WRAP;
+       }
+       return TRBE_FAULT_ACT_SPURIOUS;
+}
+
+static void *arm_trbe_alloc_buffer(struct coresight_device *csdev,
+                                  struct perf_event *event, void **pages,
+                                  int nr_pages, bool snapshot)
+{
+       struct trbe_buf *buf;
+       struct page **pglist;
+       int i;
+
+       /*
+        * TRBE LIMIT and TRBE WRITE pointers must be page aligned. But with
+        * just a single page, there would not be any room left while writing
+        * into a partially filled TRBE buffer after the page size alignment.
+        * Hence restrict the minimum buffer size as two pages.
+        */
+       if (nr_pages < 2)
+               return NULL;
+
+       buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, trbe_alloc_node(event));
+       if (!buf)
+               return ERR_PTR(-ENOMEM);
+
+       pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
+       if (!pglist) {
+               kfree(buf);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       for (i = 0; i < nr_pages; i++)
+               pglist[i] = virt_to_page(pages[i]);
+
+       buf->trbe_base = (unsigned long)vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
+       if (!buf->trbe_base) {
+               kfree(pglist);
+               kfree(buf);
+               return ERR_PTR(-ENOMEM);
+       }
+       buf->trbe_limit = buf->trbe_base + nr_pages * PAGE_SIZE;
+       buf->trbe_write = buf->trbe_base;
+       buf->snapshot = snapshot;
+       buf->nr_pages = nr_pages;
+       buf->pages = pages;
+       kfree(pglist);
+       return buf;
+}
+
+static void arm_trbe_free_buffer(void *config)
+{
+       struct trbe_buf *buf = config;
+
+       vunmap((void *)buf->trbe_base);
+       kfree(buf);
+}
+
+static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev,
+                                           struct perf_output_handle *handle,
+                                           void *config)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct trbe_buf *buf = config;
+       enum trbe_fault_action act;
+       unsigned long size, offset;
+       unsigned long write, base, status;
+       unsigned long flags;
+
+       WARN_ON(buf->cpudata != cpudata);
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (cpudata->mode != CS_MODE_PERF)
+               return 0;
+
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW);
+
+       /*
+        * We are about to disable the TRBE. And this could in turn
+        * fill up the buffer triggering, an IRQ. This could be consumed
+        * by the PE asynchronously, causing a race here against
+        * the IRQ handler in closing out the handle. So, let us
+        * make sure the IRQ can't trigger while we are collecting
+        * the buffer. We also make sure that a WRAP event is handled
+        * accordingly.
+        */
+       local_irq_save(flags);
+
+       /*
+        * If the TRBE was disabled due to lack of space in the AUX buffer or a
+        * spurious fault, the driver leaves it disabled, truncating the buffer.
+        * Since the etm_perf driver expects to close out the AUX buffer, the
+        * driver skips it. Thus, just pass in 0 size here to indicate that the
+        * buffer was truncated.
+        */
+       if (!is_trbe_enabled()) {
+               size = 0;
+               goto done;
+       }
+       /*
+        * perf handle structure needs to be shared with the TRBE IRQ handler for
+        * capturing trace data and restarting the handle. There is a probability
+        * of an undefined reference based crash when etm event is being stopped
+        * while a TRBE IRQ also getting processed. This happens due the release
+        * of perf handle via perf_aux_output_end() in etm_event_stop(). Stopping
+        * the TRBE here will ensure that no IRQ could be generated when the perf
+        * handle gets freed in etm_event_stop().
+        */
+       trbe_drain_and_disable_local();
+       write = get_trbe_write_pointer();
+       base = get_trbe_base_pointer();
+
+       /* Check if there is a pending interrupt and handle it here */
+       status = read_sysreg_s(SYS_TRBSR_EL1);
+       if (is_trbe_irq(status)) {
+
+               /*
+                * Now that we are handling the IRQ here, clear the IRQ
+                * from the status, to let the irq handler know that it
+                * is taken care of.
+                */
+               clr_trbe_irq();
+               isb();
+
+               act = trbe_get_fault_act(status);
+               /*
+                * If this was not due to a WRAP event, we have some
+                * errors and as such buffer is empty.
+                */
+               if (act != TRBE_FAULT_ACT_WRAP) {
+                       size = 0;
+                       goto done;
+               }
+
+               /*
+                * Otherwise, the buffer is full and the write pointer
+                * has reached base. Adjust this back to the Limit pointer
+                * for correct size. Also, mark the buffer truncated.
+                */
+               write = get_trbe_limit_pointer();
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       }
+
+       offset = write - base;
+       if (WARN_ON_ONCE(offset < PERF_IDX2OFF(handle->head, buf)))
+               size = 0;
+       else
+               size = offset - PERF_IDX2OFF(handle->head, buf);
+
+done:
+       local_irq_restore(flags);
+
+       if (buf->snapshot)
+               handle->head += size;
+       return size;
+}
+
+static int arm_trbe_enable(struct coresight_device *csdev, u32 mode, void *data)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct perf_output_handle *handle = data;
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (mode != CS_MODE_PERF)
+               return -EINVAL;
+
+       *this_cpu_ptr(drvdata->handle) = handle;
+       cpudata->buf = buf;
+       cpudata->mode = mode;
+       buf->cpudata = cpudata;
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_stop_and_truncate_event(handle);
+               return 0;
+       }
+       trbe_enable_hw(buf);
+       return 0;
+}
+
+static int arm_trbe_disable(struct coresight_device *csdev)
+{
+       struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev);
+       struct trbe_buf *buf = cpudata->buf;
+
+       WARN_ON(buf->cpudata != cpudata);
+       WARN_ON(cpudata->cpu != smp_processor_id());
+       WARN_ON(cpudata->drvdata != drvdata);
+       if (cpudata->mode != CS_MODE_PERF)
+               return -EINVAL;
+
+       trbe_drain_and_disable_local();
+       buf->cpudata = NULL;
+       cpudata->buf = NULL;
+       cpudata->mode = CS_MODE_DISABLED;
+       return 0;
+}
+
+static void trbe_handle_spurious(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_drain_and_disable_local();
+               return;
+       }
+       trbe_enable_hw(buf);
+}
+
+static void trbe_handle_overflow(struct perf_output_handle *handle)
+{
+       struct perf_event *event = handle->event;
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       unsigned long offset, size;
+       struct etm_event_data *event_data;
+
+       offset = get_trbe_limit_pointer() - get_trbe_base_pointer();
+       size = offset - PERF_IDX2OFF(handle->head, buf);
+       if (buf->snapshot)
+               handle->head += size;
+
+       /*
+        * Mark the buffer as truncated, as we have stopped the trace
+        * collection upon the WRAP event, without stopping the source.
+        */
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW |
+                                    PERF_AUX_FLAG_TRUNCATED);
+       perf_aux_output_end(handle, size);
+       event_data = perf_aux_output_begin(handle, event);
+       if (!event_data) {
+               /*
+                * We are unable to restart the trace collection,
+                * thus leave the TRBE disabled. The etm-perf driver
+                * is able to detect this with a disconnected handle
+                * (handle->event = NULL).
+                */
+               trbe_drain_and_disable_local();
+               *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL;
+               return;
+       }
+       buf->trbe_limit = compute_trbe_buffer_limit(handle);
+       buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf);
+       if (buf->trbe_limit == buf->trbe_base) {
+               trbe_stop_and_truncate_event(handle);
+               return;
+       }
+       *this_cpu_ptr(buf->cpudata->drvdata->handle) = handle;
+       trbe_enable_hw(buf);
+}
+
+static bool is_perf_trbe(struct perf_output_handle *handle)
+{
+       struct trbe_buf *buf = etm_perf_sink_config(handle);
+       struct trbe_cpudata *cpudata = buf->cpudata;
+       struct trbe_drvdata *drvdata = cpudata->drvdata;
+       int cpu = smp_processor_id();
+
+       WARN_ON(buf->trbe_base != get_trbe_base_pointer());
+       WARN_ON(buf->trbe_limit != get_trbe_limit_pointer());
+
+       if (cpudata->mode != CS_MODE_PERF)
+               return false;
+
+       if (cpudata->cpu != cpu)
+               return false;
+
+       if (!cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+               return false;
+
+       return true;
+}
+
+static irqreturn_t arm_trbe_irq_handler(int irq, void *dev)
+{
+       struct perf_output_handle **handle_ptr = dev;
+       struct perf_output_handle *handle = *handle_ptr;
+       enum trbe_fault_action act;
+       u64 status;
+
+       /*
+        * Ensure the trace is visible to the CPUs and
+        * any external aborts have been resolved.
+        */
+       trbe_drain_and_disable_local();
+
+       status = read_sysreg_s(SYS_TRBSR_EL1);
+       /*
+        * If the pending IRQ was handled by update_buffer callback
+        * we have nothing to do here.
+        */
+       if (!is_trbe_irq(status))
+               return IRQ_NONE;
+
+       clr_trbe_irq();
+       isb();
+
+       if (WARN_ON_ONCE(!handle) || !perf_get_aux(handle))
+               return IRQ_NONE;
+
+       if (!is_perf_trbe(handle))
+               return IRQ_NONE;
+
+       /*
+        * Ensure perf callbacks have completed, which may disable
+        * the trace buffer in response to a TRUNCATION flag.
+        */
+       irq_work_run();
+
+       act = trbe_get_fault_act(status);
+       switch (act) {
+       case TRBE_FAULT_ACT_WRAP:
+               trbe_handle_overflow(handle);
+               break;
+       case TRBE_FAULT_ACT_SPURIOUS:
+               trbe_handle_spurious(handle);
+               break;
+       case TRBE_FAULT_ACT_FATAL:
+               trbe_stop_and_truncate_event(handle);
+               break;
+       }
+       return IRQ_HANDLED;
+}
+
+static const struct coresight_ops_sink arm_trbe_sink_ops = {
+       .enable         = arm_trbe_enable,
+       .disable        = arm_trbe_disable,
+       .alloc_buffer   = arm_trbe_alloc_buffer,
+       .free_buffer    = arm_trbe_free_buffer,
+       .update_buffer  = arm_trbe_update_buffer,
+};
+
+static const struct coresight_ops arm_trbe_cs_ops = {
+       .sink_ops       = &arm_trbe_sink_ops,
+};
+
+static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct trbe_cpudata *cpudata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%llx\n", cpudata->trbe_align);
+}
+static DEVICE_ATTR_RO(align);
+
+static ssize_t flag_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct trbe_cpudata *cpudata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%d\n", cpudata->trbe_flag);
+}
+static DEVICE_ATTR_RO(flag);
+
+static struct attribute *arm_trbe_attrs[] = {
+       &dev_attr_align.attr,
+       &dev_attr_flag.attr,
+       NULL,
+};
+
+static const struct attribute_group arm_trbe_group = {
+       .attrs = arm_trbe_attrs,
+};
+
+static const struct attribute_group *arm_trbe_groups[] = {
+       &arm_trbe_group,
+       NULL,
+};
+
+static void arm_trbe_enable_cpu(void *info)
+{
+       struct trbe_drvdata *drvdata = info;
+
+       trbe_reset_local();
+       enable_percpu_irq(drvdata->irq, IRQ_TYPE_NONE);
+}
+
+static void arm_trbe_register_coresight_cpu(struct trbe_drvdata *drvdata, int cpu)
+{
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu);
+       struct coresight_desc desc = { 0 };
+       struct device *dev;
+
+       if (WARN_ON(trbe_csdev))
+               return;
+
+       dev = &cpudata->drvdata->pdev->dev;
+       desc.name = devm_kasprintf(dev, GFP_KERNEL, "trbe%d", cpu);
+       if (!desc.name)
+               goto cpu_clear;
+
+       desc.type = CORESIGHT_DEV_TYPE_SINK;
+       desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM;
+       desc.ops = &arm_trbe_cs_ops;
+       desc.pdata = dev_get_platdata(dev);
+       desc.groups = arm_trbe_groups;
+       desc.dev = dev;
+       trbe_csdev = coresight_register(&desc);
+       if (IS_ERR(trbe_csdev))
+               goto cpu_clear;
+
+       dev_set_drvdata(&trbe_csdev->dev, cpudata);
+       coresight_set_percpu_sink(cpu, trbe_csdev);
+       return;
+cpu_clear:
+       cpumask_clear_cpu(cpu, &drvdata->supported_cpus);
+}
+
+static void arm_trbe_probe_cpu(void *info)
+{
+       struct trbe_drvdata *drvdata = info;
+       int cpu = smp_processor_id();
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       u64 trbidr;
+
+       if (WARN_ON(!cpudata))
+               goto cpu_clear;
+
+       if (!is_trbe_available()) {
+               pr_err("TRBE is not implemented on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+
+       trbidr = read_sysreg_s(SYS_TRBIDR_EL1);
+       if (!is_trbe_programmable(trbidr)) {
+               pr_err("TRBE is owned in higher exception level on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+
+       cpudata->trbe_align = 1ULL << get_trbe_address_align(trbidr);
+       if (cpudata->trbe_align > SZ_2K) {
+               pr_err("Unsupported alignment on cpu %d\n", cpu);
+               goto cpu_clear;
+       }
+       cpudata->trbe_flag = get_trbe_flag_update(trbidr);
+       cpudata->cpu = cpu;
+       cpudata->drvdata = drvdata;
+       return;
+cpu_clear:
+       cpumask_clear_cpu(cpu, &drvdata->supported_cpus);
+}
+
+static void arm_trbe_remove_coresight_cpu(void *info)
+{
+       int cpu = smp_processor_id();
+       struct trbe_drvdata *drvdata = info;
+       struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu);
+       struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu);
+
+       disable_percpu_irq(drvdata->irq);
+       trbe_reset_local();
+       if (trbe_csdev) {
+               coresight_unregister(trbe_csdev);
+               cpudata->drvdata = NULL;
+               coresight_set_percpu_sink(cpu, NULL);
+       }
+}
+
+static int arm_trbe_probe_coresight(struct trbe_drvdata *drvdata)
+{
+       int cpu;
+
+       drvdata->cpudata = alloc_percpu(typeof(*drvdata->cpudata));
+       if (!drvdata->cpudata)
+               return -ENOMEM;
+
+       for_each_cpu(cpu, &drvdata->supported_cpus) {
+               smp_call_function_single(cpu, arm_trbe_probe_cpu, drvdata, 1);
+               if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                       arm_trbe_register_coresight_cpu(drvdata, cpu);
+               if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                       smp_call_function_single(cpu, arm_trbe_enable_cpu, drvdata, 1);
+       }
+       return 0;
+}
+
+static int arm_trbe_remove_coresight(struct trbe_drvdata *drvdata)
+{
+       int cpu;
+
+       for_each_cpu(cpu, &drvdata->supported_cpus)
+               smp_call_function_single(cpu, arm_trbe_remove_coresight_cpu, drvdata, 1);
+       free_percpu(drvdata->cpudata);
+       return 0;
+}
+
+static int arm_trbe_cpu_startup(unsigned int cpu, struct hlist_node *node)
+{
+       struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node);
+
+       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) {
+
+               /*
+                * If this CPU was not probed for TRBE,
+                * initialize it now.
+                */
+               if (!coresight_get_percpu_sink(cpu)) {
+                       arm_trbe_probe_cpu(drvdata);
+                       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                               arm_trbe_register_coresight_cpu(drvdata, cpu);
+                       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus))
+                               arm_trbe_enable_cpu(drvdata);
+               } else {
+                       arm_trbe_enable_cpu(drvdata);
+               }
+       }
+       return 0;
+}
+
+static int arm_trbe_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+       struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node);
+
+       if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) {
+               disable_percpu_irq(drvdata->irq);
+               trbe_reset_local();
+       }
+       return 0;
+}
+
+static int arm_trbe_probe_cpuhp(struct trbe_drvdata *drvdata)
+{
+       enum cpuhp_state trbe_online;
+       int ret;
+
+       trbe_online = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME,
+                                             arm_trbe_cpu_startup, arm_trbe_cpu_teardown);
+       if (trbe_online < 0)
+               return trbe_online;
+
+       ret = cpuhp_state_add_instance(trbe_online, &drvdata->hotplug_node);
+       if (ret) {
+               cpuhp_remove_multi_state(trbe_online);
+               return ret;
+       }
+       drvdata->trbe_online = trbe_online;
+       return 0;
+}
+
+static void arm_trbe_remove_cpuhp(struct trbe_drvdata *drvdata)
+{
+       cpuhp_remove_multi_state(drvdata->trbe_online);
+}
+
+static int arm_trbe_probe_irq(struct platform_device *pdev,
+                             struct trbe_drvdata *drvdata)
+{
+       int ret;
+
+       drvdata->irq = platform_get_irq(pdev, 0);
+       if (drvdata->irq < 0) {
+               pr_err("IRQ not found for the platform device\n");
+               return drvdata->irq;
+       }
+
+       if (!irq_is_percpu(drvdata->irq)) {
+               pr_err("IRQ is not a PPI\n");
+               return -EINVAL;
+       }
+
+       if (irq_get_percpu_devid_partition(drvdata->irq, &drvdata->supported_cpus))
+               return -EINVAL;
+
+       drvdata->handle = alloc_percpu(struct perf_output_handle *);
+       if (!drvdata->handle)
+               return -ENOMEM;
+
+       ret = request_percpu_irq(drvdata->irq, arm_trbe_irq_handler, DRVNAME, drvdata->handle);
+       if (ret) {
+               free_percpu(drvdata->handle);
+               return ret;
+       }
+       return 0;
+}
+
+static void arm_trbe_remove_irq(struct trbe_drvdata *drvdata)
+{
+       free_percpu_irq(drvdata->irq, drvdata->handle);
+       free_percpu(drvdata->handle);
+}
+
+static int arm_trbe_device_probe(struct platform_device *pdev)
+{
+       struct coresight_platform_data *pdata;
+       struct trbe_drvdata *drvdata;
+       struct device *dev = &pdev->dev;
+       int ret;
+
+       drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
+       if (!drvdata)
+               return -ENOMEM;
+
+       pdata = coresight_get_platform_data(dev);
+       if (IS_ERR(pdata))
+               return PTR_ERR(pdata);
+
+       dev_set_drvdata(dev, drvdata);
+       dev->platform_data = pdata;
+       drvdata->pdev = pdev;
+       ret = arm_trbe_probe_irq(pdev, drvdata);
+       if (ret)
+               return ret;
+
+       ret = arm_trbe_probe_coresight(drvdata);
+       if (ret)
+               goto probe_failed;
+
+       ret = arm_trbe_probe_cpuhp(drvdata);
+       if (ret)
+               goto cpuhp_failed;
+
+       return 0;
+cpuhp_failed:
+       arm_trbe_remove_coresight(drvdata);
+probe_failed:
+       arm_trbe_remove_irq(drvdata);
+       return ret;
+}
+
+static int arm_trbe_device_remove(struct platform_device *pdev)
+{
+       struct trbe_drvdata *drvdata = platform_get_drvdata(pdev);
+
+       arm_trbe_remove_cpuhp(drvdata);
+       arm_trbe_remove_coresight(drvdata);
+       arm_trbe_remove_irq(drvdata);
+       return 0;
+}
+
+static const struct of_device_id arm_trbe_of_match[] = {
+       { .compatible = "arm,trace-buffer-extension"},
+       {},
+};
+MODULE_DEVICE_TABLE(of, arm_trbe_of_match);
+
+static struct platform_driver arm_trbe_driver = {
+       .driver = {
+               .name = DRVNAME,
+               .of_match_table = of_match_ptr(arm_trbe_of_match),
+               .suppress_bind_attrs = true,
+       },
+       .probe  = arm_trbe_device_probe,
+       .remove = arm_trbe_device_remove,
+};
+
+static int __init arm_trbe_init(void)
+{
+       int ret;
+
+       if (arm64_kernel_unmapped_at_el0()) {
+               pr_err("TRBE wouldn't work if kernel gets unmapped at EL0\n");
+               return -EOPNOTSUPP;
+       }
+
+       ret = platform_driver_register(&arm_trbe_driver);
+       if (!ret)
+               return 0;
+
+       pr_err("Error registering %s platform driver\n", DRVNAME);
+       return ret;
+}
+
+static void __exit arm_trbe_exit(void)
+{
+       platform_driver_unregister(&arm_trbe_driver);
+}
+module_init(arm_trbe_init);
+module_exit(arm_trbe_exit);
+
+MODULE_AUTHOR("Anshuman Khandual <anshuman.khandual@arm.com>");
+MODULE_DESCRIPTION("Arm Trace Buffer Extension (TRBE) driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/hwtracing/coresight/coresight-trbe.h b/drivers/hwtracing/coresight/coresight-trbe.h
new file mode 100644 (file)
index 0000000..abf3e36
--- /dev/null
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This contains all required hardware related helper functions for
+ * Trace Buffer Extension (TRBE) driver in the coresight framework.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#include <linux/coresight.h>
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+
+#include "coresight-etm-perf.h"
+
+static inline bool is_trbe_available(void)
+{
+       u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
+       unsigned int trbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_TRBE_SHIFT);
+
+       return trbe >= 0b0001;
+}
+
+static inline bool is_trbe_enabled(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       return trblimitr & TRBLIMITR_ENABLE;
+}
+
+#define TRBE_EC_OTHERS         0
+#define TRBE_EC_STAGE1_ABORT   36
+#define TRBE_EC_STAGE2_ABORT   37
+
+static inline int get_trbe_ec(u64 trbsr)
+{
+       return (trbsr >> TRBSR_EC_SHIFT) & TRBSR_EC_MASK;
+}
+
+#define TRBE_BSC_NOT_STOPPED 0
+#define TRBE_BSC_FILLED      1
+#define TRBE_BSC_TRIGGERED   2
+
+static inline int get_trbe_bsc(u64 trbsr)
+{
+       return (trbsr >> TRBSR_BSC_SHIFT) & TRBSR_BSC_MASK;
+}
+
+static inline void clr_trbe_irq(void)
+{
+       u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1);
+
+       trbsr &= ~TRBSR_IRQ;
+       write_sysreg_s(trbsr, SYS_TRBSR_EL1);
+}
+
+static inline bool is_trbe_irq(u64 trbsr)
+{
+       return trbsr & TRBSR_IRQ;
+}
+
+static inline bool is_trbe_trg(u64 trbsr)
+{
+       return trbsr & TRBSR_TRG;
+}
+
+static inline bool is_trbe_wrap(u64 trbsr)
+{
+       return trbsr & TRBSR_WRAP;
+}
+
+static inline bool is_trbe_abort(u64 trbsr)
+{
+       return trbsr & TRBSR_ABORT;
+}
+
+static inline bool is_trbe_running(u64 trbsr)
+{
+       return !(trbsr & TRBSR_STOP);
+}
+
+#define TRBE_TRIG_MODE_STOP            0
+#define TRBE_TRIG_MODE_IRQ             1
+#define TRBE_TRIG_MODE_IGNORE          3
+
+#define TRBE_FILL_MODE_FILL            0
+#define TRBE_FILL_MODE_WRAP            1
+#define TRBE_FILL_MODE_CIRCULAR_BUFFER 3
+
+static inline void set_trbe_disabled(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+
+       trblimitr &= ~TRBLIMITR_ENABLE;
+       write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1);
+}
+
+static inline bool get_trbe_flag_update(u64 trbidr)
+{
+       return trbidr & TRBIDR_FLAG;
+}
+
+static inline bool is_trbe_programmable(u64 trbidr)
+{
+       return !(trbidr & TRBIDR_PROG);
+}
+
+static inline int get_trbe_address_align(u64 trbidr)
+{
+       return (trbidr >> TRBIDR_ALIGN_SHIFT) & TRBIDR_ALIGN_MASK;
+}
+
+static inline unsigned long get_trbe_write_pointer(void)
+{
+       return read_sysreg_s(SYS_TRBPTR_EL1);
+}
+
+static inline void set_trbe_write_pointer(unsigned long addr)
+{
+       WARN_ON(is_trbe_enabled());
+       write_sysreg_s(addr, SYS_TRBPTR_EL1);
+}
+
+static inline unsigned long get_trbe_limit_pointer(void)
+{
+       u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1);
+       unsigned long addr = trblimitr & (TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT);
+
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       return addr;
+}
+
+static inline unsigned long get_trbe_base_pointer(void)
+{
+       u64 trbbaser = read_sysreg_s(SYS_TRBBASER_EL1);
+       unsigned long addr = trbbaser & (TRBBASER_BASE_MASK << TRBBASER_BASE_SHIFT);
+
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       return addr;
+}
+
+static inline void set_trbe_base_pointer(unsigned long addr)
+{
+       WARN_ON(is_trbe_enabled());
+       WARN_ON(!IS_ALIGNED(addr, (1UL << TRBBASER_BASE_SHIFT)));
+       WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+       write_sysreg_s(addr, SYS_TRBBASER_EL1);
+}
index ed46e60..d205faf 100644 (file)
@@ -794,8 +794,13 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its,
 
        its_encode_alloc(cmd, alloc);
 
-       /* We can only signal PTZ when alloc==1. Why do we have two bits? */
-       its_encode_ptz(cmd, alloc);
+       /*
+        * GICv4.1 provides a way to get the VLPI state, which needs the vPE
+        * to be unmapped first, and in this case, we may remap the vPE
+        * back while the VPT is not empty. So we can't assume that the
+        * VPT is empty on map. This is why we never advertise PTZ.
+        */
+       its_encode_ptz(cmd, false);
        its_encode_vconf_addr(cmd, vconf_addr);
        its_encode_vmapp_default_db(cmd, desc->its_vmapp_cmd.vpe->vpe_db_lpi);
 
@@ -4554,6 +4559,15 @@ static void its_vpe_irq_domain_deactivate(struct irq_domain *domain,
 
                its_send_vmapp(its, vpe, false);
        }
+
+       /*
+        * There may be a direct read to the VPT after unmapping the
+        * vPE, to guarantee the validity of this, we make the VPT
+        * memory coherent with the CPU caches here.
+        */
+       if (find_4_1_its() && !atomic_read(&vpe->vmapp_count))
+               gic_flush_dcache_to_poc(page_address(vpe->vpt_page),
+                                       LPI_PENDBASE_SZ);
 }
 
 static const struct irq_domain_ops its_vpe_domain_ops = {
index 2d10d84..d4f7f1f 100644 (file)
@@ -581,33 +581,6 @@ static const struct attribute_group armpmu_common_attr_group = {
        .attrs = armpmu_common_attrs,
 };
 
-/* Set at runtime when we know what CPU type we are. */
-static struct arm_pmu *__oprofile_cpu_pmu;
-
-/*
- * Despite the names, these two functions are CPU-specific and are used
- * by the OProfile/perf code.
- */
-const char *perf_pmu_name(void)
-{
-       if (!__oprofile_cpu_pmu)
-               return NULL;
-
-       return __oprofile_cpu_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
-       int max_events = 0;
-
-       if (__oprofile_cpu_pmu != NULL)
-               max_events = __oprofile_cpu_pmu->num_events;
-
-       return max_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
 static int armpmu_count_irq_users(const int irq)
 {
        int cpu, count = 0;
@@ -979,9 +952,6 @@ int armpmu_register(struct arm_pmu *pmu)
        if (ret)
                goto out_destroy;
 
-       if (!__oprofile_cpu_pmu)
-               __oprofile_cpu_pmu = pmu;
-
        pr_info("enabled with %s PMU driver, %d counters available%s\n",
                pmu->name, pmu->num_events,
                has_nmi ? ", using NMIs" : "");
index f2edef0..8c20e52 100644 (file)
@@ -108,7 +108,7 @@ config PTP_1588_CLOCK_PCH
 config PTP_1588_CLOCK_KVM
        tristate "KVM virtual PTP clock"
        depends on PTP_1588_CLOCK
-       depends on KVM_GUEST && X86
+       depends on (KVM_GUEST && X86) || (HAVE_ARM_SMCCC_DISCOVERY && ARM_ARCH_TIMER)
        default y
        help
          This driver adds support for using kvm infrastructure as a PTP
index db5aef3..8673d17 100644 (file)
@@ -4,6 +4,8 @@
 #
 
 ptp-y                                  := ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-$(CONFIG_X86)                  := ptp_kvm_x86.o ptp_kvm_common.o
+ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC)       := ptp_kvm_arm.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)           += ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)       += ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)      += ptp_ines.o
diff --git a/drivers/ptp/ptp_kvm_arm.c b/drivers/ptp/ptp_kvm_arm.c
new file mode 100644 (file)
index 0000000..b7d28c8
--- /dev/null
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/ptp_kvm.h>
+
+#include <asm/arch_timer.h>
+#include <asm/hypervisor.h>
+
+int kvm_arch_ptp_init(void)
+{
+       int ret;
+
+       ret = kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_PTP);
+       if (ret <= 0)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+       return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL);
+}
similarity index 60%
rename from drivers/ptp/ptp_kvm.c
rename to drivers/ptp/ptp_kvm_common.c
index 658d33f..fcae32f 100644 (file)
@@ -8,11 +8,11 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/ptp_kvm.h>
 #include <uapi/linux/kvm_para.h>
 #include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/kvmclock.h>
 #include <uapi/asm/kvm_para.h>
 
 #include <linux/ptp_clock_kernel.h>
@@ -24,56 +24,29 @@ struct kvm_ptp_clock {
 
 static DEFINE_SPINLOCK(kvm_ptp_lock);
 
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
 static int ptp_kvm_get_time_fn(ktime_t *device_time,
                               struct system_counterval_t *system_counter,
                               void *ctx)
 {
-       unsigned long ret;
+       long ret;
+       u64 cycle;
        struct timespec64 tspec;
-       unsigned version;
-       int cpu;
-       struct pvclock_vcpu_time_info *src;
+       struct clocksource *cs;
 
        spin_lock(&kvm_ptp_lock);
 
        preempt_disable_notrace();
-       cpu = smp_processor_id();
-       src = &hv_clock[cpu].pvti;
-
-       do {
-               /*
-                * We are using a TSC value read in the hosts
-                * kvm_hc_clock_pairing handling.
-                * So any changes to tsc_to_system_mul
-                * and tsc_shift or any other pvclock
-                * data invalidate that measurement.
-                */
-               version = pvclock_read_begin(src);
-
-               ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-                                    clock_pair_gpa,
-                                    KVM_CLOCK_PAIRING_WALLCLOCK);
-               if (ret != 0) {
-                       pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
-                       spin_unlock(&kvm_ptp_lock);
-                       preempt_enable_notrace();
-                       return -EOPNOTSUPP;
-               }
-
-               tspec.tv_sec = clock_pair.sec;
-               tspec.tv_nsec = clock_pair.nsec;
-               ret = __pvclock_read_cycles(src, clock_pair.tsc);
-       } while (pvclock_read_retry(src, version));
+       ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs);
+       if (ret) {
+               spin_unlock(&kvm_ptp_lock);
+               preempt_enable_notrace();
+               return ret;
+       }
 
        preempt_enable_notrace();
 
-       system_counter->cycles = ret;
-       system_counter->cs = &kvm_clock;
+       system_counter->cycles = cycle;
+       system_counter->cs = cs;
 
        *device_time = timespec64_to_ktime(tspec);
 
@@ -111,22 +84,17 @@ static int ptp_kvm_settime(struct ptp_clock_info *ptp,
 
 static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 {
-       unsigned long ret;
+       long ret;
        struct timespec64 tspec;
 
        spin_lock(&kvm_ptp_lock);
 
-       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-                            clock_pair_gpa,
-                            KVM_CLOCK_PAIRING_WALLCLOCK);
-       if (ret != 0) {
-               pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+       ret = kvm_arch_ptp_get_clock(&tspec);
+       if (ret) {
                spin_unlock(&kvm_ptp_lock);
-               return -EOPNOTSUPP;
+               return ret;
        }
 
-       tspec.tv_sec = clock_pair.sec;
-       tspec.tv_nsec = clock_pair.nsec;
        spin_unlock(&kvm_ptp_lock);
 
        memcpy(ts, &tspec, sizeof(struct timespec64));
@@ -168,19 +136,12 @@ static int __init ptp_kvm_init(void)
 {
        long ret;
 
-       if (!kvm_para_available())
-               return -ENODEV;
-
-       clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-       hv_clock = pvclock_get_pvti_cpu0_va();
-
-       if (!hv_clock)
-               return -ENODEV;
-
-       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
-                       KVM_CLOCK_PAIRING_WALLCLOCK);
-       if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
-               return -ENODEV;
+       ret = kvm_arch_ptp_init();
+       if (ret) {
+               if (ret != -EOPNOTSUPP)
+                       pr_err("fail to initialize ptp_kvm");
+               return ret;
+       }
 
        kvm_ptp_clock.caps = ptp_kvm_caps;
 
diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c
new file mode 100644 (file)
index 0000000..3dd519d
--- /dev/null
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <linux/module.h>
+#include <uapi/asm/kvm_para.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_kvm.h>
+
+struct pvclock_vsyscall_time_info *hv_clock;
+
+static phys_addr_t clock_pair_gpa;
+static struct kvm_clock_pairing clock_pair;
+
+int kvm_arch_ptp_init(void)
+{
+       long ret;
+
+       if (!kvm_para_available())
+               return -ENODEV;
+
+       clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+       hv_clock = pvclock_get_pvti_cpu0_va();
+       if (!hv_clock)
+               return -ENODEV;
+
+       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+                            KVM_CLOCK_PAIRING_WALLCLOCK);
+       if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+               return -ENODEV;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+       long ret;
+
+       ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+                            clock_pair_gpa,
+                            KVM_CLOCK_PAIRING_WALLCLOCK);
+       if (ret != 0) {
+               pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+               return -EOPNOTSUPP;
+       }
+
+       ts->tv_sec = clock_pair.sec;
+       ts->tv_nsec = clock_pair.nsec;
+
+       return 0;
+}
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
+                             struct clocksource **cs)
+{
+       struct pvclock_vcpu_time_info *src;
+       unsigned int version;
+       long ret;
+       int cpu;
+
+       cpu = smp_processor_id();
+       src = &hv_clock[cpu].pvti;
+
+       do {
+               /*
+                * We are using a TSC value read in the hosts
+                * kvm_hc_clock_pairing handling.
+                * So any changes to tsc_to_system_mul
+                * and tsc_shift or any other pvclock
+                * data invalidate that measurement.
+                */
+               version = pvclock_read_begin(src);
+
+               ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+                                    clock_pair_gpa,
+                                    KVM_CLOCK_PAIRING_WALLCLOCK);
+               if (ret != 0) {
+                       pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+                       return -EOPNOTSUPP;
+               }
+               tspec->tv_sec = clock_pair.sec;
+               tspec->tv_nsec = clock_pair.nsec;
+               *cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+       } while (pvclock_read_retry(src, version));
+
+       *cs = &kvm_clock;
+
+       return 0;
+}
index 6fd3cda..864b999 100644 (file)
@@ -61,6 +61,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr);
 int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
+int kvm_pmu_probe_pmuver(void);
 #else
 struct kvm_pmu {
 };
@@ -116,6 +117,9 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
 {
        return 0;
 }
+
+static inline int kvm_pmu_probe_pmuver(void) { return 0xf; }
+
 #endif
 
 #endif
index 3d74f10..ec62118 100644 (file)
@@ -322,6 +322,7 @@ struct vgic_cpu {
         */
        struct vgic_io_device   rd_iodev;
        struct vgic_redist_region *rdreg;
+       u32 rdreg_index;
 
        /* Contains the attributes and gpa of the LPI pending tables. */
        u64 pendbaser;
index 62c5423..6861489 100644 (file)
@@ -55,6 +55,8 @@
 #define ARM_SMCCC_OWNER_TRUSTED_OS     50
 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63
 
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
 #define ARM_SMCCC_QUIRK_NONE           0
 #define ARM_SMCCC_QUIRK_QCOM_A6                1 /* Save/restore register a6 */
 
                           ARM_SMCCC_SMC_32,                            \
                           0, 0x7fff)
 
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID                          \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0     0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1     0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2     0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3     0x743a004dU
+
+/* KVM "vendor specific" services */
+#define ARM_SMCCC_KVM_FUNC_FEATURES            0
+#define ARM_SMCCC_KVM_FUNC_PTP                 1
+#define ARM_SMCCC_KVM_FUNC_FEATURES_2          127
+#define ARM_SMCCC_KVM_NUM_FUNCS                        128
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID                      \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_FEATURES)
+
 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED   1
 
+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID                           \
+       ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
+                          ARM_SMCCC_SMC_32,                            \
+                          ARM_SMCCC_OWNER_VENDOR_HYP,                  \
+                          ARM_SMCCC_KVM_FUNC_PTP)
+
+/* ptp_kvm counter type ID */
+#define KVM_PTP_VIRT_COUNTER                   0
+#define KVM_PTP_PHYS_COUNTER                   1
+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES                          \
        ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                 \
index f639bd0..348acf2 100644 (file)
@@ -36,6 +36,9 @@ static inline int is_warning_bug(const struct bug_entry *bug)
        return bug->flags & BUGFLAG_WARNING;
 }
 
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+                      unsigned int *line);
+
 struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
@@ -58,6 +61,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
        return BUG_TRAP_TYPE_BUG;
 }
 
+struct bug_entry;
+static inline void bug_get_file_line(struct bug_entry *bug, const char **file,
+                                    unsigned int *line)
+{
+       *file = NULL;
+       *line = 0;
+}
 
 static inline void generic_bug_clear_once(void) {}
 
index 86d143d..1290d0d 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/clocksource_ids.h>
 #include <asm/div64.h>
 #include <asm/io.h>
 
@@ -62,6 +63,10 @@ struct module;
  *                     400-499: Perfect
  *                             The ideal clocksource. A must-use where
  *                             available.
+ * @id:                        Defaults to CSID_GENERIC. The id value is captured
+ *                     in certain snapshot functions to allow callers to
+ *                     validate the clocksource from which the snapshot was
+ *                     taken.
  * @flags:             Flags describing special properties
  * @enable:            Optional function to enable the clocksource
  * @disable:           Optional function to disable the clocksource
@@ -100,6 +105,7 @@ struct clocksource {
        const char              *name;
        struct list_head        list;
        int                     rating;
+       enum clocksource_ids    id;
        enum vdso_clock_mode    vdso_clock_mode;
        unsigned long           flags;
 
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
new file mode 100644 (file)
index 0000000..16775d7
--- /dev/null
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOCKSOURCE_IDS_H
+#define _LINUX_CLOCKSOURCE_IDS_H
+
+/* Enum to give clocksources a unique identifier */
+enum clocksource_ids {
+       CSID_GENERIC            = 0,
+       CSID_ARM_ARCH_COUNTER,
+       CSID_MAX,
+};
+
+#endif
index 976ec26..85008a6 100644 (file)
@@ -50,6 +50,7 @@ enum coresight_dev_subtype_sink {
        CORESIGHT_DEV_SUBTYPE_SINK_PORT,
        CORESIGHT_DEV_SUBTYPE_SINK_BUFFER,
        CORESIGHT_DEV_SUBTYPE_SINK_SYSMEM,
+       CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM,
 };
 
 enum coresight_dev_subtype_link {
@@ -455,6 +456,18 @@ static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 o
 }
 #endif /* CONFIG_64BIT */
 
+static inline bool coresight_is_percpu_source(struct coresight_device *csdev)
+{
+       return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) &&
+              (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC);
+}
+
+static inline bool coresight_is_percpu_sink(struct coresight_device *csdev)
+{
+       return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SINK) &&
+              (csdev->subtype.sink_subtype == CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM);
+}
+
 extern struct coresight_device *
 coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
index 3f7f89e..51154ed 100644 (file)
@@ -951,8 +951,6 @@ extern void perf_event_itrace_started(struct perf_event *event);
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
-extern int perf_num_counters(void);
-extern const char *perf_pmu_name(void);
 extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h
new file mode 100644 (file)
index 0000000..f960a71
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#ifndef _PTP_KVM_H_
+#define _PTP_KVM_H_
+
+struct timespec64;
+struct clocksource;
+
+int kvm_arch_ptp_init(void);
+int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+int kvm_arch_ptp_get_crosststamp(u64 *cycle,
+               struct timespec64 *tspec, struct clocksource **cs);
+
+#endif /* _PTP_KVM_H_ */
index c6792cf..78a98bd 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_TIMEKEEPING_H
 
 #include <linux/errno.h>
+#include <linux/clocksource_ids.h>
 
 /* Included from linux/ktime.h */
 
@@ -243,11 +244,12 @@ struct ktime_timestamps {
  * @cs_was_changed_seq:        The sequence number of clocksource change events
  */
 struct system_time_snapshot {
-       u64             cycles;
-       ktime_t         real;
-       ktime_t         raw;
-       unsigned int    clock_was_set_seq;
-       u8              cs_was_changed_seq;
+       u64                     cycles;
+       ktime_t                 real;
+       ktime_t                 raw;
+       enum clocksource_ids    cs_id;
+       unsigned int            clock_was_set_seq;
+       u8                      cs_was_changed_seq;
 };
 
 /**
index d765334..3fd9a7e 100644 (file)
@@ -1081,6 +1081,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SET_GUEST_DEBUG2 195
 #define KVM_CAP_SGX_ATTRIBUTE 196
 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
+#define KVM_CAP_PTP_KVM 198
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index ad15e40..63971ea 100644 (file)
@@ -1156,10 +1156,15 @@ enum perf_callchain_context {
 /**
  * PERF_RECORD_AUX::flags bits
  */
-#define PERF_AUX_FLAG_TRUNCATED                0x01    /* record was truncated to fit */
-#define PERF_AUX_FLAG_OVERWRITE                0x02    /* snapshot from overwrite mode */
-#define PERF_AUX_FLAG_PARTIAL          0x04    /* record contains gaps */
-#define PERF_AUX_FLAG_COLLISION                0x08    /* sample collided with another */
+#define PERF_AUX_FLAG_TRUNCATED                        0x01    /* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE                        0x02    /* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL                  0x04    /* record contains gaps */
+#define PERF_AUX_FLAG_COLLISION                        0x08    /* sample collided with another */
+#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK     0xff00  /* PMU specific trace format type */
+
+/* CoreSight PMU AUX buffer formats */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT       0x0000 /* Default for backward compatibility */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW             0x0100 /* Raw format of the source */
 
 #define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
 #define PERF_FLAG_FD_OUTPUT            (1UL << 1)
index 03db40f..88cb0ba 100644 (file)
@@ -580,11 +580,6 @@ static u64 perf_event_time(struct perf_event *event);
 
 void __weak perf_event_print_debug(void)       { }
 
-extern __weak const char *perf_pmu_name(void)
-{
-       return "pmu";
-}
-
 static inline u64 perf_clock(void)
 {
        return local_clock();
index cce484a..4fe1df8 100644 (file)
@@ -920,6 +920,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 
        clocksource_arch_init(cs);
 
+       if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+               cs->id = CSID_GENERIC;
        if (cs->vdso_clock_mode < 0 ||
            cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
                pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
index 6aee576..06f55f9 100644 (file)
@@ -1048,6 +1048,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
+               systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
index 8f9d537..45a0584 100644 (file)
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -127,6 +127,22 @@ static inline struct bug_entry *module_find_bug(unsigned long bugaddr)
 }
 #endif
 
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+                      unsigned int *line)
+{
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+       *file = bug->file;
+#else
+       *file = (const char *)bug + bug->file_disp;
+#endif
+       *line = bug->line;
+#else
+       *file = NULL;
+       *line = 0;
+#endif
+}
+
 struct bug_entry *find_bug(unsigned long bugaddr)
 {
        struct bug_entry *bug;
@@ -153,32 +169,20 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
        disable_trace_on_warning();
 
-       file = NULL;
-       line = 0;
-       warning = 0;
+       bug_get_file_line(bug, &file, &line);
 
-       if (bug) {
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
-               file = bug->file;
-#else
-               file = (const char *)bug + bug->file_disp;
-#endif
-               line = bug->line;
-#endif
-               warning = (bug->flags & BUGFLAG_WARNING) != 0;
-               once = (bug->flags & BUGFLAG_ONCE) != 0;
-               done = (bug->flags & BUGFLAG_DONE) != 0;
-
-               if (warning && once) {
-                       if (done)
-                               return BUG_TRAP_TYPE_WARN;
-
-                       /*
-                        * Since this is the only store, concurrency is not an issue.
-                        */
-                       bug->flags |= BUGFLAG_DONE;
-               }
+       warning = (bug->flags & BUGFLAG_WARNING) != 0;
+       once = (bug->flags & BUGFLAG_ONCE) != 0;
+       done = (bug->flags & BUGFLAG_DONE) != 0;
+
+       if (warning && once) {
+               if (done)
+                       return BUG_TRAP_TYPE_WARN;
+
+               /*
+                * Since this is the only store, concurrency is not an issue.
+                */
+               bug->flags |= BUGFLAG_DONE;
        }
 
        /*
index 34414e8..bd83158 100644 (file)
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /aarch64/get-reg-list
 /aarch64/get-reg-list-sve
+/aarch64/vgic_init
 /s390x/memop
 /s390x/resets
 /s390x/sync_regs_test
index 6b0a9e7..ea5c428 100644 (file)
@@ -79,6 +79,7 @@ TEST_GEN_PROGS_x86_64 += steal_time
 
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c
new file mode 100644 (file)
index 0000000..623f31a
--- /dev/null
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic init sequence tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS               4
+
+#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) (((uint64_t)(count) << 52) | \
+       ((uint64_t)((base) >> 16) << 16) | ((uint64_t)(flags) << 12) | index)
+#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset)
+
+#define GICR_TYPER 0x8
+
+struct vm_gic {
+       struct kvm_vm *vm;
+       int gic_fd;
+};
+
+static int max_ipa_bits;
+
+/* helper to access a redistributor register */
+static int access_redist_reg(int gicv3_fd, int vcpu, int offset,
+                            uint32_t *val, bool write)
+{
+       uint64_t attr = REG_OFFSET(vcpu, offset);
+
+       return _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+                                 attr, val, write);
+}
+
+/* dummy guest code */
+static void guest_code(void)
+{
+       GUEST_SYNC(0);
+       GUEST_SYNC(1);
+       GUEST_SYNC(2);
+       GUEST_DONE();
+}
+
+/* we don't want to assert on run execution, hence that helper */
+static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       ucall_init(vm, NULL);
+       int ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+       if (ret)
+               return -errno;
+       return 0;
+}
+
+static struct vm_gic vm_gic_create(void)
+{
+       struct vm_gic v;
+
+       v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL);
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       return v;
+}
+
+static void vm_gic_destroy(struct vm_gic *v)
+{
+       close(v->gic_fd);
+       kvm_vm_free(v->vm);
+}
+
+/**
+ * Helper routine that performs KVM device tests in general and
+ * especially ARM_VGIC_V3 ones. Eventually the ARM_VGIC_V3
+ * device gets created, a legacy RDIST region is set at @0x0
+ * and a DIST region is set @0x60000
+ */
+static void subtest_dist_rdist(struct vm_gic *v)
+{
+       int ret;
+       uint64_t addr;
+
+       /* Check existing group/attributes */
+       kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                             KVM_VGIC_V3_ADDR_TYPE_DIST);
+
+       kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                             KVM_VGIC_V3_ADDR_TYPE_REDIST);
+
+       /* check non existing attribute */
+       ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0);
+       TEST_ASSERT(ret && errno == ENXIO, "attribute not supported");
+
+       /* misaligned DIST and REDIST address settings */
+       addr = 0x1000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "GICv3 dist base not 64kB aligned");
+
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "GICv3 redist base not 64kB aligned");
+
+       /* out of range address */
+       if (max_ipa_bits) {
+               addr = 1ULL << max_ipa_bits;
+               ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                        KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+               TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit");
+
+               ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                        KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+               TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit");
+       }
+
+       /* set REDIST base address @0x0*/
+       addr = 0x00000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+
+       /* Attempt to create a second legacy redistributor region */
+       addr = 0xE0000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EEXIST, "GICv3 redist base set again");
+
+       /* Attempt to mix legacy and new redistributor regions */
+       addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION");
+
+       /*
+        * Set overlapping DIST / REDIST, cannot be detected here. Will be detected
+        * on first vcpu run instead.
+        */
+       addr = 3 * 2 * 0x10000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST,
+                         &addr, true);
+}
+
+/* Test the new REDIST region API */
+static void subtest_redist_regions(struct vm_gic *v)
+{
+       uint64_t addr, expected_addr;
+       int ret;
+
+       ret = kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                    KVM_VGIC_V3_ADDR_TYPE_REDIST);
+       TEST_ASSERT(!ret, "Multiple redist regions advertised");
+
+       addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "attempt to register the first rdist region with index != 0");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address");
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "register an rdist region overlapping with another one");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 1ULL << max_ipa_bits, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == E2BIG,
+                   "register redist region with base address beyond IPA range");
+
+       addr = 0x260000;
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL,
+                   "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION");
+
+       /*
+        * Now there are 2 redist regions:
+        * region 0 @ 0x200000 2 redists
+        * region 1 @ 0x240000 1 redist
+        * Attempt to read their characteristics
+        */
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0);
+       expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1);
+       expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1");
+
+       addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false);
+       TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region");
+
+       addr = 0x260000;
+       kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2);
+       ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist");
+}
+
+/*
+ * VGIC KVM device is created and initialized before the secondary CPUs
+ * get created
+ */
+static void test_vgic_then_vcpus(void)
+{
+       struct vm_gic v;
+       int ret, i;
+
+       v.vm = vm_create_default(0, 0, guest_code);
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       subtest_dist_rdist(&v);
+
+       /* Add the rest of the VCPUs */
+       for (i = 1; i < NR_VCPUS; ++i)
+               vm_vcpu_add_default(v.vm, i, guest_code);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+/* All the VCPUs are created before the VGIC KVM device gets initialized */
+static void test_vcpus_then_vgic(void)
+{
+       struct vm_gic v;
+       int ret;
+
+       v = vm_gic_create();
+
+       subtest_dist_rdist(&v);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+static void test_new_redist_regions(void)
+{
+       void *dummy = NULL;
+       struct vm_gic v;
+       uint64_t addr;
+       int ret;
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists");
+       vm_gic_destroy(&v);
+
+       /* step2 */
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init");
+
+       vm_gic_destroy(&v);
+
+       /* step 3 */
+
+       v = vm_gic_create();
+       subtest_redist_regions(&v);
+
+       _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true);
+       TEST_ASSERT(ret && errno == EFAULT,
+                   "register a third region allowing to cover the 4 vcpus");
+
+       addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       ret = run_vcpu(v.vm, 3);
+       TEST_ASSERT(!ret, "vcpu run");
+
+       vm_gic_destroy(&v);
+}
+
+static void test_typer_accesses(void)
+{
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret, i;
+
+       v.vm = vm_create_default(0, 0, guest_code);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       vm_vcpu_add_default(v.vm, 3, guest_code);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(ret && errno == EINVAL, "attempting to read GICR_TYPER of non created vcpu");
+
+       vm_vcpu_add_default(v.vm, 1, guest_code);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(ret && errno == EBUSY, "read GICR_TYPER before GIC initialized");
+
+       vm_vcpu_add_default(v.vm, 2, guest_code);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       for (i = 0; i < NR_VCPUS ; i++) {
+               ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+               TEST_ASSERT(!ret && !val, "read GICR_TYPER before rdist region setting");
+       }
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       /* The 2 first rdists should be put there (vcpu 0 and 3) */
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && !val, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1");
+
+       addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1);
+       ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                                KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+       TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100,
+                   "no redist region attached to vcpu #1 yet, last cannot be returned");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x200,
+                   "no redist region attached to vcpu #2, last cannot be returned");
+
+       addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x210,
+                   "read typer of rdist #1, last properly returned");
+
+       vm_gic_destroy(&v);
+}
+
+/**
+ * Test GICR_TYPER last bit with new redist regions
+ * rdist regions #1 and #2 are contiguous
+ * rdist region #0 @0x100000 2 rdist capacity
+ *     rdists: 0, 3 (Last)
+ * rdist region #1 @0x240000 2 rdist capacity
+ *     rdists:  5, 4 (Last)
+ * rdist region #2 @0x200000 2 rdist capacity
+ *     rdists: 1, 2
+ */
+static void test_last_bit_redist_regions(void)
+{
+       uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret;
+
+       v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2);
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3");
+
+       ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5");
+
+       ret = access_redist_reg(v.gic_fd, 4, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4");
+
+       vm_gic_destroy(&v);
+}
+
+/* Test last bit with legacy region */
+static void test_last_bit_single_rdist(void)
+{
+       uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 };
+       struct vm_gic v;
+       uint64_t addr;
+       uint32_t val;
+       int ret;
+
+       v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids);
+
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+                         KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true);
+
+       addr = 0x10000;
+       kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+                         KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+
+       ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0");
+
+       ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1");
+
+       ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2");
+
+       ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3");
+
+       ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false);
+       TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3");
+
+       vm_gic_destroy(&v);
+}
+
+void test_kvm_device(void)
+{
+       struct vm_gic v;
+       int ret, fd;
+
+       v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL);
+
+       /* try to create a non existing KVM device */
+       ret = _kvm_create_device(v.vm, 0, true, &fd);
+       TEST_ASSERT(ret && errno == ENODEV, "unsupported device");
+
+       /* trial mode with VGIC_V3 device */
+       ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true, &fd);
+       if (ret) {
+               print_skip("GICv3 not supported");
+               exit(KSFT_SKIP);
+       }
+       v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+
+       ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false, &fd);
+       TEST_ASSERT(ret && errno == EEXIST, "create GICv3 device twice");
+
+       kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true);
+
+       if (!_kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, true, &fd)) {
+               ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, false, &fd);
+               TEST_ASSERT(ret && errno == EINVAL, "create GICv2 while v3 exists");
+       }
+
+       vm_gic_destroy(&v);
+}
+
+int main(int ac, char **av)
+{
+       max_ipa_bits = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+       test_kvm_device();
+       test_vcpus_then_vgic();
+       test_vgic_then_vcpus();
+       test_new_redist_regions();
+       test_typer_accesses();
+       test_last_bit_redist_regions();
+       test_last_bit_single_rdist();
+
+       return 0;
+}
index 0e6cc25..a8f0227 100644 (file)
@@ -223,6 +223,15 @@ int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
 #endif
 void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid);
 
+int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr);
+int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd);
+int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test);
+int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                      void *val, bool write);
+int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write);
+
 const char *exit_reason_str(unsigned int exit_reason);
 
 void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
index 35247db..8926f91 100644 (file)
@@ -1731,6 +1731,81 @@ int _kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
 }
 
 /*
+ * Device Ioctl
+ */
+
+int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
+{
+       struct kvm_device_attr attribute = {
+               .group = group,
+               .attr = attr,
+               .flags = 0,
+       };
+
+       return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
+}
+
+int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
+{
+       int ret = _kvm_device_check_attr(dev_fd, group, attr);
+
+       TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno);
+       return ret;
+}
+
+int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd)
+{
+       struct kvm_create_device create_dev;
+       int ret;
+
+       create_dev.type = type;
+       create_dev.fd = -1;
+       create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
+       ret = ioctl(vm_get_fd(vm), KVM_CREATE_DEVICE, &create_dev);
+       *fd = create_dev.fd;
+       return ret;
+}
+
+int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test)
+{
+       int fd, ret;
+
+       ret = _kvm_create_device(vm, type, test, &fd);
+
+       if (!test) {
+               TEST_ASSERT(ret >= 0,
+                           "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno);
+               return fd;
+       }
+       return ret;
+}
+
+int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write)
+{
+       struct kvm_device_attr kvmattr = {
+               .group = group,
+               .attr = attr,
+               .flags = 0,
+               .addr = (uintptr_t)val,
+       };
+       int ret;
+
+       ret = ioctl(dev_fd, write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
+                   &kvmattr);
+       return ret;
+}
+
+int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr,
+                     void *val, bool write)
+{
+       int ret = _kvm_device_access(dev_fd, group, attr, val, write);
+
+       TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno);
+       return ret;
+}
+
+/*
  * VM Dump
  *
  * Input Args: