Merge tag 'kvmarm-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmar...
authorPaolo Bonzini <pbonzini@redhat.com>
Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 23 Apr 2021 11:41:17 +0000 (07:41 -0400)
KVM/arm64 updates for Linux 5.13

New features:

- Stage-2 isolation for the host kernel when running in protected mode
- Guest SVE support when running in nVHE mode
- Force W^X hypervisor mappings in nVHE mode
- ITS save/restore for guests using direct injection with GICv4.1
- nVHE panics now produce readable backtraces
- Guest support for PTP using the ptp_kvm driver
- Performance improvements in the S2 fault handler
- Alexandru is now a reviewer (not really a new feature...)

Fixes:
- Proper emulation of the GICR_TYPER register
- Handle the complete set of relocation in the nVHE EL2 object
- Get rid of the oprofile dependency in the PMU code (and of the
  oprofile body parts at the same time)
- Debug and SPE fixes
- Fix vcpu reset

982 files changed:
.mailmap
Documentation/ABI/testing/sysfs-fs-xfs
Documentation/admin-guide/cgroup-v1/index.rst
Documentation/admin-guide/cgroup-v1/misc.rst [new file with mode: 0644]
Documentation/admin-guide/cgroup-v2.rst
Documentation/arm64/acpi_object_usage.rst
Documentation/arm64/silicon-errata.rst
Documentation/devicetree/bindings/sound/fsl,spdif.yaml
Documentation/networking/device_drivers/ethernet/amazon/ena.rst
Documentation/networking/devlink/devlink-dpipe.rst
Documentation/networking/devlink/devlink-port.rst
Documentation/networking/xfrm_device.rst
Documentation/virt/kvm/amd-memory-encryption.rst
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/locking.rst
Documentation/virt/kvm/s390-diag.rst
Documentation/x86/sgx.rst
MAINTAINERS
Makefile
arch/arm/boot/dts/am33xx.dtsi
arch/arm/boot/dts/at91-sam9x60ek.dts
arch/arm/boot/dts/at91-sama5d27_som1.dtsi
arch/arm/boot/dts/imx6ul-14x14-evk.dtsi
arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts
arch/arm/boot/dts/sam9x60.dtsi
arch/arm/mach-imx/avic.c
arch/arm/mach-imx/common.h
arch/arm/mach-imx/mach-imx1.c
arch/arm/mach-imx/mach-imx25.c
arch/arm/mach-imx/mach-imx27.c
arch/arm/mach-imx/mach-imx31.c
arch/arm/mach-imx/mach-imx35.c
arch/arm/mach-imx/mm-imx3.c
arch/arm/mach-omap2/sr_device.c
arch/arm64/Kconfig
arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi
arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts
arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi
arch/arm64/include/asm/checksum.h
arch/arm64/include/asm/cpucaps.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/processor.h
arch/arm64/include/asm/thread_info.h
arch/arm64/kernel/cpu_errata.c
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/cpuinfo.c
arch/arm64/kernel/crash_dump.c
arch/arm64/kernel/process.c
arch/arm64/kernel/stacktrace.c
arch/arm64/kvm/arm.c
arch/arm64/kvm/guest.c
arch/arm64/kvm/hyp/vgic-v3-sr.c
arch/arm64/kvm/mmu.c
arch/arm64/kvm/trace_arm.h
arch/arm64/mm/mmu.c
arch/csky/kernel/probes/ftrace.c
arch/ia64/kernel/err_inject.c
arch/ia64/kernel/mca.c
arch/mips/include/asm/kvm_host.h
arch/mips/kernel/setup.c
arch/mips/kernel/vmlinux.lds.S
arch/mips/kvm/mips.c
arch/mips/kvm/mmu.c
arch/mips/kvm/trap_emul.c
arch/mips/kvm/vz.c
arch/powerpc/include/asm/cpu_has_feature.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kernel/vdso32/gettimeofday.S
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s.h
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/e500_mmu_host.c
arch/powerpc/kvm/trace_booke.h
arch/powerpc/platforms/pseries/lpar.c
arch/powerpc/platforms/pseries/mobility.c
arch/riscv/Kconfig
arch/riscv/Kconfig.socs
arch/riscv/include/asm/asm-prototypes.h
arch/riscv/include/asm/irq.h
arch/riscv/include/asm/processor.h
arch/riscv/include/asm/ptrace.h
arch/riscv/include/asm/sbi.h
arch/riscv/include/asm/timex.h
arch/riscv/include/asm/uaccess.h
arch/riscv/kernel/Makefile
arch/riscv/kernel/entry.S
arch/riscv/kernel/probes/ftrace.c
arch/riscv/kernel/probes/kprobes.c
arch/riscv/kernel/process.c
arch/riscv/kernel/sbi.c
arch/riscv/kernel/setup.c
arch/riscv/kernel/stacktrace.c
arch/riscv/kernel/time.c
arch/riscv/kernel/traps.c
arch/riscv/mm/kasan_init.c
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/pci.h
arch/s390/include/asm/smp.h
arch/s390/include/asm/vdso/data.h
arch/s390/kernel/perf_cpum_cf_diag.c
arch/s390/kernel/smp.c
arch/s390/kernel/time.c
arch/s390/kernel/vtime.c
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/vsie.c
arch/s390/pci/pci.c
arch/s390/pci/pci_event.c
arch/x86/Kconfig
arch/x86/Makefile
arch/x86/events/intel/core.c
arch/x86/events/intel/ds.c
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/sgx.h [moved from arch/x86/kernel/cpu/sgx/arch.h with 89% similarity]
arch/x86/include/asm/smp.h
arch/x86/include/asm/svm.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/vmx.h
arch/x86/include/asm/xen/page.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/cpu/cpuid-deps.c
arch/x86/kernel/cpu/feat_ctl.c
arch/x86/kernel/cpu/scattered.c
arch/x86/kernel/cpu/sgx/Makefile
arch/x86/kernel/cpu/sgx/driver.c
arch/x86/kernel/cpu/sgx/encl.c
arch/x86/kernel/cpu/sgx/encl.h
arch/x86/kernel/cpu/sgx/encls.h
arch/x86/kernel/cpu/sgx/ioctl.c
arch/x86/kernel/cpu/sgx/main.c
arch/x86/kernel/cpu/sgx/sgx.h
arch/x86/kernel/cpu/sgx/virt.c [new file with mode: 0644]
arch/x86/kernel/kprobes/ftrace.c
arch/x86/kernel/kvm.c
arch/x86/kernel/setup.c
arch/x86/kernel/signal.c
arch/x86/kernel/smpboot.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/hyperv.c
arch/x86/kvm/hyperv.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_audit.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/mmu/spte.c
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/mmu/tdp_iter.c
arch/x86/kvm/mmu/tdp_iter.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/mmu/tdp_mmu.h
arch/x86/kvm/svm/avic.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/pmu.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/svm/vmenter.S
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/nested.h
arch/x86/kvm/vmx/sgx.c [new file with mode: 0644]
arch/x86/kvm/vmx/sgx.h [new file with mode: 0644]
arch/x86/kvm/vmx/vmcs12.c
arch/x86/kvm/vmx/vmcs12.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/vmx/vmx_ops.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/mm/mem_encrypt.c
arch/x86/net/bpf_jit_comp.c
arch/x86/platform/iris/iris.c
arch/x86/xen/p2m.c
arch/x86/xen/setup.c
arch/xtensa/kernel/coprocessor.S
arch/xtensa/mm/fault.c
block/bio.c
block/blk-merge.c
block/blk-mq-debugfs.c
block/partitions/core.c
drivers/acpi/acpica/nsaccess.c
drivers/acpi/internal.h
drivers/acpi/processor_idle.c
drivers/acpi/scan.c
drivers/acpi/tables.c
drivers/acpi/video_detect.c
drivers/atm/fore200e.c
drivers/auxdisplay/charlcd.c
drivers/base/dd.c
drivers/base/power/runtime.c
drivers/block/floppy.c
drivers/block/null_blk/main.c
drivers/block/null_blk/null_blk.h
drivers/block/xen-blkback/blkback.c
drivers/bluetooth/btrsi.c
drivers/bus/omap_l3_noc.c
drivers/bus/ti-sysc.c
drivers/char/applicom.c
drivers/char/toshiba.c
drivers/clk/qcom/clk-rcg2.c
drivers/clk/qcom/clk-rpmh.c
drivers/clk/qcom/gcc-sc7180.c
drivers/counter/stm32-timer-cnt.c
drivers/cpufreq/freq_table.c
drivers/crypto/ccp/sev-dev.c
drivers/crypto/ccp/sev-dev.h
drivers/extcon/extcon.c
drivers/firmware/efi/efi.c
drivers/firmware/efi/vars.c
drivers/gpio/gpiolib.c
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
drivers/gpu/drm/amd/amdgpu/amdgpu_display.h
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
drivers/gpu/drm/amd/amdgpu/dce_virtual.c
drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_link_encoder.c
drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c
drivers/gpu/drm/amd/display/dc/dcn30/dcn30_cm_common.c
drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega10_hwmgr.c
drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega12_hwmgr.c
drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_hwmgr.c
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
drivers/gpu/drm/etnaviv/etnaviv_gem.c
drivers/gpu/drm/exynos/exynos5433_drm_decon.c
drivers/gpu/drm/i915/display/intel_atomic_plane.c
drivers/gpu/drm/i915/display/intel_dp.c
drivers/gpu/drm/i915/display/intel_dp_aux.c
drivers/gpu/drm/i915/display/intel_dp_link_training.c
drivers/gpu/drm/i915/display/intel_dp_link_training.h
drivers/gpu/drm/i915/display/intel_vdsc.c
drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c
drivers/gpu/drm/i915/i915_perf.c
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/intel_pm.c
drivers/gpu/drm/i915/intel_runtime_pm.c
drivers/gpu/drm/i915/intel_runtime_pm.h
drivers/gpu/drm/imx/imx-drm-core.c
drivers/gpu/drm/imx/imx-ldb.c
drivers/gpu/drm/msm/adreno/a5xx_power.c
drivers/gpu/drm/msm/adreno/a6xx_gmu.c
drivers/gpu/drm/msm/adreno/a6xx_gpu.c
drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c
drivers/gpu/drm/msm/dp/dp_aux.c
drivers/gpu/drm/msm/dsi/pll/dsi_pll.c
drivers/gpu/drm/msm/dsi/pll/dsi_pll.h
drivers/gpu/drm/msm/dsi/pll/dsi_pll_7nm.c
drivers/gpu/drm/msm/msm_atomic.c
drivers/gpu/drm/msm/msm_drv.c
drivers/gpu/drm/msm/msm_fence.c
drivers/gpu/drm/msm/msm_kms.h
drivers/gpu/drm/nouveau/dispnv50/disp.c
drivers/gpu/drm/nouveau/nouveau_bo.c
drivers/gpu/drm/omapdrm/dss/dsi.c
drivers/gpu/drm/rcar-du/rcar_du_encoder.c
drivers/gpu/drm/tegra/dc.c
drivers/gpu/drm/tegra/sor.c
drivers/gpu/host1x/bus.c
drivers/iio/adc/Kconfig
drivers/iio/adc/ab8500-gpadc.c
drivers/iio/adc/ad7949.c
drivers/iio/adc/qcom-spmi-vadc.c
drivers/iio/gyro/mpu3050-core.c
drivers/iio/humidity/hid-sensor-humidity.c
drivers/iio/imu/adis16400.c
drivers/iio/light/hid-sensor-prox.c
drivers/iio/temperature/hid-sensor-temperature.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
drivers/infiniband/hw/mlx5/devx.c
drivers/infiniband/hw/mlx5/qp.c
drivers/input/joydev.c
drivers/interconnect/bulk.c
drivers/interconnect/core.c
drivers/interconnect/qcom/msm8939.c
drivers/iommu/amd/init.c
drivers/iommu/tegra-smmu.c
drivers/isdn/capi/kcapi.c
drivers/isdn/hardware/mISDN/mISDNipac.c
drivers/md/dm-ioctl.c
drivers/md/dm-table.c
drivers/md/dm-verity-target.c
drivers/md/dm-zoned-target.c
drivers/md/dm.c
drivers/media/firewire/firedtv-fw.c
drivers/media/pci/cx18/cx18-alsa-main.c
drivers/media/pci/cx18/cx18-driver.c
drivers/media/pci/cx25821/cx25821-alsa.c
drivers/media/pci/cx88/cx88-alsa.c
drivers/media/pci/ivtv/ivtv-alsa-main.c
drivers/media/pci/ivtv/ivtv-driver.c
drivers/media/pci/sta2x11/sta2x11_vip.c
drivers/media/platform/atmel/atmel-isi.c
drivers/media/platform/atmel/atmel-sama5d2-isc.c
drivers/media/platform/marvell-ccic/cafe-driver.c
drivers/media/platform/stm32/stm32-dcmi.c
drivers/media/usb/cpia2/cpia2_v4l.c
drivers/media/usb/tm6000/tm6000-alsa.c
drivers/media/usb/tm6000/tm6000-dvb.c
drivers/mfd/intel_quark_i2c_gpio.c
drivers/misc/mei/client.c
drivers/mtd/maps/sun_uflash.c
drivers/net/arcnet/com20020-pci.c
drivers/net/bonding/bond_main.c
drivers/net/can/c_can/c_can.c
drivers/net/can/c_can/c_can_pci.c
drivers/net/can/c_can/c_can_platform.c
drivers/net/can/dev/netlink.c
drivers/net/can/flexcan.c
drivers/net/can/kvaser_pciefd.c
drivers/net/can/m_can/m_can.c
drivers/net/can/peak_canfd/peak_pciefd_main.c
drivers/net/can/sja1000/ems_pci.c
drivers/net/can/sja1000/ems_pcmcia.c
drivers/net/can/sja1000/kvaser_pci.c
drivers/net/can/sja1000/peak_pci.c
drivers/net/can/sja1000/peak_pcmcia.c
drivers/net/can/sja1000/plx_pci.c
drivers/net/can/usb/Kconfig
drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
drivers/net/can/usb/peak_usb/pcan_usb.c
drivers/net/can/usb/peak_usb/pcan_usb_fd.c
drivers/net/can/usb/peak_usb/pcan_usb_pro.c
drivers/net/dsa/b53/b53_common.c
drivers/net/dsa/bcm_sf2.c
drivers/net/dsa/mt7530.c
drivers/net/ethernet/broadcom/Kconfig
drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
drivers/net/ethernet/faraday/ftgmac100.c
drivers/net/ethernet/intel/e1000e/82571.c
drivers/net/ethernet/intel/e1000e/hw.h
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/ice/ice_base.c
drivers/net/ethernet/intel/ice/ice_txrx.c
drivers/net/ethernet/intel/ice/ice_xsk.c
drivers/net/ethernet/intel/igb/e1000_hw.h
drivers/net/ethernet/intel/igb/igb.h
drivers/net/ethernet/intel/igb/igb_main.c
drivers/net/ethernet/intel/igb/igb_ptp.c
drivers/net/ethernet/intel/igc/igc.h
drivers/net/ethernet/intel/igc/igc_ethtool.c
drivers/net/ethernet/intel/igc/igc_main.c
drivers/net/ethernet/intel/igc/igc_ptp.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/net/ethernet/marvell/Kconfig
drivers/net/ethernet/marvell/mv643xx_eth.c
drivers/net/ethernet/marvell/octeontx2/af/npc_profile.h
drivers/net/ethernet/marvell/octeontx2/af/rvu.c
drivers/net/ethernet/marvell/octeontx2/af/rvu.h
drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
drivers/net/ethernet/marvell/pxa168_eth.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c
drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h
drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c
drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
drivers/net/ethernet/netronome/nfp/flower/metadata.c
drivers/net/ethernet/netronome/nfp/flower/offload.c
drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
drivers/net/ethernet/pensando/ionic/ionic_txrx.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
drivers/net/ethernet/realtek/r8169_main.c
drivers/net/ethernet/socionext/netsec.c
drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
drivers/net/ethernet/xilinx/xilinx_axienet_main.c
drivers/net/hamradio/scc.c
drivers/net/ipa/ipa_cmd.c
drivers/net/ipa/ipa_qmi.c
drivers/net/phy/broadcom.c
drivers/net/phy/phylink.c
drivers/net/usb/cdc-phonet.c
drivers/net/usb/r8152.c
drivers/net/veth.c
drivers/net/wan/hdlc_x25.c
drivers/net/wireless/admtek/adm8211.c
drivers/net/wireless/ath/ath5k/base.c
drivers/net/wireless/ath/ath9k/hw.c
drivers/net/wireless/ath/ath9k/init.c
drivers/net/wireless/atmel/atmel.c
drivers/net/wireless/atmel/atmel_cs.c
drivers/net/wireless/atmel/atmel_pci.c
drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
drivers/net/wireless/broadcom/brcm80211/brcmutil/utils.c
drivers/net/wireless/cisco/airo.c
drivers/net/wireless/cisco/airo_cs.c
drivers/net/wireless/intersil/hostap/hostap_cs.c
drivers/net/wireless/intersil/hostap/hostap_pci.c
drivers/net/wireless/intersil/hostap/hostap_plx.c
drivers/net/wireless/ralink/rt2x00/rt2400pci.c
drivers/net/wireless/ralink/rt2x00/rt2500pci.c
drivers/net/wireless/ralink/rt2x00/rt2500usb.c
drivers/net/wireless/ralink/rt2x00/rt2800pci.c
drivers/net/wireless/ralink/rt2x00/rt2800usb.c
drivers/net/wireless/ralink/rt2x00/rt61pci.c
drivers/net/wireless/ralink/rt2x00/rt73usb.c
drivers/net/wireless/rsi/rsi_91x_main.c
drivers/net/wireless/rsi/rsi_91x_sdio.c
drivers/net/wireless/rsi/rsi_91x_usb.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.h
drivers/nvme/host/fc.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/nvme/target/core.c
drivers/nvme/target/loop.c
drivers/nvme/target/tcp.c
drivers/parport/parport_amiga.c
drivers/parport/parport_atari.c
drivers/parport/parport_gsc.c
drivers/parport/parport_mfc3.c
drivers/parport/parport_sunbpp.c
drivers/pci/hotplug/rpadlpar_sysfs.c
drivers/pci/hotplug/s390_pci_hpc.c
drivers/pinctrl/intel/pinctrl-intel.c
drivers/pinctrl/pinctrl-microchip-sgpio.c
drivers/pinctrl/pinctrl-rockchip.c
drivers/pinctrl/qcom/pinctrl-lpass-lpi.c
drivers/pinctrl/qcom/pinctrl-sc7280.c
drivers/pinctrl/qcom/pinctrl-sdx55.c
drivers/platform/x86/Kconfig
drivers/platform/x86/dell/dell-wmi-sysman/enum-attributes.c
drivers/platform/x86/dell/dell-wmi-sysman/int-attributes.c
drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c
drivers/platform/x86/dell/dell-wmi-sysman/string-attributes.c
drivers/platform/x86/dell/dell-wmi-sysman/sysman.c
drivers/platform/x86/intel-hid.c
drivers/platform/x86/intel-vbtn.c
drivers/platform/x86/intel_pmc_core.c
drivers/platform/x86/intel_pmt_class.c
drivers/platform/x86/intel_pmt_crashlog.c
drivers/platform/x86/thinkpad_acpi.c
drivers/ptp/ptp_qoriq.c
drivers/s390/block/dasd.c
drivers/sbus/char/display7seg.c
drivers/scsi/hpsa.c
drivers/scsi/ibmvscsi/ibmvfc.c
drivers/scsi/lpfc/lpfc_debugfs.c
drivers/scsi/mpt3sas/mpt3sas_base.c
drivers/scsi/mpt3sas/mpt3sas_scsih.c
drivers/scsi/myrs.c
drivers/scsi/pcmcia/nsp_cs.c
drivers/scsi/qedi/qedi_main.c
drivers/scsi/qla2xxx/qla_target.c
drivers/scsi/qla2xxx/qla_target.h
drivers/scsi/qla2xxx/tcm_qla2xxx.c
drivers/scsi/scsi_transport_iscsi.c
drivers/scsi/sd_zbc.c
drivers/scsi/smartpqi/smartpqi_init.c
drivers/scsi/st.c
drivers/scsi/ufs/ufs-mediatek.c
drivers/sh/maple/maple.c
drivers/soc/litex/litex_soc_ctrl.c
drivers/soc/qcom/qcom-geni-se.c
drivers/soc/ti/omap_prm.c
drivers/spi/spi-cadence-quadspi.c
drivers/staging/comedi/drivers/cb_pcidas.c
drivers/staging/comedi/drivers/cb_pcidas64.c
drivers/staging/comedi/drivers/vmk80xx.c
drivers/staging/rtl8192e/rtllib.h
drivers/staging/rtl8192e/rtllib_rx.c
drivers/staging/vt6655/rxtx.h
drivers/target/target_core_pscsi.c
drivers/tee/optee/core.c
drivers/thermal/thermal_sysfs.c
drivers/thunderbolt/switch.c
drivers/thunderbolt/tb.c
drivers/tty/serial/icom.c
drivers/tty/serial/jsm/jsm_driver.c
drivers/tty/serial/qcom_geni_serial.c
drivers/usb/cdns3/cdnsp-ring.c
drivers/usb/class/cdc-acm.c
drivers/usb/core/quirks.c
drivers/usb/dwc2/hcd.c
drivers/usb/dwc3/dwc3-pci.c
drivers/usb/dwc3/dwc3-qcom.c
drivers/usb/dwc3/gadget.c
drivers/usb/gadget/configfs.c
drivers/usb/gadget/udc/amd5536udc_pci.c
drivers/usb/host/xhci-mtk.c
drivers/usb/misc/ldusb.c
drivers/usb/musb/musb_core.c
drivers/usb/storage/transport.c
drivers/usb/storage/unusual_devs.h
drivers/usb/typec/tcpm/tcpm.c
drivers/usb/typec/tps6598x.c
drivers/usb/usbip/vhci_hcd.c
drivers/usb/usbip/vudc_sysfs.c
drivers/vdpa/ifcvf/ifcvf_main.c
drivers/vdpa/mlx5/net/mlx5_vnet.c
drivers/vdpa/vdpa.c
drivers/vdpa/vdpa_sim/vdpa_sim.c
drivers/vdpa/vdpa_sim/vdpa_sim_net.c
drivers/vfio/Kconfig
drivers/vfio/pci/Kconfig
drivers/vfio/platform/Kconfig
drivers/vfio/vfio_iommu_type1.c
drivers/vhost/vdpa.c
drivers/vhost/vhost.c
drivers/video/fbdev/core/fbcon.c
drivers/video/fbdev/hyperv_fb.c
drivers/virtio/virtio.c
drivers/virtio/virtio_mmio.c
drivers/watchdog/cpu5wdt.c
drivers/watchdog/cpwd.c
drivers/watchdog/riowd.c
drivers/xen/Kconfig
fs/afs/dir.c
fs/afs/file.c
fs/afs/fs_operation.c
fs/afs/inode.c
fs/afs/internal.h
fs/afs/mntpt.c
fs/afs/write.c
fs/afs/xattr.c
fs/block_dev.c
fs/btrfs/Makefile
fs/btrfs/ctree.c
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/btrfs/qgroup.c
fs/btrfs/reada.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/cachefiles/bind.c
fs/cachefiles/rdwr.c
fs/cifs/cifs_swn.c
fs/cifs/cifsacl.c
fs/cifs/cifsglob.h
fs/cifs/cifspdu.h
fs/cifs/file.c
fs/cifs/fs_context.c
fs/cifs/inode.c
fs/cifs/smb2glob.h
fs/cifs/smb2misc.c
fs/cifs/smb2ops.c
fs/cifs/smb2transport.c
fs/cifs/transport.c
fs/ext4/balloc.c
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/fast_commit.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/namei.c
fs/ext4/super.c
fs/ext4/sysfs.c
fs/ext4/verity.c
fs/ext4/xattr.c
fs/fuse/dev.c
fs/fuse/fuse_i.h
fs/fuse/virtio_fs.c
fs/gfs2/super.c
fs/io-wq.c
fs/io-wq.h
fs/io_uring.c
fs/iomap/swapfile.c
fs/locks.c
fs/nfsd/Kconfig
fs/nfsd/filecache.c
fs/nfsd/nfs4callback.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4state.c
fs/reiserfs/xattr.h
fs/select.c
fs/squashfs/export.c
fs/squashfs/id.c
fs/squashfs/squashfs_fs.h
fs/squashfs/xattr_id.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_symlink.c
fs/zonefs/super.c
include/acpi/acpi_bus.h
include/drm/ttm/ttm_bo_api.h
include/linux/acpi.h
include/linux/amba/bus.h
include/linux/blkdev.h
include/linux/bpf.h
include/linux/cgroup_subsys.h
include/linux/device-mapper.h
include/linux/efi.h
include/linux/extcon.h
include/linux/firmware/intel/stratix10-svc-client.h
include/linux/host1x.h
include/linux/hugetlb_cgroup.h
include/linux/if_macvlan.h
include/linux/io_uring.h
include/linux/kvm_host.h
include/linux/memblock.h
include/linux/misc_cgroup.h [new file with mode: 0644]
include/linux/mlx5/qp.h
include/linux/mm.h
include/linux/mmu_notifier.h
include/linux/module.h
include/linux/mutex.h
include/linux/netdevice.h
include/linux/netfilter/x_tables.h
include/linux/pagemap.h
include/linux/psp-sev.h
include/linux/qcom-geni-se.h
include/linux/restart_block.h
include/linux/skbuff.h
include/linux/sunrpc/svc_rdma.h
include/linux/thread_info.h
include/linux/usb_usual.h
include/linux/usermode_driver.h
include/linux/vdpa.h
include/linux/virtio.h
include/linux/ww_mutex.h
include/linux/xarray.h
include/net/dst.h
include/net/inet_connection_sock.h
include/net/netfilter/nf_tables.h
include/net/nexthop.h
include/net/red.h
include/net/rtnetlink.h
include/net/sock.h
include/scsi/scsi_transport_iscsi.h
include/trace/events/kvm.h
include/trace/events/workqueue.h
include/uapi/linux/blkpg.h
include/uapi/linux/bpf.h
include/uapi/linux/fuse.h
include/uapi/linux/kvm.h
include/uapi/linux/psample.h
init/Kconfig
kernel/bpf/bpf_inode_storage.c
kernel/bpf/bpf_struct_ops.c
kernel/bpf/core.c
kernel/bpf/preload/bpf_preload_kern.c
kernel/bpf/syscall.c
kernel/bpf/trampoline.c
kernel/bpf/verifier.c
kernel/cgroup/Makefile
kernel/cgroup/misc.c [new file with mode: 0644]
kernel/fork.c
kernel/futex.c
kernel/gcov/clang.c
kernel/irq/irq_sim.c
kernel/irq/manage.c
kernel/jump_label.c
kernel/locking/mutex.c
kernel/power/energy_model.c
kernel/ptrace.c
kernel/reboot.c
kernel/signal.c
kernel/static_call.c
kernel/time/alarmtimer.c
kernel/time/hrtimer.c
kernel/time/posix-cpu-timers.c
kernel/trace/ftrace.c
kernel/trace/trace.c
kernel/usermode_driver.c
lib/math/div64.c
lib/test_xarray.c
lib/xarray.c
mm/highmem.c
mm/hugetlb.c
mm/hugetlb_cgroup.c
mm/kfence/core.c
mm/kmemleak.c
mm/memory.c
mm/mmu_notifier.c
mm/page-writeback.c
mm/z3fold.c
net/batman-adv/main.c
net/bridge/br_switchdev.c
net/can/isotp.c
net/core/dev.c
net/core/drop_monitor.c
net/core/dst.c
net/core/filter.c
net/core/flow_dissector.c
net/core/sock.c
net/dccp/ipv6.c
net/dsa/dsa2.c
net/ipv4/inet_connection_sock.c
net/ipv4/ipconfig.c
net/ipv4/netfilter/arp_tables.c
net/ipv4/netfilter/ip_tables.c
net/ipv4/route.c
net/ipv4/tcp_minisocks.c
net/ipv6/ip6_fib.c
net/ipv6/ip6_input.c
net/ipv6/netfilter/ip6_tables.c
net/ipv6/route.c
net/ipv6/tcp_ipv6.c
net/mac80211/aead_api.c
net/mac80211/aes_gmac.c
net/mac80211/cfg.c
net/mac80211/ibss.c
net/mac80211/main.c
net/mac80211/mlme.c
net/mac80211/rc80211_minstrel_ht.c
net/mac80211/util.c
net/mptcp/options.c
net/mptcp/protocol.c
net/mptcp/subflow.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_proto_gre.c
net/netfilter/nf_flow_table_core.c
net/netfilter/nf_tables_api.c
net/netfilter/x_tables.c
net/openvswitch/conntrack.c
net/openvswitch/conntrack.h
net/openvswitch/flow.c
net/qrtr/qrtr.c
net/sched/act_ct.c
net/sched/cls_api.c
net/sched/cls_flower.c
net/sched/sch_choke.c
net/sched/sch_gred.c
net/sched/sch_htb.c
net/sched/sch_red.c
net/sched/sch_sfq.c
net/sctp/output.c
net/sctp/outqueue.c
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/svc.c
net/sunrpc/svc_xprt.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/tipc/node.c
net/vmw_vsock/af_vsock.c
net/wireless/nl80211.c
scripts/module.lds.S
security/integrity/iint.c
security/selinux/include/security.h
security/selinux/selinuxfs.c
security/selinux/ss/services.c
security/tomoyo/network.c
sound/drivers/aloop.c
sound/drivers/dummy.c
sound/drivers/mtpav.c
sound/drivers/mts64.c
sound/drivers/pcsp/pcsp.c
sound/drivers/portman2x4.c
sound/drivers/serial-u16550.c
sound/drivers/virmidi.c
sound/firewire/dice/dice-stream.c
sound/isa/ad1816a/ad1816a.c
sound/isa/ad1848/ad1848.c
sound/isa/als100.c
sound/isa/azt2320.c
sound/isa/cmi8330.c
sound/isa/cs423x/cs4231.c
sound/isa/cs423x/cs4236.c
sound/isa/es1688/es1688.c
sound/isa/es18xx.c
sound/isa/gus/gusclassic.c
sound/isa/gus/gusextreme.c
sound/isa/gus/gusmax.c
sound/isa/gus/interwave.c
sound/isa/opl3sa2.c
sound/isa/opti9xx/miro.c
sound/isa/opti9xx/opti92x-ad1848.c
sound/isa/sb/jazz16.c
sound/isa/sb/sb16.c
sound/isa/sb/sb8.c
sound/isa/sc6000.c
sound/isa/wavefront/wavefront.c
sound/mips/sgio2audio.c
sound/pci/ad1889.c
sound/pci/ali5451/ali5451.c
sound/pci/als300.c
sound/pci/als4000.c
sound/pci/atiixp.c
sound/pci/atiixp_modem.c
sound/pci/au88x0/au88x0.c
sound/pci/azt3328.c
sound/pci/bt87x.c
sound/pci/ca0106/ca0106_main.c
sound/pci/cmipci.c
sound/pci/cs4281.c
sound/pci/cs46xx/cs46xx.c
sound/pci/cs5535audio/cs5535audio.c
sound/pci/ctxfi/xfi.c
sound/pci/echoaudio/echoaudio.c
sound/pci/emu10k1/emu10k1.c
sound/pci/emu10k1/emu10k1x.c
sound/pci/ens1370.c
sound/pci/es1938.c
sound/pci/es1968.c
sound/pci/fm801.c
sound/pci/hda/hda_generic.c
sound/pci/hda/hda_intel.c
sound/pci/hda/patch_realtek.c
sound/pci/ice1712/ice1712.c
sound/pci/ice1712/ice1724.c
sound/pci/intel8x0.c
sound/pci/intel8x0m.c
sound/pci/korg1212/korg1212.c
sound/pci/lola/lola.c
sound/pci/lx6464es/lx6464es.c
sound/pci/maestro3.c
sound/pci/mixart/mixart.c
sound/pci/nm256/nm256.c
sound/pci/oxygen/oxygen.c
sound/pci/oxygen/se6x.c
sound/pci/oxygen/virtuoso.c
sound/pci/pcxhr/pcxhr.c
sound/pci/riptide/riptide.c
sound/pci/rme32.c
sound/pci/rme96.c
sound/pci/rme9652/hdsp.c
sound/pci/rme9652/hdspm.c
sound/pci/rme9652/rme9652.c
sound/pci/sis7019.c
sound/pci/sonicvibes.c
sound/pci/trident/trident.c
sound/pci/via82xx.c
sound/pci/via82xx_modem.c
sound/pci/vx222/vx222.c
sound/pci/ymfpci/ymfpci.c
sound/pcmcia/pdaudiocf/pdaudiocf.c
sound/pcmcia/vx/vxpocket.c
sound/ppc/powermac.c
sound/sh/aica.c
sound/sh/sh_dac_audio.c
sound/soc/codecs/Kconfig
sound/soc/codecs/ak4458.c
sound/soc/codecs/ak5558.c
sound/soc/codecs/cs42l42.c
sound/soc/codecs/cs42l42.h
sound/soc/codecs/es8316.c
sound/soc/codecs/lpass-rx-macro.c
sound/soc/codecs/lpass-va-macro.c
sound/soc/codecs/lpass-wsa-macro.c
sound/soc/codecs/rt1015.c
sound/soc/codecs/rt5640.c
sound/soc/codecs/rt5651.c
sound/soc/codecs/rt5659.c
sound/soc/codecs/rt5670.c
sound/soc/codecs/rt5670.h
sound/soc/codecs/rt711.c
sound/soc/codecs/sgtl5000.c
sound/soc/codecs/sirf-audio-codec.h [deleted file]
sound/soc/codecs/wcd934x.c
sound/soc/fsl/fsl_ssi.c
sound/soc/generic/simple-card-utils.c
sound/soc/intel/boards/bytcr_rt5640.c
sound/soc/mediatek/mt8192/mt8192-dai-tdm.c
sound/soc/mediatek/mt8192/mt8192-reg.h
sound/soc/qcom/lpass-cpu.c
sound/soc/qcom/sdm845.c
sound/soc/soc-core.c
sound/soc/sof/intel/hda-dsp.c
sound/soc/sof/intel/hda.c
sound/sparc/amd7930.c
sound/sparc/cs4231.c
sound/sparc/dbri.c
sound/usb/6fire/chip.c
sound/usb/caiaq/device.c
sound/usb/card.c
sound/usb/hiface/chip.c
sound/usb/misc/ua101.c
sound/usb/mixer_quirks.c
sound/usb/quirks.c
sound/usb/usx2y/usbusx2y.c
sound/x86/intel_hdmi_audio.c
sound/xen/xen_snd_front.c
tools/include/asm-generic/hugetlb_encode.h
tools/include/uapi/linux/kvm.h
tools/kvm/kvm_stat/kvm_stat.service
tools/lib/bpf/Makefile
tools/lib/bpf/btf_dump.c
tools/lib/bpf/libbpf.c
tools/lib/bpf/netlink.c
tools/perf/builtin-daemon.c
tools/perf/tests/bpf.c
tools/perf/tests/shell/daemon.sh
tools/perf/util/auxtrace.c
tools/perf/util/bpf-event.c
tools/perf/util/parse-events.c
tools/perf/util/pmu.c
tools/perf/util/pmu.h
tools/perf/util/synthetic-events.c
tools/perf/util/vdso.c
tools/testing/kunit/configs/broken_on_uml.config
tools/testing/kunit/kunit_config.py
tools/testing/radix-tree/idr-test.c
tools/testing/radix-tree/linux/compiler_types.h [deleted file]
tools/testing/radix-tree/multiorder.c
tools/testing/radix-tree/xarray.c
tools/testing/selftests/arm64/fp/sve-test.S
tools/testing/selftests/bpf/prog_tests/check_mtu.c
tools/testing/selftests/bpf/prog_tests/fexit_sleep.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
tools/testing/selftests/bpf/progs/fexit_sleep.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_check_mtu.c
tools/testing/selftests/bpf/progs/test_tunnel_kern.c
tools/testing/selftests/bpf/verifier/bounds_deduction.c
tools/testing/selftests/bpf/verifier/map_ptr.c
tools/testing/selftests/bpf/verifier/unpriv.c
tools/testing/selftests/bpf/verifier/value_ptr_arith.c
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/dirty_log_test.c
tools/testing/selftests/kvm/hardware_disable_test.c
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/include/test_util.h
tools/testing/selftests/kvm/kvm_page_table_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/lib/assert.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/kvm_util_internal.h
tools/testing/selftests/kvm/lib/test_util.c
tools/testing/selftests/kvm/x86_64/get_msr_index_features.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/hyperv_clock.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
tools/testing/selftests/net/mptcp/mptcp_join.sh
tools/testing/selftests/net/reuseaddr_ports_exhausted.c
tools/testing/selftests/sgx/defines.h
tools/testing/selftests/sgx/load.c
tools/testing/selftests/sgx/main.c
tools/testing/selftests/vm/Makefile
virt/kvm/coalesced_mmio.c
virt/kvm/kvm_main.c

index 85b93cd..541635d 100644 (file)
--- a/.mailmap
+++ b/.mailmap
@@ -36,6 +36,7 @@ Andrew Morton <akpm@linux-foundation.org>
 Andrew Murray <amurray@thegoodpenguin.co.uk> <amurray@embedded-bits.co.uk>
 Andrew Murray <amurray@thegoodpenguin.co.uk> <andrew.murray@arm.com>
 Andrew Vasquez <andrew.vasquez@qlogic.com>
+Andrey Konovalov <andreyknvl@gmail.com> <andreyknvl@google.com>
 Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
 Andrey Ryabinin <ryabinin.a.a@gmail.com> <aryabinin@virtuozzo.com>
 Andy Adamson <andros@citi.umich.edu>
@@ -65,6 +66,8 @@ Changbin Du <changbin.du@intel.com> <changbin.du@gmail.com>
 Changbin Du <changbin.du@intel.com> <changbin.du@intel.com>
 Chao Yu <chao@kernel.org> <chao2.yu@samsung.com>
 Chao Yu <chao@kernel.org> <yuchao0@huawei.com>
+Chris Chiu <chris.chiu@canonical.com> <chiu@endlessm.com>
+Chris Chiu <chris.chiu@canonical.com> <chiu@endlessos.org>
 Christophe Ricard <christophe.ricard@gmail.com>
 Christoph Hellwig <hch@lst.de>
 Corey Minyard <minyard@acm.org>
index ea0cc8c..f704925 100644 (file)
@@ -33,7 +33,7 @@ Contact:      xfs@oss.sgi.com
 Description:
                The current state of the log write grant head. It
                represents the total log reservation of all currently
-               oustanding transactions, including regrants due to
+               outstanding transactions, including regrants due to
                rolling transactions. The grant head is exported in
                "cycle:bytes" format.
 Users:         xfstests
index 226f644..99fbc8a 100644 (file)
@@ -17,6 +17,7 @@ Control Groups version 1
     hugetlb
     memcg_test
     memory
+    misc
     net_cls
     net_prio
     pids
diff --git a/Documentation/admin-guide/cgroup-v1/misc.rst b/Documentation/admin-guide/cgroup-v1/misc.rst
new file mode 100644 (file)
index 0000000..661614c
--- /dev/null
@@ -0,0 +1,4 @@
+===============
+Misc controller
+===============
+Please refer "Misc" documentation in Documentation/admin-guide/cgroup-v2.rst
index 64c62b9..b1e81aa 100644 (file)
@@ -65,8 +65,11 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
        5-7-1. RDMA Interface Files
      5-8. HugeTLB
        5.8-1. HugeTLB Interface Files
-     5-8. Misc
-       5-8-1. perf_event
+     5-9. Misc
+       5.9-1 Miscellaneous cgroup Interface Files
+       5.9-2 Migration and Ownership
+     5-10. Others
+       5-10-1. perf_event
      5-N. Non-normative information
        5-N-1. CPU controller root cgroup process behaviour
        5-N-2. IO controller root cgroup process behaviour
@@ -2171,6 +2174,72 @@ HugeTLB Interface Files
 Misc
 ----
 
+The Miscellaneous cgroup provides the resource limiting and tracking
+mechanism for the scalar resources which cannot be abstracted like the other
+cgroup resources. Controller is enabled by the CONFIG_CGROUP_MISC config
+option.
+
+A resource can be added to the controller via enum misc_res_type{} in the
+include/linux/misc_cgroup.h file and the corresponding name via misc_res_name[]
+in the kernel/cgroup/misc.c file. Provider of the resource must set its
+capacity prior to using the resource by calling misc_cg_set_capacity().
+
+Once a capacity is set then the resource usage can be updated using charge and
+uncharge APIs. All of the APIs to interact with misc controller are in
+include/linux/misc_cgroup.h.
+
+Misc Interface Files
+~~~~~~~~~~~~~~~~~~~~
+
+Miscellaneous controller provides 3 interface files. If two misc resources (res_a and res_b) are registered then:
+
+  misc.capacity
+        A read-only flat-keyed file shown only in the root cgroup.  It shows
+        miscellaneous scalar resources available on the platform along with
+        their quantities::
+
+         $ cat misc.capacity
+         res_a 50
+         res_b 10
+
+  misc.current
+        A read-only flat-keyed file shown in the non-root cgroups.  It shows
+        the current usage of the resources in the cgroup and its children.::
+
+         $ cat misc.current
+         res_a 3
+         res_b 0
+
+  misc.max
+        A read-write flat-keyed file shown in the non root cgroups. Allowed
+        maximum usage of the resources in the cgroup and its children.::
+
+         $ cat misc.max
+         res_a max
+         res_b 4
+
+       Limit can be set by::
+
+         # echo res_a 1 > misc.max
+
+       Limit can be set to max by::
+
+         # echo res_a max > misc.max
+
+        Limits can be set higher than the capacity value in the misc.capacity
+        file.
+
+Migration and Ownership
+~~~~~~~~~~~~~~~~~~~~~~~
+
+A miscellaneous scalar resource is charged to the cgroup in which it is used
+first, and stays charged to that cgroup until that resource is freed. Migrating
+a process to a different cgroup does not move the charge to the destination
+cgroup where the process has moved.
+
+Others
+------
+
 perf_event
 ~~~~~~~~~~
 
index 377e9d2..0609da7 100644 (file)
@@ -17,12 +17,12 @@ For ACPI on arm64, tables also fall into the following categories:
 
        -  Recommended: BERT, EINJ, ERST, HEST, PCCT, SSDT
 
-       -  Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IORT,
-          MCHI, MPST, MSCT, NFIT, PMTT, RASF, SBST, SLIT, SPMI, SRAT, STAO,
-         TCPA, TPM2, UEFI, XENV
+       -  Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IBFT,
+          IORT, MCHI, MPST, MSCT, NFIT, PMTT, RASF, SBST, SLIT, SPMI, SRAT,
+          STAO, TCPA, TPM2, UEFI, XENV
 
-       -  Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, LPIT,
-          MSDM, OEMx, PSDT, RSDT, SLIC, WAET, WDAT, WDRT, WPBT
+       -  Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IVRS, LPIT, MSDM, OEMx,
+          PSDT, RSDT, SLIC, WAET, WDAT, WDRT, WPBT
 
 ====== ========================================================================
 Table  Usage for ARMv8 Linux
index 7195102..d410a47 100644 (file)
@@ -130,6 +130,9 @@ stable kernels.
 | Marvell        | ARM-MMU-500     | #582743         | N/A                         |
 +----------------+-----------------+-----------------+-----------------------------+
 +----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA         | Carmel Core     | N/A             | NVIDIA_CARMEL_CNP_ERRATUM   |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
 | Freescale/NXP  | LS2080A/LS1043A | A-008585        | FSL_ERRATUM_A008585         |
 +----------------+-----------------+-----------------+-----------------------------+
 +----------------+-----------------+-----------------+-----------------------------+
index 50449b6..4454aca 100644 (file)
@@ -21,6 +21,10 @@ properties:
       - fsl,vf610-spdif
       - fsl,imx6sx-spdif
       - fsl,imx8qm-spdif
+      - fsl,imx8qxp-spdif
+      - fsl,imx8mq-spdif
+      - fsl,imx8mm-spdif
+      - fsl,imx8mn-spdif
 
   reg:
     maxItems: 1
index 3561a8a..f8c6469 100644 (file)
@@ -267,7 +267,7 @@ DATA PATH
 Tx
 --
 
-end_start_xmit() is called by the stack. This function does the following:
+ena_start_xmit() is called by the stack. This function does the following:
 
 - Maps data buffers (skb->data and frags).
 - Populates ena_buf for the push buffer (if the driver and device are
index 468fe10..af37f25 100644 (file)
@@ -52,7 +52,7 @@ purposes as a standard complementary tool. The system's view from
 ``devlink-dpipe`` should change according to the changes done by the
 standard configuration tools.
 
-For example, it’s quiet common to  implement Access Control Lists (ACL)
+For example, it’s quite common to  implement Access Control Lists (ACL)
 using Ternary Content Addressable Memory (TCAM). The TCAM memory can be
 divided into TCAM regions. Complex TC filters can have multiple rules with
 different priorities and different lookup keys. On the other hand hardware
index e99b415..ab790e7 100644 (file)
@@ -151,7 +151,7 @@ representor netdevice.
 -------------
 A subfunction devlink port is created but it is not active yet. That means the
 entities are created on devlink side, the e-switch port representor is created,
-but the subfunction device itself it not created. A user might use e-switch port
+but the subfunction device itself is not created. A user might use e-switch port
 representor to do settings, putting it into bridge, adding TC rules, etc. A user
 might as well configure the hardware address (such as MAC address) of the
 subfunction while subfunction is inactive.
@@ -173,7 +173,7 @@ Terms and Definitions
    * - Term
      - Definitions
    * - ``PCI device``
-     - A physical PCI device having one or more PCI bus consists of one or
+     - A physical PCI device having one or more PCI buses consists of one or
        more PCI controllers.
    * - ``PCI controller``
      -  A controller consists of potentially multiple physical functions,
index da1073a..01391df 100644 (file)
@@ -50,7 +50,7 @@ Callbacks to implement
 
 The NIC driver offering ipsec offload will need to implement these
 callbacks to make the offload available to the network stack's
-XFRM subsytem.  Additionally, the feature bits NETIF_F_HW_ESP and
+XFRM subsystem.  Additionally, the feature bits NETIF_F_HW_ESP and
 NETIF_F_HW_ESP_TX_CSUM will signal the availability of the offload.
 
 
index 469a630..907adfe 100644 (file)
@@ -148,6 +148,9 @@ measurement. Since the guest owner knows the initial contents of the guest at
 boot, the measurement can be verified by comparing it to what the guest owner
 expects.
 
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
 Parameters (in): struct  kvm_sev_launch_measure
 
 Returns: 0 on success, -negative on error
@@ -271,6 +274,9 @@ report containing the SHA-256 digest of the guest memory and VMSA passed through
 commands and signed with the PEK. The digest returned by the command should match the digest
 used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
 
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
 Parameters (in): struct kvm_sev_attestation
 
 Returns: 0 on success, -negative on error
@@ -284,6 +290,142 @@ Returns: 0 on success, -negative on error
                 __u32 len;
         };
 
+11. KVM_SEV_SEND_START
+----------------------
+
+The KVM_SEV_SEND_START command can be used by the hypervisor to create an
+outgoing guest encryption context.
+
+If session_len is zero on entry, the length of the guest session information is
+written to session_len and all other fields are not used.
+
+Parameters (in): struct kvm_sev_send_start
+
+Returns: 0 on success, -negative on error
+
+::
+        struct kvm_sev_send_start {
+                __u32 policy;                 /* guest policy */
+
+                __u64 pdh_cert_uaddr;         /* platform Diffie-Hellman certificate */
+                __u32 pdh_cert_len;
+
+                __u64 plat_certs_uaddr;        /* platform certificate chain */
+                __u32 plat_certs_len;
+
+                __u64 amd_certs_uaddr;        /* AMD certificate */
+                __u32 amd_certs_len;
+
+                __u64 session_uaddr;          /* Guest session information */
+                __u32 session_len;
+        };
+
+12. KVM_SEV_SEND_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_SEND_UPDATE_DATA command can be used by the hypervisor to encrypt the
+outgoing guest memory region with the encryption context creating using
+KVM_SEV_SEND_START.
+
+If hdr_len or trans_len are zero on entry, the length of the packet header and
+transport region are written to hdr_len and trans_len respectively, and all
+other fields are not used.
+
+Parameters (in): struct kvm_sev_send_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_send_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the source memory region to be encrypted */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the destination memory region  */
+                __u32 trans_len;
+        };
+
+13. KVM_SEV_SEND_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_SEND_FINISH command can be
+issued by the hypervisor to delete the encryption context.
+
+Returns: 0 on success, -negative on error
+
+14. KVM_SEV_SEND_CANCEL
+------------------------
+
+After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the
+SEND_CANCEL command to stop a migration. This is necessary so that a cancelled
+migration can restart with a new target later.
+
+Returns: 0 on success, -negative on error
+
+15. KVM_SEV_RECEIVE_START
+------------------------
+
+The KVM_SEV_RECEIVE_START command is used for creating the memory encryption
+context for an incoming SEV guest. To create the encryption context, the user must
+provide a guest policy, the platform public Diffie-Hellman (PDH) key and session
+information.
+
+Parameters: struct  kvm_sev_receive_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_receive_start {
+                __u32 handle;           /* if zero then firmware creates a new handle */
+                __u32 policy;           /* guest's policy */
+
+                __u64 pdh_uaddr;        /* userspace address pointing to the PDH key */
+                __u32 pdh_len;
+
+                __u64 session_uaddr;    /* userspace address which points to the guest session information */
+                __u32 session_len;
+        };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.12.
+
+16. KVM_SEV_RECEIVE_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_RECEIVE_UPDATE_DATA command can be used by the hypervisor to copy
+the incoming buffers into the guest memory region with encryption context
+created during the KVM_SEV_RECEIVE_START.
+
+Parameters (in): struct kvm_sev_receive_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_receive_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the destination guest memory region */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the incoming buffer memory region  */
+                __u32 trans_len;
+        };
+
+17. KVM_SEV_RECEIVE_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_RECEIVE_FINISH command can be
+issued by the hypervisor to make the guest ready for execution.
+
+Returns: 0 on success, -negative on error
+
 References
 ==========
 
index 3f21095..94804c2 100644 (file)
@@ -204,7 +204,7 @@ Errors:
 
   ======     ============================================================
   EFAULT     the msr index list cannot be read from or written to
-  E2BIG      the msr index list is to be to fit in the array specified by
+  E2BIG      the msr index list is too big to fit in the array specified by
              the user.
   ======     ============================================================
 
@@ -1495,7 +1495,8 @@ Fails if any VCPU has already been created.
 
 Define which vcpu is the Bootstrap Processor (BSP).  Values are the same
 as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
-is vcpu 0.
+is vcpu 0. This ioctl has to be called before vcpu creation,
+otherwise it will return EBUSY error.
 
 
 4.42 KVM_GET_XSAVE
@@ -3370,6 +3371,9 @@ indicating the number of supported registers.
 For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether
 the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported.
 
+Also when supported, KVM_CAP_SET_GUEST_DEBUG2 capability indicates the
+supported KVM_GUESTDBG_* bits in the control field.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
@@ -3702,31 +3706,105 @@ which is the maximum number of possibly pending cpu-local interrupts.
 
 Queues an SMI on the thread's vcpu.
 
-4.97 KVM_CAP_PPC_MULTITCE
--------------------------
+4.97 KVM_X86_SET_MSR_FILTER
+----------------------------
 
-:Capability: KVM_CAP_PPC_MULTITCE
-:Architectures: ppc
-:Type: vm
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
 
-This capability means the kernel is capable of handling hypercalls
-H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
-space. This significantly accelerates DMA operations for PPC KVM guests.
-User space should expect that its handlers for these hypercalls
-are not going to be called if user space previously registered LIOBN
-in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+::
 
-In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
-user space might have to advertise it for the guest. For example,
-IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
-present in the "ibm,hypertas-functions" device-tree property.
+  struct kvm_msr_filter_range {
+  #define KVM_MSR_FILTER_READ  (1 << 0)
+  #define KVM_MSR_FILTER_WRITE (1 << 1)
+       __u32 flags;
+       __u32 nmsrs; /* number of msrs in bitmap */
+       __u32 base;  /* MSR index the bitmap starts at */
+       __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+  };
 
-The hypercalls mentioned above may or may not be processed successfully
-in the kernel based fast path. If they can not be handled by the kernel,
-they will get passed on to user space. So user space still has to have
-an implementation for these despite the in kernel acceleration.
+  #define KVM_MSR_FILTER_MAX_RANGES 16
+  struct kvm_msr_filter {
+  #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+  #define KVM_MSR_FILTER_DEFAULT_DENY  (1 << 0)
+       __u32 flags;
+       struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+  };
 
-This capability is always enabled.
+flags values for ``struct kvm_msr_filter_range``:
+
+``KVM_MSR_FILTER_READ``
+
+  Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a read should immediately fail, while a 1 indicates that
+  a read for a particular MSR should be handled regardless of the default
+  filter action.
+
+``KVM_MSR_FILTER_WRITE``
+
+  Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+  indicates that a write should immediately fail, while a 1 indicates that
+  a write for a particular MSR should be handled regardless of the default
+  filter action.
+
+``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
+
+  Filter both read and write accesses to MSRs using the given bitmap. A 0
+  in the bitmap indicates that both reads and writes should immediately fail,
+  while a 1 indicates that reads and writes for a particular MSR are not
+  filtered by this range.
+
+flags values for ``struct kvm_msr_filter``:
+
+``KVM_MSR_FILTER_DEFAULT_ALLOW``
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to allowing access to the MSR.
+
+``KVM_MSR_FILTER_DEFAULT_DENY``
+
+  If no filter range matches an MSR index that is getting accessed, KVM will
+  fall back to rejecting access to the MSR. In this mode, all MSRs that should
+  be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+default KVM in-kernel emulation behavior is fully preserved.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
+an error.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
+x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
+and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
+register.
+
+If a bit is within one of the defined ranges, read and write accesses are
+guarded by the bitmap's value for the MSR index if the kind of access
+is included in the ``struct kvm_msr_filter_range`` flags.  If no range
+cover this particular access, the behavior is determined by the flags
+field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
+and ``KVM_MSR_FILTER_DEFAULT_DENY``.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
 
 4.98 KVM_CREATE_SPAPR_TCE_64
 ----------------------------
@@ -4819,8 +4897,10 @@ If an MSR access is not permitted through the filtering, it generates a
 allows user space to deflect and potentially handle various MSR accesses
 into user space.
 
-If a vCPU is in running state while this ioctl is invoked, the vCPU may
-experience inconsistent filtering behavior on MSR accesses.
+Note, invoking this ioctl with a vCPU is running is inherently racy.  However,
+KVM does guarantee that vCPUs will see either the previous filter or the new
+filter, e.g. MSRs with identical settings in both the old and new filter will
+have deterministic behavior.
 
 4.127 KVM_XEN_HVM_SET_ATTR
 --------------------------
@@ -4865,7 +4945,7 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
   Sets the exception vector used to deliver Xen event channel upcalls.
 
-4.128 KVM_XEN_HVM_GET_ATTR
+4.127 KVM_XEN_HVM_GET_ATTR
 --------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4877,7 +4957,7 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
 Allows Xen VM attributes to be read. For the structure and types,
 see KVM_XEN_HVM_SET_ATTR above.
 
-4.129 KVM_XEN_VCPU_SET_ATTR
+4.128 KVM_XEN_VCPU_SET_ATTR
 ---------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4939,7 +5019,7 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
   or RUNSTATE_offline) to set the current accounted state as of the
   adjusted state_entry_time.
 
-4.130 KVM_XEN_VCPU_GET_ATTR
+4.129 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
 
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -6243,6 +6323,45 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
 This capability can be used to check / enable 2nd DAWR feature provided
 by POWER10 processor.
 
+7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
+-------------------------------------
+
+Architectures: x86 SEV enabled
+Type: vm
+Parameters: args[0] is the fd of the source vm
+Returns: 0 on success; ENOTTY on error
+
+This capability enables userspace to copy encryption context from the vm
+indicated by the fd to the vm this is called on.
+
+This is intended to support in-guest workloads scheduled by the host. This
+allows the in-guest workload to maintain its own NPTs and keeps the two vms
+from accidentally clobbering each other with interrupts and the like (separate
+APIC/MSRs/etc).
+
+7.25 KVM_CAP_SGX_ATTRIBUTE
+----------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+          attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes.  args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint.  To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
 8. Other capabilities.
 ======================
 
@@ -6738,7 +6857,33 @@ The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
 features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
 supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
 
-8.31 KVM_CAP_PTP_KVM
+8.31 KVM_CAP_PPC_MULTITCE
+-------------------------
+
+:Capability: KVM_CAP_PPC_MULTITCE
+:Architectures: ppc
+:Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
+8.32 KVM_CAP_PTP_KVM
 --------------------
 
 :Architectures: arm64
@@ -6746,4 +6891,3 @@ supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
 This capability indicates that the KVM virtual PTP service is
 supported in the host. A VMM can check whether the service is
 available to the guest on migration.
-
index 0aa4817..1fc860c 100644 (file)
@@ -38,25 +38,24 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the
 following two cases:
 
 1. Access Tracking: The SPTE is not present, but it is marked for access
-   tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
-   restore the saved R/X bits. This is described in more detail later below.
+   tracking. That means we need to restore the saved R/X bits. This is
+   described in more detail later below.
 
-2. Write-Protection: The SPTE is present and the fault is
-   caused by write-protect. That means we just need to change the W bit of
-   the spte.
+2. Write-Protection: The SPTE is present and the fault is caused by
+   write-protect. That means we just need to change the W bit of the spte.
 
-What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
-SPTE_MMU_WRITEABLE bit on the spte:
+What we use to avoid all the race is the Host-writable bit and MMU-writable bit
+on the spte:
 
-- SPTE_HOST_WRITEABLE means the gfn is writable on host.
-- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
-  the gfn is writable on guest mmu and it is not write-protected by shadow
-  page write-protection.
+- Host-writable means the gfn is writable in the host kernel page tables and in
+  its KVM memslot.
+- MMU-writable means the gfn is writable in the guest's mmu and it is not
+  write-protected by shadow page write-protection.
 
 On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
-restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
-is safe because whenever changing these bits can be detected by cmpxchg.
+bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved
+R/X bits if for an access-traced spte, or both. This is safe because whenever
+changing these bits can be detected by cmpxchg.
 
 But we need carefully check these cases:
 
@@ -185,17 +184,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
 Lockless Access Tracking:
 
 This is used for Intel CPUs that are using EPT but do not support the EPT A/D
-bits. In this case, when the KVM MMU notifier is called to track accesses to a
-page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
-by clearing the RWX bits in the PTE and storing the original R & X bits in
-some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
-PTE (using the ignored bit 62). When the VM tries to access the page later on,
-a fault is generated and the fast page fault mechanism described above is used
-to atomically restore the PTE to a Present state. The W bit is not saved when
-the PTE is marked for access tracking and during restoration to the Present
-state, the W bit is set depending on whether or not it was a write access. If
-it wasn't, then the W bit will remain clear until a write access happens, at
-which time it will be set using the Dirty tracking mechanism described above.
+bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and
+when the KVM MMU notifier is called to track accesses to a page (via
+kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware
+by clearing the RWX bits in the PTE and storing the original R & X bits in more
+unused/ignored bits. When the VM tries to access the page later on, a fault is
+generated and the fast page fault mechanism described above is used to
+atomically restore the PTE to a Present state. The W bit is not saved when the
+PTE is marked for access tracking and during restoration to the Present state,
+the W bit is set depending on whether or not it was a write access. If it
+wasn't, then the W bit will remain clear until a write access happens, at which
+time it will be set using the Dirty tracking mechanism described above.
 
 3. Reference
 ------------
index eaac486..ca85f03 100644 (file)
@@ -84,3 +84,36 @@ If the function code specifies 0x501, breakpoint functions may be performed.
 This function code is handled by userspace.
 
 This diagnose function code has no subfunctions and uses no parameters.
+
+
+DIAGNOSE function code 'X'9C - Voluntary Time Slice Yield
+---------------------------------------------------------
+
+General register 1 contains the target CPU address.
+
+In a guest of a hypervisor like LPAR, KVM or z/VM using shared host CPUs,
+DIAGNOSE with function code 0x9c may improve system performance by
+yielding the host CPU on which the guest CPU is running to be assigned
+to another guest CPU, preferably the logical CPU containing the specified
+target CPU.
+
+
+DIAG 'X'9C forwarding
++++++++++++++++++++++
+
+The guest may send a DIAGNOSE 0x9c in order to yield to a certain
+other vcpu. An example is a Linux guest that tries to yield to the vcpu
+that is currently holding a spinlock, but not running.
+
+However, on the host the real cpu backing the vcpu may itself not be
+running.
+Forwarding the DIAGNOSE 0x9c initially sent by the guest to yield to
+the backing cpu will hopefully cause that cpu, and thus subsequently
+the guest's vcpu, to be scheduled.
+
+
+diag9c_forwarding_hz
+    KVM kernel parameter allowing to specify the maximum number of DIAGNOSE
+    0x9c forwarding per second in the purpose of avoiding a DIAGNOSE 0x9c
+    forwarding storm.
+    A value of 0 turns the forwarding off.
index eaee136..dd0ac96 100644 (file)
@@ -209,3 +209,44 @@ An application may be loaded into a container enclave which is specially
 configured with a library OS and run-time which permits the application to run.
 The enclave run-time and library OS work together to execute the application
 when a thread enters the enclave.
+
+Impact of Potential Kernel SGX Bugs
+===================================
+
+EPC leaks
+---------
+
+When EPC page leaks happen, a WARNING like this is shown in dmesg:
+
+"EREMOVE returned ... and an EPC page was leaked.  SGX may become unusable..."
+
+This is effectively a kernel use-after-free of an EPC page, and due
+to the way SGX works, the bug is detected at freeing. Rather than
+adding the page back to the pool of available EPC pages, the kernel
+intentionally leaks the page to avoid additional errors in the future.
+
+When this happens, the kernel will likely soon leak more EPC pages, and
+SGX will likely become unusable because the memory available to SGX is
+limited. However, while this may be fatal to SGX, the rest of the kernel
+is unlikely to be impacted and should continue to work.
+
+As a result, when this happpens, user should stop running any new
+SGX workloads, (or just any new workloads), and migrate all valuable
+workloads. Although a machine reboot can recover all EPC memory, the bug
+should be reported to Linux developers.
+
+
+Virtual EPC
+===========
+
+The implementation has also a virtual EPC driver to support SGX enclaves
+in guests. Unlike the SGX driver, an EPC page allocated by the virtual
+EPC driver doesn't have a specific enclave associated with it. This is
+because KVM doesn't track how a guest uses EPC pages.
+
+As a result, the SGX core page reclaimer doesn't support reclaiming EPC
+pages allocated to KVM guests through the virtual EPC driver. If the
+user wants to deploy SGX applications both on the host and in guests
+on the same machine, the user should reserve enough EPC (by taking out
+total virtual EPC size of all SGX VMs from the physical EPC size) for
+host SGX applications so they can run with acceptable performance.
index aa6a335..1dd8fb4 100644 (file)
@@ -1181,7 +1181,7 @@ M:        Joel Fernandes <joel@joelfernandes.org>
 M:     Christian Brauner <christian@brauner.io>
 M:     Hridya Valsaraju <hridya@google.com>
 M:     Suren Baghdasaryan <surenb@google.com>
-L:     devel@driverdev.osuosl.org
+L:     linux-kernel@vger.kernel.org
 S:     Supported
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
 F:     drivers/android/
@@ -2491,7 +2491,7 @@ N:        sc27xx
 N:     sc2731
 
 ARM/STI ARCHITECTURE
-M:     Patrice Chotard <patrice.chotard@st.com>
+M:     Patrice Chotard <patrice.chotard@foss.st.com>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 W:     http://www.stlinux.com
@@ -2524,7 +2524,7 @@ F:        include/linux/remoteproc/st_slim_rproc.h
 
 ARM/STM32 ARCHITECTURE
 M:     Maxime Coquelin <mcoquelin.stm32@gmail.com>
-M:     Alexandre Torgue <alexandre.torgue@st.com>
+M:     Alexandre Torgue <alexandre.torgue@foss.st.com>
 L:     linux-stm32@st-md-mailman.stormreply.com (moderated for non-subscribers)
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
@@ -3117,7 +3117,7 @@ C:        irc://irc.oftc.net/bcache
 F:     drivers/md/bcache/
 
 BDISP ST MEDIA DRIVER
-M:     Fabien Dessenne <fabien.dessenne@st.com>
+M:     Fabien Dessenne <fabien.dessenne@foss.st.com>
 L:     linux-media@vger.kernel.org
 S:     Supported
 W:     https://linuxtv.org
@@ -3677,7 +3677,7 @@ M:        bcm-kernel-feedback-list@broadcom.com
 L:     linux-pm@vger.kernel.org
 S:     Maintained
 T:     git git://github.com/broadcom/stblinux.git
-F:     drivers/soc/bcm/bcm-pmb.c
+F:     drivers/soc/bcm/bcm63xx/bcm-pmb.c
 F:     include/dt-bindings/soc/bcm-pmb.h
 
 BROADCOM SPECIFIC AMBA DRIVER (BCMA)
@@ -5082,7 +5082,7 @@ S:        Maintained
 F:     drivers/platform/x86/dell/dell-wmi.c
 
 DELTA ST MEDIA DRIVER
-M:     Hugues Fruchet <hugues.fruchet@st.com>
+M:     Hugues Fruchet <hugues.fruchet@foss.st.com>
 L:     linux-media@vger.kernel.org
 S:     Supported
 W:     https://linuxtv.org
@@ -6008,7 +6008,6 @@ F:        drivers/gpu/drm/rockchip/
 
 DRM DRIVERS FOR STI
 M:     Benjamin Gaignard <benjamin.gaignard@linaro.org>
-M:     Vincent Abriou <vincent.abriou@st.com>
 L:     dri-devel@lists.freedesktop.org
 S:     Maintained
 T:     git git://anongit.freedesktop.org/drm/drm-misc
@@ -6016,10 +6015,9 @@ F:       Documentation/devicetree/bindings/display/st,stih4xx.txt
 F:     drivers/gpu/drm/sti
 
 DRM DRIVERS FOR STM
-M:     Yannick Fertre <yannick.fertre@st.com>
-M:     Philippe Cornu <philippe.cornu@st.com>
+M:     Yannick Fertre <yannick.fertre@foss.st.com>
+M:     Philippe Cornu <philippe.cornu@foss.st.com>
 M:     Benjamin Gaignard <benjamin.gaignard@linaro.org>
-M:     Vincent Abriou <vincent.abriou@st.com>
 L:     dri-devel@lists.freedesktop.org
 S:     Maintained
 T:     git git://anongit.freedesktop.org/drm/drm-misc
@@ -7478,8 +7476,9 @@ F:        include/uapi/asm-generic/
 GENERIC PHY FRAMEWORK
 M:     Kishon Vijay Abraham I <kishon@ti.com>
 M:     Vinod Koul <vkoul@kernel.org>
-L:     linux-kernel@vger.kernel.org
+L:     linux-phy@lists.infradead.org
 S:     Supported
+Q:     https://patchwork.kernel.org/project/linux-phy/list/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git
 F:     Documentation/devicetree/bindings/phy/
 F:     drivers/phy/
@@ -8118,7 +8117,6 @@ F:        drivers/crypto/hisilicon/sec2/sec_main.c
 
 HISILICON STAGING DRIVERS FOR HIKEY 960/970
 M:     Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
-L:     devel@driverdev.osuosl.org
 S:     Maintained
 F:     drivers/staging/hikey9xx/
 
@@ -8233,7 +8231,7 @@ F:        include/linux/hugetlb.h
 F:     mm/hugetlb.c
 
 HVA ST MEDIA DRIVER
-M:     Jean-Christophe Trotin <jean-christophe.trotin@st.com>
+M:     Jean-Christophe Trotin <jean-christophe.trotin@foss.st.com>
 L:     linux-media@vger.kernel.org
 S:     Supported
 W:     https://linuxtv.org
@@ -8523,6 +8521,7 @@ IBM Power SRIOV Virtual NIC Device Driver
 M:     Dany Madden <drt@linux.ibm.com>
 M:     Lijun Pan <ljp@linux.ibm.com>
 M:     Sukadev Bhattiprolu <sukadev@linux.ibm.com>
+R:     Thomas Falcon <tlfalcon@linux.ibm.com>
 L:     netdev@vger.kernel.org
 S:     Supported
 F:     drivers/net/ethernet/ibm/ibmvnic.*
@@ -9276,6 +9275,7 @@ Q:        https://patchwork.kernel.org/project/intel-sgx/list/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/sgx
 F:     Documentation/x86/sgx.rst
 F:     arch/x86/entry/vdso/vsgx.S
+F:     arch/x86/include/asm/sgx.h
 F:     arch/x86/include/uapi/asm/sgx.h
 F:     arch/x86/kernel/cpu/sgx/*
 F:     tools/testing/selftests/sgx/*
@@ -10032,7 +10032,6 @@ F:      scripts/leaking_addresses.pl
 
 LED SUBSYSTEM
 M:     Pavel Machek <pavel@ucw.cz>
-R:     Dan Murphy <dmurphy@ti.com>
 L:     linux-leds@vger.kernel.org
 S:     Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/pavel/linux-leds.git
@@ -10908,7 +10907,6 @@ T:      git git://linuxtv.org/media_tree.git
 F:     drivers/media/radio/radio-maxiradio*
 
 MCAN MMIO DEVICE DRIVER
-M:     Dan Murphy <dmurphy@ti.com>
 M:     Pankaj Sharma <pankj.sharma@samsung.com>
 L:     linux-can@vger.kernel.org
 S:     Maintained
@@ -11169,7 +11167,7 @@ T:      git git://linuxtv.org/media_tree.git
 F:     drivers/media/dvb-frontends/stv6111*
 
 MEDIA DRIVERS FOR STM32 - DCMI
-M:     Hugues Fruchet <hugues.fruchet@st.com>
+M:     Hugues Fruchet <hugues.fruchet@foss.st.com>
 L:     linux-media@vger.kernel.org
 S:     Supported
 T:     git git://linuxtv.org/media_tree.git
@@ -12540,7 +12538,7 @@ NETWORKING [MPTCP]
 M:     Mat Martineau <mathew.j.martineau@linux.intel.com>
 M:     Matthieu Baerts <matthieu.baerts@tessares.net>
 L:     netdev@vger.kernel.org
-L:     mptcp@lists.01.org
+L:     mptcp@lists.linux.dev
 S:     Maintained
 W:     https://github.com/multipath-tcp/mptcp_net-next/wiki
 B:     https://github.com/multipath-tcp/mptcp_net-next/issues
@@ -14711,15 +14709,11 @@ F:    drivers/net/ethernet/qlogic/qlcnic/
 QLOGIC QLGE 10Gb ETHERNET DRIVER
 M:     Manish Chopra <manishc@marvell.com>
 M:     GR-Linux-NIC-Dev@marvell.com
-L:     netdev@vger.kernel.org
-S:     Supported
-F:     drivers/staging/qlge/
-
-QLOGIC QLGE 10Gb ETHERNET DRIVER
 M:     Coiby Xu <coiby.xu@gmail.com>
 L:     netdev@vger.kernel.org
-S:     Maintained
+S:     Supported
 F:     Documentation/networking/device_drivers/qlogic/qlge.rst
+F:     drivers/staging/qlge/
 
 QM1D1B0004 MEDIA DRIVER
 M:     Akihiro Tsukada <tskd08@gmail.com>
@@ -15637,8 +15631,8 @@ F:      Documentation/s390/pci.rst
 
 S390 VFIO AP DRIVER
 M:     Tony Krowiak <akrowiak@linux.ibm.com>
-M:     Pierre Morel <pmorel@linux.ibm.com>
 M:     Halil Pasic <pasic@linux.ibm.com>
+M:     Jason Herne <jjherne@linux.ibm.com>
 L:     linux-s390@vger.kernel.org
 S:     Supported
 W:     http://www.ibm.com/developerworks/linux/linux390/
@@ -15650,6 +15644,7 @@ F:      drivers/s390/crypto/vfio_ap_private.h
 S390 VFIO-CCW DRIVER
 M:     Cornelia Huck <cohuck@redhat.com>
 M:     Eric Farman <farman@linux.ibm.com>
+M:     Matthew Rosato <mjrosato@linux.ibm.com>
 R:     Halil Pasic <pasic@linux.ibm.com>
 L:     linux-s390@vger.kernel.org
 L:     kvm@vger.kernel.org
@@ -15660,6 +15655,7 @@ F:      include/uapi/linux/vfio_ccw.h
 
 S390 VFIO-PCI DRIVER
 M:     Matthew Rosato <mjrosato@linux.ibm.com>
+M:     Eric Farman <farman@linux.ibm.com>
 L:     linux-s390@vger.kernel.org
 L:     kvm@vger.kernel.org
 S:     Supported
@@ -16889,8 +16885,10 @@ F:     tools/spi/
 
 SPIDERNET NETWORK DRIVER for CELL
 M:     Ishizaki Kou <kou.ishizaki@toshiba.co.jp>
+M:     Geoff Levand <geoff@infradead.org>
 L:     netdev@vger.kernel.org
-S:     Supported
+L:     linuxppc-dev@lists.ozlabs.org
+S:     Maintained
 F:     Documentation/networking/device_drivers/ethernet/toshiba/spider_net.rst
 F:     drivers/net/ethernet/toshiba/spider_net*
 
@@ -16944,7 +16942,8 @@ F:      Documentation/devicetree/bindings/media/i2c/st,st-mipid02.txt
 F:     drivers/media/i2c/st-mipid02.c
 
 ST STM32 I2C/SMBUS DRIVER
-M:     Pierre-Yves MORDRET <pierre-yves.mordret@st.com>
+M:     Pierre-Yves MORDRET <pierre-yves.mordret@foss.st.com>
+M:     Alain Volmat <alain.volmat@foss.st.com>
 L:     linux-i2c@vger.kernel.org
 S:     Maintained
 F:     drivers/i2c/busses/i2c-stm32*
@@ -17042,7 +17041,7 @@ F:      drivers/staging/vt665?/
 
 STAGING SUBSYSTEM
 M:     Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-L:     devel@driverdev.osuosl.org
+L:     linux-staging@lists.linux.dev
 S:     Supported
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
 F:     drivers/staging/
@@ -17069,7 +17068,7 @@ F:      kernel/jump_label.c
 F:     kernel/static_call.c
 
 STI AUDIO (ASoC) DRIVERS
-M:     Arnaud Pouliquen <arnaud.pouliquen@st.com>
+M:     Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
 L:     alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:     Maintained
 F:     Documentation/devicetree/bindings/sound/st,sti-asoc-card.txt
@@ -17089,15 +17088,15 @@ T:    git git://linuxtv.org/media_tree.git
 F:     drivers/media/usb/stk1160/
 
 STM32 AUDIO (ASoC) DRIVERS
-M:     Olivier Moysan <olivier.moysan@st.com>
-M:     Arnaud Pouliquen <arnaud.pouliquen@st.com>
+M:     Olivier Moysan <olivier.moysan@foss.st.com>
+M:     Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
 L:     alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:     Maintained
 F:     Documentation/devicetree/bindings/iio/adc/st,stm32-*.yaml
 F:     sound/soc/stm/
 
 STM32 TIMER/LPTIMER DRIVERS
-M:     Fabrice Gasnier <fabrice.gasnier@st.com>
+M:     Fabrice Gasnier <fabrice.gasnier@foss.st.com>
 S:     Maintained
 F:     Documentation/ABI/testing/*timer-stm32
 F:     Documentation/devicetree/bindings/*/*stm32-*timer*
@@ -17107,7 +17106,7 @@ F:      include/linux/*/stm32-*tim*
 
 STMMAC ETHERNET DRIVER
 M:     Giuseppe Cavallaro <peppe.cavallaro@st.com>
-M:     Alexandre Torgue <alexandre.torgue@st.com>
+M:     Alexandre Torgue <alexandre.torgue@foss.st.com>
 M:     Jose Abreu <joabreu@synopsys.com>
 L:     netdev@vger.kernel.org
 S:     Supported
@@ -17849,7 +17848,6 @@ S:      Maintained
 F:     drivers/thermal/ti-soc-thermal/
 
 TI BQ27XXX POWER SUPPLY DRIVER
-R:     Dan Murphy <dmurphy@ti.com>
 F:     drivers/power/supply/bq27xxx_battery.c
 F:     drivers/power/supply/bq27xxx_battery_i2c.c
 F:     include/linux/power/bq27xxx_battery.h
@@ -17984,7 +17982,6 @@ S:      Odd Fixes
 F:     sound/soc/codecs/tas571x*
 
 TI TCAN4X5X DEVICE DRIVER
-M:     Dan Murphy <dmurphy@ti.com>
 L:     linux-can@vger.kernel.org
 S:     Maintained
 F:     Documentation/devicetree/bindings/net/can/tcan4x5x.txt
@@ -19137,7 +19134,7 @@ VME SUBSYSTEM
 M:     Martyn Welch <martyn@welchs.me.uk>
 M:     Manohar Vanga <manohar.vanga@gmail.com>
 M:     Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-L:     devel@driverdev.osuosl.org
+L:     linux-kernel@vger.kernel.org
 S:     Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
 F:     Documentation/driver-api/vme.rst
index a28bb37..73add16 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 5
 PATCHLEVEL = 12
 SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc5
 NAME = Frozen Wasteland
 
 # *DOCUMENTATION*
index 5b213a1..5e33d0e 100644 (file)
@@ -40,6 +40,9 @@
                ethernet1 = &cpsw_emac1;
                spi0 = &spi0;
                spi1 = &spi1;
+               mmc0 = &mmc1;
+               mmc1 = &mmc2;
+               mmc2 = &mmc3;
        };
 
        cpus {
index 73b6b1f..775ceb3 100644 (file)
 };
 
 &pinctrl {
-       atmel,mux-mask = <
-                        /*     A       B       C       */
-                        0xFFFFFE7F 0xC0E0397F 0xEF00019D       /* pioA */
-                        0x03FFFFFF 0x02FC7E68 0x00780000       /* pioB */
-                        0xffffffff 0xF83FFFFF 0xB800F3FC       /* pioC */
-                        0x003FFFFF 0x003F8000 0x00000000       /* pioD */
-                        >;
-
        adc {
                pinctrl_adc_default: adc_default {
                        atmel,pins = <AT91_PIOB 15 AT91_PERIPH_A AT91_PINCTRL_NONE>;
index 1b11638..e3251f3 100644 (file)
@@ -84,8 +84,8 @@
                                pinctrl-0 = <&pinctrl_macb0_default>;
                                phy-mode = "rmii";
 
-                               ethernet-phy@0 {
-                                       reg = <0x0>;
+                               ethernet-phy@7 {
+                                       reg = <0x7>;
                                        interrupt-parent = <&pioA>;
                                        interrupts = <PIN_PD31 IRQ_TYPE_LEVEL_LOW>;
                                        pinctrl-names = "default";
index c593597..5a1e10d 100644 (file)
                        micrel,led-mode = <1>;
                        clocks = <&clks IMX6UL_CLK_ENET_REF>;
                        clock-names = "rmii-ref";
-                       reset-gpios = <&gpio_spi 1 GPIO_ACTIVE_LOW>;
-                       reset-assert-us = <10000>;
-                       reset-deassert-us = <100>;
 
                };
 
                        micrel,led-mode = <1>;
                        clocks = <&clks IMX6UL_CLK_ENET2_REF>;
                        clock-names = "rmii-ref";
-                       reset-gpios = <&gpio_spi 2 GPIO_ACTIVE_LOW>;
-                       reset-assert-us = <10000>;
-                       reset-deassert-us = <100>;
                };
        };
 };
        status = "okay";
 };
 
+&gpio_spi {
+       eth0-phy-hog {
+               gpio-hog;
+               gpios = <1 GPIO_ACTIVE_HIGH>;
+               output-high;
+               line-name = "eth0-phy";
+       };
+
+       eth1-phy-hog {
+               gpio-hog;
+               gpios = <2 GPIO_ACTIVE_HIGH>;
+               output-high;
+               line-name = "eth1-phy";
+       };
+};
+
 &i2c1 {
        clock-frequency = <100000>;
        pinctrl-names = "default";
index ecbb2cc..79cc457 100644 (file)
@@ -14,5 +14,6 @@
 };
 
 &gpmi {
+       fsl,use-minimum-ecc;
        status = "okay";
 };
index 84066c1..ec45ced 100644 (file)
                                compatible = "microchip,sam9x60-pinctrl", "atmel,at91sam9x5-pinctrl", "atmel,at91rm9200-pinctrl", "simple-bus";
                                ranges = <0xfffff400 0xfffff400 0x800>;
 
+                               /* mux-mask corresponding to sam9x60 SoC in TFBGA228L package */
+                               atmel,mux-mask = <
+                                                /*     A       B       C       */
+                                                0xffffffff 0xffe03fff 0xef00019d       /* pioA */
+                                                0x03ffffff 0x02fc7e7f 0x00780000       /* pioB */
+                                                0xffffffff 0xffffffff 0xf83fffff       /* pioC */
+                                                0x003fffff 0x003f8000 0x00000000       /* pioD */
+                                                >;
+
                                pioA: gpio@fffff400 {
                                        compatible = "microchip,sam9x60-gpio", "atmel,at91sam9x5-gpio", "atmel,at91rm9200-gpio";
                                        reg = <0xfffff400 0x200>;
index 322caa2..21bce40 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip.h>
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -162,7 +163,7 @@ static void __exception_irq_entry avic_handle_irq(struct pt_regs *regs)
  * interrupts. It registers the interrupt enable and disable functions
  * to the kernel for each interrupt source.
  */
-void __init mxc_init_irq(void __iomem *irqbase)
+static void __init mxc_init_irq(void __iomem *irqbase)
 {
        struct device_node *np;
        int irq_base;
@@ -220,3 +221,16 @@ void __init mxc_init_irq(void __iomem *irqbase)
 
        printk(KERN_INFO "MXC IRQ initialized\n");
 }
+
+static int __init imx_avic_init(struct device_node *node,
+                              struct device_node *parent)
+{
+       void __iomem *avic_base;
+
+       avic_base = of_iomap(node, 0);
+       BUG_ON(!avic_base);
+       mxc_init_irq(avic_base);
+       return 0;
+}
+
+IRQCHIP_DECLARE(imx_avic, "fsl,avic", imx_avic_init);
index 2b004cc..474dedb 100644 (file)
@@ -22,7 +22,6 @@ void mx35_map_io(void);
 void imx21_init_early(void);
 void imx31_init_early(void);
 void imx35_init_early(void);
-void mxc_init_irq(void __iomem *);
 void mx31_init_irq(void);
 void mx35_init_irq(void);
 void mxc_set_cpu_type(unsigned int type);
index 32df3b8..8eca92d 100644 (file)
@@ -17,16 +17,6 @@ static void __init imx1_init_early(void)
        mxc_set_cpu_type(MXC_CPU_MX1);
 }
 
-static void __init imx1_init_irq(void)
-{
-       void __iomem *avic_addr;
-
-       avic_addr = ioremap(MX1_AVIC_ADDR, SZ_4K);
-       WARN_ON(!avic_addr);
-
-       mxc_init_irq(avic_addr);
-}
-
 static const char * const imx1_dt_board_compat[] __initconst = {
        "fsl,imx1",
        NULL
@@ -34,7 +24,6 @@ static const char * const imx1_dt_board_compat[] __initconst = {
 
 DT_MACHINE_START(IMX1_DT, "Freescale i.MX1 (Device Tree Support)")
        .init_early     = imx1_init_early,
-       .init_irq       = imx1_init_irq,
        .dt_compat      = imx1_dt_board_compat,
        .restart        = mxc_restart,
 MACHINE_END
index 95de48a..51927bd 100644 (file)
@@ -22,17 +22,6 @@ static void __init imx25_dt_init(void)
        imx_aips_allow_unprivileged_access("fsl,imx25-aips");
 }
 
-static void __init mx25_init_irq(void)
-{
-       struct device_node *np;
-       void __iomem *avic_base;
-
-       np = of_find_compatible_node(NULL, NULL, "fsl,avic");
-       avic_base = of_iomap(np, 0);
-       BUG_ON(!avic_base);
-       mxc_init_irq(avic_base);
-}
-
 static const char * const imx25_dt_board_compat[] __initconst = {
        "fsl,imx25",
        NULL
@@ -42,6 +31,5 @@ DT_MACHINE_START(IMX25_DT, "Freescale i.MX25 (Device Tree Support)")
        .init_early     = imx25_init_early,
        .init_machine   = imx25_dt_init,
        .init_late      = imx25_pm_init,
-       .init_irq       = mx25_init_irq,
        .dt_compat      = imx25_dt_board_compat,
 MACHINE_END
index 262422a..e325c94 100644 (file)
@@ -56,17 +56,6 @@ static void __init imx27_init_early(void)
        mxc_set_cpu_type(MXC_CPU_MX27);
 }
 
-static void __init mx27_init_irq(void)
-{
-       void __iomem *avic_base;
-       struct device_node *np;
-
-       np = of_find_compatible_node(NULL, NULL, "fsl,avic");
-       avic_base = of_iomap(np, 0);
-       BUG_ON(!avic_base);
-       mxc_init_irq(avic_base);
-}
-
 static const char * const imx27_dt_board_compat[] __initconst = {
        "fsl,imx27",
        NULL
@@ -75,7 +64,6 @@ static const char * const imx27_dt_board_compat[] __initconst = {
 DT_MACHINE_START(IMX27_DT, "Freescale i.MX27 (Device Tree Support)")
        .map_io         = mx27_map_io,
        .init_early     = imx27_init_early,
-       .init_irq       = mx27_init_irq,
        .init_late      = imx27_pm_init,
        .dt_compat      = imx27_dt_board_compat,
 MACHINE_END
index dc69dfe..e9a1092 100644 (file)
@@ -14,6 +14,5 @@ static const char * const imx31_dt_board_compat[] __initconst = {
 DT_MACHINE_START(IMX31_DT, "Freescale i.MX31 (Device Tree Support)")
        .map_io         = mx31_map_io,
        .init_early     = imx31_init_early,
-       .init_irq       = mx31_init_irq,
        .dt_compat      = imx31_dt_board_compat,
 MACHINE_END
index ec5c306..0fc0821 100644 (file)
@@ -27,6 +27,5 @@ DT_MACHINE_START(IMX35_DT, "Freescale i.MX35 (Device Tree Support)")
        .l2c_aux_mask   = ~0,
        .map_io         = mx35_map_io,
        .init_early     = imx35_init_early,
-       .init_irq       = mx35_init_irq,
        .dt_compat      = imx35_dt_board_compat,
 MACHINE_END
index 5056438..28db972 100644 (file)
@@ -109,18 +109,6 @@ void __init imx31_init_early(void)
        mx3_ccm_base = of_iomap(np, 0);
        BUG_ON(!mx3_ccm_base);
 }
-
-void __init mx31_init_irq(void)
-{
-       void __iomem *avic_base;
-       struct device_node *np;
-
-       np = of_find_compatible_node(NULL, NULL, "fsl,imx31-avic");
-       avic_base = of_iomap(np, 0);
-       BUG_ON(!avic_base);
-
-       mxc_init_irq(avic_base);
-}
 #endif /* ifdef CONFIG_SOC_IMX31 */
 
 #ifdef CONFIG_SOC_IMX35
@@ -158,16 +146,4 @@ void __init imx35_init_early(void)
        mx3_ccm_base = of_iomap(np, 0);
        BUG_ON(!mx3_ccm_base);
 }
-
-void __init mx35_init_irq(void)
-{
-       void __iomem *avic_base;
-       struct device_node *np;
-
-       np = of_find_compatible_node(NULL, NULL, "fsl,imx35-avic");
-       avic_base = of_iomap(np, 0);
-       BUG_ON(!avic_base);
-
-       mxc_init_irq(avic_base);
-}
 #endif /* ifdef CONFIG_SOC_IMX35 */
index 62df666..17b66f0 100644 (file)
@@ -88,34 +88,26 @@ static void __init sr_set_nvalues(struct omap_volt_data *volt_data,
 
 extern struct omap_sr_data omap_sr_pdata[];
 
-static int __init sr_dev_init(struct omap_hwmod *oh, void *user)
+static int __init sr_init_by_name(const char *name, const char *voltdm)
 {
        struct omap_sr_data *sr_data = NULL;
        struct omap_volt_data *volt_data;
-       struct omap_smartreflex_dev_attr *sr_dev_attr;
        static int i;
 
-       if (!strncmp(oh->name, "smartreflex_mpu_iva", 20) ||
-           !strncmp(oh->name, "smartreflex_mpu", 16))
+       if (!strncmp(name, "smartreflex_mpu_iva", 20) ||
+           !strncmp(name, "smartreflex_mpu", 16))
                sr_data = &omap_sr_pdata[OMAP_SR_MPU];
-       else if (!strncmp(oh->name, "smartreflex_core", 17))
+       else if (!strncmp(name, "smartreflex_core", 17))
                sr_data = &omap_sr_pdata[OMAP_SR_CORE];
-       else if (!strncmp(oh->name, "smartreflex_iva", 16))
+       else if (!strncmp(name, "smartreflex_iva", 16))
                sr_data = &omap_sr_pdata[OMAP_SR_IVA];
 
        if (!sr_data) {
-               pr_err("%s: Unknown instance %s\n", __func__, oh->name);
+               pr_err("%s: Unknown instance %s\n", __func__, name);
                return -EINVAL;
        }
 
-       sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr;
-       if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) {
-               pr_err("%s: No voltage domain specified for %s. Cannot initialize\n",
-                      __func__, oh->name);
-               goto exit;
-       }
-
-       sr_data->name = oh->name;
+       sr_data->name = name;
        if (cpu_is_omap343x())
                sr_data->ip_type = 1;
        else
@@ -136,10 +128,10 @@ static int __init sr_dev_init(struct omap_hwmod *oh, void *user)
                }
        }
 
-       sr_data->voltdm = voltdm_lookup(sr_dev_attr->sensor_voltdm_name);
+       sr_data->voltdm = voltdm_lookup(voltdm);
        if (!sr_data->voltdm) {
                pr_err("%s: Unable to get voltage domain pointer for VDD %s\n",
-                       __func__, sr_dev_attr->sensor_voltdm_name);
+                       __func__, voltdm);
                goto exit;
        }
 
@@ -160,6 +152,20 @@ exit:
        return 0;
 }
 
+static int __init sr_dev_init(struct omap_hwmod *oh, void *user)
+{
+       struct omap_smartreflex_dev_attr *sr_dev_attr;
+
+       sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr;
+       if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) {
+               pr_err("%s: No voltage domain specified for %s. Cannot initialize\n",
+                      __func__, oh->name);
+               return 0;
+       }
+
+       return sr_init_by_name(oh->name, sr_dev_attr->sensor_voltdm_name);
+}
+
 /*
  * API to be called from board files to enable smartreflex
  * autocompensation at init.
@@ -169,7 +175,42 @@ void __init omap_enable_smartreflex_on_init(void)
        sr_enable_on_init = true;
 }
 
+static const char * const omap4_sr_instances[] = {
+       "mpu",
+       "iva",
+       "core",
+};
+
+static const char * const dra7_sr_instances[] = {
+       "mpu",
+       "core",
+};
+
 int __init omap_devinit_smartreflex(void)
 {
+       const char * const *sr_inst;
+       int i, nr_sr = 0;
+
+       if (soc_is_omap44xx()) {
+               sr_inst = omap4_sr_instances;
+               nr_sr = ARRAY_SIZE(omap4_sr_instances);
+
+       } else if (soc_is_dra7xx()) {
+               sr_inst = dra7_sr_instances;
+               nr_sr = ARRAY_SIZE(dra7_sr_instances);
+       }
+
+       if (nr_sr) {
+               const char *name, *voltdm;
+
+               for (i = 0; i < nr_sr; i++) {
+                       name = kasprintf(GFP_KERNEL, "smartreflex_%s", sr_inst[i]);
+                       voltdm = sr_inst[i];
+                       sr_init_by_name(name, voltdm);
+               }
+
+               return 0;
+       }
+
        return omap_hwmod_for_each_by_class("smartreflex", sr_dev_init, NULL);
 }
index e9fbb0b..9ec09f9 100644 (file)
@@ -810,6 +810,16 @@ config QCOM_FALKOR_ERRATUM_E1041
 
          If unsure, say Y.
 
+config NVIDIA_CARMEL_CNP_ERRATUM
+       bool "NVIDIA Carmel CNP: CNP on Carmel semantically different than ARM cores"
+       default y
+       help
+         If CNP is enabled on Carmel cores, non-sharable TLBIs on a core will not
+         invalidate shared TLB entries installed by a different core, as it would
+         on standard ARM cores.
+
+         If unsure, say Y.
+
 config SOCIONEXT_SYNQUACER_PREITS
        bool "Socionext Synquacer: Workaround for GICv3 pre-ITS"
        default y
index 7de6b37..9058cfa 100644 (file)
                        ranges = <0x0 0x00 0x1700000 0x100000>;
                        reg = <0x00 0x1700000 0x0 0x100000>;
                        interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>;
+                       dma-coherent;
 
                        sec_jr0: jr@10000 {
                                compatible = "fsl,sec-v5.4-job-ring",
index 5a8a1dc..28c51e5 100644 (file)
                        ranges = <0x0 0x00 0x1700000 0x100000>;
                        reg = <0x00 0x1700000 0x0 0x100000>;
                        interrupts = <0 75 0x4>;
+                       dma-coherent;
 
                        sec_jr0: jr@10000 {
                                compatible = "fsl,sec-v5.4-job-ring",
index 1d6dfd1..3945830 100644 (file)
                        ranges = <0x0 0x00 0x1700000 0x100000>;
                        reg = <0x00 0x1700000 0x0 0x100000>;
                        interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>;
+                       dma-coherent;
 
                        sec_jr0: jr@10000 {
                                compatible = "fsl,sec-v5.4-job-ring",
index 0e1a6d9..122c95d 100644 (file)
@@ -35,7 +35,7 @@
 
 &i2c2 {
        clock-frequency = <400000>;
-       pinctrl-names = "default";
+       pinctrl-names = "default", "gpio";
        pinctrl-0 = <&pinctrl_i2c2>;
        pinctrl-1 = <&pinctrl_i2c2_gpio>;
        sda-gpios = <&gpio5 17 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>;
index 44a8c23..f3965ec 100644 (file)
@@ -67,7 +67,7 @@
 
 &i2c1 {
        clock-frequency = <400000>;
-       pinctrl-names = "default";
+       pinctrl-names = "default", "gpio";
        pinctrl-0 = <&pinctrl_i2c1>;
        pinctrl-1 = <&pinctrl_i2c1_gpio>;
        sda-gpios = <&gpio5 15 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>;
index 93a161b..dc52b73 100644 (file)
@@ -37,7 +37,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
        } while (--n > 0);
 
        sum += ((sum >> 32) | (sum << 32));
-       return csum_fold((__force u32)(sum >> 32));
+       return csum_fold((__force __wsum)(sum >> 32));
 }
 #define ip_fast_csum ip_fast_csum
 
index b77d997..c40f249 100644 (file)
@@ -66,7 +66,8 @@
 #define ARM64_WORKAROUND_1508412               58
 #define ARM64_HAS_LDAPR                                59
 #define ARM64_KVM_PROTECTED_MODE               60
+#define ARM64_WORKAROUND_NVIDIA_CARMEL_CNP     61
 
-#define ARM64_NCAPS                            61
+#define ARM64_NCAPS                            62
 
 #endif /* __ASM_CPUCAPS_H */
index 0b926f0..7cd7d5c 100644 (file)
@@ -407,6 +407,10 @@ struct kvm_vcpu_arch {
 #define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active  */
 #define KVM_ARM64_DEBUG_STATE_SAVE_TRBE        (1 << 13) /* Save TRBE context if active  */
 
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
+                                KVM_GUESTDBG_USE_SW_BP | \
+                                KVM_GUESTDBG_USE_HW | \
+                                KVM_GUESTDBG_SINGLESTEP)
 /*
  * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can
  * take the following values:
@@ -588,11 +592,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
                              struct kvm_vcpu_events *events);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
index ca2cd75..efc10e9 100644 (file)
@@ -251,6 +251,8 @@ unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_switch_to(struct task_struct *prev,
                                         struct task_struct *next);
 
+asmlinkage void arm64_preempt_schedule_irq(void);
+
 #define task_pt_regs(p) \
        ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1)
 
index 9f4e3b2..6623c99 100644 (file)
@@ -55,6 +55,8 @@ void arch_setup_new_exec(void);
 #define arch_setup_new_exec     arch_setup_new_exec
 
 void arch_release_task_struct(struct task_struct *tsk);
+int arch_dup_task_struct(struct task_struct *dst,
+                               struct task_struct *src);
 
 #endif
 
index 506a1cd..e2c20c0 100644 (file)
@@ -526,6 +526,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
                                  1, 0),
        },
 #endif
+#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM
+       {
+               /* NVIDIA Carmel */
+               .desc = "NVIDIA Carmel CNP erratum",
+               .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP,
+               ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL),
+       },
+#endif
        {
        }
 };
index 3423ae3..e3e0dcb 100644 (file)
@@ -1326,7 +1326,10 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope)
         * may share TLB entries with a CPU stuck in the crashed
         * kernel.
         */
-        if (is_kdump_kernel())
+       if (is_kdump_kernel())
+               return false;
+
+       if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP))
                return false;
 
        return has_cpuid_feature(entry, scope);
index 77605ae..51fcf99 100644 (file)
@@ -353,7 +353,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
         * with the CLIDR_EL1 fields to avoid triggering false warnings
         * when there is a mismatch across the CPUs. Keep track of the
         * effective value of the CTR_EL0 in our internal records for
-        * acurate sanity check and feature enablement.
+        * accurate sanity check and feature enablement.
         */
        info->reg_ctr = read_cpuid_effective_cachetype();
        info->reg_dczid = read_cpuid(DCZID_EL0);
index e6e2842..58303a9 100644 (file)
@@ -64,5 +64,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
        memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count);
+       *ppos += count;
+
        return count;
 }
index 325c83b..6e60aa3 100644 (file)
@@ -57,6 +57,8 @@
 #include <asm/processor.h>
 #include <asm/pointer_auth.h>
 #include <asm/stacktrace.h>
+#include <asm/switch_to.h>
+#include <asm/system_misc.h>
 
 #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
 #include <linux/stackprotector.h>
index ad20981..d55bdfb 100644 (file)
@@ -194,8 +194,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
 
 #ifdef CONFIG_STACKTRACE
 
-void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
-                    struct task_struct *task, struct pt_regs *regs)
+noinline void arch_stack_walk(stack_trace_consume_fn consume_entry,
+                             void *cookie, struct task_struct *task,
+                             struct pt_regs *regs)
 {
        struct stackframe frame;
 
@@ -203,8 +204,8 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
                start_backtrace(&frame, regs->regs[29], regs->pc);
        else if (task == current)
                start_backtrace(&frame,
-                               (unsigned long)__builtin_frame_address(0),
-                               (unsigned long)arch_stack_walk);
+                               (unsigned long)__builtin_frame_address(1),
+                               (unsigned long)__builtin_return_address(0));
        else
                start_backtrace(&frame, thread_saved_fp(task),
                                thread_saved_pc(task));
index 4808aca..1cb39c0 100644 (file)
@@ -209,6 +209,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_PTP_KVM:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               return KVM_GUESTDBG_VALID_MASK;
        case KVM_CAP_ARM_SET_DEVICE_ADDR:
                r = 1;
                break;
@@ -1273,7 +1275,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
        kvm_flush_remote_tlbs(kvm);
 }
index c763808..5cb4a1c 100644 (file)
@@ -888,11 +888,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return -EINVAL;
 }
 
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |    \
-                           KVM_GUESTDBG_USE_SW_BP | \
-                           KVM_GUESTDBG_USE_HW | \
-                           KVM_GUESTDBG_SINGLESTEP)
-
 /**
  * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
  * @kvm:       pointer to the KVM struct
index ee3682b..39f8f7f 100644 (file)
@@ -429,6 +429,13 @@ u64 __vgic_v3_get_gic_config(void)
        if (has_vhe())
                flags = local_daif_save();
 
+       /*
+        * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates
+        * that to be able to set ICC_SRE_EL1.SRE to 0, all the
+        * interrupt overrides must be set. You've got to love this.
+        */
+       sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO);
+       isb();
        write_gicreg(0, ICC_SRE_EL1);
        isb();
 
@@ -436,6 +443,8 @@ u64 __vgic_v3_get_gic_config(void)
 
        write_gicreg(sre, ICC_SRE_EL1);
        isb();
+       sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0);
+       isb();
 
        if (has_vhe())
                local_daif_restore(flags);
index cd4d51a..c5d1f3c 100644 (file)
@@ -923,7 +923,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
         * the page we just got a reference to gets unmapped before we have a
         * chance to grab the mmu_lock, which ensure that if the page gets
-        * unmapped afterwards, the call to kvm_unmap_hva will take it away
+        * unmapped afterwards, the call to kvm_unmap_gfn will take it away
         * from us again properly. This smp_rmb() interacts with the smp_wmb()
         * in kvm_mmu_notifier_invalidate_<page|range_end>.
         *
@@ -1153,126 +1153,70 @@ out_unlock:
        return ret;
 }
 
-static int handle_hva_to_gpa(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            int (*handler)(struct kvm *kvm,
-                                           gpa_t gpa, u64 size,
-                                           void *data),
-                            void *data)
-{
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int ret = 0;
-
-       slots = kvm_memslots(kvm);
-
-       /* we only care about the pages that the guest sees */
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gpa;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-
-               gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
-               ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
-       }
-
-       return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       unsigned flags = *(unsigned *)data;
-       bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
-
-       __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
-       return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        if (!kvm->arch.mmu.pgt)
                return 0;
 
-       trace_kvm_unmap_hva_range(start, end);
-       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
-       return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       kvm_pfn_t *pfn = (kvm_pfn_t *)data;
-
-       WARN_ON(size != PAGE_SIZE);
+       __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
+                            (range->end - range->start) << PAGE_SHIFT,
+                            range->may_block);
 
-       /*
-        * The MMU notifiers will have unmapped a huge PMD before calling
-        * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-        * therefore we never need to clear out a huge PMD through this
-        * calling path and a memcache is not required.
-        */
-       kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
-                              __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
        return 0;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       unsigned long end = hva + PAGE_SIZE;
-       kvm_pfn_t pfn = pte_pfn(pte);
+       kvm_pfn_t pfn = pte_pfn(range->pte);
 
        if (!kvm->arch.mmu.pgt)
                return 0;
 
-       trace_kvm_set_spte_hva(hva);
+       WARN_ON(range->end - range->start != 1);
 
        /*
         * We've moved a page around, probably through CoW, so let's treat it
         * just like a translation fault and clean the cache to the PoC.
         */
        clean_dcache_guest_page(pfn, PAGE_SIZE);
-       handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
+
+       /*
+        * The MMU notifiers will have unmapped a huge PMD before calling
+        * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
+        * therefore we never need to clear out a huge PMD through this
+        * calling path and a memcache is not required.
+        */
+       kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
+                              PAGE_SIZE, __pfn_to_phys(pfn),
+                              KVM_PGTABLE_PROT_R, NULL);
+
        return 0;
 }
 
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       pte_t pte;
+       u64 size = (range->end - range->start) << PAGE_SHIFT;
        kvm_pte_t kpte;
+       pte_t pte;
+
+       if (!kvm->arch.mmu.pgt)
+               return 0;
 
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+
+       kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
+                                       range->start << PAGE_SHIFT);
        pte = __pte(kpte);
        return pte_valid(pte) && pte_young(pte);
 }
 
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        if (!kvm->arch.mmu.pgt)
                return 0;
-       trace_kvm_age_hva(start, end);
-       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       if (!kvm->arch.mmu.pgt)
-               return 0;
-       trace_kvm_test_age_hva(hva);
-       return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
-                                kvm_test_age_hva_handler, NULL);
+       return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
+                                          range->start << PAGE_SHIFT);
 }
 
 phys_addr_t kvm_mmu_get_httbr(void)
index ff04443..33e4e7d 100644 (file)
@@ -135,72 +135,6 @@ TRACE_EVENT(kvm_mmio_emulate,
                  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
 );
 
-TRACE_EVENT(kvm_unmap_hva_range,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
-       TP_PROTO(unsigned long start, unsigned long end),
-       TP_ARGS(start, end),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  start           )
-               __field(        unsigned long,  end             )
-       ),
-
-       TP_fast_assign(
-               __entry->start          = start;
-               __entry->end            = end;
-       ),
-
-       TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
-                 __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
-);
-
 TRACE_EVENT(kvm_set_way_flush,
            TP_PROTO(unsigned long vcpu_pc, bool cache),
            TP_ARGS(vcpu_pc, cache),
index 7484ea4..5d9550f 100644 (file)
@@ -1448,6 +1448,22 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
 struct range arch_get_mappable_range(void)
 {
        struct range mhp_range;
+       u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
+       u64 end_linear_pa = __pa(PAGE_END - 1);
+
+       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+               /*
+                * Check for a wrap, it is possible because of randomized linear
+                * mapping the start physical address is actually bigger than
+                * the end physical address. In this case set start to zero
+                * because [0, end_linear_pa] range must still be able to cover
+                * all addressable physical addresses.
+                */
+               if (start_linear_pa > end_linear_pa)
+                       start_linear_pa = 0;
+       }
+
+       WARN_ON(start_linear_pa > end_linear_pa);
 
        /*
         * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
@@ -1455,8 +1471,9 @@ struct range arch_get_mappable_range(void)
         * range which can be mapped inside this linear mapping range, must
         * also be derived from its end points.
         */
-       mhp_range.start = __pa(_PAGE_OFFSET(vabits_actual));
-       mhp_range.end =  __pa(PAGE_END - 1);
+       mhp_range.start = start_linear_pa;
+       mhp_range.end =  end_linear_pa;
+
        return mhp_range;
 }
 
index ae2b1c7..ef2bb9b 100644 (file)
@@ -9,7 +9,7 @@ int arch_check_ftrace_location(struct kprobe *p)
        return 0;
 }
 
-/* Ftrace callback handler for kprobes -- called under preepmt disabed */
+/* Ftrace callback handler for kprobes -- called under preepmt disabled */
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
                           struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
index 8b5b8e6..dd5bfed 100644 (file)
@@ -59,7 +59,7 @@ show_##name(struct device *dev, struct device_attribute *attr,        \
                char *buf)                                              \
 {                                                                      \
        u32 cpu=dev->id;                                                \
-       return sprintf(buf, "%lx\n", name[cpu]);                        \
+       return sprintf(buf, "%llx\n", name[cpu]);                       \
 }
 
 #define store(name)                                                    \
@@ -86,9 +86,9 @@ store_call_start(struct device *dev, struct device_attribute *attr,
 
 #ifdef ERR_INJ_DEBUG
        printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu);
-       printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]);
-       printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]);
-       printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n",
+       printk(KERN_DEBUG "err_type_info=%llx,\n", err_type_info[cpu]);
+       printk(KERN_DEBUG "err_struct_info=%llx,\n", err_struct_info[cpu]);
+       printk(KERN_DEBUG "err_data_buffer=%llx, %llx, %llx.\n",
                          err_data_buffer[cpu].data1,
                          err_data_buffer[cpu].data2,
                          err_data_buffer[cpu].data3);
@@ -117,8 +117,8 @@ store_call_start(struct device *dev, struct device_attribute *attr,
 
 #ifdef ERR_INJ_DEBUG
        printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]);
-       printk(KERN_DEBUG "capabilities=%lx,\n", capabilities[cpu]);
-       printk(KERN_DEBUG "resources=%lx\n", resources[cpu]);
+       printk(KERN_DEBUG "capabilities=%llx,\n", capabilities[cpu]);
+       printk(KERN_DEBUG "resources=%llx\n", resources[cpu]);
 #endif
        return size;
 }
@@ -131,7 +131,7 @@ show_virtual_to_phys(struct device *dev, struct device_attribute *attr,
                        char *buf)
 {
        unsigned int cpu=dev->id;
-       return sprintf(buf, "%lx\n", phys_addr[cpu]);
+       return sprintf(buf, "%llx\n", phys_addr[cpu]);
 }
 
 static ssize_t
@@ -145,7 +145,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
        ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL);
        if (ret<=0) {
 #ifdef ERR_INJ_DEBUG
-               printk("Virtual address %lx is not existing.\n",virt_addr);
+               printk("Virtual address %llx is not existing.\n", virt_addr);
 #endif
                return -EINVAL;
        }
@@ -163,7 +163,7 @@ show_err_data_buffer(struct device *dev,
 {
        unsigned int cpu=dev->id;
 
-       return sprintf(buf, "%lx, %lx, %lx\n",
+       return sprintf(buf, "%llx, %llx, %llx\n",
                        err_data_buffer[cpu].data1,
                        err_data_buffer[cpu].data2,
                        err_data_buffer[cpu].data3);
@@ -178,13 +178,13 @@ store_err_data_buffer(struct device *dev,
        int ret;
 
 #ifdef ERR_INJ_DEBUG
-       printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n",
+       printk("write err_data_buffer=[%llx,%llx,%llx] on cpu%d\n",
                 err_data_buffer[cpu].data1,
                 err_data_buffer[cpu].data2,
                 err_data_buffer[cpu].data3,
                 cpu);
 #endif
-       ret=sscanf(buf, "%lx, %lx, %lx",
+       ret = sscanf(buf, "%llx, %llx, %llx",
                        &err_data_buffer[cpu].data1,
                        &err_data_buffer[cpu].data2,
                        &err_data_buffer[cpu].data3);
index d4cae2f..adf6521 100644 (file)
@@ -1824,7 +1824,7 @@ ia64_mca_cpu_init(void *cpu_data)
                        data = mca_bootmem();
                        first_time = 0;
                } else
-                       data = (void *)__get_free_pages(GFP_KERNEL,
+                       data = (void *)__get_free_pages(GFP_ATOMIC,
                                                        get_order(sz));
                if (!data)
                        panic("Could not allocate MCA memory for cpu %d\n",
index 3a5612e..d0944a7 100644 (file)
@@ -815,14 +815,7 @@ struct kvm_mips_callbacks {
        int (*vcpu_init)(struct kvm_vcpu *vcpu);
        void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
        int (*vcpu_setup)(struct kvm_vcpu *vcpu);
-       void (*flush_shadow_all)(struct kvm *kvm);
-       /*
-        * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
-        * VZ root TLB, or T&E GVA page tables and corresponding root TLB
-        * mappings).
-        */
-       void (*flush_shadow_memslot)(struct kvm *kvm,
-                                    const struct kvm_memory_slot *slot);
+       void (*prepare_flush_shadow)(struct kvm *kvm);
        gpa_t (*gva_to_gpa)(gva_t gva);
        void (*queue_timer_int)(struct kvm_vcpu *vcpu);
        void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -967,11 +960,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
                                                   bool write);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-                       unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
@@ -1154,4 +1142,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+int kvm_arch_flush_remote_tlb(struct kvm *kvm);
+
 #endif /* __MIPS_KVM_HOST_H__ */
index 279be01..23a1403 100644 (file)
@@ -43,7 +43,7 @@
 #include <asm/prom.h>
 
 #ifdef CONFIG_MIPS_ELF_APPENDED_DTB
-const char __section(".appended_dtb") __appended_dtb[0x100000];
+char __section(".appended_dtb") __appended_dtb[0x100000];
 #endif /* CONFIG_MIPS_ELF_APPENDED_DTB */
 
 struct cpuinfo_mips cpu_data[NR_CPUS] __read_mostly;
index 1234834..1f98947 100644 (file)
@@ -176,7 +176,7 @@ SECTIONS
        .fill : {
                FILL(0);
                BYTE(0);
-               . = ALIGN(8);
+               STRUCT_ALIGN();
        }
        __appended_dtb = .;
        /* leave space for appended DTB */
index 58a8812..4a22ba7 100644 (file)
@@ -204,9 +204,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
        /* Flush whole GPA */
        kvm_mips_flush_gpa_pt(kvm, 0, ~0);
-
-       /* Let implementation do the rest */
-       kvm_mips_callbacks->flush_shadow_all(kvm);
+       kvm_flush_remote_tlbs(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -221,8 +219,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
        /* Flush slot from GPA */
        kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
                              slot->base_gfn + slot->npages - 1);
-       /* Let implementation do the rest */
-       kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+       kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
        spin_unlock(&kvm->mmu_lock);
 }
 
@@ -262,9 +259,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                /* Write protect GPA page table entries */
                needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
                                        new->base_gfn + new->npages - 1);
-               /* Let implementation do the rest */
                if (needs_flush)
-                       kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+                       kvm_arch_flush_remote_tlbs_memslot(kvm, new);
                spin_unlock(&kvm->mmu_lock);
        }
 }
@@ -996,11 +992,16 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 }
 
+int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+       kvm_mips_callbacks->prepare_flush_shadow(kvm);
+       return 1;
+}
+
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
-       /* Let implementation handle TLB/GVA invalidation */
-       kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
+       kvm_flush_remote_tlbs(kvm);
 }
 
 long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
index 3dabeda..8af002b 100644 (file)
@@ -439,85 +439,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
                                  end_gfn << PAGE_SHIFT);
 }
 
-static int handle_hva_to_gpa(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            int (*handler)(struct kvm *kvm, gfn_t gfn,
-                                           gpa_t gfn_end,
-                                           struct kvm_memory_slot *memslot,
-                                           void *data),
-                            void *data)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int ret = 0;
-
-       slots = kvm_memslots(kvm);
-
-       /* we only care about the pages that the guest sees */
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-               ret |= handler(kvm, gfn, gfn_end, memslot, data);
-       }
-
-       return ret;
-}
-
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                                struct kvm_memory_slot *memslot, void *data)
-{
-       kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+       kvm_mips_flush_gpa_pt(kvm, range->start, range->end);
        return 1;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
-{
-       handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-
-       kvm_mips_callbacks->flush_shadow_all(kvm);
-       return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                               struct kvm_memory_slot *memslot, void *data)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       gpa_t gpa = gfn << PAGE_SHIFT;
-       pte_t hva_pte = *(pte_t *)data;
+       gpa_t gpa = range->start << PAGE_SHIFT;
+       pte_t hva_pte = range->pte;
        pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
        pte_t old_pte;
 
        if (!gpa_pte)
-               return 0;
+               return false;
 
        /* Mapping may need adjusting depending on memslot flags */
        old_pte = *gpa_pte;
-       if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+       if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
                hva_pte = pte_mkclean(hva_pte);
-       else if (memslot->flags & KVM_MEM_READONLY)
+       else if (range->slot->flags & KVM_MEM_READONLY)
                hva_pte = pte_wrprotect(hva_pte);
 
        set_pte(gpa_pte, hva_pte);
 
        /* Replacing an absent or old page doesn't need flushes */
        if (!pte_present(old_pte) || !pte_young(old_pte))
-               return 0;
+               return false;
 
        /* Pages swapped, aged, moved, or cleaned require flushes */
        return !pte_present(hva_pte) ||
@@ -526,27 +475,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
               (pte_dirty(old_pte) && !pte_dirty(hva_pte));
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
-       unsigned long end = hva + PAGE_SIZE;
-       int ret;
-
-       ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
-       if (ret)
-               kvm_mips_callbacks->flush_shadow_all(kvm);
-       return 0;
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                              struct kvm_memory_slot *memslot, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
+       return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
 }
 
-static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
-                                   struct kvm_memory_slot *memslot, void *data)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       gpa_t gpa = gfn << PAGE_SHIFT;
+       gpa_t gpa = range->start << PAGE_SHIFT;
        pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
 
        if (!gpa_pte)
@@ -554,16 +490,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
        return pte_young(*gpa_pte);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
-       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-       return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
 /**
  * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
  * @vcpu:              VCPU pointer.
index 0788c00..5f2df49 100644 (file)
@@ -687,16 +687,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
+static void kvm_trap_emul_prepare_flush_shadow(struct kvm *kvm)
 {
-       /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
-       kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
-                                       const struct kvm_memory_slot *slot)
-{
-       kvm_trap_emul_flush_shadow_all(kvm);
 }
 
 static u64 kvm_trap_emul_get_one_regs[] = {
@@ -1280,8 +1272,7 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .vcpu_init = kvm_trap_emul_vcpu_init,
        .vcpu_uninit = kvm_trap_emul_vcpu_uninit,
        .vcpu_setup = kvm_trap_emul_vcpu_setup,
-       .flush_shadow_all = kvm_trap_emul_flush_shadow_all,
-       .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
+       .prepare_flush_shadow = kvm_trap_emul_prepare_flush_shadow,
        .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
        .queue_timer_int = kvm_mips_queue_timer_int_cb,
        .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
index 2ffbe92..2c75571 100644 (file)
@@ -3211,32 +3211,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+static void kvm_vz_prepare_flush_shadow(struct kvm *kvm)
 {
-       if (cpu_has_guestid) {
-               /* Flush GuestID for each VCPU individually */
-               kvm_flush_remote_tlbs(kvm);
-       } else {
+       if (!cpu_has_guestid) {
                /*
                 * For each CPU there is a single GPA ASID used by all VCPUs in
                 * the VM, so it doesn't make sense for the VCPUs to handle
                 * invalidation of these ASIDs individually.
                 *
                 * Instead mark all CPUs as needing ASID invalidation in
-                * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+                * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will
                 * kick any running VCPUs so they check asid_flush_mask.
                 */
                cpumask_setall(&kvm->arch.asid_flush_mask);
-               kvm_flush_remote_tlbs(kvm);
        }
 }
 
-static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
-                                       const struct kvm_memory_slot *slot)
-{
-       kvm_vz_flush_shadow_all(kvm);
-}
-
 static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
 {
        int cpu = smp_processor_id();
@@ -3292,8 +3282,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
        .vcpu_init = kvm_vz_vcpu_init,
        .vcpu_uninit = kvm_vz_vcpu_uninit,
        .vcpu_setup = kvm_vz_vcpu_setup,
-       .flush_shadow_all = kvm_vz_flush_shadow_all,
-       .flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+       .prepare_flush_shadow = kvm_vz_prepare_flush_shadow,
        .gva_to_gpa = kvm_vz_gva_to_gpa_cb,
        .queue_timer_int = kvm_vz_queue_timer_int_cb,
        .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
index 7897d16..727d4b3 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/bug.h>
 #include <asm/cputable.h>
 
-static inline bool early_cpu_has_feature(unsigned long feature)
+static __always_inline bool early_cpu_has_feature(unsigned long feature)
 {
        return !!((CPU_FTRS_ALWAYS & feature) ||
                  (CPU_FTRS_POSSIBLE & cur_cpu_spec->cpu_features & feature));
@@ -46,7 +46,7 @@ static __always_inline bool cpu_has_feature(unsigned long feature)
        return static_branch_likely(&cpu_feature_keys[i]);
 }
 #else
-static inline bool cpu_has_feature(unsigned long feature)
+static __always_inline bool cpu_has_feature(unsigned long feature)
 {
        return early_cpu_has_feature(feature);
 }
index 2f5f919..2d03f29 100644 (file)
@@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
                                      unsigned int lpid);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
-extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
-extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
-extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                       unsigned long gfn);
+extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                           unsigned long gfn);
+extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                         unsigned long gfn);
+extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                              unsigned long gfn);
 extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
                        struct kvm_memory_slot *memslot, unsigned long *map);
 extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
index 05fb00d..1e83359 100644 (file)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 
-extern int kvm_unmap_hva_range(struct kvm *kvm,
-                              unsigned long start, unsigned long end,
-                              unsigned flags);
-extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-
 #define HPTEG_CACHE_NUM                        (1 << 15)
 #define HPTEG_HASH_BITS_PTE            13
 #define HPTEG_HASH_BITS_PTE_LONG       12
index 8aacd76..21ab033 100644 (file)
@@ -281,11 +281,10 @@ struct kvmppc_ops {
                                     const struct kvm_memory_slot *old,
                                     const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change);
-       int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
-                          unsigned long end);
-       int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
-       int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
-       void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
+       bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+       bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
        void (*free_memslot)(struct kvm_memory_slot *slot);
        int (*init_vm)(struct kvm *kvm);
        void (*destroy_vm)(struct kvm *kvm);
index a6e29f8..d21d081 100644 (file)
@@ -65,3 +65,14 @@ V_FUNCTION_END(__kernel_clock_getres)
 V_FUNCTION_BEGIN(__kernel_time)
        cvdso_call_time __c_kernel_time
 V_FUNCTION_END(__kernel_time)
+
+/* Routines for restoring integer registers, called by the compiler.  */
+/* Called with r11 pointing to the stack header word of the caller of the */
+/* function, just beyond the end of the integer restore area.  */
+_GLOBAL(_restgpr_31_x)
+_GLOBAL(_rest32gpr_31_x)
+       lwz     r0,4(r11)
+       lwz     r31,-4(r11)
+       mtlr    r0
+       mr      r1,r11
+       blr
index 44bf567..2b691f4 100644 (file)
@@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
        kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+       return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->age_hva(kvm, start, end);
+       return kvm->arch.kvm_ops->age_gfn(kvm, range);
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+       return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
-       return 0;
+       return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
index 9b6323e..740e51d 100644 (file)
@@ -9,12 +9,10 @@
 
 extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
                                         struct kvm_memory_slot *memslot);
-extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
-                                 unsigned long end);
-extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
-                         unsigned long end);
-extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
-extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 
 extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
index bb67735..b7bd9ca 100644 (file)
@@ -752,51 +752,6 @@ void kvmppc_rmap_reset(struct kvm *kvm)
        srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
-typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                             unsigned long gfn);
-
-static int kvm_handle_hva_range(struct kvm *kvm,
-                               unsigned long start,
-                               unsigned long end,
-                               hva_handler_fn handler)
-{
-       int ret;
-       int retval = 0;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
-
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn, gfn+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-               for (; gfn < gfn_end; ++gfn) {
-                       ret = handler(kvm, memslot, gfn);
-                       retval |= ret;
-               }
-       }
-
-       return retval;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         hva_handler_fn handler)
-{
-       return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
-}
-
 /* Must be called with both HPTE and rmap locked */
 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
                              struct kvm_memory_slot *memslot,
@@ -840,8 +795,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
        }
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                          unsigned long gfn)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                           unsigned long gfn)
 {
        unsigned long i;
        __be64 *hptep;
@@ -874,16 +829,15 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                unlock_rmap(rmapp);
                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
        }
-       return 0;
+       return false;
 }
 
-int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               return kvm_unmap_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
-       kvm_handle_hva_range(kvm, start, end, handler);
-       return 0;
+       return kvm_unmap_rmapp(kvm, range->slot, range->start);
 }
 
 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
@@ -913,8 +867,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
        }
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                        unsigned long gfn)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                         unsigned long gfn)
 {
        struct revmap_entry *rev = kvm->arch.hpt.rev;
        unsigned long head, i, j;
@@ -968,26 +922,26 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
        return ret;
 }
 
-int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               kvm_age_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
-       return kvm_handle_hva_range(kvm, start, end, handler);
+       return kvm_age_rmapp(kvm, range->slot, range->start);
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                             unsigned long gfn)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                              unsigned long gfn)
 {
        struct revmap_entry *rev = kvm->arch.hpt.rev;
        unsigned long head, i, j;
        unsigned long *hp;
-       int ret = 1;
+       bool ret = true;
        unsigned long *rmapp;
 
        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
        if (*rmapp & KVMPPC_RMAP_REFERENCED)
-               return 1;
+               return true;
 
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_REFERENCED)
@@ -1002,27 +956,27 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
                                goto out;
                } while ((i = j) != head);
        }
-       ret = 0;
+       ret = false;
 
  out:
        unlock_rmap(rmapp);
        return ret;
 }
 
-int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               kvm_test_age_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
-       return kvm_handle_hva(kvm, hva, handler);
+       return kvm_test_age_rmapp(kvm, range->slot, range->start);
 }
 
-void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       hva_handler_fn handler;
+       if (kvm_is_radix(kvm))
+               return kvm_unmap_radix(kvm, range->slot, range->start);
 
-       handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
-       kvm_handle_hva(kvm, hva, handler);
+       return kvm_unmap_rmapp(kvm, range->slot, range->start);
 }
 
 static int vcpus_running(struct kvm *kvm)
index e603de7..ec4f58f 100644 (file)
@@ -993,8 +993,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                   unsigned long gfn)
+bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                    unsigned long gfn)
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
@@ -1002,24 +1002,24 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
                uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
-               return 0;
+               return false;
        }
 
        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
        if (ptep && pte_present(*ptep))
                kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
                                 kvm->arch.lpid);
-       return 0;
+       return false;
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                 unsigned long gfn)
+bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                  unsigned long gfn)
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       int ref = 0;
+       bool ref = false;
        unsigned long old, *rmapp;
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
@@ -1035,26 +1035,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
                                               old & PTE_RPN_MASK,
                                               1UL << shift);
-               ref = 1;
+               ref = true;
        }
        return ref;
 }
 
 /* Called with kvm->mmu_lock held */
-int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                      unsigned long gfn)
+bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+                       unsigned long gfn)
+
 {
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       int ref = 0;
+       bool ref = false;
 
        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
                return ref;
 
        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
        if (ptep && pte_present(*ptep) && pte_young(*ptep))
-               ref = 1;
+               ref = true;
        return ref;
 }
 
index 13bad6b..07682ad 100644 (file)
@@ -4770,7 +4770,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
                kvmhv_release_all_nested(kvm);
        kvmppc_rmap_reset(kvm);
        kvm->arch.process_table = 0;
-       /* Mutual exclusion with kvm_unmap_hva_range etc. */
+       /* Mutual exclusion with kvm_unmap_gfn_range etc. */
        spin_lock(&kvm->mmu_lock);
        kvm->arch.radix = 0;
        spin_unlock(&kvm->mmu_lock);
@@ -4792,7 +4792,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
        if (err)
                return err;
        kvmppc_rmap_reset(kvm);
-       /* Mutual exclusion with kvm_unmap_hva_range etc. */
+       /* Mutual exclusion with kvm_unmap_gfn_range etc. */
        spin_lock(&kvm->mmu_lock);
        kvm->arch.radix = 1;
        spin_unlock(&kvm->mmu_lock);
@@ -5654,10 +5654,10 @@ static struct kvmppc_ops kvm_ops_hv = {
        .flush_memslot  = kvmppc_core_flush_memslot_hv,
        .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
        .commit_memory_region  = kvmppc_core_commit_memory_region_hv,
-       .unmap_hva_range = kvm_unmap_hva_range_hv,
-       .age_hva  = kvm_age_hva_hv,
-       .test_age_hva = kvm_test_age_hva_hv,
-       .set_spte_hva = kvm_set_spte_hva_hv,
+       .unmap_gfn_range = kvm_unmap_gfn_range_hv,
+       .age_gfn = kvm_age_gfn_hv,
+       .test_age_gfn = kvm_test_age_gfn_hv,
+       .set_spte_gfn = kvm_set_spte_gfn_hv,
        .free_memslot = kvmppc_core_free_memslot_hv,
        .init_vm =  kvmppc_core_init_vm_hv,
        .destroy_vm = kvmppc_core_destroy_vm_hv,
index 913944d..d7733b0 100644 (file)
@@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
 }
 
 /************* MMU Notifiers *************/
-static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
-                            unsigned long end)
+static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        long i;
        struct kvm_vcpu *vcpu;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
 
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots) {
-               unsigned long hva_start, hva_end;
-               gfn_t gfn, gfn_end;
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT,
+                                     range->end << PAGE_SHIFT);
 
-               hva_start = max(start, memslot->userspace_addr);
-               hva_end = min(end, memslot->userspace_addr +
-                                       (memslot->npages << PAGE_SHIFT));
-               if (hva_start >= hva_end)
-                       continue;
-               /*
-                * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                * {gfn, gfn+1, ..., gfn_end-1}.
-                */
-               gfn = hva_to_gfn_memslot(hva_start, memslot);
-               gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
-                                             gfn_end << PAGE_SHIFT);
-       }
+       return false;
 }
 
-static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
-                                 unsigned long end)
+static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       do_kvm_unmap_hva(kvm, start, end);
-
-       return 0;
+       return do_kvm_unmap_gfn(kvm, range);
 }
 
-static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start,
-                         unsigned long end)
+static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
+static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
+static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* The page will get remapped properly on its next fault */
-       do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+       return do_kvm_unmap_gfn(kvm, range);
 }
 
 /*****************************************/
@@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = {
        .flush_memslot = kvmppc_core_flush_memslot_pr,
        .prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
        .commit_memory_region = kvmppc_core_commit_memory_region_pr,
-       .unmap_hva_range = kvm_unmap_hva_range_pr,
-       .age_hva  = kvm_age_hva_pr,
-       .test_age_hva = kvm_test_age_hva_pr,
-       .set_spte_hva = kvm_set_spte_hva_pr,
+       .unmap_gfn_range = kvm_unmap_gfn_range_pr,
+       .age_gfn  = kvm_age_gfn_pr,
+       .test_age_gfn = kvm_test_age_gfn_pr,
+       .set_spte_gfn = kvm_set_spte_gfn_pr,
        .free_memslot = kvmppc_core_free_memslot_pr,
        .init_vm = kvmppc_core_init_vm_pr,
        .destroy_vm = kvmppc_core_destroy_vm_pr,
index ed0c9c4..7f16afc 100644 (file)
@@ -721,45 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
 
 /************* MMU Notifiers *************/
 
-static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       trace_kvm_unmap_hva(hva);
-
        /*
         * Flush all shadow tlb entries everywhere. This is slow, but
         * we are 100% sure that we catch the to be unmapped page
         */
-       kvm_flush_remote_tlbs(kvm);
-
-       return 0;
+       return true;
 }
 
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       /* kvm_unmap_hva flushes everything anyways */
-       kvm_unmap_hva(kvm, start);
-
-       return 0;
+       return kvm_e500_mmu_unmap_gfn(kvm, range);
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* XXX could be more clever ;) */
-       return 0;
+       return false;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        /* The page will get remapped properly on its next fault */
-       kvm_unmap_hva(kvm, hva);
-       return 0;
+       return kvm_e500_mmu_unmap_gfn(kvm, range);
 }
 
 /*****************************************/
index 3837842..eff6e82 100644 (file)
@@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit,
                )
 );
 
-TRACE_EVENT(kvm_unmap_hva,
-       TP_PROTO(unsigned long hva),
-       TP_ARGS(hva),
-
-       TP_STRUCT__entry(
-               __field(        unsigned long,  hva             )
-       ),
-
-       TP_fast_assign(
-               __entry->hva            = hva;
-       ),
-
-       TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
 TRACE_EVENT(kvm_booke206_stlb_write,
        TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
        TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
index 764170f..3805519 100644 (file)
@@ -887,7 +887,8 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot,
 
        want_v = hpte_encode_avpn(vpn, psize, ssize);
 
-       flags = (newpp & 7) | H_AVPN;
+       flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN;
+       flags |= (newpp & HPTE_R_KEY_HI) >> 48;
        if (mmu_has_feature(MMU_FTR_KERNEL_RO))
                /* Move pp0 into bit 8 (IBM 55) */
                flags |= (newpp & HPTE_R_PP0) >> 55;
index ea4d6a6..e83e089 100644 (file)
@@ -452,12 +452,28 @@ static int do_suspend(void)
        return ret;
 }
 
+/**
+ * struct pseries_suspend_info - State shared between CPUs for join/suspend.
+ * @counter: Threads are to increment this upon resuming from suspend
+ *           or if an error is received from H_JOIN. The thread which performs
+ *           the first increment (i.e. sets it to 1) is responsible for
+ *           waking the other threads.
+ * @done: False if join/suspend is in progress. True if the operation is
+ *        complete (successful or not).
+ */
+struct pseries_suspend_info {
+       atomic_t counter;
+       bool done;
+};
+
 static int do_join(void *arg)
 {
-       atomic_t *counter = arg;
+       struct pseries_suspend_info *info = arg;
+       atomic_t *counter = &info->counter;
        long hvrc;
        int ret;
 
+retry:
        /* Must ensure MSR.EE off for H_JOIN. */
        hard_irq_disable();
        hvrc = plpar_hcall_norets(H_JOIN);
@@ -473,8 +489,20 @@ static int do_join(void *arg)
        case H_SUCCESS:
                /*
                 * The suspend is complete and this cpu has received a
-                * prod.
+                * prod, or we've received a stray prod from unrelated
+                * code (e.g. paravirt spinlocks) and we need to join
+                * again.
+                *
+                * This barrier orders the return from H_JOIN above vs
+                * the load of info->done. It pairs with the barrier
+                * in the wakeup/prod path below.
                 */
+               smp_mb();
+               if (READ_ONCE(info->done) == false) {
+                       pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",
+                                           smp_processor_id());
+                       goto retry;
+               }
                ret = 0;
                break;
        case H_BAD_MODE:
@@ -488,6 +516,13 @@ static int do_join(void *arg)
 
        if (atomic_inc_return(counter) == 1) {
                pr_info("CPU %u waking all threads\n", smp_processor_id());
+               WRITE_ONCE(info->done, true);
+               /*
+                * This barrier orders the store to info->done vs subsequent
+                * H_PRODs to wake the other CPUs. It pairs with the barrier
+                * in the H_SUCCESS case above.
+                */
+               smp_mb();
                prod_others();
        }
        /*
@@ -535,11 +570,16 @@ static int pseries_suspend(u64 handle)
        int ret;
 
        while (true) {
-               atomic_t counter = ATOMIC_INIT(0);
+               struct pseries_suspend_info info;
                unsigned long vasi_state;
                int vasi_err;
 
-               ret = stop_machine(do_join, &counter, cpu_online_mask);
+               info = (struct pseries_suspend_info) {
+                       .counter = ATOMIC_INIT(0),
+                       .done = false,
+               };
+
+               ret = stop_machine(do_join, &info, cpu_online_mask);
                if (ret == 0)
                        break;
                /*
index 85d626b..0d0cf67 100644 (file)
@@ -93,7 +93,6 @@ config RISCV
        select PCI_MSI if PCI
        select RISCV_INTC
        select RISCV_TIMER if RISCV_SBI
-       select SPARSEMEM_STATIC if 32BIT
        select SPARSE_IRQ
        select SYSCTL_EXCEPTION_TRACE
        select THREAD_INFO_IN_TASK
@@ -154,7 +153,8 @@ config ARCH_FLATMEM_ENABLE
 config ARCH_SPARSEMEM_ENABLE
        def_bool y
        depends on MMU
-       select SPARSEMEM_VMEMMAP_ENABLE
+       select SPARSEMEM_STATIC if 32BIT && SPARSMEM
+       select SPARSEMEM_VMEMMAP_ENABLE if 64BIT
 
 config ARCH_SELECT_MEMORY_MODEL
        def_bool ARCH_SPARSEMEM_ENABLE
@@ -314,7 +314,7 @@ endchoice
 # Common NUMA Features
 config NUMA
        bool "NUMA Memory Allocation and Scheduler Support"
-       depends on SMP
+       depends on SMP && MMU
        select GENERIC_ARCH_NUMA
        select OF_NUMA
        select ARCH_SUPPORTS_NUMA_BALANCING
index 7efcece..e1b2690 100644 (file)
@@ -31,6 +31,8 @@ config SOC_CANAAN
        select SIFIVE_PLIC
        select ARCH_HAS_RESET_CONTROLLER
        select PINCTRL
+       select COMMON_CLK
+       select COMMON_CLK_K210
        help
          This enables support for Canaan Kendryte K210 SoC platform hardware.
 
index 27e005f..2a652b0 100644 (file)
@@ -9,4 +9,20 @@ long long __lshrti3(long long a, int b);
 long long __ashrti3(long long a, int b);
 long long __ashlti3(long long a, int b);
 
+
+#define DECLARE_DO_ERROR_INFO(name)    asmlinkage void name(struct pt_regs *regs)
+
+DECLARE_DO_ERROR_INFO(do_trap_unknown);
+DECLARE_DO_ERROR_INFO(do_trap_insn_misaligned);
+DECLARE_DO_ERROR_INFO(do_trap_insn_fault);
+DECLARE_DO_ERROR_INFO(do_trap_insn_illegal);
+DECLARE_DO_ERROR_INFO(do_trap_load_fault);
+DECLARE_DO_ERROR_INFO(do_trap_load_misaligned);
+DECLARE_DO_ERROR_INFO(do_trap_store_misaligned);
+DECLARE_DO_ERROR_INFO(do_trap_store_fault);
+DECLARE_DO_ERROR_INFO(do_trap_ecall_u);
+DECLARE_DO_ERROR_INFO(do_trap_ecall_s);
+DECLARE_DO_ERROR_INFO(do_trap_ecall_m);
+DECLARE_DO_ERROR_INFO(do_trap_break);
+
 #endif /* _ASM_RISCV_PROTOTYPES_H */
index 9807ad1..e4c4355 100644 (file)
@@ -12,4 +12,6 @@
 
 #include <asm-generic/irq.h>
 
+extern void __init init_IRQ(void);
+
 #endif /* _ASM_RISCV_IRQ_H */
index 3a24003..021ed64 100644 (file)
@@ -71,6 +71,7 @@ int riscv_of_processor_hartid(struct device_node *node);
 int riscv_of_parent_hartid(struct device_node *node);
 
 extern void riscv_fill_hwcap(void);
+extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 
 #endif /* __ASSEMBLY__ */
 
index cb4abb6..09ad4e9 100644 (file)
@@ -119,6 +119,11 @@ extern int regs_query_register_offset(const char *name);
 extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                               unsigned int n);
 
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+                          unsigned long frame_pointer);
+int do_syscall_trace_enter(struct pt_regs *regs);
+void do_syscall_trace_exit(struct pt_regs *regs);
+
 /**
  * regs_get_register() - get register value from its offset
  * @regs:      pt_regs from which register value is gotten
index 99895d9..d702741 100644 (file)
@@ -51,10 +51,10 @@ enum sbi_ext_rfence_fid {
        SBI_EXT_RFENCE_REMOTE_FENCE_I = 0,
        SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
        SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID,
-       SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA,
        SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID,
-       SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA,
+       SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA,
        SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID,
+       SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA,
 };
 
 enum sbi_ext_hsm_fid {
index 81de51e..507cae2 100644 (file)
@@ -88,4 +88,6 @@ static inline int read_current_timer(unsigned long *timer_val)
        return 0;
 }
 
+extern void time_init(void);
+
 #endif /* _ASM_RISCV_TIMEX_H */
index 824b2c9..f944062 100644 (file)
@@ -306,7 +306,9 @@ do {                                                                \
  * data types like structures or arrays.
  *
  * @ptr must have pointer-to-simple-variable type, and @x must be assignable
- * to the result of dereferencing @ptr.
+ * to the result of dereferencing @ptr. The value of @x is copied to avoid
+ * re-ordering where @x is evaluated inside the block that enables user-space
+ * access (thus bypassing user space protection if @x is a function).
  *
  * Caller must check the pointer with access_ok() before calling this
  * function.
@@ -316,12 +318,13 @@ do {                                                              \
 #define __put_user(x, ptr)                                     \
 ({                                                             \
        __typeof__(*(ptr)) __user *__gu_ptr = (ptr);            \
+       __typeof__(*__gu_ptr) __val = (x);                      \
        long __pu_err = 0;                                      \
                                                                \
        __chk_user_ptr(__gu_ptr);                               \
                                                                \
        __enable_user_access();                                 \
-       __put_user_nocheck(x, __gu_ptr, __pu_err);              \
+       __put_user_nocheck(__val, __gu_ptr, __pu_err);          \
        __disable_user_access();                                \
                                                                \
        __pu_err;                                               \
index 3dc0abd..647a47f 100644 (file)
@@ -8,6 +8,7 @@ CFLAGS_REMOVE_ftrace.o  = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_patch.o  = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_sbi.o    = $(CC_FLAGS_FTRACE)
 endif
+CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,)
 
 extra-y += head.o
 extra-y += vmlinux.lds
index 744f320..76274a4 100644 (file)
@@ -447,6 +447,7 @@ ENDPROC(__switch_to)
 #endif
 
        .section ".rodata"
+       .align LGREG
        /* Exception vector table */
 ENTRY(excp_vect_table)
        RISCV_PTR do_trap_insn_misaligned
index e637249..17ca5e9 100644 (file)
@@ -2,39 +2,41 @@
 
 #include <linux/kprobes.h>
 
-/* Ftrace callback handler for kprobes -- called under preepmt disabed */
+/* Ftrace callback handler for kprobes -- called under preepmt disabled */
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-                          struct ftrace_ops *ops, struct ftrace_regs *regs)
+                          struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
        struct kprobe *p;
+       struct pt_regs *regs;
        struct kprobe_ctlblk *kcb;
 
        p = get_kprobe((kprobe_opcode_t *)ip);
        if (unlikely(!p) || kprobe_disabled(p))
                return;
 
+       regs = ftrace_get_regs(fregs);
        kcb = get_kprobe_ctlblk();
        if (kprobe_running()) {
                kprobes_inc_nmissed_count(p);
        } else {
-               unsigned long orig_ip = instruction_pointer(&(regs->regs));
+               unsigned long orig_ip = instruction_pointer(regs);
 
-               instruction_pointer_set(&(regs->regs), ip);
+               instruction_pointer_set(regs, ip);
 
                __this_cpu_write(current_kprobe, p);
                kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-               if (!p->pre_handler || !p->pre_handler(p, &(regs->regs))) {
+               if (!p->pre_handler || !p->pre_handler(p, regs)) {
                        /*
                         * Emulate singlestep (and also recover regs->pc)
                         * as if there is a nop
                         */
-                       instruction_pointer_set(&(regs->regs),
+                       instruction_pointer_set(regs,
                                (unsigned long)p->addr + MCOUNT_INSN_SIZE);
                        if (unlikely(p->post_handler)) {
                                kcb->kprobe_status = KPROBE_HIT_SSDONE;
-                               p->post_handler(p, &(regs->regs), 0);
+                               p->post_handler(p, regs, 0);
                        }
-                       instruction_pointer_set(&(regs->regs), orig_ip);
+                       instruction_pointer_set(regs, orig_ip);
                }
 
                /*
index a2ec186..7e2c78e 100644 (file)
@@ -256,8 +256,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
                 * normal page fault.
                 */
                regs->epc = (unsigned long) cur->addr;
-               if (!instruction_pointer(regs))
-                       BUG();
+               BUG_ON(!instruction_pointer(regs));
 
                if (kcb->kprobe_status == KPROBE_REENTER)
                        restore_previous_kprobe(kcb);
index 6f728e7..f9cd57c 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/tick.h>
 #include <linux/ptrace.h>
index f4a7db3..d3bf756 100644 (file)
@@ -116,7 +116,7 @@ void sbi_clear_ipi(void)
 EXPORT_SYMBOL(sbi_clear_ipi);
 
 /**
- * sbi_set_timer_v01() - Program the timer for next timer event.
+ * __sbi_set_timer_v01() - Program the timer for next timer event.
  * @stime_value: The value after which next timer event should fire.
  *
  * Return: None
index e85bacf..f8f1533 100644 (file)
@@ -147,7 +147,8 @@ static void __init init_resources(void)
        bss_res.end = __pa_symbol(__bss_stop) - 1;
        bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt) * sizeof(*mem_res);
+       /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */
+       mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt + 1) * sizeof(*mem_res);
        mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES);
        if (!mem_res)
                panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz);
index 3f893c9..2b3e0cb 100644 (file)
@@ -14,7 +14,7 @@
 
 #include <asm/stacktrace.h>
 
-register const unsigned long sp_in_global __asm__("sp");
+register unsigned long sp_in_global __asm__("sp");
 
 #ifdef CONFIG_FRAME_POINTER
 
index 8a5cf99..1b43226 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/delay.h>
 #include <asm/sbi.h>
 #include <asm/processor.h>
+#include <asm/timex.h>
 
 unsigned long riscv_timebase;
 EXPORT_SYMBOL_GPL(riscv_timebase);
index 3ed2c23..0879b5d 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/irq.h>
 
+#include <asm/asm-prototypes.h>
 #include <asm/bug.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
index 3fc18f4..937d13c 100644 (file)
@@ -155,7 +155,7 @@ static void __init kasan_populate(void *start, void *end)
        memset(start, KASAN_SHADOW_INIT, end - start);
 }
 
-void __init kasan_shallow_populate(void *start, void *end)
+static void __init kasan_shallow_populate(void *start, void *end)
 {
        unsigned long vaddr = (unsigned long)start & PAGE_MASK;
        unsigned long vend = PAGE_ALIGN((unsigned long)end);
@@ -187,6 +187,8 @@ void __init kasan_shallow_populate(void *start, void *end)
                }
                vaddr += PAGE_SIZE;
        }
+
+       local_flush_tlb_all();
 }
 
 void __init kasan_init(void)
@@ -214,7 +216,7 @@ void __init kasan_init(void)
                        break;
 
                kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end));
-       };
+       }
 
        for (i = 0; i < PTRS_PER_PTE; i++)
                set_pte(&kasan_early_shadow_pte[i],
index 6bcfc56..8925f39 100644 (file)
@@ -454,6 +454,7 @@ struct kvm_vcpu_stat {
        u64 diagnose_44;
        u64 diagnose_9c;
        u64 diagnose_9c_ignored;
+       u64 diagnose_9c_forward;
        u64 diagnose_258;
        u64 diagnose_308;
        u64 diagnose_500;
@@ -700,6 +701,10 @@ struct kvm_hw_bp_info_arch {
 #define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
                (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
 
+#define KVM_GUESTDBG_VALID_MASK \
+               (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+               KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
 struct kvm_guestdbg_info_arch {
        unsigned long cr0;
        unsigned long cr9;
index 053fe8b..a75d94a 100644 (file)
@@ -202,7 +202,7 @@ extern unsigned int s390_pci_no_rid;
 ----------------------------------------------------------------------------- */
 /* Base stuff */
 int zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
-void zpci_remove_device(struct zpci_dev *zdev);
+void zpci_remove_device(struct zpci_dev *zdev, bool set_error);
 int zpci_enable_device(struct zpci_dev *);
 int zpci_disable_device(struct zpci_dev *);
 int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64);
index 01e3600..e317fd4 100644 (file)
@@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void);
 extern void __cpu_die(unsigned int cpu);
 extern int __cpu_disable(void);
 extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
 
 #endif /* __ASM_SMP_H */
index 7b3cdb4..73ee891 100644 (file)
@@ -6,7 +6,7 @@
 #include <vdso/datapage.h>
 
 struct arch_vdso_data {
-       __u64 tod_steering_delta;
+       __s64 tod_steering_delta;
        __u64 tod_steering_end;
 };
 
index bc302b8..2e3e7ed 100644 (file)
@@ -968,7 +968,7 @@ static int cf_diag_all_start(void)
  */
 static size_t cf_diag_needspace(unsigned int sets)
 {
-       struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+       struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events);
        size_t bytes = 0;
        int i;
 
@@ -984,6 +984,7 @@ static size_t cf_diag_needspace(unsigned int sets)
                     sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
        debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__,
                            bytes);
+       put_cpu_ptr(&cpu_cf_events);
        return bytes;
 }
 
index 58c8afa..2fec2b8 100644 (file)
@@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu)
        asm volatile("diag %0,0,0x9c"
                     : : "d" (pcpu_devices[cpu].address));
 }
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
 
 /*
  * Send cpus emergency shutdown signal. This gives the cpus the
index 165da96..326cb8f 100644 (file)
@@ -80,10 +80,12 @@ void __init time_early_init(void)
 {
        struct ptff_qto qto;
        struct ptff_qui qui;
+       int cs;
 
        /* Initialize TOD steering parameters */
        tod_steering_end = tod_clock_base.tod;
-       vdso_data->arch_data.tod_steering_end = tod_steering_end;
+       for (cs = 0; cs < CS_BASES; cs++)
+               vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
 
        if (!test_facility(28))
                return;
@@ -366,6 +368,7 @@ static void clock_sync_global(unsigned long delta)
 {
        unsigned long now, adj;
        struct ptff_qto qto;
+       int cs;
 
        /* Fixup the monotonic sched clock. */
        tod_clock_base.eitod += delta;
@@ -381,7 +384,10 @@ static void clock_sync_global(unsigned long delta)
                panic("TOD clock sync offset %li is too large to drift\n",
                      tod_steering_delta);
        tod_steering_end = now + (abs(tod_steering_delta) << 15);
-       vdso_data->arch_data.tod_steering_end = tod_steering_end;
+       for (cs = 0; cs < CS_BASES; cs++) {
+               vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
+               vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta;
+       }
 
        /* Update LPAR offset. */
        if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
index 73c7afc..f216a1b 100644 (file)
@@ -214,7 +214,7 @@ void vtime_flush(struct task_struct *tsk)
        avg_steal = S390_lowcore.avg_steal_timer / 2;
        if ((s64) steal > 0) {
                S390_lowcore.steal_timer = 0;
-               account_steal_time(steal);
+               account_steal_time(cputime_to_nsecs(steal));
                avg_steal += steal;
        }
        S390_lowcore.avg_steal_timer = avg_steal;
index 5b8ec1c..02c146f 100644 (file)
@@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+       /* Reset the count on a new slice */
+       if (time_after(jiffies, cur_slice)) {
+               cur_slice = jiffies;
+               forward_cnt = diag9c_forwarding_hz / HZ;
+       }
+       return forward_cnt-- <= 0 ? 1 : 0;
+}
+
 static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
 {
        struct kvm_vcpu *tcpu;
@@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
        if (!tcpu)
                goto no_yield;
 
-       /* target already running */
-       if (READ_ONCE(tcpu->cpu) >= 0)
-               goto no_yield;
+       /* target guest VCPU already running */
+       if (READ_ONCE(tcpu->cpu) >= 0) {
+               if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+                       goto no_yield;
+
+               /* target host CPU already running */
+               if (!vcpu_is_preempted(tcpu->cpu))
+                       goto no_yield;
+               smp_yield_cpu(tcpu->cpu);
+               VCPU_EVENT(vcpu, 5,
+                          "diag time slice end directed to %d: yield forwarded",
+                          tid);
+               vcpu->stat.diagnose_9c_forward++;
+               return 0;
+       }
 
        if (kvm_vcpu_yield_to(tcpu) <= 0)
                goto no_yield;
index 6d6b570..b9f85b2 100644 (file)
@@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
  * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ *      successful (return value 0), or to the first invalid DAT entry in
+ *      case of exceptions (return value > 0)
  * @fake: pgt references contiguous guest memory block, not a pgtable
  */
 static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
                        rfte.val = ptr;
                        goto shadow_r2t;
                }
+               *pgt = ptr + vaddr.rfx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
                if (rc)
                        return rc;
@@ -1060,6 +1063,7 @@ shadow_r2t:
                        rste.val = ptr;
                        goto shadow_r3t;
                }
+               *pgt = ptr + vaddr.rsx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
                if (rc)
                        return rc;
@@ -1087,6 +1091,7 @@ shadow_r3t:
                        rtte.val = ptr;
                        goto shadow_sgt;
                }
+               *pgt = ptr + vaddr.rtx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
                if (rc)
                        return rc;
@@ -1123,6 +1128,7 @@ shadow_sgt:
                        ste.val = ptr;
                        goto shadow_pgt;
                }
+               *pgt = ptr + vaddr.sx * 8;
                rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
                if (rc)
                        return rc;
@@ -1157,6 +1163,8 @@ shadow_pgt:
  * @vcpu: virtual cpu
  * @sg: pointer to the shadow guest address space structure
  * @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ *         the valid leaf, plus some flags
  *
  * Returns: - 0 if the shadow fault was successfully resolved
  *         - > 0 (pgm exception code) on exceptions while faulting
@@ -1165,11 +1173,11 @@ shadow_pgt:
  *         - -ENOMEM if out of memory
  */
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
-                         unsigned long saddr)
+                         unsigned long saddr, unsigned long *datptr)
 {
        union vaddress vaddr;
        union page_table_entry pte;
-       unsigned long pgt;
+       unsigned long pgt = 0;
        int dat_protection, fake;
        int rc;
 
@@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
                pte.val = pgt + vaddr.px * PAGE_SIZE;
                goto shadow_page;
        }
-       if (!rc)
-               rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+       switch (rc) {
+       case PGM_SEGMENT_TRANSLATION:
+       case PGM_REGION_THIRD_TRANS:
+       case PGM_REGION_SECOND_TRANS:
+       case PGM_REGION_FIRST_TRANS:
+               pgt |= PEI_NOT_PTE;
+               break;
+       case 0:
+               pgt += vaddr.px * 8;
+               rc = gmap_read_table(sg->parent, pgt, &pte.val);
+       }
+       if (datptr)
+               *datptr = pgt | dat_protection * PEI_DAT_PROT;
        if (!rc && pte.i)
                rc = PGM_PAGE_TRANSLATION;
        if (!rc && pte.z)
index f4c5175..7c72a5e 100644 (file)
 
 /**
  * kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
  * @gra - guest real address
  *
  * Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
  */
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
-                                                unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
 {
-       unsigned long prefix  = kvm_s390_get_prefix(vcpu);
-
        if (gra < 2 * PAGE_SIZE)
                gra += prefix;
        else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
 }
 
 /**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+                                                unsigned long gra)
+{
+       return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+                                                          unsigned long ga)
+{
+       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+               return ga;
+       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+               return ga & ((1UL << 31) - 1);
+       return ga & ((1UL << 24) - 1);
+}
+
+/**
  * kvm_s390_logical_to_effective - convert guest logical to effective address
  * @vcpu: guest virtual cpu
  * @ga: guest logical address
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
 static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
                                                          unsigned long ga)
 {
-       psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
-       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
-               return ga;
-       if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
-               return ga & ((1UL << 31) - 1);
-       return ga & ((1UL << 24) - 1);
+       return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
 }
 
 /*
@@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
 int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
-                         unsigned long saddr);
+                         unsigned long saddr, unsigned long *datptr);
 
 #endif /* __KVM_S390_GACCESS_H */
index 2f09e9d..1296fc1 100644 (file)
@@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("instruction_diag_44", diagnose_44),
        VCPU_STAT("instruction_diag_9c", diagnose_9c),
        VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+       VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
        VCPU_STAT("instruction_diag_258", diagnose_258),
        VCPU_STAT("instruction_diag_308", diagnose_308),
        VCPU_STAT("instruction_diag_500", diagnose_500),
@@ -185,6 +186,11 @@ static bool use_gisa  = true;
 module_param(use_gisa, bool, 0644);
 MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
 
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
 /*
  * For now we handle at most 16 double words as this is what the s390 base
  * kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -544,6 +550,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_DIAG318:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               r = KVM_GUESTDBG_VALID_MASK;
+               break;
        case KVM_CAP_S390_HPAGE_1M:
                r = 0;
                if (hpage && !kvm_is_ucontrol(kvm))
@@ -4307,16 +4316,16 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu)
        kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
        kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
        if (MACHINE_HAS_GS) {
+               preempt_disable();
                __ctl_set_bit(2, 4);
                if (vcpu->arch.gs_enabled)
                        save_gs_cb(current->thread.gs_cb);
-               preempt_disable();
                current->thread.gs_cb = vcpu->arch.host_gscb;
                restore_gs_cb(vcpu->arch.host_gscb);
-               preempt_enable();
                if (!vcpu->arch.host_gscb)
                        __ctl_clear_bit(2, 4);
                vcpu->arch.host_gscb = NULL;
+               preempt_enable();
        }
        /* SIE will save etoken directly into SDNX and therefore kvm_run */
 }
@@ -4542,7 +4551,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
                /*
                 * As we are starting a second VCPU, we have to disable
                 * the IBS facility on all VCPUs to remove potentially
-                * oustanding ENABLE requests.
+                * outstanding ENABLE requests.
                 */
                __disable_ibs_on_all_vcpus(vcpu->kvm);
        }
index 79dcd64..9fad251 100644 (file)
@@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
  * @kvm: the KVM guest
  */
 void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
 #endif
index bd803e0..4002a24 100644 (file)
@@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                memcpy((void *)((u64)scb_o + 0xc0),
                       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
                break;
-       case ICPT_PARTEXEC:
-               /* MVPG only */
-               memcpy((void *)((u64)scb_o + 0xc0),
-                      (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
-               break;
        }
 
        if (scb_s->ihcpu != 0xffffU)
@@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
        /* with mso/msl, the prefix lies at offset *mso* */
        prefix += scb_s->mso;
 
-       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+       rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
        if (!rc && (scb_s->ecb & ECB_TE))
                rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                          prefix + PAGE_SIZE);
+                                          prefix + PAGE_SIZE, NULL);
        /*
         * We don't have to mprotect, we will be called for all unshadows.
         * SIE will detect if protection applies and trigger a validity.
@@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                                    current->thread.gmap_addr, 1);
 
        rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                  current->thread.gmap_addr);
+                                  current->thread.gmap_addr, NULL);
        if (rc > 0) {
                rc = inject_fault(vcpu, rc,
                                  current->thread.gmap_addr,
@@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
 {
        if (vsie_page->fault_addr)
                kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-                                     vsie_page->fault_addr);
+                                     vsie_page->fault_addr, NULL);
        vsie_page->fault_addr = 0;
 }
 
@@ -984,6 +979,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 }
 
 /*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+       /* no need to validate the parameter and/or perform error handling */
+       reg &= 0xf;
+       switch (reg) {
+       case 15:
+               return vsie_page->scb_s.gg15;
+       case 14:
+               return vsie_page->scb_s.gg14;
+       default:
+               return vcpu->run->s.regs.gprs[reg];
+       }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+       struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+       unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+       u64 *pei_block = &vsie_page->scb_o->mcic;
+       int edat, rc_dest, rc_src;
+       union ctlreg0 cr0;
+
+       cr0.val = vcpu->arch.sie_block->gcr[0];
+       edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+       mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+       prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+       dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+       dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+       src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+       src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+       rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+       rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+       /*
+        * Either everything went well, or something non-critical went wrong
+        * e.g. because of a race. In either case, simply retry.
+        */
+       if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+               retry_vsie_icpt(vsie_page);
+               return -EAGAIN;
+       }
+       /* Something more serious went wrong, propagate the error */
+       if (rc_dest < 0)
+               return rc_dest;
+       if (rc_src < 0)
+               return rc_src;
+
+       /* The only possible suppressing exception: just deliver it */
+       if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+               clear_vsie_icpt(vsie_page);
+               rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+               WARN_ON_ONCE(rc_dest);
+               return 1;
+       }
+
+       /*
+        * Forward the PEI intercept to the guest if it was a page fault, or
+        * also for segment and region table faults if EDAT applies.
+        */
+       if (edat) {
+               rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+               rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+       } else {
+               rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+               rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+       }
+       if (!rc_dest && !rc_src) {
+               pei_block[0] = pei_dest;
+               pei_block[1] = pei_src;
+               return 1;
+       }
+
+       retry_vsie_icpt(vsie_page);
+
+       /*
+        * The host has edat, and the guest does not, or it was an ASCE type
+        * exception. The host needs to inject the appropriate DAT interrupts
+        * into the guest.
+        */
+       if (rc_dest)
+               return inject_fault(vcpu, rc_dest, dest, 1);
+       return inject_fault(vcpu, rc_src, src, 0);
+}
+
+/*
  * Run the vsie on a shadow scb and a shadow gmap, without any further
  * sanity checks, handling SIE faults.
  *
@@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                if ((scb_s->ipa & 0xf000) != 0xf000)
                        scb_s->ipa += 0x1000;
                break;
+       case ICPT_PARTEXEC:
+               if (scb_s->ipa == 0xb254)
+                       rc = vsie_handle_mvpg(vcpu, vsie_page);
+               break;
        }
        return rc;
 }
index 600881d..9106407 100644 (file)
@@ -682,16 +682,36 @@ int zpci_disable_device(struct zpci_dev *zdev)
 }
 EXPORT_SYMBOL_GPL(zpci_disable_device);
 
-void zpci_remove_device(struct zpci_dev *zdev)
+/* zpci_remove_device - Removes the given zdev from the PCI core
+ * @zdev: the zdev to be removed from the PCI core
+ * @set_error: if true the device's error state is set to permanent failure
+ *
+ * Sets a zPCI device to a configured but offline state; the zPCI
+ * device is still accessible through its hotplug slot and the zPCI
+ * API but is removed from the common code PCI bus, making it
+ * no longer available to drivers.
+ */
+void zpci_remove_device(struct zpci_dev *zdev, bool set_error)
 {
        struct zpci_bus *zbus = zdev->zbus;
        struct pci_dev *pdev;
 
+       if (!zdev->zbus->bus)
+               return;
+
        pdev = pci_get_slot(zbus->bus, zdev->devfn);
        if (pdev) {
-               if (pdev->is_virtfn)
-                       return zpci_iov_remove_virtfn(pdev, zdev->vfn);
+               if (set_error)
+                       pdev->error_state = pci_channel_io_perm_failure;
+               if (pdev->is_virtfn) {
+                       zpci_iov_remove_virtfn(pdev, zdev->vfn);
+                       /* balance pci_get_slot */
+                       pci_dev_put(pdev);
+                       return;
+               }
                pci_stop_and_remove_bus_device_locked(pdev);
+               /* balance pci_get_slot */
+               pci_dev_put(pdev);
        }
 }
 
@@ -765,7 +785,7 @@ void zpci_release_device(struct kref *kref)
        struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref);
 
        if (zdev->zbus->bus)
-               zpci_remove_device(zdev);
+               zpci_remove_device(zdev, false);
 
        switch (zdev->state) {
        case ZPCI_FN_STATE_ONLINE:
index b4162da..ac0c65c 100644 (file)
@@ -76,13 +76,10 @@ void zpci_event_error(void *data)
 static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 {
        struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
-       struct pci_dev *pdev = NULL;
        enum zpci_state state;
+       struct pci_dev *pdev;
        int ret;
 
-       if (zdev && zdev->zbus->bus)
-               pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
-
        zpci_err("avail CCDF:\n");
        zpci_err_hex(ccdf, sizeof(*ccdf));
 
@@ -124,8 +121,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
        case 0x0303: /* Deconfiguration requested */
                if (!zdev)
                        break;
-               if (pdev)
-                       zpci_remove_device(zdev);
+               zpci_remove_device(zdev, false);
 
                ret = zpci_disable_device(zdev);
                if (ret)
@@ -140,12 +136,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
        case 0x0304: /* Configured -> Standby|Reserved */
                if (!zdev)
                        break;
-               if (pdev) {
-                       /* Give the driver a hint that the function is
-                        * already unusable. */
-                       pdev->error_state = pci_channel_io_perm_failure;
-                       zpci_remove_device(zdev);
-               }
+               /* Give the driver a hint that the function is
+                * already unusable.
+                */
+               zpci_remove_device(zdev, true);
 
                zdev->fh = ccdf->fh;
                zpci_disable_device(zdev);
index 2792879..35391e9 100644 (file)
@@ -1931,6 +1931,7 @@ config X86_SGX
        depends on CRYPTO_SHA256=y
        select SRCU
        select MMU_NOTIFIER
+       select NUMA_KEEP_MEMINFO if NUMA
        help
          Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
          that can be used by applications to set aside private regions of code
index 2d6d5a2..9a85eae 100644 (file)
@@ -27,7 +27,7 @@ endif
 REALMODE_CFLAGS        := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \
                   -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
                   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-                  -mno-mmx -mno-sse
+                  -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
 
 REALMODE_CFLAGS += -ffreestanding
 REALMODE_CFLAGS += -fno-stack-protector
index 7bbb5bb..37ce384 100644 (file)
@@ -3659,6 +3659,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
                return ret;
 
        if (event->attr.precise_ip) {
+               if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
+                       return -EINVAL;
+
                if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
                        event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
                        if (!(event->attr.sample_type &
index 7ebae18..d32b302 100644 (file)
@@ -2010,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
                 */
                if (!pebs_status && cpuc->pebs_enabled &&
                        !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
-                       pebs_status = cpuc->pebs_enabled;
+                       pebs_status = p->status = cpuc->pebs_enabled;
 
                bit = find_first_bit((unsigned long *)&pebs_status,
                                        x86_pmu.max_pebs_events);
index cc96e26..dddc746 100644 (file)
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL        (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
 #define X86_FEATURE_SPLIT_LOCK_DETECT  (11*32+ 6) /* #AC for split lock */
 #define X86_FEATURE_PER_THREAD_MBA     (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1               (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2               (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI           (12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVIC               (15*32+13) /* Virtual Interrupt Controller */
 #define X86_FEATURE_V_VMSAVE_VMLOAD    (15*32+15) /* Virtual VMSAVE VMLOAD */
 #define X86_FEATURE_VGIF               (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL                (15*32+20) /* Virtual SPEC_CTRL */
 #define X86_FEATURE_SVME_ADDR_CHK      (15*32+28) /* "" SVME addr check */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
index 9bc091e..ad22d48 100644 (file)
@@ -221,12 +221,22 @@ enum x86_intercept_stage;
 #define DR7_FIXED_1    0x00000400
 #define DR7_VOLATILE   0xffff2bff
 
+#define KVM_GUESTDBG_VALID_MASK \
+       (KVM_GUESTDBG_ENABLE | \
+       KVM_GUESTDBG_SINGLESTEP | \
+       KVM_GUESTDBG_USE_HW_BP | \
+       KVM_GUESTDBG_USE_SW_BP | \
+       KVM_GUESTDBG_INJECT_BP | \
+       KVM_GUESTDBG_INJECT_DB)
+
+
 #define PFERR_PRESENT_BIT 0
 #define PFERR_WRITE_BIT 1
 #define PFERR_USER_BIT 2
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
 #define PFERR_GUEST_FINAL_BIT 32
 #define PFERR_GUEST_PAGE_BIT 33
 
@@ -236,6 +246,7 @@ enum x86_intercept_stage;
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
 #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
 #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
 
@@ -884,12 +895,29 @@ struct kvm_hv_syndbg {
        u64 options;
 };
 
+/* Current state of Hyper-V TSC page clocksource */
+enum hv_tsc_page_status {
+       /* TSC page was not set up or disabled */
+       HV_TSC_PAGE_UNSET = 0,
+       /* TSC page MSR was written by the guest, update pending */
+       HV_TSC_PAGE_GUEST_CHANGED,
+       /* TSC page MSR was written by KVM userspace, update pending */
+       HV_TSC_PAGE_HOST_CHANGED,
+       /* TSC page was properly set up and is currently active  */
+       HV_TSC_PAGE_SET,
+       /* TSC page is currently being updated and therefore is inactive */
+       HV_TSC_PAGE_UPDATING,
+       /* TSC page was set up with an inaccessible GPA */
+       HV_TSC_PAGE_BROKEN,
+};
+
 /* Hyper-V emulation context */
 struct kvm_hv {
        struct mutex hv_lock;
        u64 hv_guest_os_id;
        u64 hv_hypercall;
        u64 hv_tsc_page;
+       enum hv_tsc_page_status hv_tsc_page_status;
 
        /* Hyper-v based guest crash (NT kernel bugcheck) parameters */
        u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
@@ -931,6 +959,12 @@ enum kvm_irqchip_mode {
        KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+struct kvm_x86_msr_filter {
+       u8 count;
+       bool default_allow:1;
+       struct msr_bitmap_range ranges[16];
+};
+
 #define APICV_INHIBIT_REASON_DISABLE    0
 #define APICV_INHIBIT_REASON_HYPERV     1
 #define APICV_INHIBIT_REASON_NESTED     2
@@ -1025,16 +1059,14 @@ struct kvm_arch {
        bool guest_can_read_msr_platform_info;
        bool exception_payload_enabled;
 
+       bool bus_lock_detection_enabled;
+
        /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
        u32 user_space_msr_mask;
+       struct kvm_x86_msr_filter __rcu *msr_filter;
 
-       struct {
-               u8 count;
-               bool default_allow:1;
-               struct msr_bitmap_range ranges[16];
-       } msr_filter;
-
-       bool bus_lock_detection_enabled;
+       /* Guest can access the SGX PROVISIONKEY. */
+       bool sgx_provisioning_allowed;
 
        struct kvm_pmu_event_filter __rcu *pmu_event_filter;
        struct task_struct *nx_lpage_recovery_thread;
@@ -1050,25 +1082,36 @@ struct kvm_arch {
        bool tdp_mmu_enabled;
 
        /*
-        * List of struct kvmp_mmu_pages being used as roots.
+        * List of struct kvm_mmu_pages being used as roots.
         * All struct kvm_mmu_pages in the list should have
         * tdp_mmu_page set.
-        * All struct kvm_mmu_pages in the list should have a positive
-        * root_count except when a thread holds the MMU lock and is removing
-        * an entry from the list.
+        *
+        * For reads, this list is protected by:
+        *      the MMU lock in read mode + RCU or
+        *      the MMU lock in write mode
+        *
+        * For writes, this list is protected by:
+        *      the MMU lock in read mode + the tdp_mmu_pages_lock or
+        *      the MMU lock in write mode
+        *
+        * Roots will remain in the list until their tdp_mmu_root_count
+        * drops to zero, at which point the thread that decremented the
+        * count to zero should removed the root from the list and clean
+        * it up, freeing the root after an RCU grace period.
         */
        struct list_head tdp_mmu_roots;
 
        /*
         * List of struct kvmp_mmu_pages not being used as roots.
         * All struct kvm_mmu_pages in the list should have
-        * tdp_mmu_page set and a root_count of 0.
+        * tdp_mmu_page set and a tdp_mmu_root_count of 0.
         */
        struct list_head tdp_mmu_pages;
 
        /*
         * Protects accesses to the following fields when the MMU lock
         * is held in read mode:
+        *  - tdp_mmu_roots (above)
         *  - tdp_mmu_pages (above)
         *  - the link field of struct kvm_mmu_pages used by the TDP MMU
         *  - lpage_disallowed_mmu_pages
@@ -1125,6 +1168,9 @@ struct kvm_vcpu_stat {
        u64 req_event;
        u64 halt_poll_success_ns;
        u64 halt_poll_fail_ns;
+       u64 nested_run;
+       u64 directed_yield_attempted;
+       u64 directed_yield_successful;
 };
 
 struct x86_instruction_info;
@@ -1251,8 +1297,8 @@ struct kvm_x86_ops {
        int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 
-       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level);
+       void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level);
 
        bool (*has_wbinvd_exit)(void);
 
@@ -1321,6 +1367,7 @@ struct kvm_x86_ops {
        int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
        int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
        int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+       int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
 
        int (*get_msr_feature)(struct kvm_msr_entry *entry);
 
@@ -1339,6 +1386,7 @@ struct kvm_x86_ops {
 struct kvm_x86_nested_ops {
        int (*check_events)(struct kvm_vcpu *vcpu);
        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+       void (*triple_fault)(struct kvm_vcpu *vcpu);
        int (*get_state)(struct kvm_vcpu *vcpu,
                         struct kvm_nested_state __user *user_kvm_nested_state,
                         unsigned user_data_size);
@@ -1410,9 +1458,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1422,8 +1467,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1520,6 +1563,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
 
 int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
 int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -1548,14 +1596,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -1596,9 +1644,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                        ulong roots_to_free);
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -1717,11 +1762,7 @@ asmlinkage void kvm_spurious_fault(void);
        _ASM_EXTABLE(666b, 667b)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_extint(struct kvm_vcpu *v);
index dc6d149..f1b9ed5 100644 (file)
@@ -551,15 +551,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
        *size = fpu_kernel_xstate_size;
 }
 
-/*
- * Thread-synchronous status.
- *
- * This is different from the flags in that nobody else
- * ever touches our thread-synchronous status, so we don't
- * have to worry about atomic accesses.
- */
-#define TS_COMPAT              0x0002  /* 32bit syscall active (64BIT)*/
-
 static inline void
 native_load_sp0(unsigned long sp0)
 {
similarity index 89%
rename from arch/x86/kernel/cpu/sgx/arch.h
rename to arch/x86/include/asm/sgx.h
index dd7602c..a16e2c9 100644 (file)
@@ -2,15 +2,20 @@
 /**
  * Copyright(c) 2016-20 Intel Corporation.
  *
- * Contains data structures defined by the SGX architecture.  Data structures
- * defined by the Linux software stack should not be placed here.
+ * Intel Software Guard Extensions (SGX) support.
  */
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
 
 #include <linux/bits.h>
 #include <linux/types.h>
 
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions.  The two should not be mixed
+ * together for better readibility.  The architectural definitions come first.
+ */
+
 /* The SGX specific CPUID function. */
 #define SGX_CPUID              0x12
 /* EPC enumeration. */
 /* The bitmask for the EPC section type. */
 #define SGX_CPUID_EPC_MASK     GENMASK(3, 0)
 
+enum sgx_encls_function {
+       ECREATE = 0x00,
+       EADD    = 0x01,
+       EINIT   = 0x02,
+       EREMOVE = 0x03,
+       EDGBRD  = 0x04,
+       EDGBWR  = 0x05,
+       EEXTEND = 0x06,
+       ELDU    = 0x08,
+       EBLOCK  = 0x09,
+       EPA     = 0x0A,
+       EWB     = 0x0B,
+       ETRACK  = 0x0C,
+       EAUG    = 0x0D,
+       EMODPR  = 0x0E,
+       EMODT   = 0x0F,
+};
+
 /**
  * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
  * %SGX_NOT_TRACKED:           Previous ETRACK's shootdown sequence has not
  *                             been completed yet.
+ * %SGX_CHILD_PRESENT          SECS has child pages present in the EPC.
  * %SGX_INVALID_EINITTOKEN:    EINITTOKEN is invalid and enclave signer's
  *                             public key does not match IA32_SGXLEPUBKEYHASH.
  * %SGX_UNMASKED_EVENT:                An unmasked event, e.g. INTR, was received
  */
 enum sgx_return_code {
        SGX_NOT_TRACKED                 = 11,
+       SGX_CHILD_PRESENT               = 13,
        SGX_INVALID_EINITTOKEN          = 16,
        SGX_UNMASKED_EVENT              = 128,
 };
@@ -335,4 +360,19 @@ struct sgx_sigstruct {
 
 #define SGX_LAUNCH_TOKEN_SIZE 304
 
-#endif /* _ASM_X86_SGX_ARCH_H */
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+                    int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+                  void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+                     unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
index c0538f8..57ef209 100644 (file)
@@ -132,6 +132,7 @@ void native_play_dead(void);
 void play_dead_common(void);
 void wbinvd_on_cpu(int cpu);
 int wbinvd_on_all_cpus(void);
+bool wakeup_cpu0(void);
 
 void native_smp_send_reschedule(int cpu);
 void native_send_call_func_ipi(const struct cpumask *mask);
index 1c56194..772e60e 100644 (file)
@@ -269,7 +269,9 @@ struct vmcb_save_area {
         * SEV-ES guests when referenced through the GHCB or for
         * saving to the host save area.
         */
-       u8 reserved_7[80];
+       u8 reserved_7[72];
+       u32 spec_ctrl;          /* Guest version of SPEC_CTRL at 0x2E0 */
+       u8 reserved_7b[4];
        u32 pkru;
        u8 reserved_7a[20];
        u64 reserved_8;         /* rax already available at 0x01f8 */
index 0d751d5..06b740b 100644 (file)
@@ -205,10 +205,23 @@ static inline int arch_within_stack_frames(const void * const stack,
 
 #endif
 
+/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_COMPAT              0x0002  /* 32bit syscall active (64BIT)*/
+
+#ifndef __ASSEMBLY__
 #ifdef CONFIG_COMPAT
 #define TS_I386_REGS_POKED     0x0004  /* regs poked by 32-bit ptracer */
+
+#define arch_set_restart_data(restart) \
+       do { restart->arch_data = current_thread_info()->status; } while (0)
+
 #endif
-#ifndef __ASSEMBLY__
 
 #ifdef CONFIG_X86_32
 #define in_ia32_syscall() true
index 358707f..0ffaa31 100644 (file)
@@ -373,6 +373,7 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_MOV_SS                0x00000002
 #define GUEST_INTR_STATE_SMI           0x00000004
 #define GUEST_INTR_STATE_NMI           0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR  0x00000010
 
 /* GUEST_ACTIVITY_STATE flags */
 #define GUEST_ACTIVITY_ACTIVE          0
index 7068e4b..1a162e5 100644 (file)
@@ -87,18 +87,6 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 #endif
 
 /*
- * The maximum amount of extra memory compared to the base size.  The
- * main scaling factor is the size of struct page.  At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define XEN_EXTRA_MEM_RATIO    (10)
-
-/*
  * Helper functions to write or read unsigned long values to/from
  * memory, when the access may fault.
  */
index b8e650a..946d761 100644 (file)
@@ -27,6 +27,7 @@
 
 
 #define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE      0x08000000
 
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
index 7bdc023..14cd318 100644 (file)
@@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void)
        /*
         * Initialize the ACPI boot-time table parser.
         */
-       if (acpi_table_init()) {
+       if (acpi_locate_initial_tables())
                disable_acpi();
-               return;
-       }
+       else
+               acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+       if (acpi_disabled)
+               return 1;
+
+       acpi_table_init_complete();
 
        acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
 
@@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void)
                } else {
                        printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
                        disable_acpi();
-                       return;
+                       return 1;
                }
        }
-}
-
-int __init early_acpi_boot_init(void)
-{
-       /*
-        * If acpi_disabled, bail out
-        */
-       if (acpi_disabled)
-               return 1;
 
        /*
         * Process the Multiple APIC Description Table (MADT), if present
index bda4f2a..4f26700 100644 (file)
@@ -2342,6 +2342,11 @@ static int cpuid_to_apicid[] = {
        [0 ... NR_CPUS - 1] = -1,
 };
 
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+       return phys_id == cpuid_to_apicid[cpu];
+}
+
 #ifdef CONFIG_SMP
 /**
  * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
index c3b60c3..73ff4dd 100644 (file)
@@ -1032,6 +1032,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
        if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
                irq = mp_irqs[idx].srcbusirq;
                legacy = mp_is_legacy_irq(irq);
+               /*
+                * IRQ2 is unusable for historical reasons on systems which
+                * have a legacy PIC. See the comment vs. IRQ2 further down.
+                *
+                * If this gets removed at some point then the related code
+                * in lapic_assign_system_vectors() needs to be adjusted as
+                * well.
+                */
+               if (legacy && irq == PIC_CASCADE_IR)
+                       return -EINVAL;
        }
 
        mutex_lock(&ioapic_mutex);
index 42af31b..defda61 100644 (file)
@@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = {
        { X86_FEATURE_AVX512_FP16,              X86_FEATURE_AVX512BW  },
        { X86_FEATURE_ENQCMD,                   X86_FEATURE_XSAVES    },
        { X86_FEATURE_PER_THREAD_MBA,           X86_FEATURE_MBA       },
+       { X86_FEATURE_SGX_LC,                   X86_FEATURE_SGX       },
+       { X86_FEATURE_SGX1,                     X86_FEATURE_SGX       },
+       { X86_FEATURE_SGX2,                     X86_FEATURE_SGX1      },
        {}
 };
 
index 3b1b01f..da696eb 100644 (file)
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
 }
 #endif /* CONFIG_X86_VMX_FEATURE_NAMES */
 
-static void clear_sgx_caps(void)
-{
-       setup_clear_cpu_cap(X86_FEATURE_SGX);
-       setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
 static int __init nosgx(char *str)
 {
-       clear_sgx_caps();
+       setup_clear_cpu_cap(X86_FEATURE_SGX);
 
        return 0;
 }
@@ -110,23 +104,30 @@ early_param("nosgx", nosgx);
 
 void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
 {
+       bool enable_sgx_kvm = false, enable_sgx_driver = false;
        bool tboot = tboot_enabled();
-       bool enable_sgx;
+       bool enable_vmx;
        u64 msr;
 
        if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
                clear_cpu_cap(c, X86_FEATURE_VMX);
-               clear_sgx_caps();
+               clear_cpu_cap(c, X86_FEATURE_SGX);
                return;
        }
 
-       /*
-        * Enable SGX if and only if the kernel supports SGX and Launch Control
-        * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
-        */
-       enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
-                    cpu_has(c, X86_FEATURE_SGX_LC) &&
-                    IS_ENABLED(CONFIG_X86_SGX);
+       enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+                    IS_ENABLED(CONFIG_KVM_INTEL);
+
+       if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+               /*
+                * Separate out SGX driver enabling from KVM.  This allows KVM
+                * guests to use SGX even if the kernel SGX driver refuses to
+                * use it.  This happens if flexible Launch Control is not
+                * available.
+                */
+               enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+               enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+       }
 
        if (msr & FEAT_CTL_LOCKED)
                goto update_caps;
@@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
         * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
         * for the kernel, e.g. using VMX to hide malicious code.
         */
-       if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+       if (enable_vmx) {
                msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
 
                if (tboot)
                        msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
        }
 
-       if (enable_sgx)
-               msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+       if (enable_sgx_kvm || enable_sgx_driver) {
+               msr |= FEAT_CTL_SGX_ENABLED;
+               if (enable_sgx_driver)
+                       msr |= FEAT_CTL_SGX_LC_ENABLED;
+       }
 
        wrmsrl(MSR_IA32_FEAT_CTL, msr);
 
@@ -173,10 +177,29 @@ update_caps:
        }
 
 update_sgx:
-       if (!(msr & FEAT_CTL_SGX_ENABLED) ||
-           !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
-               if (enable_sgx)
-                       pr_err_once("SGX disabled by BIOS\n");
-               clear_sgx_caps();
+       if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+               if (enable_sgx_kvm || enable_sgx_driver)
+                       pr_err_once("SGX disabled by BIOS.\n");
+               clear_cpu_cap(c, X86_FEATURE_SGX);
+               return;
+       }
+
+       /*
+        * VMX feature bit may be cleared due to being disabled in BIOS,
+        * in which case SGX virtualization cannot be supported either.
+        */
+       if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+               pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+               enable_sgx_kvm = 0;
+       }
+
+       if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+               if (!enable_sgx_kvm) {
+                       pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+                       clear_cpu_cap(c, X86_FEATURE_SGX);
+               } else {
+                       pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+                       clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+               }
        }
 }
index 972ec3b..21d1f06 100644 (file)
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
        { X86_FEATURE_CDP_L2,           CPUID_ECX,  2, 0x00000010, 2 },
        { X86_FEATURE_MBA,              CPUID_EBX,  3, 0x00000010, 0 },
        { X86_FEATURE_PER_THREAD_MBA,   CPUID_ECX,  0, 0x00000010, 3 },
+       { X86_FEATURE_SGX1,             CPUID_EAX,  0, 0x00000012, 0 },
+       { X86_FEATURE_SGX2,             CPUID_EAX,  1, 0x00000012, 0 },
        { X86_FEATURE_HW_PSTATE,        CPUID_EDX,  7, 0x80000007, 0 },
        { X86_FEATURE_CPB,              CPUID_EDX,  9, 0x80000007, 0 },
        { X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
index 91d3dc7..9c16567 100644 (file)
@@ -3,3 +3,4 @@ obj-y += \
        encl.o \
        ioctl.o \
        main.o
+obj-$(CONFIG_X86_SGX_KVM)      += virt.o
index 8ce6d83..aa9b8b8 100644 (file)
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
        .get_unmapped_area      = sgx_get_unmapped_area,
 };
 
-const struct file_operations sgx_provision_fops = {
-       .owner                  = THIS_MODULE,
-};
-
 static struct miscdevice sgx_dev_enclave = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
        .fops = &sgx_encl_fops,
 };
 
-static struct miscdevice sgx_dev_provision = {
-       .minor = MISC_DYNAMIC_MINOR,
-       .name = "sgx_provision",
-       .nodename = "sgx_provision",
-       .fops = &sgx_provision_fops,
-};
-
 int __init sgx_drv_init(void)
 {
        unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
        if (ret)
                return ret;
 
-       ret = misc_register(&sgx_dev_provision);
-       if (ret) {
-               misc_deregister(&sgx_dev_enclave);
-               return ret;
-       }
-
        return 0;
 }
index 7449ef3..3be2032 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/suspend.h>
 #include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
 #include "encl.h"
 #include "encls.h"
 #include "sgx.h"
@@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
 
        ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
        if (ret) {
-               sgx_free_epc_page(epc_page);
+               sgx_encl_free_epc_page(epc_page);
                return ERR_PTR(ret);
        }
 
@@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref)
                        if (sgx_unmark_page_reclaimable(entry->epc_page))
                                continue;
 
-                       sgx_free_epc_page(entry->epc_page);
+                       sgx_encl_free_epc_page(entry->epc_page);
                        encl->secs_child_cnt--;
                        entry->epc_page = NULL;
                }
@@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref)
        xa_destroy(&encl->page_array);
 
        if (!encl->secs_child_cnt && encl->secs.epc_page) {
-               sgx_free_epc_page(encl->secs.epc_page);
+               sgx_encl_free_epc_page(encl->secs.epc_page);
                encl->secs.epc_page = NULL;
        }
 
@@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref)
                va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
                                           list);
                list_del(&va_page->list);
-               sgx_free_epc_page(va_page->epc_page);
+               sgx_encl_free_epc_page(va_page->epc_page);
                kfree(va_page);
        }
 
@@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
        ret = __epa(sgx_get_epc_virt_addr(epc_page));
        if (ret) {
                WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
-               sgx_free_epc_page(epc_page);
+               sgx_encl_free_epc_page(epc_page);
                return ERR_PTR(-EFAULT);
        }
 
@@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
 
        return slot == SGX_VA_SLOT_COUNT;
 }
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page:      EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list.  Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+       int ret;
+
+       WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+       ret = __eremove(sgx_get_epc_virt_addr(page));
+       if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+               return;
+
+       sgx_free_epc_page(page);
+}
index d8d30cc..6e74f85 100644 (file)
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
 unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
 void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
 bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
 
 #endif /* _X86_ENCL_H */
index 443188f..9b20484 100644 (file)
 #include <asm/traps.h>
 #include "sgx.h"
 
-enum sgx_encls_function {
-       ECREATE = 0x00,
-       EADD    = 0x01,
-       EINIT   = 0x02,
-       EREMOVE = 0x03,
-       EDGBRD  = 0x04,
-       EDGBWR  = 0x05,
-       EEXTEND = 0x06,
-       ELDU    = 0x08,
-       EBLOCK  = 0x09,
-       EPA     = 0x0A,
-       EWB     = 0x0B,
-       ETRACK  = 0x0C,
-};
-
 /**
  * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
  *
@@ -55,6 +40,19 @@ enum sgx_encls_function {
        } while (0);                                                      \
 }
 
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret:       the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true:     ENCLS leaf faulted.
+ * - false:    Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+       return ret & ENCLS_FAULT_FLAG;
+}
+
 /**
  * encls_failed() - Check if an ENCLS function failed
  * @ret:       the return value of an ENCLS function call
@@ -65,7 +63,7 @@ enum sgx_encls_function {
  */
 static inline bool encls_failed(int ret)
 {
-       if (ret & ENCLS_FAULT_FLAG)
+       if (encls_faulted(ret))
                return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
 
        return !!ret;
index 90a5caf..83df20e 100644 (file)
@@ -2,6 +2,7 @@
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
 #include <asm/mman.h>
+#include <asm/sgx.h>
 #include <linux/mman.h>
 #include <linux/delay.h>
 #include <linux/file.h>
@@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
        encl->page_cnt--;
 
        if (va_page) {
-               sgx_free_epc_page(va_page->epc_page);
+               sgx_encl_free_epc_page(va_page->epc_page);
                list_del(&va_page->list);
                kfree(va_page);
        }
@@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
        return 0;
 
 err_out:
-       sgx_free_epc_page(encl->secs.epc_page);
+       sgx_encl_free_epc_page(encl->secs.epc_page);
        encl->secs.epc_page = NULL;
 
 err_out_backing:
@@ -365,7 +366,7 @@ err_out_unlock:
        mmap_read_unlock(current->mm);
 
 err_out_free:
-       sgx_free_epc_page(epc_page);
+       sgx_encl_free_epc_page(epc_page);
        kfree(encl_page);
 
        return ret;
@@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
                         void *token)
 {
        u64 mrsigner[4];
-       int i, j, k;
+       int i, j;
        void *addr;
        int ret;
 
@@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
 
                        preempt_disable();
 
-                       for (k = 0; k < 4; k++)
-                               wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+                       sgx_update_lepubkeyhash(mrsigner);
 
                        ret = __einit(sigstruct, token, addr);
 
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
                }
        }
 
-       if (ret & ENCLS_FAULT_FLAG) {
+       if (encls_faulted(ret)) {
                if (encls_failed(ret))
                        ENCLS_WARN(ret, "EINIT");
 
@@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
 {
        struct sgx_sigstruct *sigstruct;
        struct sgx_enclave_init init_arg;
-       struct page *initp_page;
        void *token;
        int ret;
 
@@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
        if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
                return -EFAULT;
 
-       initp_page = alloc_page(GFP_KERNEL);
-       if (!initp_page)
+       /*
+        * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+        * boundary.  kmalloc() will give this alignment when allocating
+        * PAGE_SIZE bytes.
+        */
+       sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!sigstruct)
                return -ENOMEM;
 
-       sigstruct = kmap(initp_page);
        token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
        memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
 
@@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
        ret = sgx_encl_init(encl, sigstruct, token);
 
 out:
-       kunmap(initp_page);
-       __free_page(initp_page);
+       kfree(sigstruct);
        return ret;
 }
 
@@ -665,24 +667,11 @@ out:
 static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
 {
        struct sgx_enclave_provision params;
-       struct file *file;
 
        if (copy_from_user(&params, arg, sizeof(params)))
                return -EFAULT;
 
-       file = fget(params.fd);
-       if (!file)
-               return -EINVAL;
-
-       if (file->f_op != &sgx_provision_fops) {
-               fput(file);
-               return -EINVAL;
-       }
-
-       encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
-       fput(file);
-       return 0;
+       return sgx_set_attribute(&encl->attributes_mask, params.fd);
 }
 
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
index 8df81a3..ad90474 100644 (file)
@@ -1,14 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
+#include <linux/file.h>
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/miscdevice.h>
 #include <linux/pagemap.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include <asm/sgx.h>
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
@@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
  * with sgx_reclaimer_lock acquired.
  */
 static LIST_HEAD(sgx_active_page_list);
-
 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
 
+/* The free page list lock protected variables prepend the lock. */
+static unsigned long sgx_nr_free_pages;
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node.  Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
 /*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
  */
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
 {
        struct sgx_epc_page *page;
        LIST_HEAD(dirty);
        int ret;
 
-       /* init_laundry_list is thread-local, no need for a lock: */
-       while (!list_empty(&section->init_laundry_list)) {
+       /* dirty_page_list is thread-local, no need for a lock: */
+       while (!list_empty(dirty_page_list)) {
                if (kthread_should_stop())
                        return;
 
-               /* needed for access to ->page_list: */
-               spin_lock(&section->lock);
-
-               page = list_first_entry(&section->init_laundry_list,
-                                       struct sgx_epc_page, list);
+               page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
 
                ret = __eremove(sgx_get_epc_virt_addr(page));
-               if (!ret)
-                       list_move(&page->list, &section->page_list);
-               else
+               if (!ret) {
+                       /*
+                        * page is now sanitized.  Make it available via the SGX
+                        * page allocator:
+                        */
+                       list_del(&page->list);
+                       sgx_free_epc_page(page);
+               } else {
+                       /* The page is not yet clean - move to the dirty list. */
                        list_move_tail(&page->list, &dirty);
-
-               spin_unlock(&section->lock);
+               }
 
                cond_resched();
        }
 
-       list_splice(&dirty, &section->init_laundry_list);
+       list_splice(&dirty, dirty_page_list);
 }
 
 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 
                sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
 
-               sgx_free_epc_page(encl->secs.epc_page);
+               sgx_encl_free_epc_page(encl->secs.epc_page);
                encl->secs.epc_page = NULL;
 
                sgx_encl_put_backing(&secs_backing, true);
@@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void)
        struct sgx_epc_section *section;
        struct sgx_encl_page *encl_page;
        struct sgx_epc_page *epc_page;
+       struct sgx_numa_node *node;
        pgoff_t page_index;
        int cnt = 0;
        int ret;
@@ -379,50 +399,33 @@ skip:
                epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 
                section = &sgx_epc_sections[epc_page->section];
-               spin_lock(&section->lock);
-               list_add_tail(&epc_page->list, &section->page_list);
-               section->free_cnt++;
-               spin_unlock(&section->lock);
-       }
-}
-
-static unsigned long sgx_nr_free_pages(void)
-{
-       unsigned long cnt = 0;
-       int i;
-
-       for (i = 0; i < sgx_nr_epc_sections; i++)
-               cnt += sgx_epc_sections[i].free_cnt;
+               node = section->node;
 
-       return cnt;
+               spin_lock(&node->lock);
+               list_add_tail(&epc_page->list, &node->free_page_list);
+               sgx_nr_free_pages++;
+               spin_unlock(&node->lock);
+       }
 }
 
 static bool sgx_should_reclaim(unsigned long watermark)
 {
-       return sgx_nr_free_pages() < watermark &&
-              !list_empty(&sgx_active_page_list);
+       return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
 }
 
 static int ksgxd(void *p)
 {
-       int i;
-
        set_freezable();
 
        /*
         * Sanitize pages in order to recover from kexec(). The 2nd pass is
         * required for SECS pages, whose child pages blocked EREMOVE.
         */
-       for (i = 0; i < sgx_nr_epc_sections; i++)
-               sgx_sanitize_section(&sgx_epc_sections[i]);
-
-       for (i = 0; i < sgx_nr_epc_sections; i++) {
-               sgx_sanitize_section(&sgx_epc_sections[i]);
+       __sgx_sanitize_pages(&sgx_dirty_page_list);
+       __sgx_sanitize_pages(&sgx_dirty_page_list);
 
-               /* Should never happen. */
-               if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
-                       WARN(1, "EPC section %d has unsanitized pages.\n", i);
-       }
+       /* sanity check: */
+       WARN_ON(!list_empty(&sgx_dirty_page_list));
 
        while (!kthread_should_stop()) {
                if (try_to_freeze())
@@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void)
        return true;
 }
 
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 {
-       struct sgx_epc_page *page;
+       struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+       struct sgx_epc_page *page = NULL;
 
-       spin_lock(&section->lock);
+       spin_lock(&node->lock);
 
-       if (list_empty(&section->page_list)) {
-               spin_unlock(&section->lock);
+       if (list_empty(&node->free_page_list)) {
+               spin_unlock(&node->lock);
                return NULL;
        }
 
-       page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+       page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
        list_del_init(&page->list);
-       section->free_cnt--;
+       sgx_nr_free_pages--;
+
+       spin_unlock(&node->lock);
 
-       spin_unlock(&section->lock);
        return page;
 }
 
 /**
  * __sgx_alloc_epc_page() - Allocate an EPC page
  *
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
  *
  * Return:
- *   an EPC page,
- *   -errno on error
+ * - an EPC page:      A borrowed EPC pages were available.
+ * - NULL:             Out of EPC pages.
  */
 struct sgx_epc_page *__sgx_alloc_epc_page(void)
 {
-       struct sgx_epc_section *section;
        struct sgx_epc_page *page;
-       int i;
+       int nid_of_current = numa_node_id();
+       int nid = nid_of_current;
 
-       for (i = 0; i < sgx_nr_epc_sections; i++) {
-               section = &sgx_epc_sections[i];
+       if (node_isset(nid_of_current, sgx_numa_mask)) {
+               page = __sgx_alloc_epc_page_from_node(nid_of_current);
+               if (page)
+                       return page;
+       }
+
+       /* Fall back to the non-local NUMA nodes: */
+       while (true) {
+               nid = next_node_in(nid, sgx_numa_mask);
+               if (nid == nid_of_current)
+                       break;
 
-               page = __sgx_alloc_epc_page_from_section(section);
+               page = __sgx_alloc_epc_page_from_node(nid);
                if (page)
                        return page;
        }
@@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
  * sgx_free_epc_page() - Free an EPC page
  * @page:      an EPC page
  *
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
  */
 void sgx_free_epc_page(struct sgx_epc_page *page)
 {
        struct sgx_epc_section *section = &sgx_epc_sections[page->section];
-       int ret;
+       struct sgx_numa_node *node = section->node;
 
-       WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+       spin_lock(&node->lock);
 
-       ret = __eremove(sgx_get_epc_virt_addr(page));
-       if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
-               return;
+       list_add_tail(&page->list, &node->free_page_list);
+       sgx_nr_free_pages++;
 
-       spin_lock(&section->lock);
-       list_add_tail(&page->list, &section->page_list);
-       section->free_cnt++;
-       spin_unlock(&section->lock);
+       spin_unlock(&node->lock);
 }
 
 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
        }
 
        section->phys_addr = phys_addr;
-       spin_lock_init(&section->lock);
-       INIT_LIST_HEAD(&section->page_list);
-       INIT_LIST_HEAD(&section->init_laundry_list);
 
        for (i = 0; i < nr_pages; i++) {
                section->pages[i].section = index;
                section->pages[i].flags = 0;
                section->pages[i].owner = NULL;
-               list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+               list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
        }
 
-       section->free_cnt = nr_pages;
        return true;
 }
 
@@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void)
 {
        u32 eax, ebx, ecx, edx, type;
        u64 pa, size;
+       int nid;
        int i;
 
+       sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+       if (!sgx_numa_nodes)
+               return false;
+
        for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
                cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
 
@@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void)
                        break;
                }
 
+               nid = numa_map_to_online_node(phys_to_target_node(pa));
+               if (nid == NUMA_NO_NODE) {
+                       /* The physical address is already printed above. */
+                       pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+                       nid = 0;
+               }
+
+               if (!node_isset(nid, sgx_numa_mask)) {
+                       spin_lock_init(&sgx_numa_nodes[nid].lock);
+                       INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+                       node_set(nid, sgx_numa_mask);
+               }
+
+               sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
+
                sgx_nr_epc_sections++;
        }
 
@@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void)
        return true;
 }
 
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+       int i;
+
+       WARN_ON_ONCE(preemptible());
+
+       for (i = 0; i < 4; i++)
+               wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+       .owner                  = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "sgx_provision",
+       .nodename = "sgx_provision",
+       .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes:                Pointer to allowed enclave attributes
+ * @attribute_fd:              File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0:         SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL:    Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+                     unsigned int attribute_fd)
+{
+       struct file *file;
+
+       file = fget(attribute_fd);
+       if (!file)
+               return -EINVAL;
+
+       if (file->f_op != &sgx_provision_fops) {
+               fput(file);
+               return -EINVAL;
+       }
+
+       *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+       fput(file);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
 static int __init sgx_init(void)
 {
        int ret;
@@ -716,12 +806,28 @@ static int __init sgx_init(void)
                goto err_page_cache;
        }
 
-       ret = sgx_drv_init();
+       ret = misc_register(&sgx_dev_provision);
        if (ret)
                goto err_kthread;
 
+       /*
+        * Always try to initialize the native *and* KVM drivers.
+        * The KVM driver is less picky than the native one and
+        * can function if the native one is not supported on the
+        * current system or fails to initialize.
+        *
+        * Error out only if both fail to initialize.
+        */
+       ret = sgx_drv_init();
+
+       if (sgx_vepc_init() && ret)
+               goto err_provision;
+
        return 0;
 
+err_provision:
+       misc_deregister(&sgx_dev_provision);
+
 err_kthread:
        kthread_stop(ksgxd_tsk);
 
index 5fa42d1..4628ace 100644 (file)
@@ -8,11 +8,15 @@
 #include <linux/rwsem.h>
 #include <linux/types.h>
 #include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
 
 #undef pr_fmt
 #define pr_fmt(fmt) "sgx: " fmt
 
+#define EREMOVE_ERROR_MESSAGE \
+       "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+       "Refer to Documentation/x86/sgx.rst for more information."
+
 #define SGX_MAX_EPC_SECTIONS           8
 #define SGX_EEXTEND_BLOCK_SIZE         256
 #define SGX_NR_TO_SCAN                 16
@@ -30,28 +34,25 @@ struct sgx_epc_page {
 };
 
 /*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+       struct list_head free_page_list;
+       spinlock_t lock;
+};
+
+/*
  * The firmware can define multiple chunks of EPC to the different areas of the
  * physical memory e.g. for memory areas of the each node. This structure is
  * used to store EPC pages for one EPC section and virtual memory area where
  * the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
  */
 struct sgx_epc_section {
        unsigned long phys_addr;
        void *virt_addr;
        struct sgx_epc_page *pages;
-
-       spinlock_t lock;
-       struct list_head page_list;
-       unsigned long free_cnt;
-
-       /*
-        * Pages which need EREMOVE run on them before they can be
-        * used.  Only safe to be accessed in ksgxd and init code.
-        * Not protected by locks.
-        */
-       struct list_head init_laundry_list;
+       struct sgx_numa_node *node;
 };
 
 extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
 
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+       return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
 #endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644 (file)
index 0000000..6ad165a
--- /dev/null
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+       struct xarray page_array;
+       struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+                           struct vm_area_struct *vma, unsigned long addr)
+{
+       struct sgx_epc_page *epc_page;
+       unsigned long index, pfn;
+       int ret;
+
+       WARN_ON(!mutex_is_locked(&vepc->lock));
+
+       /* Calculate index of EPC page in virtual EPC's page_array */
+       index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+       epc_page = xa_load(&vepc->page_array, index);
+       if (epc_page)
+               return 0;
+
+       epc_page = sgx_alloc_epc_page(vepc, false);
+       if (IS_ERR(epc_page))
+               return PTR_ERR(epc_page);
+
+       ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+       if (ret)
+               goto err_free;
+
+       pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+       ret = vmf_insert_pfn(vma, addr, pfn);
+       if (ret != VM_FAULT_NOPAGE) {
+               ret = -EFAULT;
+               goto err_delete;
+       }
+
+       return 0;
+
+err_delete:
+       xa_erase(&vepc->page_array, index);
+err_free:
+       sgx_free_epc_page(epc_page);
+       return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct sgx_vepc *vepc = vma->vm_private_data;
+       int ret;
+
+       mutex_lock(&vepc->lock);
+       ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+       mutex_unlock(&vepc->lock);
+
+       if (!ret)
+               return VM_FAULT_NOPAGE;
+
+       if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+               mmap_read_unlock(vma->vm_mm);
+               return VM_FAULT_RETRY;
+       }
+
+       return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+       .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       struct sgx_vepc *vepc = file->private_data;
+
+       if (!(vma->vm_flags & VM_SHARED))
+               return -EINVAL;
+
+       vma->vm_ops = &sgx_vepc_vm_ops;
+       /* Don't copy VMA in fork() */
+       vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+       vma->vm_private_data = vepc;
+
+       return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+       int ret;
+
+       /*
+        * Take a previously guest-owned EPC page and return it to the
+        * general EPC page pool.
+        *
+        * Guests can not be trusted to have left this page in a good
+        * state, so run EREMOVE on the page unconditionally.  In the
+        * case that a guest properly EREMOVE'd this page, a superfluous
+        * EREMOVE is harmless.
+        */
+       ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+       if (ret) {
+               /*
+                * Only SGX_CHILD_PRESENT is expected, which is because of
+                * EREMOVE'ing an SECS still with child, in which case it can
+                * be handled by EREMOVE'ing the SECS again after all pages in
+                * virtual EPC have been EREMOVE'd. See comments in below in
+                * sgx_vepc_release().
+                *
+                * The user of virtual EPC (KVM) needs to guarantee there's no
+                * logical processor is still running in the enclave in guest,
+                * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+                * handled here.
+                */
+               WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+                         ret, ret);
+               return ret;
+       }
+
+       sgx_free_epc_page(epc_page);
+
+       return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+       struct sgx_vepc *vepc = file->private_data;
+       struct sgx_epc_page *epc_page, *tmp, *entry;
+       unsigned long index;
+
+       LIST_HEAD(secs_pages);
+
+       xa_for_each(&vepc->page_array, index, entry) {
+               /*
+                * Remove all normal, child pages.  sgx_vepc_free_page()
+                * will fail if EREMOVE fails, but this is OK and expected on
+                * SECS pages.  Those can only be EREMOVE'd *after* all their
+                * child pages. Retries below will clean them up.
+                */
+               if (sgx_vepc_free_page(entry))
+                       continue;
+
+               xa_erase(&vepc->page_array, index);
+       }
+
+       /*
+        * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
+        * only had children in this 'epc' area.
+        */
+       xa_for_each(&vepc->page_array, index, entry) {
+               epc_page = entry;
+               /*
+                * An EREMOVE failure here means that the SECS page still
+                * has children.  But, since all children in this 'sgx_vepc'
+                * have been removed, the SECS page must have a child on
+                * another instance.
+                */
+               if (sgx_vepc_free_page(epc_page))
+                       list_add_tail(&epc_page->list, &secs_pages);
+
+               xa_erase(&vepc->page_array, index);
+       }
+
+       /*
+        * SECS pages are "pinned" by child pages, and "unpinned" once all
+        * children have been EREMOVE'd.  A child page in this instance
+        * may have pinned an SECS page encountered in an earlier release(),
+        * creating a zombie.  Since some children were EREMOVE'd above,
+        * try to EREMOVE all zombies in the hopes that one was unpinned.
+        */
+       mutex_lock(&zombie_secs_pages_lock);
+       list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+               /*
+                * Speculatively remove the page from the list of zombies,
+                * if the page is successfully EREMOVE'd it will be added to
+                * the list of free pages.  If EREMOVE fails, throw the page
+                * on the local list, which will be spliced on at the end.
+                */
+               list_del(&epc_page->list);
+
+               if (sgx_vepc_free_page(epc_page))
+                       list_add_tail(&epc_page->list, &secs_pages);
+       }
+
+       if (!list_empty(&secs_pages))
+               list_splice_tail(&secs_pages, &zombie_secs_pages);
+       mutex_unlock(&zombie_secs_pages_lock);
+
+       kfree(vepc);
+
+       return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+       struct sgx_vepc *vepc;
+
+       vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+       if (!vepc)
+               return -ENOMEM;
+       mutex_init(&vepc->lock);
+       xa_init(&vepc->page_array);
+
+       file->private_data = vepc;
+
+       return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = sgx_vepc_open,
+       .release        = sgx_vepc_release,
+       .mmap           = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+       .minor          = MISC_DYNAMIC_MINOR,
+       .name           = "sgx_vepc",
+       .nodename       = "sgx_vepc",
+       .fops           = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+       /* SGX virtualization requires KVM to work */
+       if (!cpu_feature_enabled(X86_FEATURE_VMX))
+               return -ENODEV;
+
+       INIT_LIST_HEAD(&zombie_secs_pages);
+       mutex_init(&zombie_secs_pages_lock);
+
+       return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo:  Pointer to PAGEINFO structure
+ * @secs:      Userspace pointer to SECS page
+ * @trapnr:    trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * -  0:       ECREATE was successful.
+ * - <0:       on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+                    int *trapnr)
+{
+       int ret;
+
+       /*
+        * @secs is an untrusted, userspace-provided address.  It comes from
+        * KVM and is assumed to be a valid pointer which points somewhere in
+        * userspace.  This can fault and call SGX or other fault handlers when
+        * userspace mapping @secs doesn't exist.
+        *
+        * Add a WARN() to make sure @secs is already valid userspace pointer
+        * from caller (KVM), who should already have handled invalid pointer
+        * case (for instance, made by malicious guest).  All other checks,
+        * such as alignment of @secs, are deferred to ENCLS itself.
+        */
+       if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+               return -EINVAL;
+
+       __uaccess_begin();
+       ret = __ecreate(pageinfo, (void *)secs);
+       __uaccess_end();
+
+       if (encls_faulted(ret)) {
+               *trapnr = ENCLS_TRAPNR(ret);
+               return -EFAULT;
+       }
+
+       /* ECREATE doesn't return an error code, it faults or succeeds. */
+       WARN_ON_ONCE(ret);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+                           void __user *secs)
+{
+       int ret;
+
+       /*
+        * Make sure all userspace pointers from caller (KVM) are valid.
+        * All other checks deferred to ENCLS itself.  Also see comment
+        * for @secs in sgx_virt_ecreate().
+        */
+#define SGX_EINITTOKEN_SIZE    304
+       if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+                        !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+                        !access_ok(secs, PAGE_SIZE)))
+               return -EINVAL;
+
+       __uaccess_begin();
+       ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+       __uaccess_end();
+
+       return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct:         Userspace pointer to SIGSTRUCT structure
+ * @token:             Userspace pointer to EINITTOKEN structure
+ * @secs:              Userspace pointer to SECS page
+ * @lepubkeyhash:      Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr:            trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * -  0:       EINIT was successful.
+ * - <0:       on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+                  void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+       int ret;
+
+       if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+               ret = __sgx_virt_einit(sigstruct, token, secs);
+       } else {
+               preempt_disable();
+
+               sgx_update_lepubkeyhash(lepubkeyhash);
+
+               ret = __sgx_virt_einit(sigstruct, token, secs);
+               preempt_enable();
+       }
+
+       /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+       if (ret == -EINVAL)
+               return ret;
+
+       if (encls_faulted(ret)) {
+               *trapnr = ENCLS_TRAPNR(ret);
+               return -EFAULT;
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
index 373e5fa..51c7f52 100644 (file)
@@ -12,7 +12,7 @@
 
 #include "common.h"
 
-/* Ftrace callback handler for kprobes -- called under preepmt disabed */
+/* Ftrace callback handler for kprobes -- called under preepmt disabled */
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
                           struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
index 5e78e01..bd01a61 100644 (file)
@@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void)
        }
 }
 
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
 static bool pv_tlb_flush_supported(void)
 {
        return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
@@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void)
                kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
 static bool pv_ipi_supported(void)
 {
        return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
@@ -574,6 +574,49 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
        }
 }
 
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+                       const struct flush_tlb_info *info)
+{
+       u8 state;
+       int cpu;
+       struct kvm_steal_time *src;
+       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+       cpumask_copy(flushmask, cpumask);
+       /*
+        * We have to call flush only on online vCPUs. And
+        * queue flush_on_enter for pre-empted vCPUs
+        */
+       for_each_cpu(cpu, flushmask) {
+               src = &per_cpu(steal_time, cpu);
+               state = READ_ONCE(src->preempted);
+               if ((state & KVM_VCPU_PREEMPTED)) {
+                       if (try_cmpxchg(&src->preempted, &state,
+                                       state | KVM_VCPU_FLUSH_TLB))
+                               __cpumask_clear_cpu(cpu, flushmask);
+               }
+       }
+
+       native_flush_tlb_others(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+       int cpu;
+
+       if (!kvm_para_available() || nopv)
+               return 0;
+
+       if (pv_tlb_flush_supported() || pv_ipi_supported())
+               for_each_possible_cpu(cpu) {
+                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+                               GFP_KERNEL, cpu_to_node(cpu));
+               }
+
+       return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
        /*
@@ -611,33 +654,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
        local_irq_enable();
        return 0;
 }
-#endif
 
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
-                       const struct flush_tlb_info *info)
-{
-       u8 state;
-       int cpu;
-       struct kvm_steal_time *src;
-       struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
-
-       cpumask_copy(flushmask, cpumask);
-       /*
-        * We have to call flush only on online vCPUs. And
-        * queue flush_on_enter for pre-empted vCPUs
-        */
-       for_each_cpu(cpu, flushmask) {
-               src = &per_cpu(steal_time, cpu);
-               state = READ_ONCE(src->preempted);
-               if ((state & KVM_VCPU_PREEMPTED)) {
-                       if (try_cmpxchg(&src->preempted, &state,
-                                       state | KVM_VCPU_FLUSH_TLB))
-                               __cpumask_clear_cpu(cpu, flushmask);
-               }
-       }
-
-       native_flush_tlb_others(flushmask, info);
-}
+#endif
 
 static void __init kvm_guest_init(void)
 {
@@ -653,12 +671,6 @@ static void __init kvm_guest_init(void)
                pv_ops.time.steal_clock = kvm_steal_clock;
        }
 
-       if (pv_tlb_flush_supported()) {
-               pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
-               pv_ops.mmu.tlb_remove_table = tlb_remove_table;
-               pr_info("KVM setup pv remote TLB flush\n");
-       }
-
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
@@ -668,6 +680,12 @@ static void __init kvm_guest_init(void)
        }
 
 #ifdef CONFIG_SMP
+       if (pv_tlb_flush_supported()) {
+               pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+               pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+               pr_info("KVM setup pv remote TLB flush\n");
+       }
+
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        if (pv_sched_yield_supported()) {
                smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
@@ -734,7 +752,7 @@ static uint32_t __init kvm_detect(void)
 
 static void __init kvm_apic_init(void)
 {
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        if (pv_ipi_supported())
                kvm_setup_pv_ipi();
 #endif
@@ -794,32 +812,6 @@ static __init int activate_jump_labels(void)
 }
 arch_initcall(activate_jump_labels);
 
-static __init int kvm_alloc_cpumask(void)
-{
-       int cpu;
-       bool alloc = false;
-
-       if (!kvm_para_available() || nopv)
-               return 0;
-
-       if (pv_tlb_flush_supported())
-               alloc = true;
-
-#if defined(CONFIG_SMP)
-       if (pv_ipi_supported())
-               alloc = true;
-#endif
-
-       if (alloc)
-               for_each_possible_cpu(cpu) {
-                       zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
-                               GFP_KERNEL, cpu_to_node(cpu));
-               }
-
-       return 0;
-}
-arch_initcall(kvm_alloc_cpumask);
-
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
@@ -836,28 +828,25 @@ static void kvm_kick_cpu(int cpu)
 
 static void kvm_wait(u8 *ptr, u8 val)
 {
-       unsigned long flags;
-
        if (in_nmi())
                return;
 
-       local_irq_save(flags);
-
-       if (READ_ONCE(*ptr) != val)
-               goto out;
-
        /*
         * halt until it's our turn and kicked. Note that we do safe halt
         * for irq enabled case to avoid hang when lock info is overwritten
         * in irq spinlock slowpath and no spurious interrupt occur to save us.
         */
-       if (arch_irqs_disabled_flags(flags))
-               halt();
-       else
-               safe_halt();
+       if (irqs_disabled()) {
+               if (READ_ONCE(*ptr) == val)
+                       halt();
+       } else {
+               local_irq_disable();
 
-out:
-       local_irq_restore(flags);
+               if (READ_ONCE(*ptr) == val)
+                       safe_halt();
+
+               local_irq_enable();
+       }
 }
 
 #ifdef CONFIG_X86_32
index d883176..5ecd69a 100644 (file)
@@ -1045,6 +1045,9 @@ void __init setup_arch(char **cmdline_p)
 
        cleanup_highmap();
 
+       /* Look for ACPI tables and reserve memory occupied by them. */
+       acpi_boot_table_init();
+
        memblock_set_current_limit(ISA_END_ADDRESS);
        e820__memblock_setup();
 
@@ -1136,11 +1139,6 @@ void __init setup_arch(char **cmdline_p)
 
        early_platform_quirks();
 
-       /*
-        * Parse the ACPI tables for possible boot-time SMP configuration.
-        */
-       acpi_boot_table_init();
-
        early_acpi_boot_init();
 
        initmem_init();
index ea794a0..f306e85 100644 (file)
@@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 
 static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 {
-       /*
-        * This function is fundamentally broken as currently
-        * implemented.
-        *
-        * The idea is that we want to trigger a call to the
-        * restart_block() syscall and that we want in_ia32_syscall(),
-        * in_x32_syscall(), etc. to match whatever they were in the
-        * syscall being restarted.  We assume that the syscall
-        * instruction at (regs->ip - 2) matches whatever syscall
-        * instruction we used to enter in the first place.
-        *
-        * The problem is that we can get here when ptrace pokes
-        * syscall-like values into regs even if we're not in a syscall
-        * at all.
-        *
-        * For now, we maintain historical behavior and guess based on
-        * stored state.  We could do better by saving the actual
-        * syscall arch in restart_block or (with caveats on x32) by
-        * checking if regs->ip points to 'int $0x80'.  The current
-        * behavior is incorrect if a tracer has a different bitness
-        * than the tracee.
-        */
 #ifdef CONFIG_IA32_EMULATION
-       if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
+       if (current->restart_block.arch_data & TS_COMPAT)
                return __NR_ia32_restart_syscall;
 #endif
 #ifdef CONFIG_X86_X32_ABI
index 02813a7..f877150 100644 (file)
@@ -1659,7 +1659,7 @@ void play_dead_common(void)
        local_irq_disable();
 }
 
-static bool wakeup_cpu0(void)
+bool wakeup_cpu0(void)
 {
        if (smp_processor_id() == 0 && enable_start_cpu0)
                return true;
index a788d51..f6b93a3 100644 (file)
@@ -84,6 +84,18 @@ config KVM_INTEL
          To compile this as a module, choose M here: the module
          will be called kvm-intel.
 
+config X86_SGX_KVM
+       bool "Software Guard eXtensions (SGX) Virtualization"
+       depends on X86_SGX && KVM_INTEL
+       help
+
+         Enables KVM guests to create SGX enclaves.
+
+         This includes support to expose "raw" unreclaimable enclave memory to
+         guests via a device node, e.g. /dev/sgx_vepc.
+
+         If unsure, say N.
+
 config KVM_AMD
        tristate "KVM for AMD processors support"
        depends on KVM
index 1b4766f..c589db5 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-ccflags-y += -Iarch/x86/kvm
+ccflags-y += -I $(srctree)/arch/x86/kvm
 ccflags-$(CONFIG_KVM_WERROR) += -Werror
 
 ifeq ($(CONFIG_FRAME_POINTER),y)
@@ -23,6 +23,8 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
 
 kvm-intel-y            += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
                           vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM)        += vmx/sgx.o
+
 kvm-amd-y              += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
 
 obj-$(CONFIG_KVM)      += kvm.o
index 6bd2f8b..2ae0615 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
@@ -28,7 +29,7 @@
  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
  */
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -53,6 +54,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 }
 
 #define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
 
 static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
        struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -170,6 +172,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                vcpu->arch.guest_supported_xcr0 =
                        (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
 
+       /*
+        * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+        * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+        * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+        * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+        * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+        * '1' even on CPUs that don't support XSAVE.
+        */
+       best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+       if (best) {
+               best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+               best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+               best->ecx |= XFEATURE_MASK_FPSSE;
+       }
+
        kvm_update_pv_runtime(vcpu);
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -347,13 +364,13 @@ out:
        return r;
 }
 
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(enum cpuid_leafs leaf)
 {
        const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
        struct kvm_cpuid_entry2 entry;
 
        reverse_cpuid_check(leaf);
-       kvm_cpu_caps[leaf] &= mask;
 
        cpuid_count(cpuid.function, cpuid.index,
                    &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -361,6 +378,26 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
        kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 }
 
+static __always_inline void kvm_cpu_cap_init_scattered(enum cpuid_leafs leaf, u32 mask)
+{
+       /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+       BUILD_BUG_ON(leaf < NCAPINTS);
+
+       kvm_cpu_caps[leaf] = mask;
+
+       __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+       /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+       BUILD_BUG_ON(leaf >= NCAPINTS);
+
+       kvm_cpu_caps[leaf] &= mask;
+
+       __kvm_cpu_cap_mask(leaf);
+}
+
 void kvm_set_cpu_caps(void)
 {
        unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -371,12 +408,13 @@ void kvm_set_cpu_caps(void)
        unsigned int f_gbpages = 0;
        unsigned int f_lm = 0;
 #endif
+       memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
 
-       BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+       BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
                     sizeof(boot_cpu_data.x86_capability));
 
        memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
-              sizeof(kvm_cpu_caps));
+              sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
 
        kvm_cpu_cap_mask(CPUID_1_ECX,
                /*
@@ -407,7 +445,7 @@ void kvm_set_cpu_caps(void)
        );
 
        kvm_cpu_cap_mask(CPUID_7_0_EBX,
-               F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+               F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
                F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
                F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
                F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
@@ -418,7 +456,8 @@ void kvm_set_cpu_caps(void)
                F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
                F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
                F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+               F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+               F(SGX_LC)
        );
        /* Set LA57 based on hardware capability. */
        if (cpuid_ecx(7) & F(LA57))
@@ -457,6 +496,10 @@ void kvm_set_cpu_caps(void)
                F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
        );
 
+       kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+               SF(SGX1) | SF(SGX2)
+       );
+
        kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
                F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
                F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -778,6 +821,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                        entry->edx = 0;
                }
                break;
+       case 0x12:
+               /* Intel SGX */
+               if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+                       entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+                       break;
+               }
+
+               /*
+                * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+                * and max enclave sizes.   The SGX sub-features and MISCSELECT
+                * are restricted by kernel and KVM capabilities (like most
+                * feature flags), while enclave size is unrestricted.
+                */
+               cpuid_entry_override(entry, CPUID_12_EAX);
+               entry->ebx &= SGX_MISC_EXINFO;
+
+               entry = do_host_cpuid(array, function, 1);
+               if (!entry)
+                       goto out;
+
+               /*
+                * Index 1: SECS.ATTRIBUTES.  ATTRIBUTES are restricted a la
+                * feature flags.  Advertise all supported flags, including
+                * privileged attributes that require explicit opt-in from
+                * userspace.  ATTRIBUTES.XFRM is not adjusted as userspace is
+                * expected to derive it from supported XCR0.
+                */
+               entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+                             SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
+                             SGX_ATTR_KSS;
+               entry->ebx &= 0;
+               break;
        /* Intel PT */
        case 0x14:
                if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
index 2a0c506..888e88b 100644 (file)
@@ -7,7 +7,25 @@
 #include <asm/processor.h>
 #include <uapi/asm/kvm_para.h>
 
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM.  Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+       CPUID_12_EAX     = NCAPINTS,
+       NR_KVM_CPU_CAPS,
+
+       NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define KVM_X86_FEATURE(w, f)          ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1           KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2           KVM_X86_FEATURE(CPUID_12_EAX, 1)
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 void kvm_set_cpu_caps(void);
 
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -80,6 +98,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
        [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
        [CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
        [CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
+       [CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
 };
 
 /*
@@ -101,6 +120,25 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
 }
 
 /*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+       if (x86_feature == X86_FEATURE_SGX1)
+               return KVM_X86_FEATURE_SGX1;
+       else if (x86_feature == X86_FEATURE_SGX2)
+               return KVM_X86_FEATURE_SGX2;
+
+       return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+       return __feature_translate(x86_feature) / 32;
+}
+
+/*
  * Retrieve the bit mask from an X86_FEATURE_* definition.  Features contain
  * the hardware defined bit number (stored in bits 4:0) and a software defined
  * "word" (stored in bits 31:5).  The word is used to index into arrays of
@@ -108,6 +146,8 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
  */
 static __always_inline u32 __feature_bit(int x86_feature)
 {
+       x86_feature = __feature_translate(x86_feature);
+
        reverse_cpuid_check(x86_feature / 32);
        return 1 << (x86_feature & 31);
 }
@@ -116,7 +156,7 @@ static __always_inline u32 __feature_bit(int x86_feature)
 
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        return reverse_cpuid[x86_leaf];
@@ -248,6 +288,14 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
                is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
 }
 
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = kvm_find_cpuid_entry(vcpu, 0, 0);
+       return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
 static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -308,7 +356,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
 
 static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -316,7 +364,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
 
 static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -324,7 +372,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
 
 static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
 {
-       unsigned int x86_leaf = x86_feature / 32;
+       unsigned int x86_leaf = __feature_leaf(x86_feature);
 
        reverse_cpuid_check(x86_leaf);
        return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
index 58fa8c0..f98370a 100644 (file)
@@ -520,10 +520,10 @@ static u64 get_time_ref_counter(struct kvm *kvm)
        u64 tsc;
 
        /*
-        * The guest has not set up the TSC page or the clock isn't
-        * stable, fall back to get_kvmclock_ns.
+        * Fall back to get_kvmclock_ns() when TSC page hasn't been set up,
+        * is broken, disabled or being updated.
         */
-       if (!hv->tsc_ref.tsc_sequence)
+       if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET)
                return div_u64(get_kvmclock_ns(kvm), 100);
 
        vcpu = kvm_get_vcpu(kvm, 0);
@@ -1077,6 +1077,21 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
        return true;
 }
 
+/*
+ * Don't touch TSC page values if the guest has opted for TSC emulation after
+ * migration. KVM doesn't fully support reenlightenment notifications and TSC
+ * access emulation and Hyper-V is known to expect the values in TSC page to
+ * stay constant before TSC access emulation is disabled from guest side
+ * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC
+ * frequency and guest visible TSC value across migration (and prevent it when
+ * TSC scaling is unsupported).
+ */
+static inline bool tsc_page_update_unsafe(struct kvm_hv *hv)
+{
+       return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) &&
+               hv->hv_tsc_emulation_control;
+}
+
 void kvm_hv_setup_tsc_page(struct kvm *kvm,
                           struct pvclock_vcpu_time_info *hv_clock)
 {
@@ -1087,7 +1102,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
        BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
        BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
 
-       if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+       if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+           hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
                return;
 
        mutex_lock(&hv->hv_lock);
@@ -1101,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
         */
        if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
                                    &tsc_seq, sizeof(tsc_seq))))
+               goto out_err;
+
+       if (tsc_seq && tsc_page_update_unsafe(hv)) {
+               if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+                       goto out_err;
+
+               hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
                goto out_unlock;
+       }
 
        /*
         * While we're computing and writing the parameters, force the
@@ -1110,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
        hv->tsc_ref.tsc_sequence = 0;
        if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
                            &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
-               goto out_unlock;
+               goto out_err;
 
        if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
-               goto out_unlock;
+               goto out_err;
 
        /* Ensure sequence is zero before writing the rest of the struct.  */
        smp_wmb();
        if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
-               goto out_unlock;
+               goto out_err;
 
        /*
         * Now switch to the TSC page mechanism by writing the sequence.
@@ -1131,8 +1155,45 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
        smp_wmb();
 
        hv->tsc_ref.tsc_sequence = tsc_seq;
-       kvm_write_guest(kvm, gfn_to_gpa(gfn),
-                       &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+                           &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+               goto out_err;
+
+       hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+       goto out_unlock;
+
+out_err:
+       hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+out_unlock:
+       mutex_unlock(&hv->hv_lock);
+}
+
+void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
+{
+       struct kvm_hv *hv = to_kvm_hv(kvm);
+       u64 gfn;
+
+       if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+           hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
+           tsc_page_update_unsafe(hv))
+               return;
+
+       mutex_lock(&hv->hv_lock);
+
+       if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+               goto out_unlock;
+
+       /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */
+       if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET)
+               hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING;
+
+       gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+
+       hv->tsc_ref.tsc_sequence = 0;
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+                           &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+               hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+
 out_unlock:
        mutex_unlock(&hv->hv_lock);
 }
@@ -1193,8 +1254,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
        }
        case HV_X64_MSR_REFERENCE_TSC:
                hv->hv_tsc_page = data;
-               if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+               if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+                       if (!host)
+                               hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED;
+                       else
+                               hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
                        kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+               } else {
+                       hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET;
+               }
                break;
        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
                return kvm_hv_msr_set_crash_data(kvm,
@@ -1229,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                hv->hv_tsc_emulation_control = data;
                break;
        case HV_X64_MSR_TSC_EMULATION_STATUS:
+               if (data && !host)
+                       return 1;
+
                hv->hv_tsc_emulation_status = data;
                break;
        case HV_X64_MSR_TIME_REF_COUNT:
index e951af1..60547d5 100644 (file)
@@ -133,6 +133,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
 void kvm_hv_setup_tsc_page(struct kvm *kvm,
                           struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_invalidate_tsc_page(struct kvm *kvm);
 
 void kvm_hv_init_vm(struct kvm *kvm);
 void kvm_hv_destroy_vm(struct kvm *kvm);
index cc369b9..0050f39 100644 (file)
@@ -2869,7 +2869,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
                return;
 
        if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                if (r < 0)
                        return;
                /*
index c68bfc3..88d0ed5 100644 (file)
@@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e)
        return ((2ULL << (e - s)) - 1) << s;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
                                u64 fault_address, char *insn, int insn_len);
 
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
        if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
@@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(root_hpa))
                return;
 
-       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
-                                vcpu->arch.mmu->shadow_root_level);
+       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+                                         vcpu->arch.mmu->shadow_root_level);
 }
 
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  * write-protects guest page to sync the guest modification, b) another one is
  * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
  * between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 1) the first case clears MMU-writable bit.
  * 2) the first case requires flushing tlb immediately avoiding corrupting
  *    shadow page table between all vcpus so it should be in the protection of
  *    mmu-lock. And the another case does not need to flush tlb until returning
@@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  * So, there is the problem: the first case can meet the corrupted tlb caused
  * by another case which write-protects pages but without flush tlb
  * immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ * it flush tlb if we try to write-protect a spte whose MMU-writable bit
+ * is set, it works since another case never touches MMU-writable bit.
  *
  * Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * changed) we need to check whether the spte with MMU-writable becomes
  * readonly, if that happens, we need to flush tlb. Fortunately,
  * mmu_spte_update() has already handled it perfectly.
  *
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * The rules to use MMU-writable and PT_WRITABLE_MASK:
  * - if we want to see if it has writable tlb entry or if the spte can be
- *   writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ *   writable on the mmu mapping, check MMU-writable, this is the most
  *   case, otherwise
  * - if we fix page fault on the spte or do write-protection by dirty logging,
  *   check PT_WRITABLE_MASK.
index d75524b..930ac8a 100644 (file)
@@ -48,6 +48,7 @@
 #include <asm/memtype.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/set_memory.h>
 #include <asm/vmx.h>
 #include <asm/kvm_page_track.h>
 #include "trace.h"
@@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void)
 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
                           unsigned int access)
 {
-       u64 mask = make_mmio_spte(vcpu, gfn, access);
+       u64 spte = make_mmio_spte(vcpu, gfn, access);
 
-       trace_mark_mmio_spte(sptep, gfn, mask);
-       mmu_spte_set(sptep, mask);
+       trace_mark_mmio_spte(sptep, gfn, spte);
+       mmu_spte_set(sptep, spte);
 }
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte)
        return spte & shadow_mmio_access_mask;
 }
 
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
-                         kvm_pfn_t pfn, unsigned int access)
-{
-       if (unlikely(is_noslot_pfn(pfn))) {
-               mark_mmio_spte(vcpu, sptep, gfn, access);
-               return true;
-       }
-
-       return false;
-}
-
 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
        u64 kvm_gen, spte_gen, gen;
@@ -725,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
  * handling slots that are not large page aligned.
  */
 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
-                                             struct kvm_memory_slot *slot,
-                                             int level)
+               const struct kvm_memory_slot *slot, int level)
 {
        unsigned long idx;
 
@@ -1118,7 +1107,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
        rmap_printk("spte %p %llx\n", sptep, *sptep);
 
        if (pt_protect)
-               spte &= ~SPTE_MMU_WRITEABLE;
+               spte &= ~shadow_mmu_writable_mask;
        spte = spte & ~PT_WRITABLE_MASK;
 
        return mmu_spte_update(sptep, spte);
@@ -1308,26 +1297,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        return flush;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                          unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                           pte_t unused)
 {
        return kvm_zap_rmapp(kvm, rmap_head, slot);
 }
 
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                            unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                             pte_t pte)
 {
        u64 *sptep;
        struct rmap_iterator iter;
        int need_flush = 0;
        u64 new_spte;
-       pte_t *ptep = (pte_t *)data;
        kvm_pfn_t new_pfn;
 
-       WARN_ON(pte_huge(*ptep));
-       new_pfn = pte_pfn(*ptep);
+       WARN_ON(pte_huge(pte));
+       new_pfn = pte_pfn(pte);
 
 restart:
        for_each_rmap_spte(rmap_head, &iter, sptep) {
@@ -1336,7 +1324,7 @@ restart:
 
                need_flush = 1;
 
-               if (pte_write(*ptep)) {
+               if (pte_write(pte)) {
                        pte_list_remove(rmap_head, sptep);
                        goto restart;
                } else {
@@ -1424,93 +1412,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
             slot_rmap_walk_okay(_iter_);                               \
             slot_rmap_walk_next(_iter_))
 
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
-                    unsigned long start,
-                    unsigned long end,
-                    unsigned long data,
-                    int (*handler)(struct kvm *kvm,
-                                   struct kvm_rmap_head *rmap_head,
-                                   struct kvm_memory_slot *slot,
-                                   gfn_t gfn,
-                                   int level,
-                                   unsigned long data))
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                              struct kvm_memory_slot *slot, gfn_t gfn,
+                              int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+                                                struct kvm_gfn_range *range,
+                                                rmap_handler_t handler)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
        struct slot_rmap_walk_iterator iterator;
-       int ret = 0;
-       int i;
-
-       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-               slots = __kvm_memslots(kvm, i);
-               kvm_for_each_memslot(memslot, slots) {
-                       unsigned long hva_start, hva_end;
-                       gfn_t gfn_start, gfn_end;
+       bool ret = false;
 
-                       hva_start = max(start, memslot->userspace_addr);
-                       hva_end = min(end, memslot->userspace_addr +
-                                     (memslot->npages << PAGE_SHIFT));
-                       if (hva_start >= hva_end)
-                               continue;
-                       /*
-                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                        */
-                       gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-                       gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-                       for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
-                                                KVM_MAX_HUGEPAGE_LEVEL,
-                                                gfn_start, gfn_end - 1,
-                                                &iterator)
-                               ret |= handler(kvm, iterator.rmap, memslot,
-                                              iterator.gfn, iterator.level, data);
-               }
-       }
+       for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+                                range->start, range->end - 1, &iterator)
+               ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+                              iterator.level, range->pte);
 
        return ret;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-                         unsigned long data,
-                         int (*handler)(struct kvm *kvm,
-                                        struct kvm_rmap_head *rmap_head,
-                                        struct kvm_memory_slot *slot,
-                                        gfn_t gfn, int level,
-                                        unsigned long data))
-{
-       return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-                       unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int r;
+       bool flush;
 
-       r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+       flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 
        if (is_tdp_mmu_enabled(kvm))
-               r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+               flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
 
-       return r;
+       return flush;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int r;
+       bool flush;
 
-       r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+       flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
 
        if (is_tdp_mmu_enabled(kvm))
-               r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+               flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
 
-       return r;
+       return flush;
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                        struct kvm_memory_slot *slot, gfn_t gfn, int level,
-                        unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                         struct kvm_memory_slot *slot, gfn_t gfn, int level,
+                         pte_t unused)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1519,13 +1466,12 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
        for_each_rmap_spte(rmap_head, &iter, sptep)
                young |= mmu_spte_age(sptep);
 
-       trace_kvm_age_page(gfn, level, slot, young);
        return young;
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-                             struct kvm_memory_slot *slot, gfn_t gfn,
-                             int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+                              struct kvm_memory_slot *slot, gfn_t gfn,
+                              int level, pte_t unused)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1547,29 +1493,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
        rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
 
-       kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+       kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
        kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
                        KVM_PAGES_PER_HPAGE(sp->role.level));
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int young = false;
+       bool young;
+
+       young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
 
-       young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
        if (is_tdp_mmu_enabled(kvm))
-               young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+               young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
 
        return young;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       int young = false;
+       bool young;
+
+       young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
 
-       young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
        if (is_tdp_mmu_enabled(kvm))
-               young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+               young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
 
        return young;
 }
@@ -2421,6 +2369,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
 
        kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
 
+       /*
+        * Note, this check is intentionally soft, it only guarantees that one
+        * page is available, while the caller may end up allocating as many as
+        * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
+        * exceeding the (arbitrary by default) limit will not harm the host,
+        * being too agressive may unnecessarily kill the guest, and getting an
+        * exact count is far more trouble than it's worth, especially in the
+        * page fault paths.
+        */
        if (!kvm_mmu_available_pages(vcpu->kvm))
                return -ENOSPC;
        return 0;
@@ -2561,9 +2518,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        struct kvm_mmu_page *sp;
        int ret;
 
-       if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
-               return 0;
-
        sp = sptep_to_sp(sptep);
 
        ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
@@ -2593,6 +2547,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
                 *sptep, write_fault, gfn);
 
+       if (unlikely(is_noslot_pfn(pfn))) {
+               mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+               return RET_PF_EMULATE;
+       }
+
        if (is_shadow_present_pte(*sptep)) {
                /*
                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
@@ -2626,9 +2585,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
                                KVM_PAGES_PER_HPAGE(level));
 
-       if (unlikely(is_mmio_spte(*sptep)))
-               ret = RET_PF_EMULATE;
-
        /*
         * The fault is fully spurious if and only if the new SPTE and old SPTE
         * are identical, and emulation is not required.
@@ -2745,7 +2701,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
-                                 struct kvm_memory_slot *slot)
+                                 const struct kvm_memory_slot *slot)
 {
        unsigned long hva;
        pte_t *pte;
@@ -2771,8 +2727,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
        return level;
 }
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-                             gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+                             const struct kvm_memory_slot *slot, gfn_t gfn,
+                             kvm_pfn_t pfn, int max_level)
 {
        struct kvm_lpage_info *linfo;
 
@@ -2946,9 +2903,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                return true;
        }
 
-       if (unlikely(is_noslot_pfn(pfn)))
+       if (unlikely(is_noslot_pfn(pfn))) {
                vcpu_cache_mmio_info(vcpu, gva, gfn,
                                     access & shadow_mmio_access_mask);
+               /*
+                * If MMIO caching is disabled, emulate immediately without
+                * touching the shadow page tables as attempting to install an
+                * MMIO SPTE will just be an expensive nop.
+                */
+               if (unlikely(!shadow_mmio_value)) {
+                       *ret_val = RET_PF_EMULATE;
+                       return true;
+               }
+       }
 
        return false;
 }
@@ -3061,6 +3028,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                        if (!is_shadow_present_pte(spte))
                                break;
 
+               if (!is_shadow_present_pte(spte))
+                       break;
+
                sp = sptep_to_sp(iterator.sptep);
                if (!is_last_spte(spte, sp->role.level))
                        break;
@@ -3150,12 +3120,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 
        sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
-       if (kvm_mmu_put_root(kvm, sp)) {
-               if (is_tdp_mmu_page(sp))
-                       kvm_tdp_mmu_free_root(kvm, sp);
-               else if (sp->role.invalid)
-                       kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
-       }
+       if (is_tdp_mmu_page(sp))
+               kvm_tdp_mmu_put_root(kvm, sp, false);
+       else if (!--sp->root_count && sp->role.invalid)
+               kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
        *root_hpa = INVALID_PAGE;
 }
@@ -3193,14 +3161,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
                    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
                        mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
-               } else {
-                       for (i = 0; i < 4; ++i)
-                               if (mmu->pae_root[i] != 0)
-                                       mmu_free_root_page(kvm,
-                                                          &mmu->pae_root[i],
-                                                          &invalid_list);
-                       mmu->root_hpa = INVALID_PAGE;
+               } else if (mmu->pae_root) {
+                       for (i = 0; i < 4; ++i) {
+                               if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+                                       continue;
+
+                               mmu_free_root_page(kvm, &mmu->pae_root[i],
+                                                  &invalid_list);
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
+                       }
                }
+               mmu->root_hpa = INVALID_PAGE;
                mmu->root_pgd = 0;
        }
 
@@ -3226,155 +3197,208 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 {
        struct kvm_mmu_page *sp;
 
-       write_lock(&vcpu->kvm->mmu_lock);
-
-       if (make_mmu_pages_available(vcpu)) {
-               write_unlock(&vcpu->kvm->mmu_lock);
-               return INVALID_PAGE;
-       }
        sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
        ++sp->root_count;
 
-       write_unlock(&vcpu->kvm->mmu_lock);
        return __pa(sp->spt);
 }
 
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
-       u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u8 shadow_root_level = mmu->shadow_root_level;
        hpa_t root;
        unsigned i;
+       int r;
+
+       write_lock(&vcpu->kvm->mmu_lock);
+       r = make_mmu_pages_available(vcpu);
+       if (r < 0)
+               goto out_unlock;
 
        if (is_tdp_mmu_enabled(vcpu->kvm)) {
                root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               mmu->root_hpa = root;
        } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
-               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
-                                     true);
-
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+               root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+               mmu->root_hpa = root;
        } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+               if (WARN_ON_ONCE(!mmu->pae_root)) {
+                       r = -EIO;
+                       goto out_unlock;
+               }
+
                for (i = 0; i < 4; ++i) {
-                       MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+                       WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
 
                        root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
                                              i << 30, PT32_ROOT_LEVEL, true);
-                       if (!VALID_PAGE(root))
-                               return -ENOSPC;
-                       vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+                       mmu->pae_root[i] = root | PT_PRESENT_MASK |
+                                          shadow_me_mask;
                }
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
-       } else
-               BUG();
+               mmu->root_hpa = __pa(mmu->pae_root);
+       } else {
+               WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+               r = -EIO;
+               goto out_unlock;
+       }
 
        /* root_pgd is ignored for direct MMUs. */
-       vcpu->arch.mmu->root_pgd = 0;
-
-       return 0;
+       mmu->root_pgd = 0;
+out_unlock:
+       write_unlock(&vcpu->kvm->mmu_lock);
+       return r;
 }
 
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
-       u64 pdptr, pm_mask;
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 pdptrs[4], pm_mask;
        gfn_t root_gfn, root_pgd;
        hpa_t root;
-       int i;
+       unsigned i;
+       int r;
 
-       root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+       root_pgd = mmu->get_guest_pgd(vcpu);
        root_gfn = root_pgd >> PAGE_SHIFT;
 
        if (mmu_check_root(vcpu, root_gfn))
                return 1;
 
        /*
+        * On SVM, reading PDPTRs might access guest memory, which might fault
+        * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
+        */
+       if (mmu->root_level == PT32E_ROOT_LEVEL) {
+               for (i = 0; i < 4; ++i) {
+                       pdptrs[i] = mmu->get_pdptr(vcpu, i);
+                       if (!(pdptrs[i] & PT_PRESENT_MASK))
+                               continue;
+
+                       if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+                               return 1;
+               }
+       }
+
+       write_lock(&vcpu->kvm->mmu_lock);
+       r = make_mmu_pages_available(vcpu);
+       if (r < 0)
+               goto out_unlock;
+
+       /*
         * Do we shadow a long mode page table? If so we need to
         * write-protect the guests page table root.
         */
-       if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+       if (mmu->root_level >= PT64_ROOT_4LEVEL) {
                root = mmu_alloc_root(vcpu, root_gfn, 0,
-                                     vcpu->arch.mmu->shadow_root_level, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->root_hpa = root;
+                                     mmu->shadow_root_level, false);
+               mmu->root_hpa = root;
                goto set_root_pgd;
        }
 
+       if (WARN_ON_ONCE(!mmu->pae_root)) {
+               r = -EIO;
+               goto out_unlock;
+       }
+
        /*
         * We shadow a 32 bit page table. This may be a legacy 2-level
         * or a PAE 3-level page table. In either case we need to be aware that
         * the shadow page table may be a PAE or a long mode page table.
         */
-       pm_mask = PT_PRESENT_MASK;
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+       pm_mask = PT_PRESENT_MASK | shadow_me_mask;
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
+               if (WARN_ON_ONCE(!mmu->lm_root)) {
+                       r = -EIO;
+                       goto out_unlock;
+               }
+
+               mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+       }
+
        for (i = 0; i < 4; ++i) {
-               MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
-               if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
-                       pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
-                       if (!(pdptr & PT_PRESENT_MASK)) {
-                               vcpu->arch.mmu->pae_root[i] = 0;
+               WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+               if (mmu->root_level == PT32E_ROOT_LEVEL) {
+                       if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+                               mmu->pae_root[i] = INVALID_PAE_ROOT;
                                continue;
                        }
-                       root_gfn = pdptr >> PAGE_SHIFT;
-                       if (mmu_check_root(vcpu, root_gfn))
-                               return 1;
+                       root_gfn = pdptrs[i] >> PAGE_SHIFT;
                }
 
                root = mmu_alloc_root(vcpu, root_gfn, i << 30,
                                      PT32_ROOT_LEVEL, false);
-               if (!VALID_PAGE(root))
-                       return -ENOSPC;
-               vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+               mmu->pae_root[i] = root | pm_mask;
        }
-       vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+       if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+               mmu->root_hpa = __pa(mmu->lm_root);
+       else
+               mmu->root_hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+       mmu->root_pgd = root_pgd;
+out_unlock:
+       write_unlock(&vcpu->kvm->mmu_lock);
+
+       return 0;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
+       u64 *lm_root, *pae_root;
 
        /*
-        * If we shadow a 32 bit page table with a long mode page
-        * table we enter this path.
+        * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+        * tables are allocated and initialized at root creation as there is no
+        * equivalent level in the guest's NPT to shadow.  Allocate the tables
+        * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
         */
-       if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
-               if (vcpu->arch.mmu->lm_root == NULL) {
-                       /*
-                        * The additional page necessary for this is only
-                        * allocated on demand.
-                        */
+       if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+           mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+               return 0;
 
-                       u64 *lm_root;
+       /*
+        * This mess only works with 4-level paging and needs to be updated to
+        * work with 5-level paging.
+        */
+       if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+               return -EIO;
 
-                       lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-                       if (lm_root == NULL)
-                               return 1;
+       if (mmu->pae_root && mmu->lm_root)
+               return 0;
 
-                       lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+       /*
+        * The special roots should always be allocated in concert.  Yell and
+        * bail if KVM ends up in a state where only one of the roots is valid.
+        */
+       if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+               return -EIO;
 
-                       vcpu->arch.mmu->lm_root = lm_root;
-               }
+       /*
+        * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+        * doesn't need to be decrypted.
+        */
+       pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!pae_root)
+               return -ENOMEM;
 
-               vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+       lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+       if (!lm_root) {
+               free_page((unsigned long)pae_root);
+               return -ENOMEM;
        }
 
-set_root_pgd:
-       vcpu->arch.mmu->root_pgd = root_pgd;
+       mmu->pae_root = pae_root;
+       mmu->lm_root = lm_root;
 
        return 0;
 }
 
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.mmu->direct_map)
-               return mmu_alloc_direct_roots(vcpu);
-       else
-               return mmu_alloc_shadow_roots(vcpu);
-}
-
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
        int i;
@@ -3422,7 +3446,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu->pae_root[i];
 
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                        root &= PT64_BASE_ADDR_MASK;
                        sp = to_shadow_page(root);
                        mmu_sync_children(vcpu, sp);
@@ -3554,11 +3578,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                            __is_rsvd_bits_set(rsvd_check, sptes[level], level);
 
        if (reserved) {
-               pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+               pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
                       __func__, addr);
                for (level = root; level >= leaf; level--)
-                       pr_err("------ spte 0x%llx level %d.\n",
-                              sptes[level], level);
+                       pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+                              sptes[level], level,
+                              rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
        }
 
        return reserved;
@@ -3653,6 +3678,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        bool async;
 
+       /*
+        * Retry the page fault if the gfn hit a memslot that is being deleted
+        * or moved.  This ensures any existing SPTEs for the old memslot will
+        * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+        */
+       if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+               return true;
+
        /* Don't expose private memslots to L2. */
        if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
                *pfn = KVM_PFN_NOSLOT;
@@ -4615,12 +4648,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
        struct kvm_mmu *context = &vcpu->arch.guest_mmu;
        union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
 
-       context->shadow_root_level = new_role.base.level;
-
        __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
 
-       if (new_role.as_u64 != context->mmu_role.as_u64)
+       if (new_role.as_u64 != context->mmu_role.as_u64) {
                shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+
+               /*
+                * Override the level set by the common init helper, nested TDP
+                * always uses the host's TDP configuration.
+                */
+               context->shadow_root_level = new_role.base.level;
+       }
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
 
@@ -4802,16 +4840,23 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
        if (r)
                goto out;
-       r = mmu_alloc_roots(vcpu);
-       kvm_mmu_sync_roots(vcpu);
+       r = mmu_alloc_special_roots(vcpu);
+       if (r)
+               goto out;
+       if (vcpu->arch.mmu->direct_map)
+               r = mmu_alloc_direct_roots(vcpu);
+       else
+               r = mmu_alloc_shadow_roots(vcpu);
        if (r)
                goto out;
+
+       kvm_mmu_sync_roots(vcpu);
+
        kvm_mmu_load_pgd(vcpu);
        static_call(kvm_x86_tlb_flush_current)(vcpu);
 out:
        return r;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
@@ -4820,7 +4865,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
        WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
 static bool need_remote_flush(u64 old, u64 new)
 {
@@ -5169,10 +5213,10 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_
 static __always_inline bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        slot_level_handler fn, int start_level, int end_level,
-                       gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+                       gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
+                       bool flush)
 {
        struct slot_rmap_walk_iterator iterator;
-       bool flush = false;
 
        for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
                        end_gfn, &iterator) {
@@ -5180,7 +5224,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        flush |= fn(kvm, iterator.rmap, memslot);
 
                if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-                       if (flush && lock_flush_tlb) {
+                       if (flush && flush_on_yield) {
                                kvm_flush_remote_tlbs_with_address(kvm,
                                                start_gfn,
                                                iterator.gfn - start_gfn + 1);
@@ -5190,36 +5234,32 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                }
        }
 
-       if (flush && lock_flush_tlb) {
-               kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
-                                                  end_gfn - start_gfn + 1);
-               flush = false;
-       }
-
        return flush;
 }
 
 static __always_inline bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
                  slot_level_handler fn, int start_level, int end_level,
-                 bool lock_flush_tlb)
+                 bool flush_on_yield)
 {
        return slot_handle_level_range(kvm, memslot, fn, start_level,
                        end_level, memslot->base_gfn,
                        memslot->base_gfn + memslot->npages - 1,
-                       lock_flush_tlb);
+                       flush_on_yield, false);
 }
 
 static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
-                slot_level_handler fn, bool lock_flush_tlb)
+                slot_level_handler fn, bool flush_on_yield)
 {
        return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
-                                PG_LEVEL_4K, lock_flush_tlb);
+                                PG_LEVEL_4K, flush_on_yield);
 }
 
 static void free_mmu_pages(struct kvm_mmu *mmu)
 {
+       if (!tdp_enabled && mmu->pae_root)
+               set_memory_encrypted((unsigned long)mmu->pae_root, 1);
        free_page((unsigned long)mmu->pae_root);
        free_page((unsigned long)mmu->lm_root);
 }
@@ -5240,9 +5280,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
         * while the PDP table is a per-vCPU construct that's allocated at MMU
         * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
         * x86_64.  Therefore we need to allocate the PDP table in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.  Except for
-        * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
-        * skip allocating the PDP table.
+        * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
+        * generally doesn't use PAE paging and can skip allocating the PDP
+        * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
+        * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+        * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
         */
        if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
                return 0;
@@ -5252,8 +5294,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
                return -ENOMEM;
 
        mmu->pae_root = page_address(page);
+
+       /*
+        * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+        * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
+        * that KVM's writes and the CPU's reads get along.  Note, this is
+        * only necessary when using shadow paging, as 64-bit NPT can get at
+        * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+        * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+        */
+       if (!tdp_enabled)
+               set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+       else
+               WARN_ON_ONCE(shadow_me_mask);
+
        for (i = 0; i < 4; ++i)
-               mmu->pae_root[i] = INVALID_PAGE;
+               mmu->pae_root[i] = INVALID_PAE_ROOT;
 
        return 0;
 }
@@ -5365,6 +5421,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
         */
        kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
 
+       /* In order to ensure all threads see this change when
+        * handling the MMU reload signal, this must happen in the
+        * same critical section as kvm_reload_remote_mmus, and
+        * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
+        * could drop the MMU lock and yield.
+        */
+       if (is_tdp_mmu_enabled(kvm))
+               kvm_tdp_mmu_invalidate_all_roots(kvm);
+
        /*
         * Notify all vcpus to reload its shadow page table and flush TLB.
         * Then all vcpus will switch to new shadow page table with the new
@@ -5377,10 +5442,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
        kvm_zap_obsolete_pages(kvm);
 
-       if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_zap_all(kvm);
-
        write_unlock(&kvm->mmu_lock);
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               kvm_tdp_mmu_zap_invalidated_roots(kvm);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5420,7 +5488,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int i;
-       bool flush;
+       bool flush = false;
 
        write_lock(&kvm->mmu_lock);
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -5433,20 +5501,31 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                        if (start >= end)
                                continue;
 
-                       slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
-                                               PG_LEVEL_4K,
-                                               KVM_MAX_HUGEPAGE_LEVEL,
-                                               start, end - 1, true);
+                       flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+                                                       PG_LEVEL_4K,
+                                                       KVM_MAX_HUGEPAGE_LEVEL,
+                                                       start, end - 1, true, flush);
                }
        }
 
+       if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+       write_unlock(&kvm->mmu_lock);
+
        if (is_tdp_mmu_enabled(kvm)) {
-               flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+               flush = false;
+
+               read_lock(&kvm->mmu_lock);
+               for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+                       flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+                                                         gfn_end, flush, true);
                if (flush)
-                       kvm_flush_remote_tlbs(kvm);
-       }
+                       kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+                                                          gfn_end);
 
-       write_unlock(&kvm->mmu_lock);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5465,10 +5544,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
        write_lock(&kvm->mmu_lock);
        flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
                                start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
        write_unlock(&kvm->mmu_lock);
 
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+               read_unlock(&kvm->mmu_lock);
+       }
+
        /*
         * We can flush all the TLBs out of the mmu lock without TLB
         * corruption since we just change the spte from writable to
@@ -5476,9 +5559,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
         * spte from present to present (changing the spte from present
         * to nonpresent will flush all the TLBs immediately), in other
         * words, the only case we care is mmu_spte_update() where we
-        * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
-        * instead of PT_WRITABLE_MASK, that means it does not depend
-        * on PT_WRITABLE_MASK anymore.
+        * have checked Host-writable | MMU-writable instead of
+        * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+        * anymore.
         */
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5529,21 +5612,32 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 {
        /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
        struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+       bool flush;
 
        write_lock(&kvm->mmu_lock);
-       slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+       flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
 
-       if (is_tdp_mmu_enabled(kvm))
-               kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+       if (flush)
+               kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
        write_unlock(&kvm->mmu_lock);
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               flush = false;
+
+               read_lock(&kvm->mmu_lock);
+               flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
+               if (flush)
+                       kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+               read_unlock(&kvm->mmu_lock);
+       }
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot)
+                                       const struct kvm_memory_slot *memslot)
 {
        /*
         * All current use cases for flushing the TLBs for a specific memslot
-        * are related to dirty logging, and do the TLB flush out of mmu_lock.
+        * related to dirty logging, and many do the TLB flush out of mmu_lock.
         * The interaction between the various operations on memslot must be
         * serialized by slots_locks to ensure the TLB flush from one operation
         * is observed by any other operation on the same memslot.
@@ -5560,10 +5654,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 
        write_lock(&kvm->mmu_lock);
        flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
-       if (is_tdp_mmu_enabled(kvm))
-               flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
        write_unlock(&kvm->mmu_lock);
 
+       if (is_tdp_mmu_enabled(kvm)) {
+               read_lock(&kvm->mmu_lock);
+               flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+               read_unlock(&kvm->mmu_lock);
+       }
+
        /*
         * It's also safe to flush TLBs out of mmu lock here as currently this
         * function is only used for dirty logging, in which case flushing TLB
@@ -5701,25 +5799,6 @@ static void mmu_destroy_caches(void)
        kmem_cache_destroy(mmu_page_header_cache);
 }
 
-static void kvm_set_mmio_spte_mask(void)
-{
-       u64 mask;
-
-       /*
-        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
-        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
-        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
-        * 52-bit physical addresses then there are no reserved PA bits in the
-        * PTEs and so the reserved PA approach must be disabled.
-        */
-       if (shadow_phys_bits < 52)
-               mask = BIT_ULL(51) | PT_PRESENT_MASK;
-       else
-               mask = 0;
-
-       kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
 static bool get_nx_auto_mode(void)
 {
        /* Return true when CPU has the bug, and mitigations are ON */
@@ -5785,8 +5864,6 @@ int kvm_mmu_module_init(void)
 
        kvm_mmu_reset_all_pte_masks();
 
-       kvm_set_mmio_spte_mask();
-
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
                                            sizeof(struct pte_list_desc),
                                            0, SLAB_ACCOUNT, NULL);
@@ -5884,6 +5961,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
        struct kvm_mmu_page *sp;
        unsigned int ratio;
        LIST_HEAD(invalid_list);
+       bool flush = false;
        ulong to_zap;
 
        rcu_idx = srcu_read_lock(&kvm->srcu);
@@ -5905,19 +5983,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
                                      lpage_disallowed_link);
                WARN_ON_ONCE(!sp->lpage_disallowed);
                if (is_tdp_mmu_page(sp)) {
-                       kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
-                               sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+                       flush = kvm_tdp_mmu_zap_sp(kvm, sp);
                } else {
                        kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
                        WARN_ON_ONCE(sp->lpage_disallowed);
                }
 
                if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+                       kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
                        cond_resched_rwlock_write(&kvm->mmu_lock);
+                       flush = false;
                }
        }
-       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+       kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
 
        write_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, rcu_idx);
index ced15fd..cedc17b 100644 (file)
@@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu->pae_root[i];
 
-               if (root && VALID_PAGE(root)) {
+               if (IS_VALID_PAE_ROOT(root)) {
                        root &= PT64_BASE_ADDR_MASK;
                        sp = to_shadow_page(root);
                        __mmu_spte_walk(vcpu, sp, fn, 2);
index ec4fc28..f2546d6 100644 (file)
@@ -20,6 +20,16 @@ extern bool dbg;
 #define MMU_WARN_ON(x) do { } while (0)
 #endif
 
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid.  And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set.  Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT       0
+#define IS_VALID_PAE_ROOT(x)   (!!(x))
+
 struct kvm_mmu_page {
        struct list_head link;
        struct hlist_node hash_link;
@@ -40,7 +50,11 @@ struct kvm_mmu_page {
        u64 *spt;
        /* hold the gfn of each spte inside spt */
        gfn_t *gfns;
-       int root_count;          /* Currently serving as active root */
+       /* Currently serving as active root */
+       union {
+               int root_count;
+               refcount_t tdp_mmu_root_count;
+       };
        unsigned int unsync_children;
        struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
        DECLARE_BITMAP(unsync_child_bitmap, 512);
@@ -78,6 +92,16 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
        return to_shadow_page(__pa(sptep));
 }
 
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+       return role.smm ? 1 : 0;
+}
+
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+       return kvm_mmu_role_as_id(sp->role);
+}
+
 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
 {
        /*
@@ -103,22 +127,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
                                        u64 start_gfn, u64 pages);
 
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       BUG_ON(!sp->root_count);
-       lockdep_assert_held(&kvm->mmu_lock);
-
-       ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       lockdep_assert_held(&kvm->mmu_lock);
-       --sp->root_count;
-
-       return !sp->root_count;
-}
-
 /*
  * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
  *
@@ -141,8 +149,9 @@ enum {
 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
 #define SET_SPTE_SPURIOUS              BIT(2)
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-                             gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+                             const struct kvm_memory_slot *slot, gfn_t gfn,
+                             kvm_pfn_t pfn, int max_level);
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
                            int max_level, kvm_pfn_t *pfnp,
                            bool huge_page_disallowed, int *req_level);
index 55d7b47..70b7e44 100644 (file)
@@ -503,6 +503,7 @@ error:
 #endif
        walker->fault.address = addr;
        walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+       walker->fault.async_page_fault = false;
 
        trace_kvm_mmu_walker_error(walker->fault.error_code);
        return 0;
@@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
                nr_present++;
 
-               host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+               host_writable = sp->spt[i] & shadow_host_writable_mask;
 
                set_spte_ret |= set_spte(vcpu, &sp->spt[i],
                                         pte_access, PG_LEVEL_4K,
index ef55f0b..66d43ce 100644 (file)
 #include "spte.h"
 
 #include <asm/e820/api.h>
+#include <asm/vmx.h>
 
+static bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
 u64 __read_mostly shadow_nx_mask;
 u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 u64 __read_mostly shadow_user_mask;
 u64 __read_mostly shadow_accessed_mask;
 u64 __read_mostly shadow_dirty_mask;
 u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
 u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
 u64 __read_mostly shadow_me_mask;
@@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
        u64 mask;
 
        WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
-       BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
 
        mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
        mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen)
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
 {
        u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
-       u64 mask = generation_mmio_spte_mask(gen);
+       u64 spte = generation_mmio_spte_mask(gen);
        u64 gpa = gfn << PAGE_SHIFT;
 
+       WARN_ON_ONCE(!shadow_mmio_value);
+
        access &= shadow_mmio_access_mask;
-       mask |= shadow_mmio_value | access;
-       mask |= gpa | shadow_nonpresent_or_rsvd_mask;
-       mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+       spte |= shadow_mmio_value | access;
+       spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+       spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
                << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
 
-       return mask;
+       return spte;
 }
 
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                     bool can_unsync, bool host_writable, bool ad_disabled,
                     u64 *new_spte)
 {
-       u64 spte = 0;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
        int ret = 0;
 
        if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
        else if (kvm_vcpu_ad_need_write_protect(vcpu))
-               spte |= SPTE_AD_WRPROT_ONLY_MASK;
+               spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+
+       /*
+        * Bits 62:52 of PAE SPTEs are reserved.  WARN if said bits are set
+        * if PAE paging may be employed (shadow paging or any 32-bit KVM).
+        */
+       WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
+                    (spte & SPTE_TDP_AD_MASK));
 
        /*
         * For the EPT case, shadow_present_mask is 0 if hardware
@@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                        kvm_is_mmio_pfn(pfn));
 
        if (host_writable)
-               spte |= SPTE_HOST_WRITEABLE;
+               spte |= shadow_host_writable_mask;
        else
                pte_access &= ~ACC_WRITE_MASK;
 
@@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
        spte |= (u64)pfn << PAGE_SHIFT;
 
        if (pte_access & ACC_WRITE_MASK) {
-               spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+               spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
 
                /*
                 * Optimization: for pte sync, if spte was writable the hash
@@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                                 __func__, gfn);
                        ret |= SET_SPTE_WRITE_PROTECTED_PT;
                        pte_access &= ~ACC_WRITE_MASK;
-                       spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
                }
        }
 
@@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
                spte = mark_spte_for_access_track(spte);
 
 out:
+       WARN_ON(is_mmio_spte(spte));
        *new_spte = spte;
        return ret;
 }
 
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 {
-       u64 spte;
+       u64 spte = SPTE_MMU_PRESENT_MASK;
 
-       spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
-              shadow_user_mask | shadow_x_mask | shadow_me_mask;
+       spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+               shadow_user_mask | shadow_x_mask | shadow_me_mask;
 
        if (ad_disabled)
-               spte |= SPTE_AD_DISABLED_MASK;
+               spte |= SPTE_TDP_AD_DISABLED_MASK;
        else
                spte |= shadow_accessed_mask;
 
@@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
        new_spte |= (u64)new_pfn << PAGE_SHIFT;
 
        new_spte &= ~PT_WRITABLE_MASK;
-       new_spte &= ~SPTE_HOST_WRITEABLE;
+       new_spte &= ~shadow_host_writable_mask;
 
        new_spte = mark_spte_for_access_track(new_spte);
 
@@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte)
        return spte;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
 {
        BUG_ON((u64)(unsigned)access_mask != access_mask);
-       WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
        WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
-       shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+       if (!enable_mmio_caching)
+               mmio_value = 0;
+
+       /*
+        * Disable MMIO caching if the MMIO value collides with the bits that
+        * are used to hold the relocated GFN when the L1TF mitigation is
+        * enabled.  This should never fire as there is no known hardware that
+        * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+        * MMIO value are not susceptible to L1TF.
+        */
+       if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+                                 SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+               mmio_value = 0;
+
+       /*
+        * The masked MMIO value must obviously match itself and a removed SPTE
+        * must not get a false positive.  Removed SPTEs and MMIO SPTEs should
+        * never collide as MMIO must set some RWX bits, and removed SPTEs must
+        * not set any RWX bits.
+        */
+       if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+           WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+               mmio_value = 0;
+
+       shadow_mmio_value = mmio_value;
+       shadow_mmio_mask  = mmio_mask;
        shadow_mmio_access_mask = access_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- *  - Setting either @accessed_mask or @dirty_mask requires setting both
- *  - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-               u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 {
-       BUG_ON(!dirty_mask != !accessed_mask);
-       BUG_ON(!accessed_mask && !acc_track_mask);
-       BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
-       shadow_user_mask = user_mask;
-       shadow_accessed_mask = accessed_mask;
-       shadow_dirty_mask = dirty_mask;
-       shadow_nx_mask = nx_mask;
-       shadow_x_mask = x_mask;
-       shadow_present_mask = p_mask;
-       shadow_acc_track_mask = acc_track_mask;
-       shadow_me_mask = me_mask;
+       shadow_user_mask        = VMX_EPT_READABLE_MASK;
+       shadow_accessed_mask    = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+       shadow_dirty_mask       = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+       shadow_nx_mask          = 0ull;
+       shadow_x_mask           = VMX_EPT_EXECUTABLE_MASK;
+       shadow_present_mask     = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+       shadow_acc_track_mask   = VMX_EPT_RWX_MASK;
+       shadow_me_mask          = 0ull;
+
+       shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+       shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
+
+       /*
+        * EPT Misconfigurations are generated if the value of bits 2:0
+        * of an EPT paging-structure entry is 110b (write/execute).
+        */
+       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+                                  VMX_EPT_RWX_MASK, 0);
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
 
 void kvm_mmu_reset_all_pte_masks(void)
 {
        u8 low_phys_bits;
-
-       shadow_user_mask = 0;
-       shadow_accessed_mask = 0;
-       shadow_dirty_mask = 0;
-       shadow_nx_mask = 0;
-       shadow_x_mask = 0;
-       shadow_present_mask = 0;
-       shadow_acc_track_mask = 0;
+       u64 mask;
 
        shadow_phys_bits = kvm_get_shadow_phys_bits();
 
@@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void)
 
        shadow_nonpresent_or_rsvd_lower_gfn_mask =
                GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+       shadow_user_mask        = PT_USER_MASK;
+       shadow_accessed_mask    = PT_ACCESSED_MASK;
+       shadow_dirty_mask       = PT_DIRTY_MASK;
+       shadow_nx_mask          = PT64_NX_MASK;
+       shadow_x_mask           = 0;
+       shadow_present_mask     = PT_PRESENT_MASK;
+       shadow_acc_track_mask   = 0;
+       shadow_me_mask          = sme_me_mask;
+
+       shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
+       shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITEABLE;
+
+       /*
+        * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+        * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
+        * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+        * 52-bit physical addresses then there are no reserved PA bits in the
+        * PTEs and so the reserved PA approach must be disabled.
+        */
+       if (shadow_phys_bits < 52)
+               mask = BIT_ULL(51) | PT_PRESENT_MASK;
+       else
+               mask = 0;
+
+       kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
 }
index 6de3950..bca0ba1 100644 (file)
@@ -5,18 +5,33 @@
 
 #include "mmu_internal.h"
 
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware.  E.g. MMIO SPTEs are not considered present.  Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+.  MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK          BIT_ULL(11)
 
 /*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled).  Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE.  This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML).  If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
  */
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_TDP_AD_SHIFT              52
+#define SPTE_TDP_AD_MASK               (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK       (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK      (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK   (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
 
 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
        (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE    BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE     BIT_ULL(10)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.  This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+                                         PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK    (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+                                        SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE         BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE          BIT_ULL(58)
 
-#define SPTE_HOST_WRITEABLE    (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE     (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
 
 /*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
  * the memslots generation and is derived as follows:
  *
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
  *
  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
  * the MMIO generation number, as doing so would require stealing a bit from
  */
 
 #define MMIO_SPTE_GEN_LOW_START                3
-#define MMIO_SPTE_GEN_LOW_END          11
+#define MMIO_SPTE_GEN_LOW_END          10
 
-#define MMIO_SPTE_GEN_HIGH_START       PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START       52
 #define MMIO_SPTE_GEN_HIGH_END         62
 
 #define MMIO_SPTE_GEN_LOW_MASK         GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
                                                    MMIO_SPTE_GEN_LOW_START)
 #define MMIO_SPTE_GEN_HIGH_MASK                GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
                                                    MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+               (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
 
 #define MMIO_SPTE_GEN_LOW_BITS         (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
 #define MMIO_SPTE_GEN_HIGH_BITS                (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
 
 /* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
 
 #define MMIO_SPTE_GEN_LOW_SHIFT                (MMIO_SPTE_GEN_LOW_START - 0)
 #define MMIO_SPTE_GEN_HIGH_SHIFT       (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
 
 #define MMIO_SPTE_GEN_MASK             GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
 
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
 extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 extern u64 __read_mostly shadow_user_mask;
 extern u64 __read_mostly shadow_accessed_mask;
 extern u64 __read_mostly shadow_dirty_mask;
 extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
 extern u64 __read_mostly shadow_mmio_access_mask;
 extern u64 __read_mostly shadow_present_mask;
 extern u64 __read_mostly shadow_me_mask;
 
 /*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
  * pages.
  */
@@ -121,28 +171,21 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
 
 /*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
-                                         PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
-/*
  * If a thread running without exclusive control of the MMU lock must perform a
  * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
  * non-present intermediate value. Other threads which encounter this value
  * should not modify the SPTE.
  *
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability.  Use only low bits to avoid 64-bit immediates.
  *
  * Only used by the TDP MMU.
  */
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE   0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
 
 static inline bool is_removed_spte(u64 spte)
 {
@@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits;
 
 static inline bool is_mmio_spte(u64 spte)
 {
-       return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+       return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+              likely(shadow_mmio_value);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+       return !!(pte & SPTE_MMU_PRESENT_MASK);
 }
 
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 
 static inline bool spte_ad_enabled(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
 }
 
 static inline bool spte_ad_need_write_protect(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
-       return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
+       /*
+        * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+        * and non-TDP SPTEs will never set these bits.  Optimize for 64-bit
+        * TDP and do the A/D type check unconditionally.
+        */
+       return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
 }
 
 static inline u64 spte_shadow_accessed_mask(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
        return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 }
 
 static inline u64 spte_shadow_dirty_mask(u64 spte)
 {
-       MMU_WARN_ON(is_mmio_spte(spte));
+       MMU_WARN_ON(!is_shadow_present_pte(spte));
        return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 }
 
@@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte)
        return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 }
 
-static inline bool is_shadow_present_pte(u64 pte)
-{
-       return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
 static inline bool is_large_pte(u64 pte)
 {
        return pte & PT_PAGE_SIZE_MASK;
@@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte)
 
 static inline bool spte_can_locklessly_be_made_writable(u64 spte)
 {
-       return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
-               (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+       return (spte & shadow_host_writable_mask) &&
+              (spte & shadow_mmu_writable_mask);
 }
 
 static inline u64 get_mmio_spte_generation(u64 spte)
index e5f1481..b3ed302 100644 (file)
@@ -21,6 +21,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
 }
 
 /*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+       iter->yielded_gfn = iter->next_last_level_gfn;
+       iter->level = iter->root_level;
+
+       iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+       tdp_iter_refresh_sptep(iter);
+
+       iter->valid = true;
+}
+
+/*
  * Sets a TDP iterator to walk a pre-order traversal of the paging structure
  * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
  */
@@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
        WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
 
        iter->next_last_level_gfn = next_last_level_gfn;
-       iter->yielded_gfn = iter->next_last_level_gfn;
        iter->root_level = root_level;
        iter->min_level = min_level;
-       iter->level = root_level;
-       iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
-
-       iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
-       tdp_iter_refresh_sptep(iter);
+       iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
+       iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
 
-       iter->valid = true;
+       tdp_iter_restart(iter);
 }
 
 /*
@@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter)
        iter->valid = false;
 }
 
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
-{
-       return iter->pt_path[iter->root_level - 1];
-}
-
index 4cc177d..b1748b9 100644 (file)
@@ -36,6 +36,8 @@ struct tdp_iter {
        int min_level;
        /* The iterator's current level within the paging structure */
        int level;
+       /* The address space ID, i.e. SMM vs. regular. */
+       int as_id;
        /* A snapshot of the value at sptep */
        u64 old_spte;
        /*
@@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level);
 void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
                    int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
index d789150..83cbdbe 100644 (file)
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
        INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+                                                            bool shared)
+{
+       if (shared)
+               lockdep_assert_held_read(&kvm->mmu_lock);
+       else
+               lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 {
        if (!kvm->arch.tdp_mmu_enabled)
@@ -41,32 +50,85 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
        rcu_barrier();
 }
 
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
-       if (kvm_mmu_put_root(kvm, root))
-               kvm_tdp_mmu_free_root(kvm, root);
+       free_page((unsigned long)sp->spt);
+       kmem_cache_free(mmu_page_header_cache, sp);
 }
 
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
-                                          struct kvm_mmu_page *root)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
 {
-       lockdep_assert_held_write(&kvm->mmu_lock);
+       struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+                                              rcu_head);
 
-       if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
-               return false;
+       tdp_mmu_free_sp(sp);
+}
 
-       kvm_mmu_get_root(kvm, root);
-       return true;
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared)
+{
+       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
+       if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+               return;
+
+       WARN_ON(!root->tdp_mmu_page);
+
+       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       list_del_rcu(&root->link);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+       zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+
+       call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
 
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
-                                                    struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+                                             struct kvm_mmu_page *prev_root,
+                                             bool shared)
 {
        struct kvm_mmu_page *next_root;
 
-       next_root = list_next_entry(root, link);
-       tdp_mmu_put_root(kvm, root);
+       rcu_read_lock();
+
+       if (prev_root)
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &prev_root->link,
+                                                 typeof(*prev_root), link);
+       else
+               next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                  typeof(*next_root), link);
+
+       while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                               &next_root->link, typeof(*next_root), link);
+
+       rcu_read_unlock();
+
+       if (prev_root)
+               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
        return next_root;
 }
 
@@ -75,35 +137,24 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  * This makes it safe to release the MMU lock and yield within the loop, but
  * if exiting the loop early, the caller must drop the reference to the most
  * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
  */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)                          \
-       for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,        \
-                                     typeof(*_root), link);            \
-            tdp_mmu_next_root_valid(_kvm, _root);                      \
-            _root = tdp_mmu_next_root(_kvm, _root))
-
-#define for_each_tdp_mmu_root(_kvm, _root)                             \
-       list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield);
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
-       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
-       lockdep_assert_held_write(&kvm->mmu_lock);
-
-       WARN_ON(root->root_count);
-       WARN_ON(!root->tdp_mmu_page);
-
-       list_del(&root->link);
-
-       zap_gfn_range(kvm, root, 0, max_gfn, false);
-
-       free_page((unsigned long)root->spt);
-       kmem_cache_free(mmu_page_header_cache, root);
-}
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
+            _root;                                                     \
+            _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
+               } else
+
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id)                             \
+       list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,         \
+                               lockdep_is_held_type(&kvm->mmu_lock, 0) ||      \
+                               lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
+               } else
 
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
                                                   int level)
@@ -137,86 +188,46 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
        return sp;
 }
 
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 {
        union kvm_mmu_page_role role;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page *root;
 
-       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+       lockdep_assert_held_write(&kvm->mmu_lock);
 
-       write_lock(&kvm->mmu_lock);
+       role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 
        /* Check for an existing root before allocating a new one. */
-       for_each_tdp_mmu_root(kvm, root) {
-               if (root->role.word == role.word) {
-                       kvm_mmu_get_root(kvm, root);
-                       write_unlock(&kvm->mmu_lock);
-                       return root;
-               }
+       for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+               if (root->role.word == role.word &&
+                   kvm_tdp_mmu_get_root(kvm, root))
+                       goto out;
        }
 
        root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
-       root->root_count = 1;
-
-       list_add(&root->link, &kvm->arch.tdp_mmu_roots);
-
-       write_unlock(&kvm->mmu_lock);
-
-       return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *root;
+       refcount_set(&root->tdp_mmu_root_count, 1);
 
-       root = get_tdp_mmu_vcpu_root(vcpu);
-       if (!root)
-               return INVALID_PAGE;
+       spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+       spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
+out:
        return __pa(root->spt);
 }
 
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
-       free_page((unsigned long)sp->spt);
-       kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
-       struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
-                                              rcu_head);
-
-       tdp_mmu_free_sp(sp);
-}
-
 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
                                u64 old_spte, u64 new_spte, int level,
                                bool shared);
 
-static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
-{
-       return sp->role.smm ? 1 : 0;
-}
-
 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 {
-       bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
        if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
                return;
 
        if (is_accessed_spte(old_spte) &&
-           (!is_accessed_spte(new_spte) || pfn_changed))
+           (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+            spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
                kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 }
 
@@ -301,11 +312,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
  *
  * Given a page table that has been removed from the TDP paging structure,
  * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
  */
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
                                        bool shared)
 {
-       struct kvm_mmu_page *sp = sptep_to_sp(pt);
+       struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
        int level = sp->role.level;
        gfn_t base_gfn = sp->gfn;
        u64 old_child_spte;
@@ -318,7 +334,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
        tdp_mmu_unlink_page(kvm, sp, shared);
 
        for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-               sptep = pt + i;
+               sptep = rcu_dereference(pt) + i;
                gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 
                if (shared) {
@@ -455,7 +471,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 
 
        if (was_leaf && is_dirty_spte(old_spte) &&
-           (!is_dirty_spte(new_spte) || pfn_changed))
+           (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
                kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 
        /*
@@ -479,8 +495,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 }
 
 /*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
  *
  * @kvm: kvm instance
  * @iter: a tdp_iter instance currently on the SPTE that should be set
@@ -488,30 +505,39 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
  * Returns: true if the SPTE was set, false if it was not. If false is returned,
  *         this function will have no side-effects.
  */
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
-                                          struct tdp_iter *iter,
-                                          u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+                                                       struct tdp_iter *iter,
+                                                       u64 new_spte)
 {
-       u64 *root_pt = tdp_iter_root_pt(iter);
-       struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-       int as_id = kvm_mmu_page_as_id(root);
-
        lockdep_assert_held_read(&kvm->mmu_lock);
 
        /*
         * Do not change removed SPTEs. Only the thread that froze the SPTE
         * may modify it.
         */
-       if (iter->old_spte == REMOVED_SPTE)
+       if (is_removed_spte(iter->old_spte))
                return false;
 
        if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
                      new_spte) != iter->old_spte)
                return false;
 
-       handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-                           iter->level, true);
+       __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+                             new_spte, iter->level, true);
+       handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
+
+       return true;
+}
+
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+                                          struct tdp_iter *iter,
+                                          u64 new_spte)
+{
+       if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+               return false;
 
+       handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+                                     iter->old_spte, new_spte, iter->level);
        return true;
 }
 
@@ -538,7 +564,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
         * here since the SPTE is going from non-present
         * to non-present.
         */
-       WRITE_ONCE(*iter->sptep, 0);
+       WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 
        return true;
 }
@@ -564,10 +590,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
                                      u64 new_spte, bool record_acc_track,
                                      bool record_dirty_log)
 {
-       tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
-       struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-       int as_id = kvm_mmu_page_as_id(root);
-
        lockdep_assert_held_write(&kvm->mmu_lock);
 
        /*
@@ -577,17 +599,17 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
         * should be used. If operating under the MMU lock in write mode, the
         * use of the removed SPTE should not be necessary.
         */
-       WARN_ON(iter->old_spte == REMOVED_SPTE);
+       WARN_ON(is_removed_spte(iter->old_spte));
 
        WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
-       __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-                             iter->level, false);
+       __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+                             new_spte, iter->level, false);
        if (record_acc_track)
                handle_changed_spte_acc_track(iter->old_spte, new_spte,
                                              iter->level);
        if (record_dirty_log)
-               handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
+               handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
                                              iter->old_spte, new_spte,
                                              iter->level);
 }
@@ -642,7 +664,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
  * Return false if a yield was not needed.
  */
 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
-                                            struct tdp_iter *iter, bool flush)
+                                            struct tdp_iter *iter, bool flush,
+                                            bool shared)
 {
        /* Ensure forward progress has been made before yielding. */
        if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -654,14 +677,16 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
                if (flush)
                        kvm_flush_remote_tlbs(kvm);
 
-               cond_resched_rwlock_write(&kvm->mmu_lock);
+               if (shared)
+                       cond_resched_rwlock_read(&kvm->mmu_lock);
+               else
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
+
                rcu_read_lock();
 
                WARN_ON(iter->gfn > iter->next_last_level_gfn);
 
-               tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
-                              iter->root_level, iter->min_level,
-                              iter->next_last_level_gfn);
+               tdp_iter_restart(iter);
 
                return true;
        }
@@ -674,24 +699,33 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
  * non-root pages mapping GFNs strictly within that range. Returns true if
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
+ *
  * If can_yield is true, will release the MMU lock and reschedule if the
  * scheduler needs the CPU or there is contention on the MMU lock. If this
  * function cannot yield, it will not release the MMU lock or reschedule and
  * the caller must ensure it does not supply too large a GFN range, or the
  * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
  */
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield)
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared)
 {
        struct tdp_iter iter;
-       bool flush_needed = false;
+
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
+retry:
                if (can_yield &&
-                   tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
-                       flush_needed = false;
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
+                       flush = false;
                        continue;
                }
 
@@ -708,12 +742,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                    !is_last_spte(iter.old_spte, iter.level))
                        continue;
 
-               tdp_mmu_set_spte(kvm, &iter, 0);
-               flush_needed = true;
+               if (!shared) {
+                       tdp_mmu_set_spte(kvm, &iter, 0);
+                       flush = true;
+               } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
        }
 
        rcu_read_unlock();
-       return flush_needed;
+       return flush;
 }
 
 /*
@@ -721,14 +764,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  * non-root pages mapping GFNs strictly within that range. Returns true if
  * SPTEs have been cleared and a TLB flush is needed before releasing the
  * MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
  */
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared)
 {
        struct kvm_mmu_page *root;
-       bool flush = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root)
-               flush |= zap_gfn_range(kvm, root, start, end, true);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
+                                     shared);
 
        return flush;
 }
@@ -736,13 +786,115 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 {
        gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-       bool flush;
+       bool flush = false;
+       int i;
+
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+               flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+                                                 flush, false);
 
-       flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
        if (flush)
                kvm_flush_remote_tlbs(kvm);
 }
 
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+                                                 struct kvm_mmu_page *prev_root)
+{
+       struct kvm_mmu_page *next_root;
+
+       if (prev_root)
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &prev_root->link,
+                                                 typeof(*prev_root), link);
+       else
+               next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                  typeof(*next_root), link);
+
+       while (next_root && !(next_root->role.invalid &&
+                             refcount_read(&next_root->tdp_mmu_root_count)))
+               next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+                                                 &next_root->link,
+                                                 typeof(*next_root), link);
+
+       return next_root;
+}
+
+/*
+ * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial amount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+       gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+       struct kvm_mmu_page *next_root;
+       struct kvm_mmu_page *root;
+       bool flush = false;
+
+       lockdep_assert_held_read(&kvm->mmu_lock);
+
+       rcu_read_lock();
+
+       root = next_invalidated_root(kvm, NULL);
+
+       while (root) {
+               next_root = next_invalidated_root(kvm, root);
+
+               rcu_read_unlock();
+
+               flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+                                     true);
+
+               /*
+                * Put the reference acquired in
+                * kvm_tdp_mmu_invalidate_roots
+                */
+               kvm_tdp_mmu_put_root(kvm, root, true);
+
+               root = next_root;
+
+               rcu_read_lock();
+       }
+
+       rcu_read_unlock();
+
+       if (flush)
+               kvm_flush_remote_tlbs(kvm);
+}
+
+/*
+ * Mark each TDP MMU root as invalid so that other threads
+ * will drop their references and allow the root count to
+ * go to 0.
+ *
+ * Also take a reference on all roots so that this thread
+ * can do the bulk of the work required to free the roots
+ * once they are invalidated. Without this reference, a
+ * vCPU thread might drop the last reference to a root and
+ * get stuck with tearing down the entire paging structure.
+ *
+ * Roots which have a zero refcount should be skipped as
+ * they're already being torn down.
+ * Already invalid roots should be referenced again so that
+ * they aren't freed before kvm_tdp_mmu_zap_all_fast is
+ * done with them.
+ *
+ * This has essentially the same effect for the TDP MMU
+ * as updating mmu_valid_gen does for the shadow MMU.
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+       struct kvm_mmu_page *root;
+
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
+               if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+                       root->role.invalid = true;
+}
+
 /*
  * Installs a last-level SPTE to handle a TDP page fault.
  * (NPT/EPT violation/misconfiguration)
@@ -785,12 +937,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
                trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
                                     new_spte);
                ret = RET_PF_EMULATE;
-       } else
+       } else {
                trace_kvm_mmu_set_spte(iter->level, iter->gfn,
                                       rcu_dereference(iter->sptep));
+       }
 
-       trace_kvm_mmu_set_spte(iter->level, iter->gfn,
-                              rcu_dereference(iter->sptep));
        if (!prefault)
                vcpu->stat.pf_fixed++;
 
@@ -890,199 +1041,139 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
        return ret;
 }
 
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
-                            unsigned long start,
-                            unsigned long end,
-                            unsigned long data,
-                            int (*handler)(struct kvm *kvm,
-                                           struct kvm_memory_slot *slot,
-                                           struct kvm_mmu_page *root,
-                                           gfn_t start,
-                                           gfn_t end,
-                                           unsigned long data))
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+                                bool flush)
 {
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
        struct kvm_mmu_page *root;
-       int ret = 0;
-       int as_id;
-
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               as_id = kvm_mmu_page_as_id(root);
-               slots = __kvm_memslots(kvm, as_id);
-               kvm_for_each_memslot(memslot, slots) {
-                       unsigned long hva_start, hva_end;
-                       gfn_t gfn_start, gfn_end;
-
-                       hva_start = max(start, memslot->userspace_addr);
-                       hva_end = min(end, memslot->userspace_addr +
-                                     (memslot->npages << PAGE_SHIFT));
-                       if (hva_start >= hva_end)
-                               continue;
-                       /*
-                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
-                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-                        */
-                       gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-                       gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-                       ret |= handler(kvm, memslot, root, gfn_start,
-                                      gfn_end, data);
-               }
-       }
+       for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
+               flush |= zap_gfn_range(kvm, root, range->start, range->end,
+                                      range->may_block, flush, false);
 
-       return ret;
+       return flush;
 }
 
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
-                                    struct kvm_memory_slot *slot,
-                                    struct kvm_mmu_page *root, gfn_t start,
-                                    gfn_t end, unsigned long unused)
-{
-       return zap_gfn_range(kvm, root, start, end, false);
-}
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+                             struct kvm_gfn_range *range);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+                                                  struct kvm_gfn_range *range,
+                                                  tdp_handler_t handler)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-                                           zap_gfn_range_hva_wrapper);
+       struct kvm_mmu_page *root;
+       struct tdp_iter iter;
+       bool ret = false;
+
+       rcu_read_lock();
+
+       /*
+        * Don't support rescheduling, none of the MMU notifiers that funnel
+        * into this helper allow blocking; it'd be dead, wasteful code.
+        */
+       for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+               tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+                       ret |= handler(kvm, &iter, range);
+       }
+
+       rcu_read_unlock();
+
+       return ret;
 }
 
 /*
  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
  * if any of the GFNs in the range have been accessed.
  */
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
-                        struct kvm_mmu_page *root, gfn_t start, gfn_t end,
-                        unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+                         struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-       int young = 0;
        u64 new_spte = 0;
 
-       rcu_read_lock();
+       /* If we have a non-accessed entry we don't need to change the pte. */
+       if (!is_accessed_spte(iter->old_spte))
+               return false;
 
-       tdp_root_for_each_leaf_pte(iter, root, start, end) {
+       new_spte = iter->old_spte;
+
+       if (spte_ad_enabled(new_spte)) {
+               new_spte &= ~shadow_accessed_mask;
+       } else {
                /*
-                * If we have a non-accessed entry we don't need to change the
-                * pte.
+                * Capture the dirty status of the page, so that it doesn't get
+                * lost when the SPTE is marked for access tracking.
                 */
-               if (!is_accessed_spte(iter.old_spte))
-                       continue;
-
-               new_spte = iter.old_spte;
-
-               if (spte_ad_enabled(new_spte)) {
-                       clear_bit((ffs(shadow_accessed_mask) - 1),
-                                 (unsigned long *)&new_spte);
-               } else {
-                       /*
-                        * Capture the dirty status of the page, so that it doesn't get
-                        * lost when the SPTE is marked for access tracking.
-                        */
-                       if (is_writable_pte(new_spte))
-                               kvm_set_pfn_dirty(spte_to_pfn(new_spte));
-
-                       new_spte = mark_spte_for_access_track(new_spte);
-               }
-               new_spte &= ~shadow_dirty_mask;
-
-               tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
-               young = 1;
+               if (is_writable_pte(new_spte))
+                       kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 
-               trace_kvm_age_page(iter.gfn, iter.level, slot, young);
+               new_spte = mark_spte_for_access_track(new_spte);
        }
 
-       rcu_read_unlock();
+       tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
 
-       return young;
+       return true;
 }
 
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-                                           age_gfn_range);
+       return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
 }
 
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long unused2)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+                        struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-
-       tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
-               if (is_accessed_spte(iter.old_spte))
-                       return 1;
-
-       return 0;
+       return is_accessed_spte(iter->old_spte);
 }
 
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
-                                           test_age_gfn);
+       return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
-                       struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
-                       unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+                        struct kvm_gfn_range *range)
 {
-       struct tdp_iter iter;
-       pte_t *ptep = (pte_t *)data;
-       kvm_pfn_t new_pfn;
        u64 new_spte;
-       int need_flush = 0;
-
-       rcu_read_lock();
 
-       WARN_ON(pte_huge(*ptep));
+       /* Huge pages aren't expected to be modified without first being zapped. */
+       WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
 
-       new_pfn = pte_pfn(*ptep);
-
-       tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
-               if (iter.level != PG_LEVEL_4K)
-                       continue;
-
-               if (!is_shadow_present_pte(iter.old_spte))
-                       break;
-
-               tdp_mmu_set_spte(kvm, &iter, 0);
-
-               kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
+       if (iter->level != PG_LEVEL_4K ||
+           !is_shadow_present_pte(iter->old_spte))
+               return false;
 
-               if (!pte_write(*ptep)) {
-                       new_spte = kvm_mmu_changed_pte_notifier_make_spte(
-                                       iter.old_spte, new_pfn);
+       /*
+        * Note, when changing a read-only SPTE, it's not strictly necessary to
+        * zero the SPTE before setting the new PFN, but doing so preserves the
+        * invariant that the PFN of a present * leaf SPTE can never change.
+        * See __handle_changed_spte().
+        */
+       tdp_mmu_set_spte(kvm, iter, 0);
 
-                       tdp_mmu_set_spte(kvm, &iter, new_spte);
-               }
+       if (!pte_write(range->pte)) {
+               new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+                                                                 pte_pfn(range->pte));
 
-               need_flush = 1;
+               tdp_mmu_set_spte(kvm, iter, new_spte);
        }
 
-       if (need_flush)
-               kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
-       rcu_read_unlock();
-
-       return 0;
+       return true;
 }
 
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-                            pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-       return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
-                                           (unsigned long)host_ptep,
-                                           set_tdp_spte);
+       bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+
+       /* FIXME: return 'flush' instead of flushing here. */
+       if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
+
+       return false;
 }
 
 /*
@@ -1103,7 +1194,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
        for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
                                   min_level, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
                        continue;
 
                if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1113,7 +1205,15 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
                new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
 
-               tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+               if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+                                                         new_spte)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
                spte_set = true;
        }
 
@@ -1130,17 +1230,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
                             int min_level)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
                             slot->base_gfn + slot->npages, min_level);
-       }
 
        return spte_set;
 }
@@ -1162,7 +1258,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
        rcu_read_lock();
 
        tdp_root_for_each_leaf_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
                        continue;
 
                if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1177,7 +1274,15 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                                continue;
                }
 
-               tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+               if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+                                                         new_spte)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
                spte_set = true;
        }
 
@@ -1195,17 +1300,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
                spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
                                slot->base_gfn + slot->npages);
-       }
 
        return spte_set;
 }
@@ -1267,37 +1368,32 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                       bool wrprot)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
 
        lockdep_assert_held_write(&kvm->mmu_lock);
-       for_each_tdp_mmu_root(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
-
+       for_each_tdp_mmu_root(kvm, root, slot->as_id)
                clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
-       }
 }
 
 /*
  * Clear leaf entries which could be replaced by large mappings, for
  * GFNs within the slot.
  */
-static void zap_collapsible_spte_range(struct kvm *kvm,
+static bool zap_collapsible_spte_range(struct kvm *kvm,
                                       struct kvm_mmu_page *root,
-                                      struct kvm_memory_slot *slot)
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush)
 {
        gfn_t start = slot->base_gfn;
        gfn_t end = start + slot->npages;
        struct tdp_iter iter;
        kvm_pfn_t pfn;
-       bool spte_set = false;
 
        rcu_read_lock();
 
        tdp_root_for_each_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
-                       spte_set = false;
+retry:
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+                       flush = false;
                        continue;
                }
 
@@ -1311,38 +1407,43 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
                                                            pfn, PG_LEVEL_NUM))
                        continue;
 
-               tdp_mmu_set_spte(kvm, &iter, 0);
-
-               spte_set = true;
+               if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
+               flush = true;
        }
 
        rcu_read_unlock();
-       if (spte_set)
-               kvm_flush_remote_tlbs(kvm);
+
+       return flush;
 }
 
 /*
  * Clear non-leaf entries (and free associated page tables) which could
  * be replaced by large mappings, for GFNs within the slot.
  */
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
-               zap_collapsible_spte_range(kvm, root, slot);
-       }
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+               flush = zap_collapsible_spte_range(kvm, root, slot, flush);
+
+       return flush;
 }
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -1359,7 +1460,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
                        break;
 
                new_spte = iter.old_spte &
-                       ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+                       ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
 
                tdp_mmu_set_spte(kvm, &iter, new_spte);
                spte_set = true;
@@ -1372,24 +1473,19 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
 
 /*
  * Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
  * Returns true if an SPTE was set and a TLB flush is needed.
  */
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn)
 {
        struct kvm_mmu_page *root;
-       int root_as_id;
        bool spte_set = false;
 
        lockdep_assert_held_write(&kvm->mmu_lock);
-       for_each_tdp_mmu_root(kvm, root) {
-               root_as_id = kvm_mmu_page_as_id(root);
-               if (root_as_id != slot->as_id)
-                       continue;
-
+       for_each_tdp_mmu_root(kvm, root, slot->as_id)
                spte_set |= write_protect_gfn(kvm, root, gfn);
-       }
+
        return spte_set;
 }
 
index 3b761c1..5fdf630 100644 (file)
@@ -6,24 +6,60 @@
 #include <linux/kvm_host.h>
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+                                                    struct kvm_mmu_page *root)
+{
+       if (root->role.invalid)
+               return false;
+
+       return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared);
+
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
+                                            gfn_t start, gfn_t end, bool flush,
+                                            bool shared)
+{
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
+                                          shared);
+}
+static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
+
+       /*
+        * Don't allow yielding, as the caller may have a flush pending.  Note,
+        * if mmu_lock is held for write, zapping will never yield in this case,
+        * but explicitly disallow it for safety.  The TDP MMU does not yield
+        * until it has made forward progress (steps sideways), and when zapping
+        * a single shadow page that it's guaranteed to see (thus the mmu_lock
+        * requirement), its "step sideways" will always step beyond the bounds
+        * of the shadow page's gfn range and stop iterating before yielding.
+        */
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
+                                          sp->gfn, end, false, false, false);
+}
+
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
 
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                    int map_writable, int max_level, kvm_pfn_t pfn,
                    bool prefault);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end);
-
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-                             unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
-
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-                            pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+                                bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 
 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
                             int min_level);
@@ -33,8 +69,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
                                       struct kvm_memory_slot *slot,
                                       gfn_t gfn, unsigned long mask,
                                       bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-                                      struct kvm_memory_slot *slot);
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+                                      const struct kvm_memory_slot *slot,
+                                      bool flush);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn);
index 78bdcfa..cd0285f 100644 (file)
@@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
        if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
                return -EINVAL;
 
-       if (!svm->vcpu.arch.apic->regs)
+       if (!vcpu->arch.apic->regs)
                return -EINVAL;
 
        if (kvm_apicv_activated(vcpu->kvm)) {
@@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
                        return ret;
        }
 
-       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+       svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
 
        /* Setting AVIC backing page address in the phy APIC ID table */
        entry = avic_get_physical_id_entry(vcpu, id);
@@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
        }
 }
 
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
        u32 icrl = svm->vmcb->control.exit_info_1;
        u32 id = svm->vmcb->control.exit_info_2 >> 32;
        u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
-       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
-       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+       trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
 
        switch (id) {
        case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
@@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
                 * set the appropriate IRR bits on the valid target
                 * vcpus. So, we just need to kick the appropriate vcpu.
                 */
-               avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+               avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
                break;
        case AVIC_IPI_FAILURE_INVALID_TARGET:
                WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
-                         index, svm->vcpu.vcpu_id, icrh, icrl);
+                         index, vcpu->vcpu_id, icrh, icrl);
                break;
        case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
                WARN_ONCE(1, "Invalid backing page\n");
@@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset)
        return ret;
 }
 
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret = 0;
        u32 offset = svm->vmcb->control.exit_info_1 &
                     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
@@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                     AVIC_UNACCEL_ACCESS_WRITE_MASK;
        bool trap = is_avic_unaccelerated_access_trap(offset);
 
-       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+       trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
                                            trap, write, vector);
        if (trap) {
                /* Handling Trap */
@@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
                ret = avic_unaccel_trap_write(svm);
        } else {
                /* Handling Fault */
-               ret = kvm_emulate_instruction(&svm->vcpu, 0);
+               ret = kvm_emulate_instruction(vcpu, 0);
        }
 
        return ret;
@@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
        if (!avic || !irqchip_in_kernel(vcpu->kvm))
                return 0;
 
-       ret = avic_init_backing_page(&svm->vcpu);
+       ret = avic_init_backing_page(vcpu);
        if (ret)
                return ret;
 
index 35891d9..540d43b 100644 (file)
@@ -29,6 +29,8 @@
 #include "lapic.h"
 #include "svm.h"
 
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                       struct x86_exception *fault)
 {
@@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
 
        WARN_ON(mmu_is_nested(vcpu));
 
        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+                               svm->vmcb01.ptr->save.efer,
                                svm->nested.ctl.nested_cr3);
        vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
@@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
                return;
 
        c = &svm->vmcb->control;
-       h = &svm->nested.hsave->control;
+       h = &svm->vmcb01.ptr->control;
        g = &svm->nested.ctl;
 
        for (i = 0; i < MAX_INTERCEPT; i++)
@@ -213,69 +215,92 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
        return true;
 }
 
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (WARN_ON(!is_guest_mode(vcpu)))
-               return true;
-
-       if (!nested_svm_vmrun_msrpm(svm)) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror =
-                       KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
-               return false;
-       }
+       u64 addr = PAGE_ALIGN(pa);
 
-       return true;
+       return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+           kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
 }
 
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+                                      struct vmcb_control_area *control)
 {
-       if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+       if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
                return false;
 
-       if (control->asid == 0)
+       if (CC(control->asid == 0))
                return false;
 
-       if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
-           !npt_enabled)
+       if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+               return false;
+
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+                                          MSRPM_SIZE)))
+               return false;
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+                                          IOPM_SIZE)))
                return false;
 
        return true;
 }
 
-static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+                                     struct vmcb_save_area *save)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       bool vmcb12_lma;
+       /*
+        * These checks are also performed by KVM_SET_SREGS,
+        * except that EFER.LMA is not checked by SVM against
+        * CR0.PG && EFER.LME.
+        */
+       if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+               if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+                   CC(!(save->cr0 & X86_CR0_PE)) ||
+                   CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+                       return false;
+       }
 
-       if ((vmcb12->save.efer & EFER_SVME) == 0)
+       if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
                return false;
 
-       if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+       return true;
+}
+
+/* Common checks that apply to both L1 and L2 state.  */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+                                   struct vmcb_save_area *save)
+{
+       /*
+        * FIXME: these should be done after copying the fields,
+        * to avoid TOC/TOU races.  For these save area checks
+        * the possible damage is limited since kvm_set_cr0 and
+        * kvm_set_cr4 handle failure; EFER_SVME is an exception
+        * so it is force-set later in nested_prepare_vmcb_save.
+        */
+       if (CC(!(save->efer & EFER_SVME)))
                return false;
 
-       if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+       if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+           CC(save->cr0 & ~0xffffffffULL))
                return false;
 
-       vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+       if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+               return false;
 
-       if (vmcb12_lma) {
-               if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
-                   !(vmcb12->save.cr0 & X86_CR0_PE) ||
-                   kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
-                       return false;
-       }
-       if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+       if (!nested_vmcb_check_cr3_cr4(vcpu, save))
                return false;
 
-       return nested_vmcb_check_controls(&vmcb12->control);
+       if (CC(!kvm_valid_efer(vcpu, save->efer)))
+               return false;
+
+       return true;
 }
 
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
-                                    struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+                                           struct vmcb_control_area *control)
 {
        copy_vmcb_control_area(&svm->nested.ctl, control);
 
@@ -287,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 
 /*
  * Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
  */
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
 {
        u32 mask;
        svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
@@ -317,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
  * Transfer any event that L0 or L1 wanted to inject into L2 to
  * EXIT_INT_INFO.
  */
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
-                                          struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+                                               struct vmcb *vmcb12)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
        u32 exit_int_info = 0;
@@ -362,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
                               bool nested_npt)
 {
-       if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+       if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
                return -EINVAL;
 
        if (!nested_npt && is_pae_paging(vcpu) &&
            (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
-               if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+               if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
                        return -EINVAL;
        }
 
@@ -386,20 +411,56 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
        return 0;
 }
 
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
 {
+       if (!svm->nested.vmcb02.ptr)
+               return;
+
+       /* FIXME: merge g_pat from vmcb01 and vmcb12.  */
+       svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+       bool new_vmcb12 = false;
+
+       nested_vmcb02_compute_g_pat(svm);
+
        /* Load the nested guest state */
-       svm->vmcb->save.es = vmcb12->save.es;
-       svm->vmcb->save.cs = vmcb12->save.cs;
-       svm->vmcb->save.ss = vmcb12->save.ss;
-       svm->vmcb->save.ds = vmcb12->save.ds;
-       svm->vmcb->save.gdtr = vmcb12->save.gdtr;
-       svm->vmcb->save.idtr = vmcb12->save.idtr;
+       if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+               new_vmcb12 = true;
+               svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+               svm->vmcb->save.es = vmcb12->save.es;
+               svm->vmcb->save.cs = vmcb12->save.cs;
+               svm->vmcb->save.ss = vmcb12->save.ss;
+               svm->vmcb->save.ds = vmcb12->save.ds;
+               svm->vmcb->save.cpl = vmcb12->save.cpl;
+               vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+               svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+               svm->vmcb->save.idtr = vmcb12->save.idtr;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+       }
+
        kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
-       svm_set_efer(&svm->vcpu, vmcb12->save.efer);
+
+       /*
+        * Force-set EFER_SVME even though it is checked earlier on the
+        * VMCB12, because the guest can flip the bit between the check
+        * and now.  Clearing EFER_SVME would call svm_free_nested.
+        */
+       svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME);
+
        svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
        svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
-       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+       svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
        kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
        kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
        kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
@@ -408,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
        svm->vmcb->save.rax = vmcb12->save.rax;
        svm->vmcb->save.rsp = vmcb12->save.rsp;
        svm->vmcb->save.rip = vmcb12->save.rip;
-       svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
-       svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
-       svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+       /* These bits will be set properly on the first execution when new_vmc12 is true */
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+               svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+               svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+       }
 }
 
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
        const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
 
+       /*
+        * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+        * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+        */
+
+       /*
+        * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+        * avic_physical_id.
+        */
+       WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+       /* Copied from vmcb01.  msrpm_base can be overwritten later.  */
+       svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+       svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+       svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+       /* Done at vmrun: asid.  */
+
+       /* Also overwritten later if necessary.  */
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+       /* nested_cr3.  */
        if (nested_npt_enabled(svm))
                nested_svm_init_mmu_context(&svm->vcpu);
 
@@ -425,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 
        svm->vmcb->control.int_ctl             =
                (svm->nested.ctl.int_ctl & ~mask) |
-               (svm->nested.hsave->control.int_ctl & mask);
+               (svm->vmcb01.ptr->control.int_ctl & mask);
 
        svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
        svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
@@ -440,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
        enter_guest_mode(&svm->vcpu);
 
        /*
-        * Merge guest and host intercepts - must be called  with vcpu in
-        * guest-mode to take affect here
+        * Merge guest and host intercepts - must be called with vcpu in
+        * guest-mode to take effect.
         */
        recalc_intercepts(svm);
+}
 
-       vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+       /*
+        * Some VMCB state is shared between L1 and L2 and thus has to be
+        * moved at the time of nested vmrun and vmexit.
+        *
+        * VMLOAD/VMSAVE state would also belong in this category, but KVM
+        * always performs VMLOAD and VMSAVE from the VMCB01.
+        */
+       to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
 }
 
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
                         struct vmcb *vmcb12)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
 
        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -468,9 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
 
 
        svm->nested.vmcb12_gpa = vmcb12_gpa;
-       load_nested_vmcb_control(svm, &vmcb12->control);
-       nested_prepare_vmcb_control(svm);
-       nested_prepare_vmcb_save(svm, vmcb12);
+
+       WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+       nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+       nested_vmcb02_prepare_control(svm);
+       nested_vmcb02_prepare_save(svm, vmcb12);
 
        ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
                                  nested_npt_enabled(svm));
@@ -478,44 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
                return ret;
 
        if (!npt_enabled)
-               svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+               vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
 
        svm_set_gif(svm, true);
 
        return 0;
 }
 
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
-       struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
        u64 vmcb12_gpa;
 
-       if (is_smm(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       ++vcpu->stat.nested_run;
+
+       if (is_smm(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
        vmcb12_gpa = svm->vmcb->save.rax;
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
        if (ret == -EINVAL) {
-               kvm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                return 1;
        } else if (ret) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
+               return kvm_skip_emulated_instruction(vcpu);
        }
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       ret = kvm_skip_emulated_instruction(vcpu);
 
        vmcb12 = map.hva;
 
        if (WARN_ON_ONCE(!svm->nested.initialized))
                return -EINVAL;
 
-       if (!nested_vmcb_checks(svm, vmcb12)) {
+       nested_load_control_from_vmcb12(svm, &vmcb12->control);
+
+       if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+           !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
                vmcb12->control.exit_code    = SVM_EXIT_ERR;
                vmcb12->control.exit_code_hi = 0;
                vmcb12->control.exit_info_1  = 0;
@@ -525,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 
 
        /* Clear internal status */
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        /*
-        * Save the old vmcb, so we don't need to pick what we save, but can
-        * restore everything when a VMEXIT occurs
+        * Since vmcb01 is not in use, we can use it to store some of the L1
+        * state.
         */
-       hsave->save.es     = vmcb->save.es;
-       hsave->save.cs     = vmcb->save.cs;
-       hsave->save.ss     = vmcb->save.ss;
-       hsave->save.ds     = vmcb->save.ds;
-       hsave->save.gdtr   = vmcb->save.gdtr;
-       hsave->save.idtr   = vmcb->save.idtr;
-       hsave->save.efer   = svm->vcpu.arch.efer;
-       hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       hsave->save.cr4    = svm->vcpu.arch.cr4;
-       hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-       hsave->save.rip    = kvm_rip_read(&svm->vcpu);
-       hsave->save.rsp    = vmcb->save.rsp;
-       hsave->save.rax    = vmcb->save.rax;
-       if (npt_enabled)
-               hsave->save.cr3    = vmcb->save.cr3;
-       else
-               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
-
-       copy_vmcb_control_area(&hsave->control, &vmcb->control);
+       svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
+       svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
+       svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
+       svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+       svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
+
+       if (!npt_enabled)
+               svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
 
        svm->nested.nested_run_pending = 1;
 
-       if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+       if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
                goto out_exit_err;
 
        if (nested_svm_vmrun_msrpm(svm))
@@ -571,7 +667,7 @@ out_exit_err:
        nested_svm_vmexit(svm);
 
 out:
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
        return ret;
 }
@@ -594,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 
 int nested_svm_vmexit(struct vcpu_svm *svm)
 {
-       int rc;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
+       int rc;
+
+       /* Triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
 
-       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+       rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
        if (rc) {
                if (rc == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
        vmcb12 = map.hva;
 
        /* Exit Guest-Mode */
-       leave_guest_mode(&svm->vcpu);
+       leave_guest_mode(vcpu);
        svm->nested.vmcb12_gpa = 0;
        WARN_ON_ONCE(svm->nested.nested_run_pending);
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        /* in case we halted in L2 */
        svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -628,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->save.gdtr   = vmcb->save.gdtr;
        vmcb12->save.idtr   = vmcb->save.idtr;
        vmcb12->save.efer   = svm->vcpu.arch.efer;
-       vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
+       vmcb12->save.cr0    = kvm_read_cr0(vcpu);
+       vmcb12->save.cr3    = kvm_read_cr3(vcpu);
        vmcb12->save.cr2    = vmcb->save.cr2;
        vmcb12->save.cr4    = svm->vcpu.arch.cr4;
-       vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
-       vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
-       vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
-       vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
+       vmcb12->save.rflags = kvm_get_rflags(vcpu);
+       vmcb12->save.rip    = kvm_rip_read(vcpu);
+       vmcb12->save.rsp    = kvm_rsp_read(vcpu);
+       vmcb12->save.rax    = kvm_rax_read(vcpu);
        vmcb12->save.dr7    = vmcb->save.dr7;
        vmcb12->save.dr6    = svm->vcpu.arch.dr6;
        vmcb12->save.cpl    = vmcb->save.cpl;
@@ -647,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
 
        if (vmcb12->control.exit_code != SVM_EXIT_ERR)
-               nested_vmcb_save_pending_event(svm, vmcb12);
+               nested_save_pending_event_to_vmcb12(svm, vmcb12);
 
        if (svm->nrips_enabled)
                vmcb12->control.next_rip  = vmcb->control.next_rip;
@@ -662,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.pause_filter_thresh =
                svm->vmcb->control.pause_filter_thresh;
 
-       /* Restore the original control entries */
-       copy_vmcb_control_area(&vmcb->control, &hsave->control);
+       nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
 
-       /* On vmexit the  GIF is set to false */
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
+
+       /*
+        * On vmexit the  GIF is set to false and
+        * no event can be injected in L1.
+        */
        svm_set_gif(svm, false);
+       svm->vmcb->control.exit_int_info = 0;
 
-       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
-               svm->vcpu.arch.l1_tsc_offset;
+       svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+       if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+               svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+       }
 
        svm->nested.ctl.nested_cr3 = 0;
 
-       /* Restore selected save entries */
-       svm->vmcb->save.es = hsave->save.es;
-       svm->vmcb->save.cs = hsave->save.cs;
-       svm->vmcb->save.ss = hsave->save.ss;
-       svm->vmcb->save.ds = hsave->save.ds;
-       svm->vmcb->save.gdtr = hsave->save.gdtr;
-       svm->vmcb->save.idtr = hsave->save.idtr;
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
-       svm_set_efer(&svm->vcpu, hsave->save.efer);
-       svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
-       svm_set_cr4(&svm->vcpu, hsave->save.cr4);
-       kvm_rax_write(&svm->vcpu, hsave->save.rax);
-       kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
-       kvm_rip_write(&svm->vcpu, hsave->save.rip);
-       svm->vmcb->save.dr7 = DR7_FIXED_1;
-       svm->vmcb->save.cpl = 0;
-       svm->vmcb->control.exit_int_info = 0;
+       /*
+        * Restore processor state that had been saved in vmcb01
+        */
+       kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+       svm_set_efer(vcpu, svm->vmcb->save.efer);
+       svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+       svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+       kvm_rax_write(vcpu, svm->vmcb->save.rax);
+       kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+       kvm_rip_write(vcpu, svm->vmcb->save.rip);
 
-       vmcb_mark_all_dirty(svm->vmcb);
+       svm->vcpu.arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(&svm->vcpu);
 
        trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
                                       vmcb12->control.exit_info_1,
@@ -701,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
                                       vmcb12->control.exit_int_info_err,
                                       KVM_ISA_SVM);
 
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
-       nested_svm_uninit_mmu_context(&svm->vcpu);
+       nested_svm_uninit_mmu_context(vcpu);
 
-       rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+       rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
        if (rc)
                return 1;
 
-       if (npt_enabled)
-               svm->vmcb->save.cr3 = hsave->save.cr3;
-
        /*
         * Drop what we picked up for L2 via svm_complete_interrupts() so it
         * doesn't end up in L1.
         */
        svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
+
+       /*
+        * If we are here following the completion of a VMRUN that
+        * is being single-stepped, queue the pending #DB intercept
+        * right now so that it an be accounted for before we execute
+        * L1's next instruction.
+        */
+       if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+               kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
 
        return 0;
 }
 
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
 int svm_allocate_nested(struct vcpu_svm *svm)
 {
-       struct page *hsave_page;
+       struct page *vmcb02_page;
 
        if (svm->nested.initialized)
                return 0;
 
-       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!hsave_page)
+       vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb02_page)
                return -ENOMEM;
-       svm->nested.hsave = page_address(hsave_page);
+       svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+       svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
 
        svm->nested.msrpm = svm_vcpu_alloc_msrpm();
        if (!svm->nested.msrpm)
-               goto err_free_hsave;
+               goto err_free_vmcb02;
        svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
 
        svm->nested.initialized = true;
        return 0;
 
-err_free_hsave:
-       __free_page(hsave_page);
+err_free_vmcb02:
+       __free_page(vmcb02_page);
        return -ENOMEM;
 }
 
@@ -756,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm)
        svm_vcpu_free_msrpm(svm->nested.msrpm);
        svm->nested.msrpm = NULL;
 
-       __free_page(virt_to_page(svm->nested.hsave));
-       svm->nested.hsave = NULL;
+       __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+       svm->nested.vmcb02.ptr = NULL;
 
        svm->nested.initialized = false;
 }
@@ -767,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm)
  */
 void svm_leave_nested(struct vcpu_svm *svm)
 {
-       if (is_guest_mode(&svm->vcpu)) {
-               struct vmcb *hsave = svm->nested.hsave;
-               struct vmcb *vmcb = svm->vmcb;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
 
+       if (is_guest_mode(vcpu)) {
                svm->nested.nested_run_pending = 0;
-               leave_guest_mode(&svm->vcpu);
-               copy_vmcb_control_area(&vmcb->control, &hsave->control);
-               nested_svm_uninit_mmu_context(&svm->vcpu);
+               leave_guest_mode(vcpu);
+
+               svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+               nested_svm_uninit_mmu_context(vcpu);
                vmcb_mark_all_dirty(svm->vmcb);
        }
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 }
 
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -887,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
        return vmexit;
 }
 
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
 {
-       if (!(svm->vcpu.arch.efer & EFER_SVME) ||
-           !is_paging(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
-       if (svm->vmcb->save.cpl) {
-               kvm_inject_gp(&svm->vcpu, 0);
+       if (to_svm(vcpu)->vmcb->save.cpl) {
+               kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
@@ -944,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
        nested_svm_vmexit(svm);
 }
 
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
-       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
-       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
 static inline bool nested_exit_on_init(struct vcpu_svm *svm)
 {
        return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
 }
 
-static void nested_svm_init(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-
 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1001,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_init(svm))
                        return 0;
-               nested_svm_init(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
                return 0;
        }
 
        if (vcpu->arch.exception.pending) {
-               if (block_nested_events)
+               /*
+                * Only a pending nested run can block a pending exception.
+                * Otherwise an injected NMI/interrupt should either be
+                * lost or delivered to the nested hypervisor in the EXITINTINFO
+                * vmcb field, while delivering the pending exception.
+                */
+               if (svm->nested.nested_run_pending)
                         return -EBUSY;
                if (!nested_exit_on_exception(svm))
                        return 0;
@@ -1019,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_smi(svm))
                        return 0;
-               nested_svm_smi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
                return 0;
        }
 
@@ -1028,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_nmi(svm))
                        return 0;
-               nested_svm_nmi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
                return 0;
        }
 
@@ -1037,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_intr(svm))
                        return 0;
-               nested_svm_intr(svm);
+               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
                return 0;
        }
 
@@ -1056,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
 
-               if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
-                               excp_bits)
+               if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+                   excp_bits)
                        return NESTED_EXIT_HOST;
                else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
                         svm->vcpu.arch.apf.host_apf_flags)
@@ -1121,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
        if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
                         sizeof(user_vmcb->control)))
                return -EFAULT;
-       if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+       if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
                         sizeof(user_vmcb->save)))
                return -EFAULT;
-
 out:
        return kvm_state.size;
 }
@@ -1134,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                                struct kvm_nested_state *kvm_state)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb __user *user_vmcb = (struct vmcb __user *)
                &user_kvm_nested_state->data.svm[0];
        struct vmcb_control_area *ctl;
@@ -1179,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        ret  = -ENOMEM;
-       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL);
-       save = kzalloc(sizeof(*save), GFP_KERNEL);
+       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL_ACCOUNT);
+       save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
        if (!ctl || !save)
                goto out_free;
 
@@ -1191,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                goto out_free;
 
        ret = -EINVAL;
-       if (!nested_vmcb_check_controls(ctl))
+       if (!nested_vmcb_check_controls(vcpu, ctl))
                goto out_free;
 
        /*
         * Processor state contains L2 state.  Check that it is
-        * valid for guest mode (see nested_vmcb_checks).
+        * valid for guest mode (see nested_vmcb_check_save).
         */
        cr0 = kvm_read_cr0(vcpu);
         if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1205,27 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
        /*
         * Validate host state saved from before VMRUN (see
         * nested_svm_check_permissions).
-        * TODO: validate reserved bits for all saved state.
         */
-       if (!(save->cr0 & X86_CR0_PG))
+       if (!(save->cr0 & X86_CR0_PG) ||
+           !(save->cr0 & X86_CR0_PE) ||
+           (save->rflags & X86_EFLAGS_VM) ||
+           !nested_vmcb_valid_sregs(vcpu, save))
                goto out_free;
 
        /*
-        * All checks done, we can enter guest mode.  L1 control fields
-        * come from the nested save state.  Guest state is already
-        * in the registers, the save area of the nested state instead
-        * contains saved L1 state.
+        * All checks done, we can enter guest mode. Userspace provides
+        * vmcb12.control, which will be combined with L1 and stored into
+        * vmcb02, and the L1 save state which we store in vmcb01.
+        * L2 registers if needed are moved from the current VMCB to VMCB02.
         */
 
        svm->nested.nested_run_pending =
                !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 
-       copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
-       hsave->save = *save;
-
        svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
-       load_nested_vmcb_control(svm, ctl);
-       nested_prepare_vmcb_control(svm);
+       if (svm->current_vmcb == &svm->vmcb01)
+               svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+       svm->vmcb01.ptr->save.es = save->es;
+       svm->vmcb01.ptr->save.cs = save->cs;
+       svm->vmcb01.ptr->save.ss = save->ss;
+       svm->vmcb01.ptr->save.ds = save->ds;
+       svm->vmcb01.ptr->save.gdtr = save->gdtr;
+       svm->vmcb01.ptr->save.idtr = save->idtr;
+       svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+       svm->vmcb01.ptr->save.efer = save->efer;
+       svm->vmcb01.ptr->save.cr0 = save->cr0;
+       svm->vmcb01.ptr->save.cr3 = save->cr3;
+       svm->vmcb01.ptr->save.cr4 = save->cr4;
+       svm->vmcb01.ptr->save.rax = save->rax;
+       svm->vmcb01.ptr->save.rsp = save->rsp;
+       svm->vmcb01.ptr->save.rip = save->rip;
+       svm->vmcb01.ptr->save.cpl = 0;
+
+       nested_load_control_from_vmcb12(svm, ctl);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+       nested_vmcb02_prepare_control(svm);
 
        kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
        ret = 0;
@@ -1236,8 +1336,31 @@ out_free:
        return ret;
 }
 
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (WARN_ON(!is_guest_mode(vcpu)))
+               return true;
+
+       if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+                               nested_npt_enabled(svm)))
+               return false;
+
+       if (!nested_svm_vmrun_msrpm(svm)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return false;
+       }
+
+       return true;
+}
+
 struct kvm_x86_nested_ops svm_nested_ops = {
        .check_events = svm_check_nested_events,
+       .triple_fault = nested_svm_triple_fault,
        .get_nested_state_pages = svm_get_nested_state_pages,
        .get_state = svm_get_nested_state,
        .set_state = svm_set_nested_state,
index 035da07..fdf587f 100644 (file)
@@ -98,6 +98,8 @@ static enum index msr_to_index(u32 msr)
 static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
                                             enum pmu_type type)
 {
+       struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
        switch (msr) {
        case MSR_F15H_PERF_CTL0:
        case MSR_F15H_PERF_CTL1:
@@ -105,6 +107,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
        case MSR_F15H_PERF_CTL3:
        case MSR_F15H_PERF_CTL4:
        case MSR_F15H_PERF_CTL5:
+               if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+                       return NULL;
+               fallthrough;
        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
                if (type != PMU_TYPE_EVNTSEL)
                        return NULL;
@@ -115,6 +120,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
        case MSR_F15H_PERF_CTR3:
        case MSR_F15H_PERF_CTR4:
        case MSR_F15H_PERF_CTR5:
+               if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+                       return NULL;
+               fallthrough;
        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
                if (type != PMU_TYPE_COUNTER)
                        return NULL;
index 874ea30..2632852 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/psp-sev.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
+#include <linux/misc_cgroup.h>
 #include <linux/processor.h>
 #include <linux/trace_events.h>
 #include <asm/fpu/internal.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
 static u8 sev_enc_bit;
 static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
 static DEFINE_MUTEX(sev_bitmap_lock);
 unsigned int max_sev_asid;
 static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
 static unsigned long *sev_asid_bitmap;
 static unsigned long *sev_reclaim_asid_bitmap;
 
@@ -66,6 +83,11 @@ static int sev_flush_asids(void)
        return ret;
 }
 
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+       return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
 /* Must be called with the sev_bitmap_lock held */
 static bool __sev_recycle_asids(int min_asid, int max_asid)
 {
@@ -89,8 +111,19 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
 
 static int sev_asid_new(struct kvm_sev_info *sev)
 {
-       int pos, min_asid, max_asid;
+       int pos, min_asid, max_asid, ret;
        bool retry = true;
+       enum misc_res_type type;
+
+       type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+       WARN_ON(sev->misc_cg);
+       sev->misc_cg = get_current_misc_cg();
+       ret = misc_cg_try_charge(type, sev->misc_cg, 1);
+       if (ret) {
+               put_misc_cg(sev->misc_cg);
+               sev->misc_cg = NULL;
+               return ret;
+       }
 
        mutex_lock(&sev_bitmap_lock);
 
@@ -108,7 +141,8 @@ again:
                        goto again;
                }
                mutex_unlock(&sev_bitmap_lock);
-               return -EBUSY;
+               ret = -EBUSY;
+               goto e_uncharge;
        }
 
        __set_bit(pos, sev_asid_bitmap);
@@ -116,6 +150,11 @@ again:
        mutex_unlock(&sev_bitmap_lock);
 
        return pos + 1;
+e_uncharge:
+       misc_cg_uncharge(type, sev->misc_cg, 1);
+       put_misc_cg(sev->misc_cg);
+       sev->misc_cg = NULL;
+       return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -125,14 +164,15 @@ static int sev_get_asid(struct kvm *kvm)
        return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
 {
        struct svm_cpu_data *sd;
        int cpu, pos;
+       enum misc_res_type type;
 
        mutex_lock(&sev_bitmap_lock);
 
-       pos = asid - 1;
+       pos = sev->asid - 1;
        __set_bit(pos, sev_reclaim_asid_bitmap);
 
        for_each_possible_cpu(cpu) {
@@ -141,53 +181,51 @@ static void sev_asid_free(int asid)
        }
 
        mutex_unlock(&sev_bitmap_lock);
+
+       type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+       misc_cg_uncharge(type, sev->misc_cg, 1);
+       put_misc_cg(sev->misc_cg);
+       sev->misc_cg = NULL;
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
 {
-       struct sev_data_decommission *decommission;
-       struct sev_data_deactivate *data;
+       struct sev_data_decommission decommission;
+       struct sev_data_deactivate deactivate;
 
        if (!handle)
                return;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return;
-
-       /* deactivate handle */
-       data->handle = handle;
+       deactivate.handle = handle;
 
        /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
        down_read(&sev_deactivate_lock);
-       sev_guest_deactivate(data, NULL);
+       sev_guest_deactivate(&deactivate, NULL);
        up_read(&sev_deactivate_lock);
 
-       kfree(data);
-
-       decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
-       if (!decommission)
-               return;
-
        /* decommission handle */
-       decommission->handle = handle;
-       sev_guest_decommission(decommission, NULL);
-
-       kfree(decommission);
+       decommission.handle = handle;
+       sev_guest_decommission(&decommission, NULL);
 }
 
 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       bool es_active = argp->id == KVM_SEV_ES_INIT;
        int asid, ret;
 
+       if (kvm->created_vcpus)
+               return -EINVAL;
+
        ret = -EBUSY;
        if (unlikely(sev->active))
                return ret;
 
+       sev->es_active = es_active;
        asid = sev_asid_new(sev);
        if (asid < 0)
-               return ret;
+               goto e_no_asid;
+       sev->asid = asid;
 
        ret = sev_platform_init(&argp->error);
        if (ret)
@@ -200,35 +238,23 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
        return 0;
 
 e_free:
-       sev_asid_free(asid);
+       sev_asid_free(sev);
+       sev->asid = 0;
+e_no_asid:
+       sev->es_active = false;
        return ret;
 }
 
-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
-       if (!sev_es)
-               return -ENOTTY;
-
-       to_kvm_svm(kvm)->sev_info.es_active = true;
-
-       return sev_guest_init(kvm, argp);
-}
-
 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
 {
-       struct sev_data_activate *data;
+       struct sev_data_activate activate;
        int asid = sev_get_asid(kvm);
        int ret;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
        /* activate ASID on the given handle */
-       data->handle = handle;
-       data->asid   = asid;
-       ret = sev_guest_activate(data, error);
-       kfree(data);
+       activate.handle = handle;
+       activate.asid   = asid;
+       ret = sev_guest_activate(&activate, error);
 
        return ret;
 }
@@ -258,7 +284,7 @@ static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_start *start;
+       struct sev_data_launch_start start;
        struct kvm_sev_launch_start params;
        void *dh_blob, *session_blob;
        int *error = &argp->error;
@@ -270,20 +296,16 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
-       if (!start)
-               return -ENOMEM;
+       memset(&start, 0, sizeof(start));
 
        dh_blob = NULL;
        if (params.dh_uaddr) {
                dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
-               if (IS_ERR(dh_blob)) {
-                       ret = PTR_ERR(dh_blob);
-                       goto e_free;
-               }
+               if (IS_ERR(dh_blob))
+                       return PTR_ERR(dh_blob);
 
-               start->dh_cert_address = __sme_set(__pa(dh_blob));
-               start->dh_cert_len = params.dh_len;
+               start.dh_cert_address = __sme_set(__pa(dh_blob));
+               start.dh_cert_len = params.dh_len;
        }
 
        session_blob = NULL;
@@ -294,40 +316,38 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
                        goto e_free_dh;
                }
 
-               start->session_address = __sme_set(__pa(session_blob));
-               start->session_len = params.session_len;
+               start.session_address = __sme_set(__pa(session_blob));
+               start.session_len = params.session_len;
        }
 
-       start->handle = params.handle;
-       start->policy = params.policy;
+       start.handle = params.handle;
+       start.policy = params.policy;
 
        /* create memory encryption context */
-       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
        if (ret)
                goto e_free_session;
 
        /* Bind ASID to this guest */
-       ret = sev_bind_asid(kvm, start->handle, error);
+       ret = sev_bind_asid(kvm, start.handle, error);
        if (ret)
                goto e_free_session;
 
        /* return handle to userspace */
-       params.handle = start->handle;
+       params.handle = start.handle;
        if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
-               sev_unbind_asid(kvm, start->handle);
+               sev_unbind_asid(kvm, start.handle);
                ret = -EFAULT;
                goto e_free_session;
        }
 
-       sev->handle = start->handle;
+       sev->handle = start.handle;
        sev->fd = argp->sev_fd;
 
 e_free_session:
        kfree(session_blob);
 e_free_dh:
        kfree(dh_blob);
-e_free:
-       kfree(start);
        return ret;
 }
 
@@ -446,7 +466,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
        struct kvm_sev_launch_update_data params;
-       struct sev_data_launch_update_data *data;
+       struct sev_data_launch_update_data data;
        struct page **inpages;
        int ret;
 
@@ -456,20 +476,14 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
        vaddr = params.uaddr;
        size = params.len;
        vaddr_end = vaddr + size;
 
        /* Lock the user memory. */
        inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
-       if (IS_ERR(inpages)) {
-               ret = PTR_ERR(inpages);
-               goto e_free;
-       }
+       if (IS_ERR(inpages))
+               return PTR_ERR(inpages);
 
        /*
         * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
@@ -477,6 +491,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
         */
        sev_clflush_pages(inpages, npages);
 
+       data.reserved = 0;
+       data.handle = sev->handle;
+
        for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
                int offset, len;
 
@@ -491,10 +508,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
                len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
 
-               data->handle = sev->handle;
-               data->len = len;
-               data->address = __sme_page_pa(inpages[i]) + offset;
-               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+               data.len = len;
+               data.address = __sme_page_pa(inpages[i]) + offset;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
                if (ret)
                        goto e_unpin;
 
@@ -510,8 +526,6 @@ e_unpin:
        }
        /* unlock the user pages */
        sev_unpin_memory(kvm, inpages, npages);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -563,23 +577,22 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_update_vmsa *vmsa;
+       struct sev_data_launch_update_vmsa vmsa;
+       struct kvm_vcpu *vcpu;
        int i, ret;
 
        if (!sev_es_guest(kvm))
                return -ENOTTY;
 
-       vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
-       if (!vmsa)
-               return -ENOMEM;
+       vmsa.reserved = 0;
 
-       for (i = 0; i < kvm->created_vcpus; i++) {
-               struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               struct vcpu_svm *svm = to_svm(vcpu);
 
                /* Perform some pre-encryption checks against the VMSA */
                ret = sev_es_sync_vmsa(svm);
                if (ret)
-                       goto e_free;
+                       return ret;
 
                /*
                 * The LAUNCH_UPDATE_VMSA command will perform in-place
@@ -589,27 +602,25 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
                 */
                clflush_cache_range(svm->vmsa, PAGE_SIZE);
 
-               vmsa->handle = sev->handle;
-               vmsa->address = __sme_pa(svm->vmsa);
-               vmsa->len = PAGE_SIZE;
-               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+               vmsa.handle = sev->handle;
+               vmsa.address = __sme_pa(svm->vmsa);
+               vmsa.len = PAGE_SIZE;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
                                    &argp->error);
                if (ret)
-                       goto e_free;
+                       return ret;
 
                svm->vcpu.arch.guest_state_protected = true;
        }
 
-e_free:
-       kfree(vmsa);
-       return ret;
+       return 0;
 }
 
 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        void __user *measure = (void __user *)(uintptr_t)argp->data;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_measure *data;
+       struct sev_data_launch_measure data;
        struct kvm_sev_launch_measure params;
        void __user *p = NULL;
        void *blob = NULL;
@@ -621,9 +632,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, measure, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* User wants to query the blob length */
        if (!params.len)
@@ -631,23 +640,20 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
        p = (void __user *)(uintptr_t)params.uaddr;
        if (p) {
-               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
-                       ret = -EINVAL;
-                       goto e_free;
-               }
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
 
-               ret = -ENOMEM;
-               blob = kmalloc(params.len, GFP_KERNEL);
+               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
-                       goto e_free;
+                       return -ENOMEM;
 
-               data->address = __psp_pa(blob);
-               data->len = params.len;
+               data.address = __psp_pa(blob);
+               data.len = params.len;
        }
 
 cmd:
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
 
        /*
         * If we query the session length, FW responded with expected data.
@@ -664,63 +670,50 @@ cmd:
        }
 
 done:
-       params.len = data->len;
+       params.len = data.len;
        if (copy_to_user(measure, &params, sizeof(params)))
                ret = -EFAULT;
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_finish *data;
-       int ret;
+       struct sev_data_launch_finish data;
 
        if (!sev_guest(kvm))
                return -ENOTTY;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
-       kfree(data);
-       return ret;
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
 }
 
 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
        struct kvm_sev_guest_status params;
-       struct sev_data_guest_status *data;
+       struct sev_data_guest_status data;
        int ret;
 
        if (!sev_guest(kvm))
                return -ENOTTY;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
        if (ret)
-               goto e_free;
+               return ret;
 
-       params.policy = data->policy;
-       params.state = data->state;
-       params.handle = data->handle;
+       params.policy = data.policy;
+       params.state = data.state;
+       params.handle = data.handle;
 
        if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
                ret = -EFAULT;
-e_free:
-       kfree(data);
+
        return ret;
 }
 
@@ -729,23 +722,17 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
                               int *error, bool enc)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_dbg *data;
-       int ret;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       struct sev_data_dbg data;
 
-       data->handle = sev->handle;
-       data->dst_addr = dst;
-       data->src_addr = src;
-       data->len = size;
+       data.reserved = 0;
+       data.handle = sev->handle;
+       data.dst_addr = dst;
+       data.src_addr = src;
+       data.len = size;
 
-       ret = sev_issue_cmd(kvm,
-                           enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
-                           data, error);
-       kfree(data);
-       return ret;
+       return sev_issue_cmd(kvm,
+                            enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+                            &data, error);
 }
 
 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
@@ -965,7 +952,7 @@ err:
 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_secret *data;
+       struct sev_data_launch_secret data;
        struct kvm_sev_launch_secret params;
        struct page **pages;
        void *blob, *hdr;
@@ -997,41 +984,36 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
                goto e_unpin_memory;
        }
 
-       ret = -ENOMEM;
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               goto e_unpin_memory;
+       memset(&data, 0, sizeof(data));
 
        offset = params.guest_uaddr & (PAGE_SIZE - 1);
-       data->guest_address = __sme_page_pa(pages[0]) + offset;
-       data->guest_len = params.guest_len;
+       data.guest_address = __sme_page_pa(pages[0]) + offset;
+       data.guest_len = params.guest_len;
 
        blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
        if (IS_ERR(blob)) {
                ret = PTR_ERR(blob);
-               goto e_free;
+               goto e_unpin_memory;
        }
 
-       data->trans_address = __psp_pa(blob);
-       data->trans_len = params.trans_len;
+       data.trans_address = __psp_pa(blob);
+       data.trans_len = params.trans_len;
 
        hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
        if (IS_ERR(hdr)) {
                ret = PTR_ERR(hdr);
                goto e_free_blob;
        }
-       data->hdr_address = __psp_pa(hdr);
-       data->hdr_len = params.hdr_len;
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
 
        kfree(hdr);
 
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
 e_unpin_memory:
        /* content of memory is updated, mark pages dirty */
        for (i = 0; i < n; i++) {
@@ -1046,7 +1028,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
        void __user *report = (void __user *)(uintptr_t)argp->data;
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_attestation_report *data;
+       struct sev_data_attestation_report data;
        struct kvm_sev_attestation_report params;
        void __user *p;
        void *blob = NULL;
@@ -1058,9 +1040,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
        if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* User wants to query the blob length */
        if (!params.len)
@@ -1068,23 +1048,20 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
        p = (void __user *)(uintptr_t)params.uaddr;
        if (p) {
-               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
-                       ret = -EINVAL;
-                       goto e_free;
-               }
+               if (params.len > SEV_FW_BLOB_MAX_SIZE)
+                       return -EINVAL;
 
-               ret = -ENOMEM;
-               blob = kmalloc(params.len, GFP_KERNEL);
+               blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
                if (!blob)
-                       goto e_free;
+                       return -ENOMEM;
 
-               data->address = __psp_pa(blob);
-               data->len = params.len;
-               memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+               data.address = __psp_pa(blob);
+               data.len = params.len;
+               memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
        }
 cmd:
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
        /*
         * If we query the session length, FW responded with expected data.
         */
@@ -1100,16 +1077,411 @@ cmd:
        }
 
 done:
-       params.len = data->len;
+       params.len = data.len;
        if (copy_to_user(report, &params, sizeof(params)))
                ret = -EFAULT;
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+                                     struct kvm_sev_send_start *params)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_start data;
+       int ret;
+
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+       if (ret < 0)
+               return ret;
+
+       params->session_len = data.session_len;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+                               sizeof(struct kvm_sev_send_start)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_start data;
+       struct kvm_sev_send_start params;
+       void *amd_certs, *session_data;
+       void *pdh_cert, *plat_certs;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                               sizeof(struct kvm_sev_send_start)))
+               return -EFAULT;
+
+       /* if session_len is zero, userspace wants to query the session length */
+       if (!params.session_len)
+               return __sev_send_start_query_session_length(kvm, argp,
+                               &params);
+
+       /* some sanity checks */
+       if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+           !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EINVAL;
+
+       /* allocate the memory to hold the session data blob */
+       session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+       if (!session_data)
+               return -ENOMEM;
+
+       /* copy the certificate blobs from userspace */
+       pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+                               params.pdh_cert_len);
+       if (IS_ERR(pdh_cert)) {
+               ret = PTR_ERR(pdh_cert);
+               goto e_free_session;
+       }
+
+       plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+                               params.plat_certs_len);
+       if (IS_ERR(plat_certs)) {
+               ret = PTR_ERR(plat_certs);
+               goto e_free_pdh;
+       }
+
+       amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+                               params.amd_certs_len);
+       if (IS_ERR(amd_certs)) {
+               ret = PTR_ERR(amd_certs);
+               goto e_free_plat_cert;
+       }
+
+       /* populate the FW SEND_START field with system physical address */
+       memset(&data, 0, sizeof(data));
+       data.pdh_cert_address = __psp_pa(pdh_cert);
+       data.pdh_cert_len = params.pdh_cert_len;
+       data.plat_certs_address = __psp_pa(plat_certs);
+       data.plat_certs_len = params.plat_certs_len;
+       data.amd_certs_address = __psp_pa(amd_certs);
+       data.amd_certs_len = params.amd_certs_len;
+       data.session_address = __psp_pa(session_data);
+       data.session_len = params.session_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+       if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+                       session_data, params.session_len)) {
+               ret = -EFAULT;
+               goto e_free_amd_cert;
+       }
+
+       params.policy = data.policy;
+       params.session_len = data.session_len;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+                               sizeof(struct kvm_sev_send_start)))
+               ret = -EFAULT;
+
+e_free_amd_cert:
+       kfree(amd_certs);
+e_free_plat_cert:
+       kfree(plat_certs);
+e_free_pdh:
+       kfree(pdh_cert);
+e_free_session:
+       kfree(session_data);
+       return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+                                    struct kvm_sev_send_update_data *params)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_update_data data;
+       int ret;
+
+       data.handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+       if (ret < 0)
+               return ret;
+
+       params->hdr_len = data.hdr_len;
+       params->trans_len = data.trans_len;
+
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+                        sizeof(struct kvm_sev_send_update_data)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_update_data data;
+       struct kvm_sev_send_update_data params;
+       void *hdr, *trans_data;
+       struct page **guest_page;
+       unsigned long n;
+       int ret, offset;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_send_update_data)))
+               return -EFAULT;
+
+       /* userspace wants to query either header or trans length */
+       if (!params.trans_len || !params.hdr_len)
+               return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+       if (!params.trans_uaddr || !params.guest_uaddr ||
+           !params.guest_len || !params.hdr_uaddr)
+               return -EINVAL;
+
+       /* Check if we are crossing the page boundary */
+       offset = params.guest_uaddr & (PAGE_SIZE - 1);
+       if ((params.guest_len + offset > PAGE_SIZE))
+               return -EINVAL;
+
+       /* Pin guest memory */
+       guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+                                   PAGE_SIZE, &n, 0);
+       if (!guest_page)
+               return -EFAULT;
+
+       /* allocate memory for header and transport buffer */
+       ret = -ENOMEM;
+       hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+       if (!hdr)
+               goto e_unpin;
+
+       trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+       if (!trans_data)
+               goto e_free_hdr;
+
+       memset(&data, 0, sizeof(data));
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
+       data.trans_address = __psp_pa(trans_data);
+       data.trans_len = params.trans_len;
+
+       /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+       data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+       data.guest_address |= sev_me_mask;
+       data.guest_len = params.guest_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+       if (ret)
+               goto e_free_trans_data;
+
+       /* copy transport buffer to user space */
+       if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+                        trans_data, params.trans_len)) {
+               ret = -EFAULT;
+               goto e_free_trans_data;
+       }
+
+       /* Copy packet header to userspace. */
+       ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+                               params.hdr_len);
+
+e_free_trans_data:
+       kfree(trans_data);
+e_free_hdr:
+       kfree(hdr);
+e_unpin:
+       sev_unpin_memory(kvm, guest_page, n);
+
+       return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_finish data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_send_cancel data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_receive_start start;
+       struct kvm_sev_receive_start params;
+       int *error = &argp->error;
+       void *session_data;
+       void *pdh_data;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       /* Get parameter from the userspace */
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_receive_start)))
+               return -EFAULT;
+
+       /* some sanity checks */
+       if (!params.pdh_uaddr || !params.pdh_len ||
+           !params.session_uaddr || !params.session_len)
+               return -EINVAL;
+
+       pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+       if (IS_ERR(pdh_data))
+               return PTR_ERR(pdh_data);
+
+       session_data = psp_copy_user_blob(params.session_uaddr,
+                       params.session_len);
+       if (IS_ERR(session_data)) {
+               ret = PTR_ERR(session_data);
+               goto e_free_pdh;
+       }
+
+       memset(&start, 0, sizeof(start));
+       start.handle = params.handle;
+       start.policy = params.policy;
+       start.pdh_cert_address = __psp_pa(pdh_data);
+       start.pdh_cert_len = params.pdh_len;
+       start.session_address = __psp_pa(session_data);
+       start.session_len = params.session_len;
+
+       /* create memory encryption context */
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+                               error);
+       if (ret)
+               goto e_free_session;
+
+       /* Bind ASID to this guest */
+       ret = sev_bind_asid(kvm, start.handle, error);
+       if (ret)
+               goto e_free_session;
+
+       params.handle = start.handle;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data,
+                        &params, sizeof(struct kvm_sev_receive_start))) {
+               ret = -EFAULT;
+               sev_unbind_asid(kvm, start.handle);
+               goto e_free_session;
+       }
+
+       sev->handle = start.handle;
+       sev->fd = argp->sev_fd;
+
+e_free_session:
+       kfree(session_data);
+e_free_pdh:
+       kfree(pdh_data);
+
+       return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct kvm_sev_receive_update_data params;
+       struct sev_data_receive_update_data data;
+       void *hdr = NULL, *trans = NULL;
+       struct page **guest_page;
+       unsigned long n;
+       int ret, offset;
+
+       if (!sev_guest(kvm))
+               return -EINVAL;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+                       sizeof(struct kvm_sev_receive_update_data)))
+               return -EFAULT;
+
+       if (!params.hdr_uaddr || !params.hdr_len ||
+           !params.guest_uaddr || !params.guest_len ||
+           !params.trans_uaddr || !params.trans_len)
+               return -EINVAL;
+
+       /* Check if we are crossing the page boundary */
+       offset = params.guest_uaddr & (PAGE_SIZE - 1);
+       if ((params.guest_len + offset > PAGE_SIZE))
+               return -EINVAL;
+
+       hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+       if (IS_ERR(hdr))
+               return PTR_ERR(hdr);
+
+       trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto e_free_hdr;
+       }
+
+       memset(&data, 0, sizeof(data));
+       data.hdr_address = __psp_pa(hdr);
+       data.hdr_len = params.hdr_len;
+       data.trans_address = __psp_pa(trans);
+       data.trans_len = params.trans_len;
+
+       /* Pin guest memory */
+       ret = -EFAULT;
+       guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+                                   PAGE_SIZE, &n, 0);
+       if (!guest_page)
+               goto e_free_trans;
+
+       /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+       data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+       data.guest_address |= sev_me_mask;
+       data.guest_len = params.guest_len;
+       data.handle = sev->handle;
+
+       ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+                               &argp->error);
+
+       sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+       kfree(trans);
+e_free_hdr:
+       kfree(hdr);
+
+       return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_receive_finish data;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data.handle = sev->handle;
+       return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
        struct kvm_sev_cmd sev_cmd;
@@ -1126,13 +1498,22 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 
        mutex_lock(&kvm->lock);
 
+       /* enc_context_owner handles all memory enc operations */
+       if (is_mirroring_enc_context(kvm)) {
+               r = -EINVAL;
+               goto out;
+       }
+
        switch (sev_cmd.id) {
+       case KVM_SEV_ES_INIT:
+               if (!sev_es) {
+                       r = -ENOTTY;
+                       goto out;
+               }
+               fallthrough;
        case KVM_SEV_INIT:
                r = sev_guest_init(kvm, &sev_cmd);
                break;
-       case KVM_SEV_ES_INIT:
-               r = sev_es_guest_init(kvm, &sev_cmd);
-               break;
        case KVM_SEV_LAUNCH_START:
                r = sev_launch_start(kvm, &sev_cmd);
                break;
@@ -1163,6 +1544,27 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
        case KVM_SEV_GET_ATTESTATION_REPORT:
                r = sev_get_attestation_report(kvm, &sev_cmd);
                break;
+       case KVM_SEV_SEND_START:
+               r = sev_send_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_UPDATE_DATA:
+               r = sev_send_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_FINISH:
+               r = sev_send_finish(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_SEND_CANCEL:
+               r = sev_send_cancel(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_START:
+               r = sev_receive_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_UPDATE_DATA:
+               r = sev_receive_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_RECEIVE_FINISH:
+               r = sev_receive_finish(kvm, &sev_cmd);
+               break;
        default:
                r = -EINVAL;
                goto out;
@@ -1186,6 +1588,10 @@ int svm_register_enc_region(struct kvm *kvm,
        if (!sev_guest(kvm))
                return -ENOTTY;
 
+       /* If kvm is mirroring encryption context it isn't responsible for it */
+       if (is_mirroring_enc_context(kvm))
+               return -EINVAL;
+
        if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
                return -EINVAL;
 
@@ -1252,6 +1658,10 @@ int svm_unregister_enc_region(struct kvm *kvm,
        struct enc_region *region;
        int ret;
 
+       /* If kvm is mirroring encryption context it isn't responsible for it */
+       if (is_mirroring_enc_context(kvm))
+               return -EINVAL;
+
        mutex_lock(&kvm->lock);
 
        if (!sev_guest(kvm)) {
@@ -1282,6 +1692,71 @@ failed:
        return ret;
 }
 
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+       struct file *source_kvm_file;
+       struct kvm *source_kvm;
+       struct kvm_sev_info *mirror_sev;
+       unsigned int asid;
+       int ret;
+
+       source_kvm_file = fget(source_fd);
+       if (!file_is_kvm(source_kvm_file)) {
+               ret = -EBADF;
+               goto e_source_put;
+       }
+
+       source_kvm = source_kvm_file->private_data;
+       mutex_lock(&source_kvm->lock);
+
+       if (!sev_guest(source_kvm)) {
+               ret = -EINVAL;
+               goto e_source_unlock;
+       }
+
+       /* Mirrors of mirrors should work, but let's not get silly */
+       if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+               ret = -EINVAL;
+               goto e_source_unlock;
+       }
+
+       asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+       /*
+        * The mirror kvm holds an enc_context_owner ref so its asid can't
+        * disappear until we're done with it
+        */
+       kvm_get_kvm(source_kvm);
+
+       fput(source_kvm_file);
+       mutex_unlock(&source_kvm->lock);
+       mutex_lock(&kvm->lock);
+
+       if (sev_guest(kvm)) {
+               ret = -EINVAL;
+               goto e_mirror_unlock;
+       }
+
+       /* Set enc_context_owner and copy its encryption context over */
+       mirror_sev = &to_kvm_svm(kvm)->sev_info;
+       mirror_sev->enc_context_owner = source_kvm;
+       mirror_sev->asid = asid;
+       mirror_sev->active = true;
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+
+e_mirror_unlock:
+       mutex_unlock(&kvm->lock);
+       kvm_put_kvm(source_kvm);
+       return ret;
+e_source_unlock:
+       mutex_unlock(&source_kvm->lock);
+e_source_put:
+       fput(source_kvm_file);
+       return ret;
+}
+
 void sev_vm_destroy(struct kvm *kvm)
 {
        struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1291,6 +1766,12 @@ void sev_vm_destroy(struct kvm *kvm)
        if (!sev_guest(kvm))
                return;
 
+       /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+       if (is_mirroring_enc_context(kvm)) {
+               kvm_put_kvm(sev->enc_context_owner);
+               return;
+       }
+
        mutex_lock(&kvm->lock);
 
        /*
@@ -1315,12 +1796,12 @@ void sev_vm_destroy(struct kvm *kvm)
        mutex_unlock(&kvm->lock);
 
        sev_unbind_asid(kvm, sev->handle);
-       sev_asid_free(sev->asid);
+       sev_asid_free(sev);
 }
 
 void __init sev_hardware_setup(void)
 {
-       unsigned int eax, ebx, ecx, edx;
+       unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
        bool sev_es_supported = false;
        bool sev_supported = false;
 
@@ -1342,6 +1823,7 @@ void __init sev_hardware_setup(void)
 
        /* Minimum ASID value that should be used for SEV guest */
        min_sev_asid = edx;
+       sev_me_mask = 1UL << (ebx & 0x3f);
 
        /* Initialize SEV ASID bitmaps */
        sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
@@ -1352,7 +1834,11 @@ void __init sev_hardware_setup(void)
        if (!sev_reclaim_asid_bitmap)
                goto out;
 
-       pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+       sev_asid_count = max_sev_asid - min_sev_asid + 1;
+       if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
+               goto out;
+
+       pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
        sev_supported = true;
 
        /* SEV-ES support requested? */
@@ -1367,7 +1853,11 @@ void __init sev_hardware_setup(void)
        if (min_sev_asid == 1)
                goto out;
 
-       pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
+       sev_es_asid_count = min_sev_asid - 1;
+       if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count))
+               goto out;
+
+       pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
        sev_es_supported = true;
 
 out:
@@ -1382,6 +1872,8 @@ void sev_hardware_teardown(void)
 
        bitmap_free(sev_asid_bitmap);
        bitmap_free(sev_reclaim_asid_bitmap);
+       misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+       misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
 
        sev_flush_asids();
 }
@@ -1775,7 +2267,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
                               len, GHCB_SCRATCH_AREA_LIMIT);
                        return false;
                }
-               scratch_va = kzalloc(len, GFP_KERNEL);
+               scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
                if (!scratch_va)
                        return false;
 
@@ -1849,7 +2341,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
                vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
                vcpu->arch.regs[VCPU_REGS_RCX] = 0;
 
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
                if (!ret) {
                        ret = -EINVAL;
                        break;
@@ -1899,8 +2391,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
        return ret;
 }
 
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        u64 ghcb_gpa, exit_code;
        struct ghcb *ghcb;
@@ -1912,13 +2405,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                return sev_handle_vmgexit_msr_protocol(svm);
 
        if (!ghcb_gpa) {
-               vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+               vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
                return -EINVAL;
        }
 
-       if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+       if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
                /* Unable to map GHCB from guest */
-               vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+               vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
                            ghcb_gpa);
                return -EINVAL;
        }
@@ -1926,7 +2419,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
        svm->ghcb = svm->ghcb_map.hva;
        ghcb = svm->ghcb_map.hva;
 
-       trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+       trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
 
        exit_code = ghcb_get_sw_exit_code(ghcb);
 
@@ -1944,7 +2437,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
                        break;
 
-               ret = kvm_sev_es_mmio_read(&svm->vcpu,
+               ret = kvm_sev_es_mmio_read(vcpu,
                                           control->exit_info_1,
                                           control->exit_info_2,
                                           svm->ghcb_sa);
@@ -1953,19 +2446,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
                        break;
 
-               ret = kvm_sev_es_mmio_write(&svm->vcpu,
+               ret = kvm_sev_es_mmio_write(vcpu,
                                            control->exit_info_1,
                                            control->exit_info_2,
                                            svm->ghcb_sa);
                break;
        case SVM_VMGEXIT_NMI_COMPLETE:
-               ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+               ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
                break;
        case SVM_VMGEXIT_AP_HLT_LOOP:
-               ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+               ret = kvm_emulate_ap_reset_hold(vcpu);
                break;
        case SVM_VMGEXIT_AP_JUMP_TABLE: {
-               struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+               struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
 
                switch (control->exit_info_1) {
                case 0:
@@ -1990,12 +2483,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
                break;
        }
        case SVM_VMGEXIT_UNSUPPORTED_EVENT:
-               vcpu_unimpl(&svm->vcpu,
+               vcpu_unimpl(vcpu,
                            "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
                            control->exit_info_1, control->exit_info_2);
                break;
        default:
-               ret = svm_invoke_exit_handler(svm, exit_code);
+               ret = svm_invoke_exit_handler(vcpu, exit_code);
        }
 
        return ret;
@@ -2104,5 +2597,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
         * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
         * non-zero value.
         */
+       if (!svm->ghcb)
+               return;
+
        ghcb_set_sw_exit_info_2(svm->ghcb, 1);
 }
index 58a45bb..cd8c333 100644 (file)
@@ -56,9 +56,6 @@ static const struct x86_cpu_id svm_cpu_id[] = {
 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #endif
 
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
@@ -95,6 +92,8 @@ static const struct svm_direct_access_msrs {
 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
        { .index = MSR_STAR,                            .always = true  },
        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
+       { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
+       { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
 #ifdef CONFIG_X86_64
        { .index = MSR_GS_BASE,                         .always = true  },
        { .index = MSR_FS_BASE,                         .always = true  },
@@ -279,7 +278,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                         * In this case we will return to the nested guest
                         * as soon as we leave SMM.
                         */
-                       if (!is_smm(&svm->vcpu))
+                       if (!is_smm(vcpu))
                                svm_free_nested(svm);
 
                } else {
@@ -363,10 +362,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
        bool has_error_code = vcpu->arch.exception.has_error_code;
        u32 error_code = vcpu->arch.exception.error_code;
 
-       kvm_deliver_exception_payload(&svm->vcpu);
+       kvm_deliver_exception_payload(vcpu);
 
        if (nr == BP_VECTOR && !nrips) {
-               unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+               unsigned long rip, old_rip = kvm_rip_read(vcpu);
 
                /*
                 * For guest debugging where we have to reinject #BP if some
@@ -375,8 +374,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
                 * raises a fault that is not intercepted. Still better than
                 * failing in all cases.
                 */
-               (void)skip_emulated_instruction(&svm->vcpu);
-               rip = kvm_rip_read(&svm->vcpu);
+               (void)skip_emulated_instruction(vcpu);
+               rip = kvm_rip_read(vcpu);
                svm->int3_rip = rip + svm->vmcb->save.cs.base;
                svm->int3_injected = rip - old_rip;
        }
@@ -681,14 +680,15 @@ void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 
 u32 *svm_vcpu_alloc_msrpm(void)
 {
-       struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+       unsigned int order = get_order(MSRPM_SIZE);
+       struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
        u32 *msrpm;
 
        if (!pages)
                return NULL;
 
        msrpm = page_address(pages);
-       memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+       memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 
        return msrpm;
 }
@@ -707,7 +707,7 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 
 void svm_vcpu_free_msrpm(u32 *msrpm)
 {
-       __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+       __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 }
 
 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
@@ -881,7 +881,7 @@ static __init void svm_adjust_mmio_mask(void)
         */
        mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 
-       kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+       kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 
 static void svm_hardware_teardown(void)
@@ -894,7 +894,8 @@ static void svm_hardware_teardown(void)
        for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
 
-       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+       __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+       get_order(IOPM_SIZE));
        iopm_base = 0;
 }
 
@@ -930,14 +931,15 @@ static __init int svm_hardware_setup(void)
        struct page *iopm_pages;
        void *iopm_va;
        int r;
+       unsigned int order = get_order(IOPM_SIZE);
 
-       iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+       iopm_pages = alloc_pages(GFP_KERNEL, order);
 
        if (!iopm_pages)
                return -ENOMEM;
 
        iopm_va = page_address(iopm_pages);
-       memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+       memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
        init_msrpm_offsets();
@@ -1084,8 +1086,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        if (is_guest_mode(vcpu)) {
                /* Write L1's TSC offset.  */
                g_tsc_offset = svm->vmcb->control.tsc_offset -
-                              svm->nested.hsave->control.tsc_offset;
-               svm->nested.hsave->control.tsc_offset = offset;
+                              svm->vmcb01.ptr->control.tsc_offset;
+               svm->vmcb01.ptr->control.tsc_offset = offset;
        }
 
        trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1113,12 +1115,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
        }
 }
 
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
 
-       svm->vcpu.arch.hflags = 0;
+       vcpu->arch.hflags = 0;
 
        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1126,7 +1129,7 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
-       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+       if (!kvm_vcpu_apicv_active(vcpu))
                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
 
        set_dr_intercepts(svm);
@@ -1170,12 +1173,12 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm_set_intercept(svm, INTERCEPT_RDPRU);
        svm_set_intercept(svm, INTERCEPT_RSM);
 
-       if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_mwait_in_guest(vcpu->kvm)) {
                svm_set_intercept(svm, INTERCEPT_MONITOR);
                svm_set_intercept(svm, INTERCEPT_MWAIT);
        }
 
-       if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+       if (!kvm_hlt_in_guest(vcpu->kvm))
                svm_set_intercept(svm, INTERCEPT_HLT);
 
        control->iopm_base_pa = __sme_set(iopm_base);
@@ -1201,19 +1204,19 @@ static void init_vmcb(struct vcpu_svm *svm)
        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-       svm_set_cr4(&svm->vcpu, 0);
-       svm_set_efer(&svm->vcpu, 0);
+       svm_set_cr4(vcpu, 0);
+       svm_set_efer(vcpu, 0);
        save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
+       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
        save->rip = 0x0000fff0;
-       svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+       vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
 
        /*
         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
         * It also updates the guest-visible cr0 value.
         */
-       svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-       kvm_mmu_reset_context(&svm->vcpu);
+       svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+       kvm_mmu_reset_context(vcpu);
 
        save->cr4 = X86_CR4_PAE;
        /* rdx = ?? */
@@ -1225,17 +1228,18 @@ static void init_vmcb(struct vcpu_svm *svm)
                clr_exception_intercept(svm, PF_VECTOR);
                svm_clr_intercept(svm, INTERCEPT_CR3_READ);
                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
-               save->g_pat = svm->vcpu.arch.pat;
+               save->g_pat = vcpu->arch.pat;
                save->cr3 = 0;
                save->cr4 = 0;
        }
-       svm->asid_generation = 0;
+       svm->current_vmcb->asid_generation = 0;
        svm->asid = 0;
 
        svm->nested.vmcb12_gpa = 0;
-       svm->vcpu.arch.hflags = 0;
+       svm->nested.last_vmcb12_gpa = 0;
+       vcpu->arch.hflags = 0;
 
-       if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+       if (!kvm_pause_in_guest(vcpu->kvm)) {
                control->pause_filter_count = pause_filter_count;
                if (pause_filter_thresh)
                        control->pause_filter_thresh = pause_filter_thresh;
@@ -1246,18 +1250,15 @@ static void init_vmcb(struct vcpu_svm *svm)
 
        svm_check_invpcid(svm);
 
-       if (kvm_vcpu_apicv_active(&svm->vcpu))
-               avic_init_vmcb(svm);
-
        /*
-        * If hardware supports Virtual VMLOAD VMSAVE then enable it
-        * in VMCB and clear intercepts to avoid #VMEXIT.
+        * If the host supports V_SPEC_CTRL then disable the interception
+        * of MSR_IA32_SPEC_CTRL.
         */
-       if (vls) {
-               svm_clr_intercept(svm, INTERCEPT_VMLOAD);
-               svm_clr_intercept(svm, INTERCEPT_VMSAVE);
-               svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-       }
+       if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_init_vmcb(svm);
 
        if (vgif) {
                svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1265,11 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm)
                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
        }
 
-       if (sev_guest(svm->vcpu.kvm)) {
+       if (sev_guest(vcpu->kvm)) {
                svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
                clr_exception_intercept(svm, UD_VECTOR);
 
-               if (sev_es_guest(svm->vcpu.kvm)) {
+               if (sev_es_guest(vcpu->kvm)) {
                        /* Perform SEV-ES specific VMCB updates */
                        sev_es_init_vmcb(svm);
                }
@@ -1291,12 +1292,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        svm->virt_spec_ctrl = 0;
 
        if (!init_event) {
-               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                          MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-                       svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                      MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(vcpu))
+                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
        }
-       init_vmcb(svm);
+       init_vmcb(vcpu);
 
        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
        kvm_rdx_write(vcpu, eax);
@@ -1305,10 +1306,16 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+       svm->current_vmcb = target_vmcb;
+       svm->vmcb = target_vmcb->ptr;
+}
+
 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm;
-       struct page *vmcb_page;
+       struct page *vmcb01_page;
        struct page *vmsa_page = NULL;
        int err;
 
@@ -1316,11 +1323,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
        svm = to_svm(vcpu);
 
        err = -ENOMEM;
-       vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!vmcb_page)
+       vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb01_page)
                goto out;
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                /*
                 * SEV-ES guests require a separate VMSA page used to contain
                 * the encrypted register state of the guest.
@@ -1356,20 +1363,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 
        svm_vcpu_init_msrpm(vcpu, svm->msrpm);
 
-       svm->vmcb = page_address(vmcb_page);
-       svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+       svm->vmcb01.ptr = page_address(vmcb01_page);
+       svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
 
        if (vmsa_page)
                svm->vmsa = page_address(vmsa_page);
 
-       svm->asid_generation = 0;
        svm->guest_state_loaded = false;
-       init_vmcb(svm);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       init_vmcb(vcpu);
 
        svm_init_osvw(vcpu);
        vcpu->arch.microcode_version = 0x01000065;
 
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                /* Perform SEV-ES specific VMCB creation updates */
                sev_es_create_vcpu(svm);
 
@@ -1379,7 +1387,7 @@ error_free_vmsa_page:
        if (vmsa_page)
                __free_page(vmsa_page);
 error_free_vmcb_page:
-       __free_page(vmcb_page);
+       __free_page(vmcb01_page);
 out:
        return err;
 }
@@ -1407,8 +1415,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
        sev_free_vcpu(vcpu);
 
-       __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
-       __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+       __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+       __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
 }
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -1432,7 +1440,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
         * Save additional host state that will be restored on VMEXIT (sev-es)
         * or subsequent vmload of host save area.
         */
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                sev_es_prepare_guest_switch(svm, vcpu->cpu);
        } else {
                vmsave(__sme_page_pa(sd->save_area));
@@ -1476,11 +1484,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 
-       if (unlikely(cpu != vcpu->cpu)) {
-               svm->asid_generation = 0;
-               vmcb_mark_all_dirty(svm->vmcb);
-       }
-
        if (sd->current_vmcb != svm->vmcb) {
                sd->current_vmcb = svm->vmcb;
                indirect_branch_prediction_barrier();
@@ -1564,7 +1567,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
        /* Drop int_ctl fields related to VINTR injection.  */
        svm->vmcb->control.int_ctl &= mask;
        if (is_guest_mode(&svm->vcpu)) {
-               svm->nested.hsave->control.int_ctl &= mask;
+               svm->vmcb01.ptr->control.int_ctl &= mask;
 
                WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
                        (svm->nested.ctl.int_ctl & V_TPR_MASK));
@@ -1577,16 +1580,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+       struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
 
        switch (seg) {
        case VCPU_SREG_CS: return &save->cs;
        case VCPU_SREG_DS: return &save->ds;
        case VCPU_SREG_ES: return &save->es;
-       case VCPU_SREG_FS: return &save->fs;
-       case VCPU_SREG_GS: return &save->gs;
+       case VCPU_SREG_FS: return &save01->fs;
+       case VCPU_SREG_GS: return &save01->gs;
        case VCPU_SREG_SS: return &save->ss;
-       case VCPU_SREG_TR: return &save->tr;
-       case VCPU_SREG_LDTR: return &save->ldtr;
+       case VCPU_SREG_TR: return &save01->tr;
+       case VCPU_SREG_LDTR: return &save01->ldtr;
        }
        BUG();
        return NULL;
@@ -1709,37 +1713,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
 }
 
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
-       ulong gcr0;
-       u64 *hcr0;
-
-       /*
-        * SEV-ES guests must always keep the CR intercepts cleared. CR
-        * tracking is done using the CR write traps.
-        */
-       if (sev_es_guest(svm->vcpu.kvm))
-               return;
-
-       gcr0 = svm->vcpu.arch.cr0;
-       hcr0 = &svm->vmcb->save.cr0;
-       *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-               | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
-       vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
-       if (gcr0 == *hcr0) {
-               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
-               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
-       } else {
-               svm_set_intercept(svm, INTERCEPT_CR0_READ);
-               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
-       }
-}
-
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       u64 hcr0 = cr0;
 
 #ifdef CONFIG_X86_64
        if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1757,7 +1734,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        vcpu->arch.cr0 = cr0;
 
        if (!npt_enabled)
-               cr0 |= X86_CR0_PG | X86_CR0_WP;
+               hcr0 |= X86_CR0_PG | X86_CR0_WP;
 
        /*
         * re-enable caching here because the QEMU bios
@@ -1765,10 +1742,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         * reboot
         */
        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-               cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
-       svm->vmcb->save.cr0 = cr0;
+               hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+       svm->vmcb->save.cr0 = hcr0;
        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-       update_cr0_intercept(svm);
+
+       /*
+        * SEV-ES guests must always keep the CR intercepts cleared. CR
+        * tracking is done using the CR write traps.
+        */
+       if (sev_es_guest(vcpu->kvm))
+               return;
+
+       if (hcr0 == cr0) {
+               /* Selective CR0 write remains on.  */
+               svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+               svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+       } else {
+               svm_set_intercept(svm, INTERCEPT_CR0_READ);
+               svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+       }
 }
 
 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1847,7 +1840,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
        }
 
-       svm->asid_generation = sd->asid_generation;
+       svm->current_vmcb->asid_generation = sd->asid_generation;
        svm->asid = sd->next_asid++;
 }
 
@@ -1896,39 +1889,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
        vmcb_mark_dirty(svm->vmcb, VMCB_DR);
 }
 
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
 {
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       u64 fault_address = svm->vmcb->control.exit_info_2;
        u64 error_code = svm->vmcb->control.exit_info_1;
 
-       return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+       return kvm_handle_page_fault(vcpu, error_code, fault_address,
                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                        svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
+
        u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        trace_kvm_page_fault(fault_address, error_code);
-       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+       return kvm_mmu_page_fault(vcpu, fault_address, error_code,
                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                        svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
 }
 
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       if (!(svm->vcpu.guest_debug &
+       if (!(vcpu->guest_debug &
              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
                !svm->nmi_singlestep) {
                u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
-               kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+               kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
                return 1;
        }
 
@@ -1938,7 +1935,7 @@ static int db_interception(struct vcpu_svm *svm)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
-       if (svm->vcpu.guest_debug &
+       if (vcpu->guest_debug &
            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
                kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
@@ -1952,9 +1949,10 @@ static int db_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
 
        kvm_run->exit_reason = KVM_EXIT_DEBUG;
        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1962,14 +1960,14 @@ static int bp_interception(struct vcpu_svm *svm)
        return 0;
 }
 
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
 {
-       return handle_ud(&svm->vcpu);
+       return handle_ud(vcpu);
 }
 
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
 {
-       kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+       kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
        return 1;
 }
 
@@ -2012,7 +2010,7 @@ static bool is_erratum_383(void)
        return true;
 }
 
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
 {
        if (is_erratum_383()) {
                /*
@@ -2021,7 +2019,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
                 */
                pr_err("KVM: Guest triggered AMD Erratum 383\n");
 
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 
                return;
        }
@@ -2033,20 +2031,21 @@ static void svm_handle_mce(struct vcpu_svm *svm)
        kvm_machine_check();
 }
 
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
 {
        return 1;
 }
 
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
+       struct kvm_run *kvm_run = vcpu->run;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
        /*
         * The VM save area has already been encrypted so it
         * cannot be reinitialized - just terminate.
         */
-       if (sev_es_guest(svm->vcpu.kvm))
+       if (sev_es_guest(vcpu->kvm))
                return -EINVAL;
 
        /*
@@ -2054,20 +2053,20 @@ static int shutdown_interception(struct vcpu_svm *svm)
         * so reinitialize it.
         */
        clear_page(svm->vmcb);
-       init_vmcb(svm);
+       init_vmcb(vcpu);
 
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
 }
 
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
        int size, in, string;
        unsigned port;
 
-       ++svm->vcpu.stat.io_exits;
+       ++vcpu->stat.io_exits;
        string = (io_info & SVM_IOIO_STR_MASK) != 0;
        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
        port = io_info >> 16;
@@ -2082,93 +2081,69 @@ static int io_interception(struct vcpu_svm *svm)
 
        svm->next_rip = svm->vmcb->control.exit_info_2;
 
-       return kvm_fast_pio(&svm->vcpu, size, port, in);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
-       return 1;
+       return kvm_fast_pio(vcpu, size, port, in);
 }
 
-static int intr_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
 {
-       ++svm->vcpu.stat.irq_exits;
        return 1;
 }
 
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
 {
+       ++vcpu->stat.irq_exits;
        return 1;
 }
 
-static int halt_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
 {
-       return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
-       struct vmcb *nested_vmcb;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb12;
        struct kvm_host_map map;
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
        if (ret) {
                if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
-       nested_vmcb = map.hva;
+       vmcb12 = map.hva;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       ret = kvm_skip_emulated_instruction(vcpu);
 
-       nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       if (vmload) {
+               nested_svm_vmloadsave(vmcb12, svm->vmcb);
+               svm->sysenter_eip_hi = 0;
+               svm->sysenter_esp_hi = 0;
+       } else
+               nested_svm_vmloadsave(svm->vmcb, vmcb12);
+
+       kvm_vcpu_unmap(vcpu, &map, true);
 
        return ret;
 }
 
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
 {
-       struct vmcb *nested_vmcb;
-       struct kvm_host_map map;
-       int ret;
-
-       if (nested_svm_check_permissions(svm))
-               return 1;
-
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-       if (ret) {
-               if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
-               return 1;
-       }
-
-       nested_vmcb = map.hva;
-
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-       nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       return vmload_vmsave_interception(vcpu, true);
+}
 
-       return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+       return vmload_vmsave_interception(vcpu, false);
 }
 
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
 {
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       return nested_svm_vmrun(svm);
+       return nested_svm_vmrun(vcpu);
 }
 
 enum {
@@ -2207,7 +2182,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
                [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
                [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
        };
-       int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+       int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
                [SVM_INSTR_VMRUN] = vmrun_interception,
                [SVM_INSTR_VMLOAD] = vmload_interception,
                [SVM_INSTR_VMSAVE] = vmsave_interception,
@@ -2216,17 +2191,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
        int ret;
 
        if (is_guest_mode(vcpu)) {
-               svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
-               svm->vmcb->control.exit_info_1 = 0;
-               svm->vmcb->control.exit_info_2 = 0;
-
                /* Returns '1' or -errno on failure, '0' on success. */
-               ret = nested_svm_vmexit(svm);
+               ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
                if (ret)
                        return ret;
                return 1;
        }
-       return svm_instr_handlers[opcode](svm);
+       return svm_instr_handlers[opcode](vcpu);
 }
 
 /*
@@ -2237,9 +2208,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
  *      regions (e.g. SMM memory on host).
  *   2) VMware backdoor
  */
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        u32 error_code = svm->vmcb->control.exit_info_1;
        int opcode;
 
@@ -2304,73 +2275,52 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
        }
 }
 
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
 {
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, true);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), true);
        return ret;
 }
 
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
 {
        int ret;
 
-       if (nested_svm_check_permissions(svm))
+       if (nested_svm_check_permissions(vcpu))
                return 1;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       svm_set_gif(svm, false);
+       ret = kvm_skip_emulated_instruction(vcpu);
+       svm_set_gif(to_svm(vcpu), false);
        return ret;
 }
 
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-
-       trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
-                         kvm_rax_read(&svm->vcpu));
+       trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, kvm_rcx_read(vcpu),
+                         kvm_rax_read(vcpu));
 
        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
-       kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
-
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+       kvm_mmu_invlpg(vcpu, kvm_rax_read(vcpu));
 
-static int skinit_interception(struct vcpu_svm *svm)
-{
-       trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
-
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
-
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wbinvd(&svm->vcpu);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
 {
-       u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
-       u32 index = kvm_rcx_read(&svm->vcpu);
+       trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
 
-       int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
 
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u16 tss_selector;
        int reason;
        int int_type = svm->vmcb->control.exit_int_info &
@@ -2399,7 +2349,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
        if (reason == TASK_SWITCH_GATE) {
                switch (type) {
                case SVM_EXITINTINFO_TYPE_NMI:
-                       svm->vcpu.arch.nmi_injected = false;
+                       vcpu->arch.nmi_injected = false;
                        break;
                case SVM_EXITINTINFO_TYPE_EXEPT:
                        if (svm->vmcb->control.exit_info_2 &
@@ -2408,10 +2358,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
                                error_code =
                                        (u32)svm->vmcb->control.exit_info_2;
                        }
-                       kvm_clear_exception_queue(&svm->vcpu);
+                       kvm_clear_exception_queue(vcpu);
                        break;
                case SVM_EXITINTINFO_TYPE_INTR:
-                       kvm_clear_interrupt_queue(&svm->vcpu);
+                       kvm_clear_interrupt_queue(vcpu);
                        break;
                default:
                        break;
@@ -2422,77 +2372,58 @@ static int task_switch_interception(struct vcpu_svm *svm)
            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
             (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-               if (!skip_emulated_instruction(&svm->vcpu))
+               if (!skip_emulated_instruction(vcpu))
                        return 0;
        }
 
        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
                int_vec = -1;
 
-       return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+       return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
                               has_error_code, error_code);
 }
 
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_cpuid(&svm->vcpu);
-}
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-static int iret_interception(struct vcpu_svm *svm)
-{
-       ++svm->vcpu.stat.nmi_window_exits;
-       svm->vcpu.arch.hflags |= HF_IRET_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       ++vcpu->stat.nmi_window_exits;
+       vcpu->arch.hflags |= HF_IRET_MASK;
+       if (!sev_es_guest(vcpu->kvm)) {
                svm_clr_intercept(svm, INTERCEPT_IRET);
-               svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+               svm->nmi_iret_rip = kvm_rip_read(vcpu);
        }
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
        return 1;
 }
 
-static int invd_interception(struct vcpu_svm *svm)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
 {
        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return kvm_emulate_instruction(&svm->vcpu, 0);
-
-       kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+               return kvm_emulate_instruction(vcpu, 0);
 
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_instruction(&svm->vcpu, 0);
+       kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int rsm_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+       return kvm_emulate_instruction(vcpu, 0);
 }
 
-static int rdpmc_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
 {
-       int err;
-
-       if (!nrips)
-               return emulate_on_interception(svm);
-
-       err = kvm_rdpmc(&svm->vcpu);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
 }
 
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
                                            unsigned long val)
 {
-       unsigned long cr0 = svm->vcpu.arch.cr0;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned long cr0 = vcpu->arch.cr0;
        bool ret = false;
 
-       if (!is_guest_mode(&svm->vcpu) ||
+       if (!is_guest_mode(vcpu) ||
            (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
                return false;
 
@@ -2509,17 +2440,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
 
 #define CR_VALID (1ULL << 63)
 
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int reg, cr;
        unsigned long val;
        int err;
 
        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
@@ -2530,61 +2462,61 @@ static int cr_interception(struct vcpu_svm *svm)
        err = 0;
        if (cr >= 16) { /* mov to cr */
                cr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
+               val = kvm_register_read(vcpu, reg);
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
-                       if (!check_selective_cr0_intercepted(svm, val))
-                               err = kvm_set_cr0(&svm->vcpu, val);
+                       if (!check_selective_cr0_intercepted(vcpu, val))
+                               err = kvm_set_cr0(vcpu, val);
                        else
                                return 1;
 
                        break;
                case 3:
-                       err = kvm_set_cr3(&svm->vcpu, val);
+                       err = kvm_set_cr3(vcpu, val);
                        break;
                case 4:
-                       err = kvm_set_cr4(&svm->vcpu, val);
+                       err = kvm_set_cr4(vcpu, val);
                        break;
                case 8:
-                       err = kvm_set_cr8(&svm->vcpu, val);
+                       err = kvm_set_cr8(vcpu, val);
                        break;
                default:
                        WARN(1, "unhandled write to CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
        } else { /* mov from cr */
                switch (cr) {
                case 0:
-                       val = kvm_read_cr0(&svm->vcpu);
+                       val = kvm_read_cr0(vcpu);
                        break;
                case 2:
-                       val = svm->vcpu.arch.cr2;
+                       val = vcpu->arch.cr2;
                        break;
                case 3:
-                       val = kvm_read_cr3(&svm->vcpu);
+                       val = kvm_read_cr3(vcpu);
                        break;
                case 4:
-                       val = kvm_read_cr4(&svm->vcpu);
+                       val = kvm_read_cr4(vcpu);
                        break;
                case 8:
-                       val = kvm_get_cr8(&svm->vcpu);
+                       val = kvm_get_cr8(vcpu);
                        break;
                default:
                        WARN(1, "unhandled read from CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_register_write(vcpu, reg, val);
                trace_kvm_cr_read(cr, val);
        }
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
-static int cr_trap(struct vcpu_svm *svm)
+static int cr_trap(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long old_value, new_value;
        unsigned int cr;
        int ret = 0;
@@ -2606,7 +2538,7 @@ static int cr_trap(struct vcpu_svm *svm)
                kvm_post_set_cr4(vcpu, old_value, new_value);
                break;
        case 8:
-               ret = kvm_set_cr8(&svm->vcpu, new_value);
+               ret = kvm_set_cr8(vcpu, new_value);
                break;
        default:
                WARN(1, "unhandled CR%d write trap", cr);
@@ -2617,57 +2549,57 @@ static int cr_trap(struct vcpu_svm *svm)
        return kvm_complete_insn_gp(vcpu, ret);
 }
 
-static int dr_interception(struct vcpu_svm *svm)
+static int dr_interception(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int reg, dr;
        unsigned long val;
        int err = 0;
 
-       if (svm->vcpu.guest_debug == 0) {
+       if (vcpu->guest_debug == 0) {
                /*
                 * No more DR vmexits; force a reload of the debug registers
                 * and reenter on this instruction.  The next vmexit will
                 * retrieve the full state of the debug registers.
                 */
                clr_dr_intercepts(svm);
-               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
                return 1;
        }
 
        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
+               return emulate_on_interception(vcpu);
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
        if (dr >= 16) { /* mov to DRn  */
                dr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
-               err = kvm_set_dr(&svm->vcpu, dr, val);
+               val = kvm_register_read(vcpu, reg);
+               err = kvm_set_dr(vcpu, dr, val);
        } else {
-               kvm_get_dr(&svm->vcpu, dr, &val);
-               kvm_register_write(&svm->vcpu, reg, val);
+               kvm_get_dr(vcpu, dr, &val);
+               kvm_register_write(vcpu, reg, val);
        }
 
-       return kvm_complete_insn_gp(&svm->vcpu, err);
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
-static int cr8_write_interception(struct vcpu_svm *svm)
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *kvm_run = svm->vcpu.run;
        int r;
 
-       u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+       u8 cr8_prev = kvm_get_cr8(vcpu);
        /* instruction emulation calls kvm_set_cr8() */
-       r = cr_interception(svm);
-       if (lapic_in_kernel(&svm->vcpu))
+       r = cr_interception(vcpu);
+       if (lapic_in_kernel(vcpu))
                return r;
-       if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+       if (cr8_prev <= kvm_get_cr8(vcpu))
                return r;
-       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
        return 0;
 }
 
-static int efer_trap(struct vcpu_svm *svm)
+static int efer_trap(struct kvm_vcpu *vcpu)
 {
        struct msr_data msr_info;
        int ret;
@@ -2680,10 +2612,10 @@ static int efer_trap(struct vcpu_svm *svm)
         */
        msr_info.host_initiated = false;
        msr_info.index = MSR_EFER;
-       msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME;
-       ret = kvm_set_msr_common(&svm->vcpu, &msr_info);
+       msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+       ret = kvm_set_msr_common(vcpu, &msr_info);
 
-       return kvm_complete_insn_gp(&svm->vcpu, ret);
+       return kvm_complete_insn_gp(vcpu, ret);
 }
 
 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -2710,30 +2642,34 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
        switch (msr_info->index) {
        case MSR_STAR:
-               msr_info->data = svm->vmcb->save.star;
+               msr_info->data = svm->vmcb01.ptr->save.star;
                break;
 #ifdef CONFIG_X86_64
        case MSR_LSTAR:
-               msr_info->data = svm->vmcb->save.lstar;
+               msr_info->data = svm->vmcb01.ptr->save.lstar;
                break;
        case MSR_CSTAR:
-               msr_info->data = svm->vmcb->save.cstar;
+               msr_info->data = svm->vmcb01.ptr->save.cstar;
                break;
        case MSR_KERNEL_GS_BASE:
-               msr_info->data = svm->vmcb->save.kernel_gs_base;
+               msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
                break;
        case MSR_SYSCALL_MASK:
-               msr_info->data = svm->vmcb->save.sfmask;
+               msr_info->data = svm->vmcb01.ptr->save.sfmask;
                break;
 #endif
        case MSR_IA32_SYSENTER_CS:
-               msr_info->data = svm->vmcb->save.sysenter_cs;
+               msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
                break;
        case MSR_IA32_SYSENTER_EIP:
-               msr_info->data = svm->sysenter_eip;
+               msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+               if (guest_cpuid_is_intel(vcpu))
+                       msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
                break;
        case MSR_IA32_SYSENTER_ESP:
-               msr_info->data = svm->sysenter_esp;
+               msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+               if (guest_cpuid_is_intel(vcpu))
+                       msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
                break;
        case MSR_TSC_AUX:
                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
@@ -2771,7 +2707,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !guest_has_spec_ctrl_msr(vcpu))
                        return 1;
 
-               msr_info->data = svm->spec_ctrl;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       msr_info->data = svm->vmcb->save.spec_ctrl;
+               else
+                       msr_info->data = svm->spec_ctrl;
                break;
        case MSR_AMD64_VIRT_SPEC_CTRL:
                if (!msr_info->host_initiated &&
@@ -2809,8 +2748,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       if (!sev_es_guest(svm->vcpu.kvm) || !err)
-               return kvm_complete_insn_gp(&svm->vcpu, err);
+       if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb))
+               return kvm_complete_insn_gp(vcpu, err);
 
        ghcb_set_sw_exit_info_1(svm->ghcb, 1);
        ghcb_set_sw_exit_info_2(svm->ghcb,
@@ -2820,11 +2759,6 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
        return 1;
 }
 
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -2861,7 +2795,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
                        return 1;
                vcpu->arch.pat = data;
-               svm->vmcb->save.g_pat = data;
+               svm->vmcb01.ptr->save.g_pat = data;
+               if (is_guest_mode(vcpu))
+                       nested_vmcb02_compute_g_pat(svm);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
                break;
        case MSR_IA32_SPEC_CTRL:
@@ -2872,7 +2808,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                if (kvm_spec_ctrl_test_value(data))
                        return 1;
 
-               svm->spec_ctrl = data;
+               if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+                       svm->vmcb->save.spec_ctrl = data;
+               else
+                       svm->spec_ctrl = data;
                if (!data)
                        break;
 
@@ -2915,32 +2854,39 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                svm->virt_spec_ctrl = data;
                break;
        case MSR_STAR:
-               svm->vmcb->save.star = data;
+               svm->vmcb01.ptr->save.star = data;
                break;
 #ifdef CONFIG_X86_64
        case MSR_LSTAR:
-               svm->vmcb->save.lstar = data;
+               svm->vmcb01.ptr->save.lstar = data;
                break;
        case MSR_CSTAR:
-               svm->vmcb->save.cstar = data;
+               svm->vmcb01.ptr->save.cstar = data;
                break;
        case MSR_KERNEL_GS_BASE:
-               svm->vmcb->save.kernel_gs_base = data;
+               svm->vmcb01.ptr->save.kernel_gs_base = data;
                break;
        case MSR_SYSCALL_MASK:
-               svm->vmcb->save.sfmask = data;
+               svm->vmcb01.ptr->save.sfmask = data;
                break;
 #endif
        case MSR_IA32_SYSENTER_CS:
-               svm->vmcb->save.sysenter_cs = data;
+               svm->vmcb01.ptr->save.sysenter_cs = data;
                break;
        case MSR_IA32_SYSENTER_EIP:
-               svm->sysenter_eip = data;
-               svm->vmcb->save.sysenter_eip = data;
+               svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
+               /*
+                * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
+                * when we spoof an Intel vendor ID (for cross vendor migration).
+                * In this case we use this intercept to track the high
+                * 32 bit part of these msrs to support Intel's
+                * implementation of SYSENTER/SYSEXIT.
+                */
+               svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_IA32_SYSENTER_ESP:
-               svm->sysenter_esp = data;
-               svm->vmcb->save.sysenter_esp = data;
+               svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
+               svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_TSC_AUX:
                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
@@ -3006,38 +2952,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
        return 0;
 }
 
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
+static int msr_interception(struct kvm_vcpu *vcpu)
 {
-       if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm);
+       if (to_svm(vcpu)->vmcb->control.exit_info_1)
+               return kvm_emulate_wrmsr(vcpu);
        else
-               return rdmsr_interception(svm);
+               return kvm_emulate_rdmsr(vcpu);
 }
 
-static int interrupt_window_interception(struct vcpu_svm *svm)
+static int interrupt_window_interception(struct kvm_vcpu *vcpu)
 {
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-       svm_clear_vintr(svm);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       svm_clear_vintr(to_svm(vcpu));
 
        /*
         * For AVIC, the only reason to end up here is ExtINTs.
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
         */
-       svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+       svm_toggle_avic_for_irq_window(vcpu, true);
 
-       ++svm->vcpu.stat.irq_window_exits;
+       ++vcpu->stat.irq_window_exits;
        return 1;
 }
 
-static int pause_interception(struct vcpu_svm *svm)
+static int pause_interception(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
        bool in_kernel;
 
        /*
@@ -3045,35 +2985,18 @@ static int pause_interception(struct vcpu_svm *svm)
         * vcpu->arch.preempted_in_kernel can never be true.  Just
         * set in_kernel to false as well.
         */
-       in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0;
+       in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
 
        if (!kvm_pause_in_guest(vcpu->kvm))
                grow_ple_window(vcpu);
 
        kvm_vcpu_on_spin(vcpu, in_kernel);
-       return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
-       return kvm_skip_emulated_instruction(&(svm->vcpu));
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int monitor_interception(struct vcpu_svm *svm)
+static int invpcid_interception(struct kvm_vcpu *vcpu)
 {
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int invpcid_interception(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long type;
        gva_t gva;
 
@@ -3098,7 +3021,7 @@ static int invpcid_interception(struct vcpu_svm *svm)
        return kvm_handle_invpcid(vcpu, type, gva);
 }
 
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [SVM_EXIT_READ_CR0]                     = cr_interception,
        [SVM_EXIT_READ_CR3]                     = cr_interception,
        [SVM_EXIT_READ_CR4]                     = cr_interception,
@@ -3133,15 +3056,15 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
        [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = nop_on_interception,
-       [SVM_EXIT_INIT]                         = nop_on_interception,
+       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
+       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
-       [SVM_EXIT_RDPMC]                        = rdpmc_interception,
-       [SVM_EXIT_CPUID]                        = cpuid_interception,
+       [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
+       [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
        [SVM_EXIT_IRET]                         = iret_interception,
-       [SVM_EXIT_INVD]                         = invd_interception,
+       [SVM_EXIT_INVD]                         = kvm_emulate_invd,
        [SVM_EXIT_PAUSE]                        = pause_interception,
-       [SVM_EXIT_HLT]                          = halt_interception,
+       [SVM_EXIT_HLT]                          = kvm_emulate_halt,
        [SVM_EXIT_INVLPG]                       = invlpg_interception,
        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
        [SVM_EXIT_IOIO]                         = io_interception,
@@ -3149,17 +3072,17 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
        [SVM_EXIT_VMRUN]                        = vmrun_interception,
-       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
+       [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
        [SVM_EXIT_VMLOAD]                       = vmload_interception,
        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
        [SVM_EXIT_SKINIT]                       = skinit_interception,
-       [SVM_EXIT_WBINVD]                       = wbinvd_interception,
-       [SVM_EXIT_MONITOR]                      = monitor_interception,
-       [SVM_EXIT_MWAIT]                        = mwait_interception,
-       [SVM_EXIT_XSETBV]                       = xsetbv_interception,
-       [SVM_EXIT_RDPRU]                        = rdpru_interception,
+       [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
+       [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
+       [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
+       [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
+       [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
        [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
        [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
        [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
@@ -3177,6 +3100,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
+       struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
 
        if (!dump_invalid_vmcb) {
                pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
@@ -3239,28 +3163,28 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
               save->ds.limit, save->ds.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "fs:",
-              save->fs.selector, save->fs.attrib,
-              save->fs.limit, save->fs.base);
+              save01->fs.selector, save01->fs.attrib,
+              save01->fs.limit, save01->fs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gs:",
-              save->gs.selector, save->gs.attrib,
-              save->gs.limit, save->gs.base);
+              save01->gs.selector, save01->gs.attrib,
+              save01->gs.limit, save01->gs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gdtr:",
               save->gdtr.selector, save->gdtr.attrib,
               save->gdtr.limit, save->gdtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "ldtr:",
-              save->ldtr.selector, save->ldtr.attrib,
-              save->ldtr.limit, save->ldtr.base);
+              save01->ldtr.selector, save01->ldtr.attrib,
+              save01->ldtr.limit, save01->ldtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "idtr:",
               save->idtr.selector, save->idtr.attrib,
               save->idtr.limit, save->idtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "tr:",
-              save->tr.selector, save->tr.attrib,
-              save->tr.limit, save->tr.base);
+              save01->tr.selector, save01->tr.attrib,
+              save01->tr.limit, save01->tr.base);
        pr_err("cpl:            %d                efer:         %016llx\n",
                save->cpl, save->efer);
        pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3274,15 +3198,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
        pr_err("%-15s %016llx %-13s %016llx\n",
               "rsp:", save->rsp, "rax:", save->rax);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "star:", save->star, "lstar:", save->lstar);
+              "star:", save01->star, "lstar:", save01->lstar);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "cstar:", save->cstar, "sfmask:", save->sfmask);
+              "cstar:", save01->cstar, "sfmask:", save01->sfmask);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "kernel_gs_base:", save->kernel_gs_base,
-              "sysenter_cs:", save->sysenter_cs);
+              "kernel_gs_base:", save01->kernel_gs_base,
+              "sysenter_cs:", save01->sysenter_cs);
        pr_err("%-15s %016llx %-13s %016llx\n",
-              "sysenter_esp:", save->sysenter_esp,
-              "sysenter_eip:", save->sysenter_eip);
+              "sysenter_esp:", save01->sysenter_esp,
+              "sysenter_eip:", save01->sysenter_eip);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
        pr_err("%-15s %016llx %-13s %016llx\n",
@@ -3309,24 +3233,24 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
        return -EINVAL;
 }
 
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code)
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 {
-       if (svm_handle_invalid_exit(&svm->vcpu, exit_code))
+       if (svm_handle_invalid_exit(vcpu, exit_code))
                return 0;
 
 #ifdef CONFIG_RETPOLINE
        if (exit_code == SVM_EXIT_MSR)
-               return msr_interception(svm);
+               return msr_interception(vcpu);
        else if (exit_code == SVM_EXIT_VINTR)
-               return interrupt_window_interception(svm);
+               return interrupt_window_interception(vcpu);
        else if (exit_code == SVM_EXIT_INTR)
-               return intr_interception(svm);
+               return intr_interception(vcpu);
        else if (exit_code == SVM_EXIT_HLT)
-               return halt_interception(svm);
+               return kvm_emulate_halt(vcpu);
        else if (exit_code == SVM_EXIT_NPF)
-               return npf_interception(svm);
+               return npf_interception(vcpu);
 #endif
-       return svm_exit_handlers[exit_code](svm);
+       return svm_exit_handlers[exit_code](vcpu);
 }
 
 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
@@ -3395,7 +3319,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        if (exit_fastpath != EXIT_FASTPATH_NONE)
                return 1;
 
-       return svm_invoke_exit_handler(svm, exit_code);
+       return svm_invoke_exit_handler(vcpu, exit_code);
 }
 
 static void reload_tss(struct kvm_vcpu *vcpu)
@@ -3406,15 +3330,27 @@ static void reload_tss(struct kvm_vcpu *vcpu)
        load_TR_desc();
 }
 
-static void pre_svm_run(struct vcpu_svm *svm)
+static void pre_svm_run(struct kvm_vcpu *vcpu)
 {
-       struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
+       struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       /*
+        * If the previous vmrun of the vmcb occurred on a different physical
+        * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
+        * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
+        */
+       if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
+               svm->current_vmcb->asid_generation = 0;
+               vmcb_mark_all_dirty(svm->vmcb);
+               svm->current_vmcb->cpu = vcpu->cpu;
+        }
 
-       if (sev_guest(svm->vcpu.kvm))
-               return pre_sev_run(svm, svm->vcpu.cpu);
+       if (sev_guest(vcpu->kvm))
+               return pre_sev_run(svm, vcpu->cpu);
 
        /* FIXME: handle wraparound of asid_generation */
-       if (svm->asid_generation != sd->asid_generation)
+       if (svm->current_vmcb->asid_generation != sd->asid_generation)
                new_asid(svm, sd);
 }
 
@@ -3424,7 +3360,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 
        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
        vcpu->arch.hflags |= HF_NMI_MASK;
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                svm_set_intercept(svm, INTERCEPT_IRET);
        ++vcpu->stat.nmi_injections;
 }
@@ -3478,7 +3414,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
                return false;
 
        ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-             (svm->vcpu.arch.hflags & HF_NMI_MASK);
+             (vcpu->arch.hflags & HF_NMI_MASK);
 
        return ret;
 }
@@ -3498,9 +3434,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+       return !!(vcpu->arch.hflags & HF_NMI_MASK);
 }
 
 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3508,12 +3442,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if (masked) {
-               svm->vcpu.arch.hflags |= HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags |= HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                        svm_set_intercept(svm, INTERCEPT_IRET);
        } else {
-               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-               if (!sev_es_guest(svm->vcpu.kvm))
+               vcpu->arch.hflags &= ~HF_NMI_MASK;
+               if (!sev_es_guest(vcpu->kvm))
                        svm_clr_intercept(svm, INTERCEPT_IRET);
        }
 }
@@ -3526,7 +3460,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
        if (!gif_set(svm))
                return true;
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
+       if (sev_es_guest(vcpu->kvm)) {
                /*
                 * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
                 * bit to determine the state of the IF flag.
@@ -3536,7 +3470,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
        } else if (is_guest_mode(vcpu)) {
                /* As long as interrupts are being delivered...  */
                if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
-                   ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
+                   ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
                    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
                        return true;
 
@@ -3595,8 +3529,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
-           == HF_NMI_MASK)
+       if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
                return; /* IRET will cause a vm exit */
 
        if (!gif_set(svm)) {
@@ -3638,7 +3571,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
        else
-               svm->asid_generation--;
+               svm->current_vmcb->asid_generation--;
 }
 
 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3675,8 +3608,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
-static void svm_complete_interrupts(struct vcpu_svm *svm)
+static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        u8 vector;
        int type;
        u32 exitintinfo = svm->vmcb->control.exit_int_info;
@@ -3688,28 +3622,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
         * If we've made progress since setting HF_IRET_MASK, we've
         * executed an IRET and can allow NMI injection.
         */
-       if ((svm->vcpu.arch.hflags & HF_IRET_MASK) &&
-           (sev_es_guest(svm->vcpu.kvm) ||
-            kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) {
-               svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
-               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+           (sev_es_guest(vcpu->kvm) ||
+            kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+               vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
 
-       svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
                return;
 
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
 
        switch (type) {
        case SVM_EXITINTINFO_TYPE_NMI:
-               svm->vcpu.arch.nmi_injected = true;
+               vcpu->arch.nmi_injected = true;
                break;
        case SVM_EXITINTINFO_TYPE_EXEPT:
                /*
@@ -3725,21 +3659,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
                 */
                if (kvm_exception_is_soft(vector)) {
                        if (vector == BP_VECTOR && int3_injected &&
-                           kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
-                               kvm_rip_write(&svm->vcpu,
-                                             kvm_rip_read(&svm->vcpu) -
-                                             int3_injected);
+                           kvm_is_linear_rip(vcpu, svm->int3_rip))
+                               kvm_rip_write(vcpu,
+                                             kvm_rip_read(vcpu) - int3_injected);
                        break;
                }
                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
                        u32 err = svm->vmcb->control.exit_int_info_err;
-                       kvm_requeue_exception_e(&svm->vcpu, vector, err);
+                       kvm_requeue_exception_e(vcpu, vector, err);
 
                } else
-                       kvm_requeue_exception(&svm->vcpu, vector);
+                       kvm_requeue_exception(vcpu, vector);
                break;
        case SVM_EXITINTINFO_TYPE_INTR:
-               kvm_queue_interrupt(&svm->vcpu, vector, false);
+               kvm_queue_interrupt(vcpu, vector, false);
                break;
        default:
                break;
@@ -3754,7 +3687,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
        control->exit_int_info = control->event_inj;
        control->exit_int_info_err = control->event_inj_err;
        control->event_inj = 0;
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
 }
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
@@ -3766,9 +3699,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
        return EXIT_FASTPATH_NONE;
 }
 
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
-                                       struct vcpu_svm *svm)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned long vmcb_pa = svm->current_vmcb->pa;
+
        /*
         * VMENTER enables interrupts (host state), but the kernel state is
         * interrupts disabled when this is invoked. Also tell RCU about
@@ -3789,12 +3724,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
        guest_enter_irqoff();
        lockdep_hardirqs_on(CALLER_ADDR0);
 
-       if (sev_es_guest(svm->vcpu.kvm)) {
-               __svm_sev_es_vcpu_run(svm->vmcb_pa);
+       if (sev_es_guest(vcpu->kvm)) {
+               __svm_sev_es_vcpu_run(vmcb_pa);
        } else {
                struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
 
-               __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+               /*
+                * Use a single vmcb (vmcb01 because it's always valid) for
+                * context switching guest state via VMLOAD/VMSAVE, that way
+                * the state doesn't need to be copied between vmcb01 and
+                * vmcb02 when switching vmcbs for nested virtualization.
+                */
+               vmload(svm->vmcb01.pa);
+               __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
+               vmsave(svm->vmcb01.pa);
 
                vmload(__sme_page_pa(sd->save_area));
        }
@@ -3845,7 +3788,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
                smp_send_reschedule(vcpu->cpu);
        }
 
-       pre_svm_run(svm);
+       pre_svm_run(vcpu);
 
        sync_lapic_to_cr8(vcpu);
 
@@ -3859,7 +3802,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * Run with all-zero DR6 unless needed, so that we can get the exact cause
         * of a #DB.
         */
-       if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+       if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
                svm_set_dr6(svm, vcpu->arch.dr6);
        else
                svm_set_dr6(svm, DR6_ACTIVE_LOW);
@@ -3875,9 +3818,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * is no need to worry about the conditional branch over the wrmsr
         * being speculatively taken.
         */
-       x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-       svm_vcpu_enter_exit(vcpu, svm);
+       svm_vcpu_enter_exit(vcpu);
 
        /*
         * We do not use IBRS in the kernel. If this vCPU has used the
@@ -3894,15 +3838,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         * If the L02 MSR bitmap does not intercept the MSR, then we need to
         * save it.
         */
-       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
+           unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
                svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-       if (!sev_es_guest(svm->vcpu.kvm))
+       if (!sev_es_guest(vcpu->kvm))
                reload_tss(vcpu);
 
-       x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+       if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+               x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-       if (!sev_es_guest(svm->vcpu.kvm)) {
+       if (!sev_es_guest(vcpu->kvm)) {
                vcpu->arch.cr2 = svm->vmcb->save.cr2;
                vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
                vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
@@ -3910,7 +3856,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        }
 
        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_before_interrupt(&svm->vcpu);
+               kvm_before_interrupt(vcpu);
 
        kvm_load_host_xsave_state(vcpu);
        stgi();
@@ -3918,13 +3864,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        /* Any pending NMI will happen here */
 
        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_after_interrupt(&svm->vcpu);
+               kvm_after_interrupt(vcpu);
 
        sync_cr8_to_lapic(vcpu);
 
        svm->next_rip = 0;
-       if (is_guest_mode(&svm->vcpu)) {
-               sync_nested_vmcb_control(svm);
+       if (is_guest_mode(vcpu)) {
+               nested_sync_control_from_vmcb02(svm);
                svm->nested.nested_run_pending = 0;
        }
 
@@ -3933,7 +3879,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 
        /* if exit due to PF check for async PF */
        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
-               svm->vcpu.arch.apf.host_apf_flags =
+               vcpu->arch.apf.host_apf_flags =
                        kvm_read_and_reset_apf_flags();
 
        if (npt_enabled) {
@@ -3947,9 +3893,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         */
        if (unlikely(svm->vmcb->control.exit_code ==
                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
-               svm_handle_mce(svm);
+               svm_handle_mce(vcpu);
 
-       svm_complete_interrupts(svm);
+       svm_complete_interrupts(vcpu);
 
        if (is_guest_mode(vcpu))
                return EXIT_FASTPATH_NONE;
@@ -3957,21 +3903,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        return svm_exit_handlers_fastpath(vcpu);
 }
 
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                             int root_level)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long cr3;
 
-       cr3 = __sme_set(root);
        if (npt_enabled) {
-               svm->vmcb->control.nested_cr3 = cr3;
+               svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
 
                /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
                if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
                        return;
                cr3 = vcpu->arch.cr3;
+       } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+               cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
+       } else {
+               /* PCID in the guest should be impossible with a 32-bit MMU. */
+               WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
+               cr3 = root_hpa;
        }
 
        svm->vmcb->save.cr3 = cr3;
@@ -4048,7 +3999,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
        /* Update nrips enabled cache */
        svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
-                            guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+                            guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
 
        /* Check again if INVPCID interception if required */
        svm_check_invpcid(svm);
@@ -4060,24 +4011,50 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                        vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
        }
 
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
+       if (kvm_vcpu_apicv_active(vcpu)) {
+               /*
+                * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+                * is exposed to the guest, disable AVIC.
+                */
+               if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+                       kvm_request_apicv_update(vcpu->kvm, false,
+                                                APICV_INHIBIT_REASON_X2APIC);
 
-       /*
-        * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
-        * is exposed to the guest, disable AVIC.
-        */
-       if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_X2APIC);
+               /*
+                * Currently, AVIC does not work with nested virtualization.
+                * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
+                */
+               if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+                       kvm_request_apicv_update(vcpu->kvm, false,
+                                                APICV_INHIBIT_REASON_NESTED);
+       }
 
-       /*
-        * Currently, AVIC does not work with nested virtualization.
-        * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
-        */
-       if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_NESTED);
+       if (guest_cpuid_is_intel(vcpu)) {
+               /*
+                * We must intercept SYSENTER_EIP and SYSENTER_ESP
+                * accesses because the processor only stores 32 bits.
+                * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+                */
+               svm_set_intercept(svm, INTERCEPT_VMLOAD);
+               svm_set_intercept(svm, INTERCEPT_VMSAVE);
+               svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+       } else {
+               /*
+                * If hardware supports Virtual VMLOAD VMSAVE then enable it
+                * in VMCB and clear intercepts to avoid #VMEXIT.
+                */
+               if (vls) {
+                       svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+                       svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+                       svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+               }
+               /* No need to intercept these MSRs */
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+               set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+       }
 }
 
 static bool svm_has_wbinvd_exit(void)
@@ -4349,15 +4326,15 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                        if (!(saved_efer & EFER_SVME))
                                return 1;
 
-                       if (kvm_vcpu_map(&svm->vcpu,
+                       if (kvm_vcpu_map(vcpu,
                                         gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
                                return 1;
 
                        if (svm_allocate_nested(svm))
                                return 1;
 
-                       ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
-                       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       kvm_vcpu_unmap(vcpu, &map, true);
                }
        }
 
@@ -4612,6 +4589,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .mem_enc_reg_region = svm_register_enc_region,
        .mem_enc_unreg_region = svm_unregister_enc_region,
 
+       .vm_copy_enc_context_from = svm_vm_copy_asid_from,
+
        .can_emulate_instruction = svm_can_emulate_instruction,
 
        .apic_init_signal_blocked = svm_apic_init_signal_blocked,
index 39e071f..d620619 100644 (file)
@@ -28,7 +28,10 @@ static const u32 host_save_user_msrs[] = {
 };
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
 
-#define MAX_DIRECT_ACCESS_MSRS 18
+#define        IOPM_SIZE PAGE_SIZE * 3
+#define        MSRPM_SIZE PAGE_SIZE * 2
+
+#define MAX_DIRECT_ACCESS_MSRS 20
 #define MSRPM_OFFSETS  16
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
@@ -65,6 +68,8 @@ struct kvm_sev_info {
        unsigned long pages_locked; /* Number of pages locked */
        struct list_head regions_list;  /* List of registered regions */
        u64 ap_jump_table;      /* SEV-ES AP Jump Table address */
+       struct kvm *enc_context_owner; /* Owner of copied encryption context */
+       struct misc_cg *misc_cg; /* For misc cgroup accounting */
 };
 
 struct kvm_svm {
@@ -81,11 +86,19 @@ struct kvm_svm {
 
 struct kvm_vcpu;
 
+struct kvm_vmcb_info {
+       struct vmcb *ptr;
+       unsigned long pa;
+       int cpu;
+       uint64_t asid_generation;
+};
+
 struct svm_nested_state {
-       struct vmcb *hsave;
+       struct kvm_vmcb_info vmcb02;
        u64 hsave_msr;
        u64 vm_cr_msr;
        u64 vmcb12_gpa;
+       u64 last_vmcb12_gpa;
 
        /* These are the merged vectors */
        u32 *msrpm;
@@ -102,13 +115,14 @@ struct svm_nested_state {
 
 struct vcpu_svm {
        struct kvm_vcpu vcpu;
+       /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */
        struct vmcb *vmcb;
-       unsigned long vmcb_pa;
+       struct kvm_vmcb_info vmcb01;
+       struct kvm_vmcb_info *current_vmcb;
        struct svm_cpu_data *svm_data;
        u32 asid;
-       uint64_t asid_generation;
-       uint64_t sysenter_esp;
-       uint64_t sysenter_eip;
+       u32 sysenter_esp_hi;
+       u32 sysenter_eip_hi;
        uint64_t tsc_aux;
 
        u64 msr_decfg;
@@ -239,17 +253,14 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
        vmcb->control.clean &= ~(1 << bit);
 }
 
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
 {
-       return container_of(vcpu, struct vcpu_svm, vcpu);
+        return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
 }
 
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(&svm->vcpu))
-               return svm->nested.hsave;
-       else
-               return svm->vmcb;
+       return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
 static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
@@ -272,7 +283,7 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
 
 static inline void set_dr_intercepts(struct vcpu_svm *svm)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        if (!sev_es_guest(svm->vcpu.kvm)) {
                vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
@@ -299,7 +310,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm)
 
 static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb->control.intercepts[INTERCEPT_DR] = 0;
 
@@ -314,7 +325,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 
 static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        WARN_ON_ONCE(bit >= 32);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -324,7 +335,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
 
 static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        WARN_ON_ONCE(bit >= 32);
        vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit);
@@ -334,7 +345,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit)
 
 static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb_set_intercept(&vmcb->control, bit);
 
@@ -343,7 +354,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
 
 static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
 {
-       struct vmcb *vmcb = get_host_vmcb(svm);
+       struct vmcb *vmcb = svm->vmcb01.ptr;
 
        vmcb_clr_intercept(&vmcb->control, bit);
 
@@ -405,7 +416,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu);
 bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
 void svm_set_gif(struct vcpu_svm *svm, bool value);
-int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code);
+int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
                          int read, int write);
 
@@ -437,20 +448,30 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
        return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
 }
 
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-                        struct vmcb *nested_vmcb);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12);
 void svm_leave_nested(struct vcpu_svm *svm);
 void svm_free_nested(struct vcpu_svm *svm);
 int svm_allocate_nested(struct vcpu_svm *svm);
-int nested_svm_vmrun(struct vcpu_svm *svm);
+int nested_svm_vmrun(struct kvm_vcpu *vcpu);
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
 int nested_svm_vmexit(struct vcpu_svm *svm);
+
+static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
+{
+       svm->vmcb->control.exit_code   = exit_code;
+       svm->vmcb->control.exit_info_1 = 0;
+       svm->vmcb->control.exit_info_2 = 0;
+       return nested_svm_vmexit(svm);
+}
+
 int nested_svm_exit_handled(struct vcpu_svm *svm);
-int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
 int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                               bool has_error_code, u32 error_code);
 int nested_svm_exit_special(struct vcpu_svm *svm);
-void sync_nested_vmcb_control(struct vcpu_svm *svm);
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
 
 extern struct kvm_x86_nested_ops svm_nested_ops;
 
@@ -491,8 +512,8 @@ void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
 void avic_init_vmcb(struct vcpu_svm *svm);
 void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
 int avic_init_vcpu(struct vcpu_svm *svm);
 void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void avic_vcpu_put(struct kvm_vcpu *vcpu);
@@ -561,11 +582,12 @@ int svm_register_enc_region(struct kvm *kvm,
                            struct kvm_enc_region *range);
 int svm_unregister_enc_region(struct kvm *kvm,
                              struct kvm_enc_region *range);
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
 void pre_sev_run(struct vcpu_svm *svm, int cpu);
 void __init sev_hardware_setup(void);
 void sev_hardware_teardown(void);
 void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct vcpu_svm *svm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
 void sev_es_init_vmcb(struct vcpu_svm *svm);
 void sev_es_create_vcpu(struct vcpu_svm *svm);
index 6feb8c0..4fa17df 100644 (file)
@@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run)
 
        /* Enter guest mode */
        sti
-1:     vmload %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
 
-3:     vmrun %_ASM_AX
-       jmp 5f
-4:     cmpb $0, kvm_rebooting
-       jne 5f
-       ud2
-       _ASM_EXTABLE(3b, 4b)
+1:     vmrun %_ASM_AX
 
-5:     vmsave %_ASM_AX
-       jmp 7f
-6:     cmpb $0, kvm_rebooting
-       jne 7f
-       ud2
-       _ASM_EXTABLE(5b, 6b)
-7:
-       cli
+2:     cli
 
 #ifdef CONFIG_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -167,6 +149,13 @@ SYM_FUNC_START(__svm_vcpu_run)
 #endif
        pop %_ASM_BP
        ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
 SYM_FUNC_END(__svm_vcpu_run)
 
 /**
@@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 #endif
        push %_ASM_BX
 
-       /* Enter guest mode */
+       /* Move @vmcb to RAX. */
        mov %_ASM_ARG1, %_ASM_AX
+
+       /* Enter guest mode */
        sti
 
 1:     vmrun %_ASM_AX
-       jmp 3f
-2:     cmpb $0, kvm_rebooting
-       jne 3f
-       ud2
-       _ASM_EXTABLE(1b, 2b)
 
-3:     cli
+2:     cli
 
 #ifdef CONFIG_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
@@ -217,4 +203,11 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 #endif
        pop %_ASM_BP
        ret
+
+3:     cmpb $0, kvm_rebooting
+       jne 2b
+       ud2
+
+       _ASM_EXTABLE(1b, 3b)
+
 SYM_FUNC_END(__svm_sev_es_vcpu_run)
index bcca0b8..8b11168 100644 (file)
@@ -11,6 +11,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmx.h"
 #include "x86.h"
@@ -21,13 +22,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 static bool __read_mostly nested_early_check = 0;
 module_param(nested_early_check, bool, S_IRUGO);
 
-#define CC(consistency_check)                                          \
-({                                                                     \
-       bool failed = (consistency_check);                              \
-       if (failed)                                                     \
-               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
-       failed;                                                         \
-})
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
 
 /*
  * Hyper-V requires all of these, so mark them as supported even though
@@ -2306,6 +2301,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
                    exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 
+               if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+                       vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
+
                secondary_exec_controls_set(vmx, exec_control);
        }
 
@@ -3453,6 +3451,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
        enum nested_evmptrld_status evmptrld_status;
 
+       ++vcpu->stat.nested_run;
+
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
@@ -3810,9 +3810,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 
        /*
         * Process any exceptions that are not debug traps before MTF.
+        *
+        * Note that only a pending nested run can block a pending exception.
+        * Otherwise an injected NMI/interrupt should either be
+        * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
+        * while delivering the pending exception.
         */
+
        if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
-               if (block_nested_events)
+               if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                if (!nested_vmx_check_exception(vcpu, &exit_qual))
                        goto no_vmexit;
@@ -3829,7 +3835,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
        }
 
        if (vcpu->arch.exception.pending) {
-               if (block_nested_events)
+               if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                if (!nested_vmx_check_exception(vcpu, &exit_qual))
                        goto no_vmexit;
@@ -4105,6 +4111,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
        /* update exit information fields: */
        vmcs12->vm_exit_reason = vm_exit_reason;
+       if (to_vmx(vcpu)->exit_reason.enclave_mode)
+               vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
        vmcs12->exit_qualification = exit_qualification;
        vmcs12->vm_exit_intr_info = exit_intr_info;
 
@@ -4422,6 +4430,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
+       /* Similarly, triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
        kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        /* Service the TLB flush request for L2 before switching to L1. */
@@ -4558,6 +4569,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
        vmx->fail = 0;
 }
 
+static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+}
+
 /*
  * Decode the memory-address operand of a vmx instruction, as recorded on an
  * exit caused by such an instruction (run by a guest hypervisor).
@@ -5479,16 +5495,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
                if (!nested_vmx_check_eptp(vcpu, new_eptp))
                        return 1;
 
-               kvm_mmu_unload(vcpu);
                mmu->ept_ad = accessed_dirty;
                mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                vmcs12->ept_pointer = new_eptp;
-               /*
-                * TODO: Check what's the correct approach in case
-                * mmu reload fails. Currently, we just let the next
-                * reload potentially fail
-                */
-               kvm_mmu_reload(vcpu);
+
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
        }
 
        return 0;
@@ -5705,6 +5716,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
        return false;
 }
 
+static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       u32 encls_leaf;
+
+       if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
+               return false;
+
+       encls_leaf = kvm_rax_read(vcpu);
+       if (encls_leaf > 62)
+               encls_leaf = 63;
+       return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
+}
+
 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
        struct vmcs12 *vmcs12, gpa_t bitmap)
 {
@@ -5801,9 +5827,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
        case EXIT_REASON_VMFUNC:
                /* VM functions are emulated through L2->L0 vmexits. */
                return true;
-       case EXIT_REASON_ENCLS:
-               /* SGX is never exposed to L1 */
-               return true;
        default:
                break;
        }
@@ -5927,6 +5950,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
        case EXIT_REASON_TPAUSE:
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
+       case EXIT_REASON_ENCLS:
+               return nested_vmx_exit_handled_encls(vcpu, vmcs12);
        default:
                return true;
        }
@@ -6502,6 +6527,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
                msrs->secondary_ctls_high |=
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 
+       if (enable_sgx)
+               msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC,
                msrs->misc_low,
@@ -6599,6 +6627,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 struct kvm_x86_nested_ops vmx_nested_ops = {
        .check_events = vmx_check_nested_events,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
+       .triple_fault = nested_vmx_triple_fault,
        .get_state = vmx_get_nested_state,
        .set_state = vmx_set_nested_state,
        .get_nested_state_pages = vmx_get_nested_state_pages,
index 197148d..184418b 100644 (file)
@@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
                PIN_BASED_EXT_INTR_MASK;
 }
 
+static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING);
+}
+
 /*
  * if fixed0[i] == 1: val[i] must be 1
  * if fixed1[i] == 0: val[i] must be 0
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644 (file)
index 0000000..6693ebd
--- /dev/null
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2021 Intel Corporation. */
+
+#include <asm/sgx.h>
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "nested.h"
+#include "sgx.h"
+#include "vmx.h"
+#include "x86.h"
+
+bool __read_mostly enable_sgx = 1;
+module_param_named(sgx, enable_sgx, bool, 0444);
+
+/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode.  Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+                            int size, int alignment, gva_t *gva)
+{
+       struct kvm_segment s;
+       bool fault;
+
+       /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
+       *gva = offset;
+       if (!is_long_mode(vcpu)) {
+               vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
+               *gva += s.base;
+       }
+
+       if (!IS_ALIGNED(*gva, alignment)) {
+               fault = true;
+       } else if (likely(is_long_mode(vcpu))) {
+               fault = is_noncanonical_address(*gva, vcpu);
+       } else {
+               *gva &= 0xffffffff;
+               fault = (s.unusable) ||
+                       (s.type != 2 && s.type != 3) ||
+                       (*gva > s.limit) ||
+                       ((s.base != 0 || s.limit != 0xffffffff) &&
+                       (((u64)*gva + size - 1) > s.limit + 1));
+       }
+       if (fault)
+               kvm_inject_gp(vcpu, 0);
+       return fault ? -EINVAL : 0;
+}
+
+static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
+                                        unsigned int size)
+{
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 2;
+       vcpu->run->internal.data[0] = addr;
+       vcpu->run->internal.data[1] = size;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+                       unsigned int size)
+{
+       if (__copy_from_user(data, (void __user *)hva, size)) {
+               sgx_handle_emulation_failure(vcpu, hva, size);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+                         gpa_t *gpa)
+{
+       struct x86_exception ex;
+
+       if (write)
+               *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
+       else
+               *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);
+
+       if (*gpa == UNMAPPED_GVA) {
+               kvm_inject_emulated_page_fault(vcpu, &ex);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
+{
+       *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
+       if (kvm_is_error_hva(*hva)) {
+               sgx_handle_emulation_failure(vcpu, gpa, 1);
+               return -EFAULT;
+       }
+
+       *hva |= gpa & ~PAGE_MASK;
+
+       return 0;
+}
+
+static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
+{
+       struct x86_exception ex;
+
+       /*
+        * A non-EPCM #PF indicates a bad userspace HVA.  This *should* check
+        * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
+        * but the error code isn't (yet) plumbed through the ENCLS helpers.
+        */
+       if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return 0;
+       }
+
+       /*
+        * If the guest thinks it's running on SGX2 hardware, inject an SGX
+        * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
+        * #PF on SGX2).  The assumption is that EPCM faults are much more
+        * likely than a bad userspace address.
+        */
+       if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
+           guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+               memset(&ex, 0, sizeof(ex));
+               ex.vector = PF_VECTOR;
+               ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
+                               PFERR_SGX_MASK;
+               ex.address = gva;
+               ex.error_code_valid = true;
+               ex.nested_page_fault = false;
+               kvm_inject_page_fault(vcpu, &ex);
+       } else {
+               kvm_inject_gp(vcpu, 0);
+       }
+       return 1;
+}
+
+static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
+                                 struct sgx_pageinfo *pageinfo,
+                                 unsigned long secs_hva,
+                                 gva_t secs_gva)
+{
+       struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
+       struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
+       u64 attributes, xfrm, size;
+       u32 miscselect;
+       u8 max_size_log2;
+       int trapnr, ret;
+
+       sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+       sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+       if (!sgx_12_0 || !sgx_12_1) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return 0;
+       }
+
+       miscselect = contents->miscselect;
+       attributes = contents->attributes;
+       xfrm = contents->xfrm;
+       size = contents->size;
+
+       /* Enforce restriction of access to the PROVISIONKEY. */
+       if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
+           (attributes & SGX_ATTR_PROVISIONKEY)) {
+               if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
+                       pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+       if ((u32)miscselect & ~sgx_12_0->ebx ||
+           (u32)attributes & ~sgx_12_1->eax ||
+           (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
+           (u32)xfrm & ~sgx_12_1->ecx ||
+           (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       /* Enforce CPUID restriction on max enclave size. */
+       max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
+                                                           sgx_12_0->edx;
+       if (size >= BIT_ULL(max_size_log2))
+               kvm_inject_gp(vcpu, 0);
+
+       /*
+        * sgx_virt_ecreate() returns:
+        *  1) 0:       ECREATE was successful
+        *  2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
+        *              exception number.
+        *  3) -EINVAL: access_ok() on @secs_hva failed. This should never
+        *              happen as KVM checks host addresses at memslot creation.
+        *              sgx_virt_ecreate() has already warned in this case.
+        */
+       ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
+       if (!ret)
+               return kvm_skip_emulated_instruction(vcpu);
+       if (ret == -EFAULT)
+               return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+       return ret;
+}
+
+static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+       gva_t pageinfo_gva, secs_gva;
+       gva_t metadata_gva, contents_gva;
+       gpa_t metadata_gpa, contents_gpa, secs_gpa;
+       unsigned long metadata_hva, contents_hva, secs_hva;
+       struct sgx_pageinfo pageinfo;
+       struct sgx_secs *contents;
+       struct x86_exception ex;
+       int r;
+
+       if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
+               return 1;
+
+       /*
+        * Copy the PAGEINFO to local memory, its pointers need to be
+        * translated, i.e. we need to do a deep copy/translate.
+        */
+       r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
+                               sizeof(pageinfo), &ex);
+       if (r == X86EMUL_PROPAGATE_FAULT) {
+               kvm_inject_emulated_page_fault(vcpu, &ex);
+               return 1;
+       } else if (r != X86EMUL_CONTINUE) {
+               sgx_handle_emulation_failure(vcpu, pageinfo_gva,
+                                            sizeof(pageinfo));
+               return 0;
+       }
+
+       if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
+           sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
+                             &contents_gva))
+               return 1;
+
+       /*
+        * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
+        * Resume the guest on failure to inject a #PF.
+        */
+       if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
+           sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
+           sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
+               return 1;
+
+       /*
+        * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+        * KVM doesn't have to fully process one address at a time.  Exit to
+        * userspace if a GPA is invalid.
+        */
+       if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
+           sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
+           sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
+               return 0;
+
+       /*
+        * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
+        * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
+        * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
+        * enforce restriction of access to the PROVISIONKEY.
+        */
+       contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+       if (!contents)
+               return -ENOMEM;
+
+       /* Exit to userspace if copying from a host userspace address fails. */
+       if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
+               free_page((unsigned long)contents);
+               return 0;
+       }
+
+       pageinfo.metadata = metadata_hva;
+       pageinfo.contents = (u64)contents;
+
+       r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);
+
+       free_page((unsigned long)contents);
+
+       return r;
+}
+
+static int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+       unsigned long sig_hva, secs_hva, token_hva, rflags;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       gva_t sig_gva, secs_gva, token_gva;
+       gpa_t sig_gpa, secs_gpa, token_gpa;
+       int ret, trapnr;
+
+       if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
+           sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
+               return 1;
+
+       /*
+        * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
+        * Resume the guest on failure to inject a #PF.
+        */
+       if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
+           sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
+           sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
+               return 1;
+
+       /*
+        * ...and then to HVA.  The order of accesses isn't architectural, i.e.
+        * KVM doesn't have to fully process one address at a time.  Exit to
+        * userspace if a GPA is invalid.  Note, all structures are aligned and
+        * cannot split pages.
+        */
+       if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
+           sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
+           sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
+               return 0;
+
+       ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
+                            (void __user *)secs_hva,
+                            vmx->msr_ia32_sgxlepubkeyhash, &trapnr);
+
+       if (ret == -EFAULT)
+               return sgx_inject_fault(vcpu, secs_gva, trapnr);
+
+       /*
+        * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
+        * @token_hva or @secs_hva. This should never happen as KVM checks host
+        * addresses at memslot creation. sgx_virt_einit() has already warned
+        * in this case, so just return.
+        */
+       if (ret < 0)
+               return ret;
+
+       rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
+                                         X86_EFLAGS_AF | X86_EFLAGS_SF |
+                                         X86_EFLAGS_OF);
+       if (ret)
+               rflags |= X86_EFLAGS_ZF;
+       else
+               rflags &= ~X86_EFLAGS_ZF;
+       vmx_set_rflags(vcpu, rflags);
+
+       kvm_rax_write(vcpu, ret);
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
+static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
+{
+       if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+               return false;
+
+       if (leaf >= ECREATE && leaf <= ETRACK)
+               return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+
+       if (leaf >= EAUG && leaf <= EMODT)
+               return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+
+       return false;
+}
+
+static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
+{
+       const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;
+
+       return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
+}
+
+int handle_encls(struct kvm_vcpu *vcpu)
+{
+       u32 leaf = (u32)kvm_rax_read(vcpu);
+
+       if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+       } else if (!sgx_enabled_in_guest_bios(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+       } else {
+               if (leaf == ECREATE)
+                       return handle_encls_ecreate(vcpu);
+               if (leaf == EINIT)
+                       return handle_encls_einit(vcpu);
+               WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+               vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
+               return 0;
+       }
+       return 1;
+}
+
+void setup_default_sgx_lepubkeyhash(void)
+{
+       /*
+        * Use Intel's default value for Skylake hardware if Launch Control is
+        * not supported, i.e. Intel's hash is hardcoded into silicon, or if
+        * Launch Control is supported and enabled, i.e. mimic the reset value
+        * and let the guest write the MSRs at will.  If Launch Control is
+        * supported but disabled, then use the current MSR values as the hash
+        * MSRs exist but are read-only (locked and not writable).
+        */
+       if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
+           rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+               sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+               sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+               sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+               sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+       } else {
+               /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+               rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+       }
+}
+
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+              sizeof(sgx_pubkey_hash));
+}
+
+/*
+ * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
+ * restrictions if the guest's allowed-1 settings diverge from hardware.
+ */
+static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *guest_cpuid;
+       u32 eax, ebx, ecx, edx;
+
+       if (!vcpu->kvm->arch.sgx_provisioning_allowed)
+               return true;
+
+       guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0);
+       if (!guest_cpuid)
+               return true;
+
+       cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
+       if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
+               return true;
+
+       guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1);
+       if (!guest_cpuid)
+               return true;
+
+       cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
+       if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
+           guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
+               return true;
+
+       return false;
+}
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+       /*
+        * There is no software enable bit for SGX that is virtualized by
+        * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
+        * guest (either by the host or by the guest's BIOS) but enabled in the
+        * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
+        * the expected system behavior for ENCLS.
+        */
+       u64 bitmap = -1ull;
+
+       /* Nothing to do if hardware doesn't support SGX */
+       if (!cpu_has_vmx_encls_vmexit())
+               return;
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+           sgx_enabled_in_guest_bios(vcpu)) {
+               if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+                       bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
+                       if (sgx_intercept_encls_ecreate(vcpu))
+                               bitmap |= (1 << ECREATE);
+               }
+
+               if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+                       bitmap &= ~GENMASK_ULL(EMODT, EAUG);
+
+               /*
+                * Trap and execute EINIT if launch control is enabled in the
+                * host using the guest's values for launch control MSRs, even
+                * if the guest's values are fixed to hardware default values.
+                * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
+                * the MSRs is extraordinarily expensive.
+                */
+               if (boot_cpu_has(X86_FEATURE_SGX_LC))
+                       bitmap |= (1 << EINIT);
+
+               if (!vmcs12 && is_guest_mode(vcpu))
+                       vmcs12 = get_vmcs12(vcpu);
+               if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
+                       bitmap |= vmcs12->encls_exiting_bitmap;
+       }
+       vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
+}
diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h
new file mode 100644 (file)
index 0000000..a400888
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SGX_H
+#define __KVM_X86_SGX_H
+
+#include <linux/kvm_host.h>
+
+#include "capabilities.h"
+#include "vmx_ops.h"
+
+#ifdef CONFIG_X86_SGX_KVM
+extern bool __read_mostly enable_sgx;
+
+int handle_encls(struct kvm_vcpu *vcpu);
+
+void setup_default_sgx_lepubkeyhash(void);
+void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu);
+
+void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+#else
+#define enable_sgx 0
+
+static inline void setup_default_sgx_lepubkeyhash(void) { }
+static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { }
+
+static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       /* Nothing to do if hardware doesn't support SGX */
+       if (cpu_has_vmx_encls_vmexit())
+               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+#endif
+
+#endif /* __KVM_X86_SGX_H */
index c8e51c0..034adb6 100644 (file)
@@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(VMREAD_BITMAP, vmread_bitmap),
        FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
        FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+       FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
        FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
index 80232da..1349495 100644 (file)
@@ -69,7 +69,8 @@ struct __packed vmcs12 {
        u64 vm_function_control;
        u64 eptp_list_address;
        u64 pml_address;
-       u64 padding64[3]; /* room for future expansion */
+       u64 encls_exiting_bitmap;
+       u64 padding64[2]; /* room for future expansion */
        /*
         * To allow migration of L1 (complete with its L2 guests) between
         * machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void)
        CHECK_OFFSET(vm_function_control, 296);
        CHECK_OFFSET(eptp_list_address, 304);
        CHECK_OFFSET(pml_address, 312);
+       CHECK_OFFSET(encls_exiting_bitmap, 320);
        CHECK_OFFSET(cr0_guest_host_mask, 344);
        CHECK_OFFSET(cr4_guest_host_mask, 352);
        CHECK_OFFSET(cr0_read_shadow, 360);
index 32cf828..6501d66 100644 (file)
@@ -57,6 +57,7 @@
 #include "mmu.h"
 #include "nested.h"
 #include "pmu.h"
+#include "sgx.h"
 #include "trace.h"
 #include "vmcs.h"
 #include "vmcs12.h"
@@ -472,26 +473,6 @@ static const u32 vmx_uret_msrs_list[] = {
 static bool __read_mostly enlightened_vmcs = true;
 module_param(enlightened_vmcs, bool, 0444);
 
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
-       struct kvm_vcpu *vcpu;
-       u64 tmp_eptp = INVALID_PAGE;
-       int i;
-
-       kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (!VALID_PAGE(tmp_eptp)) {
-                       tmp_eptp = to_vmx(vcpu)->ept_pointer;
-               } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_MISMATCH;
-                       return;
-               }
-       }
-
-       to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
                void *data)
 {
@@ -501,47 +482,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
                        range->pages);
 }
 
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
-               struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_root_ept(hpa_t root_ept,
+                                          struct kvm_tlb_range *range)
 {
-       u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
-       /*
-        * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
-        * of the base of EPT PML4 table, strip off EPT configuration
-        * information.
-        */
        if (range)
-               return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
+               return hyperv_flush_guest_mapping_range(root_ept,
                                kvm_fill_hv_flush_list_func, (void *)range);
        else
-               return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
+               return hyperv_flush_guest_mapping(root_ept);
 }
 
 static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
                struct kvm_tlb_range *range)
 {
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
        struct kvm_vcpu *vcpu;
-       int ret = 0, i;
+       int ret = 0, i, nr_unique_valid_roots;
+       hpa_t root;
 
-       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_lock(&kvm_vmx->hv_root_ept_lock);
 
-       if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
-               check_ept_pointer_match(kvm);
+       if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
+               nr_unique_valid_roots = 0;
 
-       if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+               /*
+                * Flush all valid roots, and see if all vCPUs have converged
+                * on a common root, in which case future flushes can skip the
+                * loop and flush the common root.
+                */
                kvm_for_each_vcpu(i, vcpu, kvm) {
-                       /* If ept_pointer is invalid pointer, bypass flush request. */
-                       if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
-                               ret |= __hv_remote_flush_tlb_with_range(
-                                       kvm, vcpu, range);
+                       root = to_vmx(vcpu)->hv_root_ept;
+                       if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
+                               continue;
+
+                       /*
+                        * Set the tracked root to the first valid root.  Keep
+                        * this root for the entirety of the loop even if more
+                        * roots are encountered as a low effort optimization
+                        * to avoid flushing the same (first) root again.
+                        */
+                       if (++nr_unique_valid_roots == 1)
+                               kvm_vmx->hv_root_ept = root;
+
+                       if (!ret)
+                               ret = hv_remote_flush_root_ept(root, range);
+
+                       /*
+                        * Stop processing roots if a failure occurred and
+                        * multiple valid roots have already been detected.
+                        */
+                       if (ret && nr_unique_valid_roots > 1)
+                               break;
                }
+
+               /*
+                * The optimized flush of a single root can't be used if there
+                * are multiple valid roots (obviously).
+                */
+               if (nr_unique_valid_roots > 1)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
        } else {
-               ret = __hv_remote_flush_tlb_with_range(kvm,
-                               kvm_get_vcpu(kvm, 0), range);
+               ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
        }
 
-       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+       spin_unlock(&kvm_vmx->hv_root_ept_lock);
        return ret;
 }
 static int hv_remote_flush_tlb(struct kvm *kvm)
@@ -559,7 +563,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
         * evmcs in singe VM shares same assist page.
         */
        if (!*p_hv_pa_pg)
-               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 
        if (!*p_hv_pa_pg)
                return -ENOMEM;
@@ -576,6 +580,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
 
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
+static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+       struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+       if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+               spin_lock(&kvm_vmx->hv_root_ept_lock);
+               to_vmx(vcpu)->hv_root_ept = root_ept;
+               if (root_ept != kvm_vmx->hv_root_ept)
+                       kvm_vmx->hv_root_ept = INVALID_PAGE;
+               spin_unlock(&kvm_vmx->hv_root_ept_lock);
+       }
+#endif
+}
+
 /*
  * Comment's format: document - errata name - stepping - processor name.
  * Refer from
@@ -1570,12 +1589,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 
 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
+       /*
+        * Emulation of instructions in SGX enclaves is impossible as RIP does
+        * not point  tthe failing instruction, and even if it did, the code
+        * stream is inaccessible.  Inject #UD instead of exiting to userspace
+        * so that guest userspace can't DoS the guest simply by triggering
+        * emulation (enclaves are CPL3 only).
+        */
+       if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return false;
+       }
        return true;
 }
 
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
+       union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
        unsigned long rip, orig_rip;
+       u32 instr_len;
 
        /*
         * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1586,9 +1618,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
         * i.e. we end up advancing IP with some random value.
         */
        if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-           to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+           exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+               instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+               /*
+                * Emulating an enclave's instructions isn't supported as KVM
+                * cannot access the enclave's memory or its true RIP, e.g. the
+                * vmcs.GUEST_RIP points at the exit point of the enclave, not
+                * the RIP that actually triggered the VM-Exit.  But, because
+                * most instructions that cause VM-Exit will #UD in an enclave,
+                * most instruction-based VM-Exits simply do not occur.
+                *
+                * There are a few exceptions, notably the debug instructions
+                * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+                * and generate #DB/#BP as expected, which KVM might intercept.
+                * But again, the CPU does the dirty work and saves an instr
+                * length of zero so VMMs don't shoot themselves in the foot.
+                * WARN if KVM tries to skip a non-zero length instruction on
+                * a VM-Exit from an enclave.
+                */
+               if (!instr_len)
+                       goto rip_updated;
+
+               WARN(exit_reason.enclave_mode,
+                    "KVM: skipping instruction after SGX enclave VM-Exit");
+
                orig_rip = kvm_rip_read(vcpu);
-               rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               rip = orig_rip + instr_len;
 #ifdef CONFIG_X86_64
                /*
                 * We need to mask out the high 32 bits of RIP if not in 64-bit
@@ -1604,6 +1660,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
                        return 0;
        }
 
+rip_updated:
        /* skipping an emulated instruction also counts */
        vmx_set_interrupt_shadow(vcpu, 0);
 
@@ -1865,6 +1922,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_FEAT_CTL:
                msr_info->data = vmx->msr_ia32_feature_control;
                break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+                       return 1;
+               msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+                       [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+               break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!nested_vmx_allowed(vcpu))
                        return 1;
@@ -2158,6 +2222,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vmx->msr_ia32_feature_control = data;
                if (msr_info->host_initiated && data == 0)
                        vmx_leave_nested(vcpu);
+
+               /* SGX may be enabled/disabled by guest's firmware */
+               vmx_write_encls_bitmap(vcpu, NULL);
+               break;
+       case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+               /*
+                * On real hardware, the LE hash MSRs are writable before
+                * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+                * at which point SGX related bits in IA32_FEATURE_CONTROL
+                * become writable.
+                *
+                * KVM does not emulate SGX activation for simplicity, so
+                * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+                * is unlocked.  This is technically not architectural
+                * behavior, but it's close enough.
+                */
+               if (!msr_info->host_initiated &&
+                   (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+                   ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+                   !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
+                       return 1;
+               vmx->msr_ia32_sgxlepubkeyhash
+                       [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                if (!msr_info->host_initiated)
@@ -3088,8 +3175,7 @@ static int vmx_get_max_tdp_level(void)
        return 4;
 }
 
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level)
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
 {
        u64 eptp = VMX_EPTP_MT_WB;
 
@@ -3098,13 +3184,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
        if (enable_ept_ad_bits &&
            (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
                eptp |= VMX_EPTP_AD_ENABLE_BIT;
-       eptp |= (root_hpa & PAGE_MASK);
+       eptp |= root_hpa;
 
        return eptp;
 }
 
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
-                            int pgd_level)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+                            int root_level)
 {
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
@@ -3112,16 +3198,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
        u64 eptp;
 
        if (enable_ept) {
-               eptp = construct_eptp(vcpu, pgd, pgd_level);
+               eptp = construct_eptp(vcpu, root_hpa, root_level);
                vmcs_write64(EPT_POINTER, eptp);
 
-               if (kvm_x86_ops.tlb_remote_flush) {
-                       spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-                       to_vmx(vcpu)->ept_pointer = eptp;
-                       to_kvm_vmx(kvm)->ept_pointers_match
-                               = EPT_POINTERS_CHECK;
-                       spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-               }
+               hv_track_root_ept(vcpu, root_hpa);
 
                if (!enable_unrestricted_guest && !is_paging(vcpu))
                        guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -3131,7 +3211,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
                        update_guest_cr3 = false;
                vmx_ept_load_pdptrs(vcpu);
        } else {
-               guest_cr3 = pgd;
+               guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
        }
 
        if (update_guest_cr3)
@@ -4314,15 +4394,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        vmx->secondary_exec_control = exec_control;
 }
 
-static void ept_set_mmio_spte_mask(void)
-{
-       /*
-        * EPT Misconfigurations can be generated if the value of bits 2:0
-        * of an EPT paging-structure entry is 110b (write/execute).
-        */
-       kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
 #define VMX_XSS_EXIT_BITMAP 0
 
 /*
@@ -4410,8 +4481,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
 
-       if (cpu_has_vmx_encls_vmexit())
-               vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+       vmx_write_encls_bitmap(&vmx->vcpu, NULL);
 
        if (vmx_pt_mode_is_host_guest()) {
                memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
@@ -5184,17 +5254,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
-       /* Treat an INVD instruction as a NOP and just skip it. */
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
@@ -5203,28 +5262,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
-       int err;
-
-       err = kvm_rdpmc(vcpu);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
-       return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
-       u64 new_bv = kvm_read_edx_eax(vcpu);
-       u32 index = kvm_rcx_read(vcpu);
-
-       int err = kvm_set_xcr(vcpu, index, new_bv);
-       return kvm_complete_insn_gp(vcpu, err);
-}
-
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
        if (likely(fasteoi)) {
@@ -5384,6 +5421,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
        gpa_t gpa;
 
+       if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+               return 1;
+
        /*
         * A nested guest cannot optimize MMIO vmexits, because we have an
         * nGPA here instead of the required GPA.
@@ -5485,18 +5525,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
        }
 }
 
-static void vmx_enable_tdp(void)
-{
-       kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-               enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
-               enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
-               0ull, VMX_EPT_EXECUTABLE_MASK,
-               cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-               VMX_EPT_RWX_MASK, 0ull);
-
-       ept_set_mmio_spte_mask();
-}
-
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5516,34 +5544,11 @@ static int handle_pause(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
-       return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
-       kvm_queue_exception(vcpu, UD_VECTOR);
-       return 1;
-}
-
 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
 {
        return 1;
 }
 
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
-
 static int handle_invpcid(struct kvm_vcpu *vcpu)
 {
        u32 vmx_instruction_info;
@@ -5632,16 +5637,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+#ifndef CONFIG_X86_SGX_KVM
 static int handle_encls(struct kvm_vcpu *vcpu)
 {
        /*
-        * SGX virtualization is not yet supported.  There is no software
-        * enable bit for SGX, so we have to trap ENCLS and inject a #UD
-        * to prevent the guest from executing ENCLS.
+        * SGX virtualization is disabled.  There is no software enable bit for
+        * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+        * the guest from executing ENCLS (when SGX is supported by hardware).
         */
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
+#endif /* CONFIG_X86_SGX_KVM */
 
 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
 {
@@ -5668,10 +5675,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
        [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = kvm_emulate_halt,
-       [EXIT_REASON_INVD]                    = handle_invd,
+       [EXIT_REASON_INVD]                    = kvm_emulate_invd,
        [EXIT_REASON_INVLPG]                  = handle_invlpg,
-       [EXIT_REASON_RDPMC]                   = handle_rdpmc,
-       [EXIT_REASON_VMCALL]                  = handle_vmcall,
+       [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
+       [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
        [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
        [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
        [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
@@ -5685,8 +5692,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
        [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
-       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
-       [EXIT_REASON_XSETBV]                  = handle_xsetbv,
+       [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
+       [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
        [EXIT_REASON_GDTR_IDTR]               = handle_desc,
@@ -5694,13 +5701,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-       [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
+       [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
        [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
-       [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
+       [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
        [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
        [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
-       [EXIT_REASON_RDRAND]                  = handle_invalid_op,
-       [EXIT_REASON_RDSEED]                  = handle_invalid_op,
+       [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
+       [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
        [EXIT_REASON_INVPCID]                 = handle_invpcid,
        [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
@@ -5787,12 +5794,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
               vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
 }
 
-void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
+{
+       unsigned int i;
+       struct vmx_msr_entry *e;
+
+       pr_err("MSR %s:\n", name);
+       for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+               pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vmentry_ctl, vmexit_ctl;
        u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
        unsigned long cr4;
-       u64 efer;
+       int efer_slot;
 
        if (!dump_invalid_vmcs) {
                pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5804,7 +5822,6 @@ void dump_vmcs(void)
        cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
        cr4 = vmcs_readl(GUEST_CR4);
-       efer = vmcs_read64(GUEST_IA32_EFER);
        secondary_exec_control = 0;
        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -5816,9 +5833,7 @@ void dump_vmcs(void)
        pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
               cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
        pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
-       if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
-           (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
-       {
+       if (cpu_has_vmx_ept()) {
                pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
                       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
                pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
@@ -5841,10 +5856,20 @@ void dump_vmcs(void)
        vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
        vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
        vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
-       if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
-           (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
-               pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
-                      efer, vmcs_read64(GUEST_IA32_PAT));
+       efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+       else if (efer_slot >= 0)
+               pr_err("EFER= 0x%016llx (autoload)\n",
+                      vmx->msr_autoload.guest.val[efer_slot].value);
+       else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer | (EFER_LMA | EFER_LME));
+       else
+               pr_err("EFER= 0x%016llx (effective)\n",
+                      vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+       if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
        pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
               vmcs_read64(GUEST_IA32_DEBUGCTL),
               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5860,6 +5885,10 @@ void dump_vmcs(void)
        if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                pr_err("InterruptStatus = %04x\n",
                       vmcs_read16(GUEST_INTR_STATUS));
+       if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+       if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+               vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
 
        pr_err("*** Host State ***\n");
        pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
@@ -5881,14 +5910,16 @@ void dump_vmcs(void)
               vmcs_readl(HOST_IA32_SYSENTER_ESP),
               vmcs_read32(HOST_IA32_SYSENTER_CS),
               vmcs_readl(HOST_IA32_SYSENTER_EIP));
-       if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
-               pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
-                      vmcs_read64(HOST_IA32_EFER),
-                      vmcs_read64(HOST_IA32_PAT));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+               pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+       if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+               pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
        if (cpu_has_load_perf_global_ctrl() &&
            vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                pr_err("PerfGlobCtl = 0x%016llx\n",
                       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+       if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+               vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
 
        pr_err("*** Control State ***\n");
        pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -5997,7 +6028,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        }
 
        if (exit_reason.failed_vmentry) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = exit_reason.full;
@@ -6006,7 +6037,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        }
 
        if (unlikely(vmx->fail)) {
-               dump_vmcs();
+               dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6092,7 +6123,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 unexpected_vmexit:
        vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
                    exit_reason.full);
-       dump_vmcs();
+       dump_vmcs(vcpu);
        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
        vcpu->run->internal.suberror =
                        KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
@@ -6976,6 +7007,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        else
                memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
+       vcpu_setup_sgx_lepubkeyhash(vcpu);
+
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
 
@@ -6989,8 +7022,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
        vmx->pi_desc.nv = POSTED_INTR_VECTOR;
        vmx->pi_desc.sn = 1;
 
-       vmx->ept_pointer = INVALID_PAGE;
-
+#if IS_ENABLED(CONFIG_HYPERV)
+       vmx->hv_root_ept = INVALID_PAGE;
+#endif
        return 0;
 
 free_vmcs:
@@ -7007,7 +7041,9 @@ free_vpid:
 
 static int vmx_vm_init(struct kvm *kvm)
 {
-       spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#if IS_ENABLED(CONFIG_HYPERV)
+       spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
+#endif
 
        if (!ple_gap)
                kvm->arch.pause_in_guest = true;
@@ -7302,6 +7338,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
        set_cr4_guest_host_mask(vmx);
 
+       vmx_write_encls_bitmap(vcpu, NULL);
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+               vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+               vmx->msr_ia32_feature_control_valid_bits |=
+                       FEAT_CTL_SGX_LC_ENABLED;
+       else
+               vmx->msr_ia32_feature_control_valid_bits &=
+                       ~FEAT_CTL_SGX_LC_ENABLED;
+
        /* Refresh #PF interception to account for MAXPHYADDR changes. */
        vmx_update_exception_bitmap(vcpu);
 }
@@ -7322,6 +7371,13 @@ static __init void vmx_set_cpu_caps(void)
        if (vmx_pt_mode_is_host_guest())
                kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
 
+       if (!enable_sgx) {
+               kvm_cpu_cap_clear(X86_FEATURE_SGX);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+               kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+       }
+
        if (vmx_umip_emulated())
                kvm_cpu_cap_set(X86_FEATURE_UMIP);
 
@@ -7848,7 +7904,8 @@ static __init int hardware_setup(void)
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
        if (enable_ept)
-               vmx_enable_tdp();
+               kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+                                     cpu_has_vmx_ept_execute_only());
 
        if (!enable_ept)
                ept_lpage_level = 0;
@@ -7909,6 +7966,8 @@ static __init int hardware_setup(void)
        if (!enable_ept || !cpu_has_vmx_intel_pt())
                pt_mode = PT_MODE_SYSTEM;
 
+       setup_default_sgx_lepubkeyhash();
+
        if (nested) {
                nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
                                           vmx_capability.ept);
index 89da5e1..19fe09f 100644 (file)
@@ -325,7 +325,12 @@ struct vcpu_vmx {
         */
        u64 msr_ia32_feature_control;
        u64 msr_ia32_feature_control_valid_bits;
-       u64 ept_pointer;
+       /* SGX Launch Control public key hash */
+       u64 msr_ia32_sgxlepubkeyhash[4];
+
+#if IS_ENABLED(CONFIG_HYPERV)
+       u64 hv_root_ept;
+#endif
 
        struct pt_desc pt_desc;
        struct lbr_desc lbr_desc;
@@ -338,12 +343,6 @@ struct vcpu_vmx {
        } shadow_msr_intercept;
 };
 
-enum ept_pointers_status {
-       EPT_POINTERS_CHECK = 0,
-       EPT_POINTERS_MATCH = 1,
-       EPT_POINTERS_MISMATCH = 2
-};
-
 struct kvm_vmx {
        struct kvm kvm;
 
@@ -351,8 +350,10 @@ struct kvm_vmx {
        bool ept_identity_pagetable_done;
        gpa_t ept_identity_map_addr;
 
-       enum ept_pointers_status ept_pointers_match;
-       spinlock_t ept_pointer_lock;
+#if IS_ENABLED(CONFIG_HYPERV)
+       hpa_t hv_root_ept;
+       spinlock_t hv_root_ept_lock;
+#endif
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -376,8 +377,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
 void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
-                  int root_level);
+u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
 
 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
@@ -543,6 +543,6 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
        return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
 }
 
-void dump_vmcs(void);
+void dump_vmcs(struct kvm_vcpu *vcpu);
 
 #endif /* __KVM_X86_VMX_H */
index 692b0c3..164b64f 100644 (file)
@@ -37,6 +37,10 @@ static __always_inline void vmcs_check32(unsigned long field)
 {
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
                         "32-bit accessor invalid for 16-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+                        "32-bit accessor invalid for 64-bit field");
+       BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+                        "32-bit accessor invalid for 64-bit high field");
        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
                         "32-bit accessor invalid for natural width field");
 }
index 47e021b..f0d0b6e 100644 (file)
@@ -75,6 +75,7 @@
 #include <asm/tlbflush.h>
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
 
 #define CREATE_TRACE_POINTS
@@ -245,6 +246,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("l1d_flush", l1d_flush),
        VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
        VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+       VCPU_STAT("nested_run", nested_run),
+       VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
+       VCPU_STAT("directed_yield_successful", directed_yield_successful),
        VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
        VM_STAT("mmu_pte_write", mmu_pte_write),
        VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -271,8 +275,7 @@ static struct kmem_cache *x86_emulator_cache;
  * When called, it means the previous get/set msr reached an invalid msr.
  * Return true if we want to ignore/silent this failed msr access.
  */
-static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
-                                 u64 data, bool write)
+static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
 {
        const char *op = write ? "wrmsr" : "rdmsr";
 
@@ -544,8 +547,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
        if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
        queue:
-               if (has_error && !is_protmode(vcpu))
-                       has_error = false;
                if (reinject) {
                        /*
                         * On vmentry, vcpu->arch.exception.pending is only
@@ -984,14 +985,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
        return 0;
 }
 
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
 {
-       if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
-               return __kvm_set_xcr(vcpu, index, xcr);
+       if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+           __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
 
-       return 1;
+       return kvm_skip_emulated_instruction(vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_set_xcr);
+EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
 
 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
@@ -1192,20 +1196,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
-bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
 {
        u32 ecx = kvm_rcx_read(vcpu);
        u64 data;
-       int err;
 
-       err = kvm_pmu_rdpmc(vcpu, ecx, &data);
-       if (err)
-               return err;
+       if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
        kvm_rax_write(vcpu, (u32)data);
        kvm_rdx_write(vcpu, data >> 32);
-       return err;
+       return kvm_skip_emulated_instruction(vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_rdpmc);
+EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
 
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -1445,7 +1450,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
        if (r == KVM_MSR_RET_INVALID) {
                /* Unconditionally clear the output for simplicity */
                *data = 0;
-               if (kvm_msr_ignored_check(vcpu, index, 0, false))
+               if (kvm_msr_ignored_check(index, 0, false))
                        r = 0;
        }
 
@@ -1526,35 +1531,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 
 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
 {
+       struct kvm_x86_msr_filter *msr_filter;
+       struct msr_bitmap_range *ranges;
        struct kvm *kvm = vcpu->kvm;
-       struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
-       u32 count = kvm->arch.msr_filter.count;
-       u32 i;
-       bool r = kvm->arch.msr_filter.default_allow;
+       bool allowed;
        int idx;
+       u32 i;
 
-       /* MSR filtering not set up or x2APIC enabled, allow everything */
-       if (!count || (index >= 0x800 && index <= 0x8ff))
+       /* x2APIC MSRs do not support filtering. */
+       if (index >= 0x800 && index <= 0x8ff)
                return true;
 
-       /* Prevent collision with set_msr_filter */
        idx = srcu_read_lock(&kvm->srcu);
 
-       for (i = 0; i < count; i++) {
+       msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+       if (!msr_filter) {
+               allowed = true;
+               goto out;
+       }
+
+       allowed = msr_filter->default_allow;
+       ranges = msr_filter->ranges;
+
+       for (i = 0; i < msr_filter->count; i++) {
                u32 start = ranges[i].base;
                u32 end = start + ranges[i].nmsrs;
                u32 flags = ranges[i].flags;
                unsigned long *bitmap = ranges[i].bitmap;
 
                if ((index >= start) && (index < end) && (flags & type)) {
-                       r = !!test_bit(index - start, bitmap);
+                       allowed = !!test_bit(index - start, bitmap);
                        break;
                }
        }
 
+out:
        srcu_read_unlock(&kvm->srcu, idx);
 
-       return r;
+       return allowed;
 }
 EXPORT_SYMBOL_GPL(kvm_msr_allowed);
 
@@ -1611,7 +1625,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
        int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
 
        if (ret == KVM_MSR_RET_INVALID)
-               if (kvm_msr_ignored_check(vcpu, index, data, true))
+               if (kvm_msr_ignored_check(index, data, true))
                        ret = 0;
 
        return ret;
@@ -1649,7 +1663,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
        if (ret == KVM_MSR_RET_INVALID) {
                /* Unconditionally clear *data for simplicity */
                *data = 0;
-               if (kvm_msr_ignored_check(vcpu, index, 0, false))
+               if (kvm_msr_ignored_check(index, 0, false))
                        ret = 0;
        }
 
@@ -1783,6 +1797,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
+{
+       return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
+
+int kvm_emulate_invd(struct kvm_vcpu *vcpu)
+{
+       /* Treat an INVD instruction as a NOP and just skip it. */
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+       pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n");
+       return kvm_emulate_as_nop(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
 {
        xfer_to_guest_mode_prepare();
@@ -2320,7 +2368,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        kvm_vcpu_write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-       spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+       spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
        if (!matched) {
                kvm->arch.nr_vcpus_matched_tsc = 0;
        } else if (!already_matched) {
@@ -2328,7 +2376,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        }
 
        kvm_track_tsc_matching(vcpu);
-       spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+       spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
 }
 
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2550,11 +2598,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
        int i;
        struct kvm_vcpu *vcpu;
        struct kvm_arch *ka = &kvm->arch;
+       unsigned long flags;
+
+       kvm_hv_invalidate_tsc_page(kvm);
 
-       spin_lock(&ka->pvclock_gtod_sync_lock);
        kvm_make_mclock_inprogress_request(kvm);
+
        /* no guest entries from this point */
+       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        pvclock_update_vm_gtod_copy(kvm);
+       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2562,8 +2615,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
        /* guest entries allowed */
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-
-       spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
 }
 
@@ -2571,17 +2622,18 @@ u64 get_kvmclock_ns(struct kvm *kvm)
 {
        struct kvm_arch *ka = &kvm->arch;
        struct pvclock_vcpu_time_info hv_clock;
+       unsigned long flags;
        u64 ret;
 
-       spin_lock(&ka->pvclock_gtod_sync_lock);
+       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
-               spin_unlock(&ka->pvclock_gtod_sync_lock);
+               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
                return get_kvmclock_base_ns() + ka->kvmclock_offset;
        }
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-       spin_unlock(&ka->pvclock_gtod_sync_lock);
+       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* both __this_cpu_read() and rdtsc() should be on the same cpu */
        get_cpu();
@@ -2675,13 +2727,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         * If the host uses TSC clock, then passthrough TSC as stable
         * to the guest.
         */
-       spin_lock(&ka->pvclock_gtod_sync_lock);
+       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        use_master_clock = ka->use_master_clock;
        if (use_master_clock) {
                host_tsc = ka->master_cycle_now;
                kernel_ns = ka->master_kernel_ns;
        }
-       spin_unlock(&ka->pvclock_gtod_sync_lock);
+       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
@@ -3370,6 +3422,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = 0;
                break;
        case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+               if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+                       return kvm_pmu_get_msr(vcpu, msr_info);
+               if (!msr_info->host_initiated)
+                       return 1;
+               msr_info->data = 0;
+               break;
        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -3759,8 +3817,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_X86_USER_SPACE_MSR:
        case KVM_CAP_X86_MSR_FILTER:
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+       case KVM_CAP_SGX_ATTRIBUTE:
+#endif
+       case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
                r = 1;
                break;
+       case KVM_CAP_SET_GUEST_DEBUG2:
+               return KVM_GUESTDBG_VALID_MASK;
 #ifdef CONFIG_KVM_XEN
        case KVM_CAP_XEN_HVM:
                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
@@ -4663,7 +4727,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                        kvm_update_pv_runtime(vcpu);
 
                return 0;
-
        default:
                return -EINVAL;
        }
@@ -5345,6 +5408,28 @@ split_irqchip_unlock:
                        kvm->arch.bus_lock_detection_enabled = true;
                r = 0;
                break;
+#ifdef CONFIG_X86_SGX_KVM
+       case KVM_CAP_SGX_ATTRIBUTE: {
+               unsigned long allowed_attributes = 0;
+
+               r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+               if (r)
+                       break;
+
+               /* KVM only supports the PROVISIONKEY privileged attribute. */
+               if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+                   !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+                       kvm->arch.sgx_provisioning_allowed = true;
+               else
+                       r = -EINVAL;
+               break;
+       }
+#endif
+       case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+               r = -EINVAL;
+               if (kvm_x86_ops.vm_copy_enc_context_from)
+                       r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+               return r;
        default:
                r = -EINVAL;
                break;
@@ -5352,25 +5437,34 @@ split_irqchip_unlock:
        return r;
 }
 
-static void kvm_clear_msr_filter(struct kvm *kvm)
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+       struct kvm_x86_msr_filter *msr_filter;
+
+       msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+       if (!msr_filter)
+               return NULL;
+
+       msr_filter->default_allow = default_allow;
+       return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
 {
        u32 i;
-       u32 count = kvm->arch.msr_filter.count;
-       struct msr_bitmap_range ranges[16];
 
-       mutex_lock(&kvm->lock);
-       kvm->arch.msr_filter.count = 0;
-       memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
-       mutex_unlock(&kvm->lock);
-       synchronize_srcu(&kvm->srcu);
+       if (!msr_filter)
+               return;
+
+       for (i = 0; i < msr_filter->count; i++)
+               kfree(msr_filter->ranges[i].bitmap);
 
-       for (i = 0; i < count; i++)
-               kfree(ranges[i].bitmap);
+       kfree(msr_filter);
 }
 
-static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+                             struct kvm_msr_filter_range *user_range)
 {
-       struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
        struct msr_bitmap_range range;
        unsigned long *bitmap = NULL;
        size_t bitmap_size;
@@ -5404,11 +5498,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user
                goto err;
        }
 
-       /* Everything ok, add this range identifier to our global pool */
-       ranges[kvm->arch.msr_filter.count] = range;
-       /* Make sure we filled the array before we tell anyone to walk it */
-       smp_wmb();
-       kvm->arch.msr_filter.count++;
+       /* Everything ok, add this range identifier. */
+       msr_filter->ranges[msr_filter->count] = range;
+       msr_filter->count++;
 
        return 0;
 err:
@@ -5419,10 +5511,11 @@ err:
 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
 {
        struct kvm_msr_filter __user *user_msr_filter = argp;
+       struct kvm_x86_msr_filter *new_filter, *old_filter;
        struct kvm_msr_filter filter;
        bool default_allow;
-       int r = 0;
        bool empty = true;
+       int r = 0;
        u32 i;
 
        if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
@@ -5435,25 +5528,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
        if (empty && !default_allow)
                return -EINVAL;
 
-       kvm_clear_msr_filter(kvm);
-
-       kvm->arch.msr_filter.default_allow = default_allow;
+       new_filter = kvm_alloc_msr_filter(default_allow);
+       if (!new_filter)
+               return -ENOMEM;
 
-       /*
-        * Protect from concurrent calls to this function that could trigger
-        * a TOCTOU violation on kvm->arch.msr_filter.count.
-        */
-       mutex_lock(&kvm->lock);
        for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
-               r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
-               if (r)
-                       break;
+               r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
+               if (r) {
+                       kvm_free_msr_filter(new_filter);
+                       return r;
+               }
        }
 
+       mutex_lock(&kvm->lock);
+
+       /* The per-VM filter is protected by kvm->lock... */
+       old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
+
+       rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+       synchronize_srcu(&kvm->srcu);
+
+       kvm_free_msr_filter(old_filter);
+
        kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
        mutex_unlock(&kvm->lock);
 
-       return r;
+       return 0;
 }
 
 long kvm_arch_vm_ioctl(struct file *filp,
@@ -5700,6 +5800,7 @@ set_pit2_out:
        }
 #endif
        case KVM_SET_CLOCK: {
+               struct kvm_arch *ka = &kvm->arch;
                struct kvm_clock_data user_ns;
                u64 now_ns;
 
@@ -5718,8 +5819,22 @@ set_pit2_out:
                 * pvclock_update_vm_gtod_copy().
                 */
                kvm_gen_update_masterclock(kvm);
-               now_ns = get_kvmclock_ns(kvm);
-               kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
+
+               /*
+                * This pairs with kvm_guest_time_update(): when masterclock is
+                * in use, we use master_kernel_ns + kvmclock_offset to set
+                * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+                * is slightly ahead) here we risk going negative on unsigned
+                * 'system_time' when 'user_ns.clock' is very small.
+                */
+               spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+               if (kvm->arch.use_master_clock)
+                       now_ns = ka->master_kernel_ns;
+               else
+                       now_ns = get_kvmclock_base_ns();
+               ka->kvmclock_offset = user_ns.clock - now_ns;
+               spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+
                kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
                break;
        }
@@ -5959,6 +6074,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
        u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
                                struct x86_exception *exception)
@@ -5975,6 +6091,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
        access |= PFERR_WRITE_MASK;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -6603,7 +6720,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
                int cpu = get_cpu();
 
                cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
-               smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+               on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
                                wbinvd_ipi, NULL, 1);
                put_cpu();
                cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
@@ -7698,6 +7815,7 @@ static void kvm_hyperv_tsc_notifier(void)
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
        int cpu;
+       unsigned long flags;
 
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
@@ -7713,17 +7831,15 @@ static void kvm_hyperv_tsc_notifier(void)
        list_for_each_entry(kvm, &vm_list, vm_list) {
                struct kvm_arch *ka = &kvm->arch;
 
-               spin_lock(&ka->pvclock_gtod_sync_lock);
-
+               spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
                pvclock_update_vm_gtod_copy(kvm);
+               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
                kvm_for_each_vcpu(cpu, vcpu, kvm)
                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
                kvm_for_each_vcpu(cpu, vcpu, kvm)
                        kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-
-               spin_unlock(&ka->pvclock_gtod_sync_lock);
        }
        mutex_unlock(&kvm_lock);
 }
@@ -8004,9 +8120,6 @@ int kvm_arch_init(void *opaque)
        if (r)
                goto out_free_percpu;
 
-       kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0,
-                       PT_PRESENT_MASK, 0, sme_me_mask);
        kvm_timer_init();
 
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -8166,21 +8279,35 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
        struct kvm_vcpu *target = NULL;
        struct kvm_apic_map *map;
 
+       vcpu->stat.directed_yield_attempted++;
+
        rcu_read_lock();
-       map = rcu_dereference(kvm->arch.apic_map);
+       map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
        if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
                target = map->phys_map[dest_id]->vcpu;
 
        rcu_read_unlock();
 
-       if (target && READ_ONCE(target->ready))
-               kvm_vcpu_yield_to(target);
+       if (!target || !READ_ONCE(target->ready))
+               goto no_yield;
+
+       /* Ignore requests to yield to self */
+       if (vcpu == target)
+               goto no_yield;
+
+       if (kvm_vcpu_yield_to(target) <= 0)
+               goto no_yield;
+
+       vcpu->stat.directed_yield_successful++;
+
+no_yield:
+       return;
 }
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8227,7 +8354,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                        break;
 
                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
-               kvm_sched_yield(vcpu->kvm, a1);
+               kvm_sched_yield(vcpu, a1);
                ret = 0;
                break;
 #ifdef CONFIG_X86_64
@@ -8245,7 +8372,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
                        break;
 
-               kvm_sched_yield(vcpu->kvm, a0);
+               kvm_sched_yield(vcpu, a0);
                ret = 0;
                break;
        default:
@@ -8328,6 +8455,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
 }
 
+
+int kvm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+               return -EIO;
+
+       if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+               return 1;
+       }
+
+       return kvm_x86_ops.nested_ops->check_events(vcpu);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+               vcpu->arch.exception.error_code = false;
+       static_call(kvm_x86_queue_exception)(vcpu);
+}
+
 static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 {
        int r;
@@ -8336,7 +8484,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        /* try to reinject previous events if any */
 
        if (vcpu->arch.exception.injected) {
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                can_inject = false;
        }
        /*
@@ -8373,7 +8521,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
         * from L2 to L1.
         */
        if (is_guest_mode(vcpu)) {
-               r = kvm_x86_ops.nested_ops->check_events(vcpu);
+               r = kvm_check_nested_events(vcpu);
                if (r < 0)
                        goto busy;
        }
@@ -8399,7 +8547,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                        }
                }
 
-               static_call(kvm_x86_queue_exception)(vcpu);
+               kvm_inject_exception(vcpu);
                can_inject = false;
        }
 
@@ -8936,10 +9084,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
                if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
-                       vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
-                       vcpu->mmio_needed = 0;
-                       r = 0;
-                       goto out;
+                       if (is_guest_mode(vcpu)) {
+                               kvm_x86_ops.nested_ops->triple_fault(vcpu);
+                       } else {
+                               vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+                               vcpu->mmio_needed = 0;
+                               r = 0;
+                               goto out;
+                       }
                }
                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
                        /* Page is swapped out. Do synthetic halt */
@@ -9237,7 +9389,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
        if (is_guest_mode(vcpu))
-               kvm_x86_ops.nested_ops->check_events(vcpu);
+               kvm_check_nested_events(vcpu);
 
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
@@ -10634,8 +10786,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-       u32 i;
-
        if (current->mm == kvm->mm) {
                /*
                 * Free memory regions allocated on behalf of userspace,
@@ -10651,8 +10801,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                mutex_unlock(&kvm->slots_lock);
        }
        static_call_cond(kvm_x86_vm_destroy)(kvm);
-       for (i = 0; i < kvm->arch.msr_filter.count; i++)
-               kfree(kvm->arch.msr_filter.ranges[i].bitmap);
+       kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
        kvm_pic_destroy(kvm);
        kvm_ioapic_destroy(kvm);
        kvm_free_vcpus(kvm);
@@ -10982,6 +11131,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
        return false;
 }
 
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+               return true;
+
+       return false;
+}
+
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.preempted_in_kernel;
@@ -11503,7 +11660,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
 
                fallthrough;
        case INVPCID_TYPE_ALL_INCL_GLOBAL:
-               kvm_mmu_unload(vcpu);
+               kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
                return kvm_skip_emulated_instruction(vcpu);
 
        default:
index 39eb048..5334bf4 100644 (file)
@@ -8,6 +8,14 @@
 #include "kvm_cache_regs.h"
 #include "kvm_emulate.h"
 
+#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)                \
+({                                                                     \
+       bool failed = (consistency_check);                              \
+       if (failed)                                                     \
+               trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
+       failed;                                                         \
+})
+
 #define KVM_DEFAULT_PLE_GAP            128
 #define KVM_VMX_DEFAULT_PLE_WINDOW     4096
 #define KVM_DEFAULT_PLE_WINDOW_GROW    2
@@ -48,6 +56,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
 
 #define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
 
+int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.exception.pending = false;
@@ -250,7 +260,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
 void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
 u64 get_kvmclock_ns(struct kvm *kvm);
 
 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
index 4b01f7d..ae78cef 100644 (file)
@@ -262,7 +262,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
        if (pgprot_val(old_prot) == pgprot_val(new_prot))
                return;
 
-       pa = pfn << page_level_shift(level);
+       pa = pfn << PAGE_SHIFT;
        size = page_level_size(level);
 
        /*
index 6926d0c..b35fc80 100644 (file)
@@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
  * add rsp, 8                      // skip eth_type_trans's frame
  * ret                             // return to its caller
  */
-int arch_prepare_bpf_trampoline(void *image, void *image_end,
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
                                const struct btf_func_model *m, u32 flags,
                                struct bpf_tramp_progs *tprogs,
                                void *orig_call)
@@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 
        save_regs(m, &prog, nr_args, stack_size);
 
+       if (flags & BPF_TRAMP_F_CALL_ORIG) {
+               /* arg1: mov rdi, im */
+               emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
+               if (emit_call(&prog, __bpf_tramp_enter, prog)) {
+                       ret = -EINVAL;
+                       goto cleanup;
+               }
+       }
+
        if (fentry->nr_progs)
                if (invoke_bpf(m, &prog, fentry, stack_size))
                        return -EINVAL;
@@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
        }
 
        if (flags & BPF_TRAMP_F_CALL_ORIG) {
-               if (fentry->nr_progs || fmod_ret->nr_progs)
-                       restore_regs(m, &prog, nr_args, stack_size);
+               restore_regs(m, &prog, nr_args, stack_size);
 
                /* call original function */
                if (emit_call(&prog, orig_call, prog)) {
@@ -2003,6 +2011,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
                }
                /* remember return value in a stack for bpf prog to access */
                emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
+               im->ip_after_call = prog;
+               memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
+               prog += X86_PATCH_SIZE;
        }
 
        if (fmod_ret->nr_progs) {
@@ -2033,9 +2044,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
         * the return value is only updated on the stack and still needs to be
         * restored to R0.
         */
-       if (flags & BPF_TRAMP_F_CALL_ORIG)
+       if (flags & BPF_TRAMP_F_CALL_ORIG) {
+               im->ip_epilogue = prog;
+               /* arg1: mov rdi, im */
+               emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
+               if (emit_call(&prog, __bpf_tramp_exit, prog)) {
+                       ret = -EINVAL;
+                       goto cleanup;
+               }
                /* restore original return value back into RAX */
                emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
+       }
 
        EMIT1(0x5B); /* pop rbx */
        EMIT1(0xC9); /* leave */
@@ -2225,7 +2244,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
                padding = true;
                goto skip_init_addrs;
        }
-       addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
+       addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
        if (!addrs) {
                prog = orig_prog;
                goto out_addrs;
@@ -2317,7 +2336,7 @@ out_image:
                if (image)
                        bpf_prog_fill_jited_linfo(prog, addrs + 1);
 out_addrs:
-               kfree(addrs);
+               kvfree(addrs);
                kfree(jit_data);
                prog->aux->jit_data = NULL;
        }
index 1ac8578..b42bfda 100644 (file)
@@ -27,7 +27,6 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
 MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
-MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
 
 static bool force;
 
index 17d80f7..ac06ca3 100644 (file)
@@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size);
 unsigned long xen_max_p2m_pfn __read_mostly;
 EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
 
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
-#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT
+#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT
 #else
 #define P2M_LIMIT 0
 #endif
@@ -416,9 +416,6 @@ void __init xen_vmalloc_p2m_tree(void)
        xen_p2m_last_pfn = xen_max_p2m_pfn;
 
        p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
-       if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC))
-               p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO;
-
        vm.flags = VM_ALLOC;
        vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
                        PMD_SIZE * PMDS_PER_MID_PAGE);
index 1a3b756..8bfc103 100644 (file)
@@ -59,6 +59,18 @@ static struct {
 } xen_remap_buf __initdata __aligned(PAGE_SIZE);
 static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
 
+/*
+ * The maximum amount of extra memory compared to the base size.  The
+ * main scaling factor is the size of struct page.  At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO                (10)
+
 static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
 
 static void __init xen_parse_512gb(void)
@@ -778,13 +790,13 @@ char * __init xen_memory_setup(void)
                extra_pages += max_pages - max_pfn;
 
        /*
-        * Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO
+        * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
         * factor the base size.
         *
         * Make sure we have no memory above max_pages, as this area
         * isn't handled by the p2m management.
         */
-       extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+       extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
                           extra_pages, max_pages - max_pfn);
        i = 0;
        addr = xen_e820_table.entries[0].addr;
index c426b84..45cc0ae 100644 (file)
        LOAD_CP_REGS_TAB(7)
 
 /*
- * coprocessor_flush(struct thread_info*, index)
- *                             a2        a3
- *
- * Save coprocessor registers for coprocessor 'index'.
- * The register values are saved to or loaded from the coprocessor area 
- * inside the task_info structure.
- *
- * Note that this function doesn't update the coprocessor_owner information!
- *
- */
-
-ENTRY(coprocessor_flush)
-
-       /* reserve 4 bytes on stack to save a0 */
-       abi_entry(4)
-
-       s32i    a0, a1, 0
-       movi    a0, .Lsave_cp_regs_jump_table
-       addx8   a3, a3, a0
-       l32i    a4, a3, 4
-       l32i    a3, a3, 0
-       add     a2, a2, a4
-       beqz    a3, 1f
-       callx0  a3
-1:     l32i    a0, a1, 0
-
-       abi_ret(4)
-
-ENDPROC(coprocessor_flush)
-
-/*
  * Entry condition:
  *
  *   a0:       trashed, original value saved on stack (PT_AREG0)
@@ -245,6 +214,39 @@ ENTRY(fast_coprocessor)
 
 ENDPROC(fast_coprocessor)
 
+       .text
+
+/*
+ * coprocessor_flush(struct thread_info*, index)
+ *                             a2        a3
+ *
+ * Save coprocessor registers for coprocessor 'index'.
+ * The register values are saved to or loaded from the coprocessor area
+ * inside the task_info structure.
+ *
+ * Note that this function doesn't update the coprocessor_owner information!
+ *
+ */
+
+ENTRY(coprocessor_flush)
+
+       /* reserve 4 bytes on stack to save a0 */
+       abi_entry(4)
+
+       s32i    a0, a1, 0
+       movi    a0, .Lsave_cp_regs_jump_table
+       addx8   a3, a3, a0
+       l32i    a4, a3, 4
+       l32i    a3, a3, 0
+       add     a2, a2, a4
+       beqz    a3, 1f
+       callx0  a3
+1:     l32i    a0, a1, 0
+
+       abi_ret(4)
+
+ENDPROC(coprocessor_flush)
+
        .data
 
 ENTRY(coprocessor_owner)
index 7666408..95a7489 100644 (file)
@@ -112,8 +112,11 @@ good_area:
         */
        fault = handle_mm_fault(vma, address, flags, regs);
 
-       if (fault_signal_pending(fault, regs))
+       if (fault_signal_pending(fault, regs)) {
+               if (!user_mode(regs))
+                       goto bad_page_fault;
                return;
+       }
 
        if (unlikely(fault & VM_FAULT_ERROR)) {
                if (fault & VM_FAULT_OOM)
index 26b7f72..50e5790 100644 (file)
@@ -277,7 +277,7 @@ static struct bio *__bio_chain_endio(struct bio *bio)
 {
        struct bio *parent = bio->bi_private;
 
-       if (!parent->bi_status)
+       if (bio->bi_status && !parent->bi_status)
                parent->bi_status = bio->bi_status;
        bio_put(bio);
        return parent;
@@ -949,7 +949,7 @@ void bio_release_pages(struct bio *bio, bool mark_dirty)
 }
 EXPORT_SYMBOL_GPL(bio_release_pages);
 
-static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 {
        WARN_ON_ONCE(bio->bi_max_vecs);
 
@@ -959,11 +959,26 @@ static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
        bio->bi_iter.bi_size = iter->count;
        bio_set_flag(bio, BIO_NO_PAGE_REF);
        bio_set_flag(bio, BIO_CLONED);
+}
 
+static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+{
+       __bio_iov_bvec_set(bio, iter);
        iov_iter_advance(iter, iter->count);
        return 0;
 }
 
+static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
+{
+       struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+       struct iov_iter i = *iter;
+
+       iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
+       __bio_iov_bvec_set(bio, &i);
+       iov_iter_advance(iter, i.count);
+       return 0;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
 
 /**
@@ -1094,8 +1109,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
        int ret = 0;
 
        if (iov_iter_is_bvec(iter)) {
-               if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
-                       return -EINVAL;
+               if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+                       return bio_iov_bvec_set_append(bio, iter);
                return bio_iov_bvec_set(bio, iter);
        }
 
index ffb4aa0..4d97fb6 100644 (file)
@@ -382,6 +382,14 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
        switch (bio_op(rq->bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
+               if (queue_max_discard_segments(rq->q) > 1) {
+                       struct bio *bio = rq->bio;
+
+                       for_each_bio(bio)
+                               nr_phys_segs++;
+                       return nr_phys_segs;
+               }
+               return 1;
        case REQ_OP_WRITE_ZEROES:
                return 0;
        case REQ_OP_WRITE_SAME:
index 9ebb344..271f659 100644 (file)
@@ -302,7 +302,6 @@ static const char *const rqf_name[] = {
        RQF_NAME(QUIET),
        RQF_NAME(ELVPRIV),
        RQF_NAME(IO_STAT),
-       RQF_NAME(ALLOCED),
        RQF_NAME(PM),
        RQF_NAME(HASHED),
        RQF_NAME(STATS),
index 1a75589..46f055b 100644 (file)
@@ -323,6 +323,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
        int err;
 
        /*
+        * disk_max_parts() won't be zero, either GENHD_FL_EXT_DEVT is set
+        * or 'minors' is passed to alloc_disk().
+        */
+       if (partno >= disk_max_parts(disk))
+               return ERR_PTR(-EINVAL);
+
+       /*
         * Partitions are not supported on zoned block devices that are used as
         * such.
         */
index 3f045b5..a0c1a66 100644 (file)
@@ -99,13 +99,12 @@ acpi_status acpi_ns_root_initialize(void)
                 * just create and link the new node(s) here.
                 */
                new_node =
-                   ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_namespace_node));
+                   acpi_ns_create_node(*ACPI_CAST_PTR(u32, init_val->name));
                if (!new_node) {
                        status = AE_NO_MEMORY;
                        goto unlock_and_exit;
                }
 
-               ACPI_COPY_NAMESEG(new_node->name.ascii, init_val->name);
                new_node->descriptor_type = ACPI_DESC_TYPE_NAMED;
                new_node->type = init_val->type;
 
index e6a5d99..cb8f708 100644 (file)
@@ -9,6 +9,8 @@
 #ifndef _ACPI_INTERNAL_H_
 #define _ACPI_INTERNAL_H_
 
+#include <linux/idr.h>
+
 #define PREFIX "ACPI: "
 
 int early_acpi_osi_init(void);
@@ -96,9 +98,11 @@ void acpi_scan_table_handler(u32 event, void *table, void *context);
 
 extern struct list_head acpi_bus_id_list;
 
+#define ACPI_MAX_DEVICE_INSTANCES      4096
+
 struct acpi_device_bus_id {
        const char *bus_id;
-       unsigned int instance_no;
+       struct ida instance_ida;
        struct list_head node;
 };
 
index d93e400..768a6b4 100644 (file)
@@ -29,6 +29,7 @@
  */
 #ifdef CONFIG_X86
 #include <asm/apic.h>
+#include <asm/cpu.h>
 #endif
 
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
@@ -541,6 +542,12 @@ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index)
                        wait_for_freeze();
                } else
                        return -ENODEV;
+
+#if defined(CONFIG_X86) && defined(CONFIG_HOTPLUG_CPU)
+               /* If NMI wants to wake up CPU0, start CPU0. */
+               if (wakeup_cpu0())
+                       start_cpu0();
+#endif
        }
 
        /* Never reached */
index a184529..6efe7ed 100644 (file)
@@ -479,9 +479,8 @@ static void acpi_device_del(struct acpi_device *device)
        list_for_each_entry(acpi_device_bus_id, &acpi_bus_id_list, node)
                if (!strcmp(acpi_device_bus_id->bus_id,
                            acpi_device_hid(device))) {
-                       if (acpi_device_bus_id->instance_no > 0)
-                               acpi_device_bus_id->instance_no--;
-                       else {
+                       ida_simple_remove(&acpi_device_bus_id->instance_ida, device->pnp.instance_no);
+                       if (ida_is_empty(&acpi_device_bus_id->instance_ida)) {
                                list_del(&acpi_device_bus_id->node);
                                kfree_const(acpi_device_bus_id->bus_id);
                                kfree(acpi_device_bus_id);
@@ -631,6 +630,21 @@ static struct acpi_device_bus_id *acpi_device_bus_id_match(const char *dev_id)
        return NULL;
 }
 
+static int acpi_device_set_name(struct acpi_device *device,
+                               struct acpi_device_bus_id *acpi_device_bus_id)
+{
+       struct ida *instance_ida = &acpi_device_bus_id->instance_ida;
+       int result;
+
+       result = ida_simple_get(instance_ida, 0, ACPI_MAX_DEVICE_INSTANCES, GFP_KERNEL);
+       if (result < 0)
+               return result;
+
+       device->pnp.instance_no = result;
+       dev_set_name(&device->dev, "%s:%02x", acpi_device_bus_id->bus_id, result);
+       return 0;
+}
+
 int acpi_device_add(struct acpi_device *device,
                    void (*release)(struct device *))
 {
@@ -665,7 +679,9 @@ int acpi_device_add(struct acpi_device *device,
 
        acpi_device_bus_id = acpi_device_bus_id_match(acpi_device_hid(device));
        if (acpi_device_bus_id) {
-               acpi_device_bus_id->instance_no++;
+               result = acpi_device_set_name(device, acpi_device_bus_id);
+               if (result)
+                       goto err_unlock;
        } else {
                acpi_device_bus_id = kzalloc(sizeof(*acpi_device_bus_id),
                                             GFP_KERNEL);
@@ -681,9 +697,16 @@ int acpi_device_add(struct acpi_device *device,
                        goto err_unlock;
                }
 
+               ida_init(&acpi_device_bus_id->instance_ida);
+
+               result = acpi_device_set_name(device, acpi_device_bus_id);
+               if (result) {
+                       kfree(acpi_device_bus_id);
+                       goto err_unlock;
+               }
+
                list_add_tail(&acpi_device_bus_id->node, &acpi_bus_id_list);
        }
-       dev_set_name(&device->dev, "%s:%02x", acpi_device_bus_id->bus_id, acpi_device_bus_id->instance_no);
 
        if (device->parent)
                list_add_tail(&device->node, &device->parent->children);
@@ -1647,6 +1670,8 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
        device_initialize(&device->dev);
        dev_set_uevent_suppress(&device->dev, true);
        acpi_init_coherency(device);
+       /* Assume there are unmet deps to start with. */
+       device->dep_unmet = 1;
 }
 
 void acpi_device_add_finalize(struct acpi_device *device)
@@ -1910,6 +1935,8 @@ static void acpi_scan_dep_init(struct acpi_device *adev)
 {
        struct acpi_dep_data *dep;
 
+       adev->dep_unmet = 0;
+
        mutex_lock(&acpi_dep_list_lock);
 
        list_for_each_entry(dep, &acpi_dep_list, node) {
@@ -1957,7 +1984,13 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, bool check_dep,
                return AE_CTRL_DEPTH;
 
        acpi_scan_init_hotplug(device);
-       if (!check_dep)
+       /*
+        * If check_dep is true at this point, the device has no dependencies,
+        * or the creation of the device object would have been postponed above.
+        */
+       if (check_dep)
+               device->dep_unmet = 0;
+       else
                acpi_scan_dep_init(device);
 
 out:
index e48690a..9d58104 100644 (file)
@@ -780,7 +780,7 @@ acpi_status acpi_os_table_override(struct acpi_table_header *existing_table,
 }
 
 /*
- * acpi_table_init()
+ * acpi_locate_initial_tables()
  *
  * find RSDP, find and checksum SDT/XSDT.
  * checksum all tables, print SDT/XSDT
@@ -788,7 +788,7 @@ acpi_status acpi_os_table_override(struct acpi_table_header *existing_table,
  * result: sdt_entry[] is initialized
  */
 
-int __init acpi_table_init(void)
+int __init acpi_locate_initial_tables(void)
 {
        acpi_status status;
 
@@ -803,9 +803,45 @@ int __init acpi_table_init(void)
        status = acpi_initialize_tables(initial_tables, ACPI_MAX_TABLES, 0);
        if (ACPI_FAILURE(status))
                return -EINVAL;
-       acpi_table_initrd_scan();
 
+       return 0;
+}
+
+void __init acpi_reserve_initial_tables(void)
+{
+       int i;
+
+       for (i = 0; i < ACPI_MAX_TABLES; i++) {
+               struct acpi_table_desc *table_desc = &initial_tables[i];
+               u64 start = table_desc->address;
+               u64 size = table_desc->length;
+
+               if (!start || !size)
+                       break;
+
+               pr_info("Reserving %4s table memory at [mem 0x%llx-0x%llx]\n",
+                       table_desc->signature.ascii, start, start + size - 1);
+
+               memblock_reserve(start, size);
+       }
+}
+
+void __init acpi_table_init_complete(void)
+{
+       acpi_table_initrd_scan();
        check_multiple_madt();
+}
+
+int __init acpi_table_init(void)
+{
+       int ret;
+
+       ret = acpi_locate_initial_tables();
+       if (ret)
+               return ret;
+
+       acpi_table_init_complete();
+
        return 0;
 }
 
index 811d298..83cd4c9 100644 (file)
@@ -147,6 +147,7 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
                },
        },
        {
+       .callback = video_detect_force_vendor,
        .ident = "Sony VPCEH3U1E",
        .matches = {
                DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
index 9a70bee..495fd0a 100644 (file)
@@ -100,8 +100,6 @@ static LIST_HEAD(fore200e_boards);
 
 MODULE_AUTHOR("Christophe Lizzi - credits to Uwe Dannowski and Heikki Vatiainen");
 MODULE_DESCRIPTION("FORE Systems 200E-series ATM driver - version " FORE200E_VERSION);
-MODULE_SUPPORTED_DEVICE("PCA-200E, SBA-200E");
-
 
 static const int fore200e_rx_buf_nbr[ BUFFER_SCHEME_NBR ][ BUFFER_MAGN_NBR ] = {
     { BUFFER_S1_NBR, BUFFER_L1_NBR },
index f43430e..24fd6f3 100644 (file)
@@ -470,12 +470,14 @@ static ssize_t charlcd_write(struct file *file, const char __user *buf,
        char c;
 
        for (; count-- > 0; (*ppos)++, tmp++) {
-               if (!in_interrupt() && (((count + 1) & 0x1f) == 0))
+               if (((count + 1) & 0x1f) == 0) {
                        /*
-                        * let's be a little nice with other processes
-                        * that need some CPU
+                        * charlcd_write() is invoked as a VFS->write() callback
+                        * and as such it is always invoked from preemptible
+                        * context and may sleep.
                         */
-                       schedule();
+                       cond_resched();
+               }
 
                if (get_user(c, tmp))
                        return -EFAULT;
@@ -537,12 +539,8 @@ static void charlcd_puts(struct charlcd *lcd, const char *s)
        int count = strlen(s);
 
        for (; count-- > 0; tmp++) {
-               if (!in_interrupt() && (((count + 1) & 0x1f) == 0))
-                       /*
-                        * let's be a little nice with other processes
-                        * that need some CPU
-                        */
-                       schedule();
+               if (((count + 1) & 0x1f) == 0)
+                       cond_resched();
 
                charlcd_write_char(lcd, *tmp);
        }
index 9179825..e2cf3b2 100644 (file)
@@ -97,6 +97,9 @@ static void deferred_probe_work_func(struct work_struct *work)
 
                get_device(dev);
 
+               kfree(dev->p->deferred_probe_reason);
+               dev->p->deferred_probe_reason = NULL;
+
                /*
                 * Drop the mutex while probing each device; the probe path may
                 * manipulate the deferred list
index 18b8242..fe1dad6 100644 (file)
@@ -305,7 +305,7 @@ static int rpm_get_suppliers(struct device *dev)
        return 0;
 }
 
-static void rpm_put_suppliers(struct device *dev)
+static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend)
 {
        struct device_link *link;
 
@@ -313,10 +313,30 @@ static void rpm_put_suppliers(struct device *dev)
                                device_links_read_lock_held()) {
 
                while (refcount_dec_not_one(&link->rpm_active))
-                       pm_runtime_put(link->supplier);
+                       pm_runtime_put_noidle(link->supplier);
+
+               if (try_to_suspend)
+                       pm_request_idle(link->supplier);
        }
 }
 
+static void rpm_put_suppliers(struct device *dev)
+{
+       __rpm_put_suppliers(dev, true);
+}
+
+static void rpm_suspend_suppliers(struct device *dev)
+{
+       struct device_link *link;
+       int idx = device_links_read_lock();
+
+       list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
+                               device_links_read_lock_held())
+               pm_request_idle(link->supplier);
+
+       device_links_read_unlock(idx);
+}
+
 /**
  * __rpm_callback - Run a given runtime PM callback for a given device.
  * @cb: Runtime PM callback to run.
@@ -325,27 +345,29 @@ static void rpm_put_suppliers(struct device *dev)
 static int __rpm_callback(int (*cb)(struct device *), struct device *dev)
        __releases(&dev->power.lock) __acquires(&dev->power.lock)
 {
-       bool use_links = dev->power.links_count > 0;
-       bool get = false;
        int retval, idx;
-       bool put;
+       bool use_links = dev->power.links_count > 0;
 
        if (dev->power.irq_safe) {
                spin_unlock(&dev->power.lock);
-       } else if (!use_links) {
-               spin_unlock_irq(&dev->power.lock);
        } else {
-               get = dev->power.runtime_status == RPM_RESUMING;
-
                spin_unlock_irq(&dev->power.lock);
 
-               /* Resume suppliers if necessary. */
-               if (get) {
+               /*
+                * Resume suppliers if necessary.
+                *
+                * The device's runtime PM status cannot change until this
+                * routine returns, so it is safe to read the status outside of
+                * the lock.
+                */
+               if (use_links && dev->power.runtime_status == RPM_RESUMING) {
                        idx = device_links_read_lock();
 
                        retval = rpm_get_suppliers(dev);
-                       if (retval)
+                       if (retval) {
+                               rpm_put_suppliers(dev);
                                goto fail;
+                       }
 
                        device_links_read_unlock(idx);
                }
@@ -355,36 +377,24 @@ static int __rpm_callback(int (*cb)(struct device *), struct device *dev)
 
        if (dev->power.irq_safe) {
                spin_lock(&dev->power.lock);
-               return retval;
-       }
-
-       spin_lock_irq(&dev->power.lock);
-
-       if (!use_links)
-               return retval;
-
-       /*
-        * If the device is suspending and the callback has returned success,
-        * drop the usage counters of the suppliers that have been reference
-        * counted on its resume.
-        *
-        * Do that if the resume fails too.
-        */
-       put = dev->power.runtime_status == RPM_SUSPENDING && !retval;
-       if (put)
-               __update_runtime_status(dev, RPM_SUSPENDED);
-       else
-               put = get && retval;
-
-       if (put) {
-               spin_unlock_irq(&dev->power.lock);
+       } else {
+               /*
+                * If the device is suspending and the callback has returned
+                * success, drop the usage counters of the suppliers that have
+                * been reference counted on its resume.
+                *
+                * Do that if resume fails too.
+                */
+               if (use_links
+                   && ((dev->power.runtime_status == RPM_SUSPENDING && !retval)
+                   || (dev->power.runtime_status == RPM_RESUMING && retval))) {
+                       idx = device_links_read_lock();
 
-               idx = device_links_read_lock();
+                       __rpm_put_suppliers(dev, false);
 
 fail:
-               rpm_put_suppliers(dev);
-
-               device_links_read_unlock(idx);
+                       device_links_read_unlock(idx);
+               }
 
                spin_lock_irq(&dev->power.lock);
        }
@@ -654,8 +664,11 @@ static int rpm_suspend(struct device *dev, int rpmflags)
                goto out;
        }
 
+       if (dev->power.irq_safe)
+               goto out;
+
        /* Maybe the parent is now able to suspend. */
-       if (parent && !parent->power.ignore_children && !dev->power.irq_safe) {
+       if (parent && !parent->power.ignore_children) {
                spin_unlock(&dev->power.lock);
 
                spin_lock(&parent->power.lock);
@@ -664,6 +677,14 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 
                spin_lock(&dev->power.lock);
        }
+       /* Maybe the suppliers are now able to suspend. */
+       if (dev->power.links_count > 0) {
+               spin_unlock_irq(&dev->power.lock);
+
+               rpm_suspend_suppliers(dev);
+
+               spin_lock_irq(&dev->power.lock);
+       }
 
  out:
        trace_rpm_return_int_rcuidle(dev, _THIS_IP_, retval);
@@ -1669,8 +1690,8 @@ void pm_runtime_get_suppliers(struct device *dev)
                                device_links_read_lock_held())
                if (link->flags & DL_FLAG_PM_RUNTIME) {
                        link->supplier_preactivated = true;
-                       refcount_inc(&link->rpm_active);
                        pm_runtime_get_sync(link->supplier);
+                       refcount_inc(&link->rpm_active);
                }
 
        device_links_read_unlock(idx);
@@ -1683,6 +1704,8 @@ void pm_runtime_get_suppliers(struct device *dev)
 void pm_runtime_put_suppliers(struct device *dev)
 {
        struct device_link *link;
+       unsigned long flags;
+       bool put;
        int idx;
 
        idx = device_links_read_lock();
@@ -1691,7 +1714,11 @@ void pm_runtime_put_suppliers(struct device *dev)
                                device_links_read_lock_held())
                if (link->supplier_preactivated) {
                        link->supplier_preactivated = false;
-                       if (refcount_dec_not_one(&link->rpm_active))
+                       spin_lock_irqsave(&dev->power.lock, flags);
+                       put = pm_runtime_status_suspended(dev) &&
+                             refcount_dec_not_one(&link->rpm_active);
+                       spin_unlock_irqrestore(&dev->power.lock, flags);
+                       if (put)
                                pm_runtime_put(link->supplier);
                }
 
index 0b71292..4aa9683 100644 (file)
@@ -5091,7 +5091,6 @@ module_param(floppy, charp, 0);
 module_param(FLOPPY_IRQ, int, 0);
 module_param(FLOPPY_DMA, int, 0);
 MODULE_AUTHOR("Alain L. Knaff");
-MODULE_SUPPORTED_DEVICE("fd");
 MODULE_LICENSE("GPL");
 
 /* This doesn't actually get used other than for module information */
index d6c821d..51bfd77 100644 (file)
@@ -1369,10 +1369,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
        }
 
        if (dev->zoned)
-               cmd->error = null_process_zoned_cmd(cmd, op,
-                                                   sector, nr_sectors);
+               sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors);
        else
-               cmd->error = null_process_cmd(cmd, op, sector, nr_sectors);
+               sts = null_process_cmd(cmd, op, sector, nr_sectors);
+
+       /* Do not overwrite errors (e.g. timeout errors) */
+       if (cmd->error == BLK_STS_OK)
+               cmd->error = sts;
 
 out:
        nullb_complete_cmd(cmd);
@@ -1451,8 +1454,20 @@ static bool should_requeue_request(struct request *rq)
 
 static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
 {
+       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
        pr_info("rq %p timed out\n", rq);
-       blk_mq_complete_request(rq);
+
+       /*
+        * If the device is marked as blocking (i.e. memory backed or zoned
+        * device), the submission path may be blocked waiting for resources
+        * and cause real timeouts. For these real timeouts, the submission
+        * path will complete the request using blk_mq_complete_request().
+        * Only fake timeouts need to execute blk_mq_complete_request() here.
+        */
+       cmd->error = BLK_STS_TIMEOUT;
+       if (cmd->fake_timeout)
+               blk_mq_complete_request(rq);
        return BLK_EH_DONE;
 }
 
@@ -1473,6 +1488,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
        cmd->rq = bd->rq;
        cmd->error = BLK_STS_OK;
        cmd->nq = nq;
+       cmd->fake_timeout = should_timeout_request(bd->rq);
 
        blk_mq_start_request(bd->rq);
 
@@ -1489,7 +1505,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                        return BLK_STS_OK;
                }
        }
-       if (should_timeout_request(bd->rq))
+       if (cmd->fake_timeout)
                return BLK_STS_OK;
 
        return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq));
index 83504f3..4876d5a 100644 (file)
@@ -22,6 +22,7 @@ struct nullb_cmd {
        blk_status_t error;
        struct nullb_queue *nq;
        struct hrtimer timer;
+       bool fake_timeout;
 };
 
 struct nullb_queue {
index 1cdf09f..14e4528 100644 (file)
@@ -891,7 +891,7 @@ next:
 out:
        for (i = last_map; i < num; i++) {
                /* Don't zap current batch's valid persistent grants. */
-               if(i >= last_map + segs_to_map)
+               if(i >= map_until)
                        pages[i]->persistent_gnt = NULL;
                pages[i]->handle = BLKBACK_INVALID_HANDLE;
        }
index 3951f7b..bea1595 100644 (file)
@@ -194,5 +194,4 @@ module_init(rsi_91x_bt_module_init);
 module_exit(rsi_91x_bt_module_exit);
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("RSI BT driver");
-MODULE_SUPPORTED_DEVICE("RSI-BT");
 MODULE_LICENSE("Dual BSD/GPL");
index b040447..dcfb32e 100644 (file)
@@ -285,7 +285,7 @@ static int omap_l3_probe(struct platform_device *pdev)
         */
        l3->debug_irq = platform_get_irq(pdev, 0);
        ret = devm_request_irq(l3->dev, l3->debug_irq, l3_interrupt_handler,
-                              0x0, "l3-dbg-irq", l3);
+                              IRQF_NO_THREAD, "l3-dbg-irq", l3);
        if (ret) {
                dev_err(l3->dev, "request_irq failed for %d\n",
                        l3->debug_irq);
@@ -294,7 +294,7 @@ static int omap_l3_probe(struct platform_device *pdev)
 
        l3->app_irq = platform_get_irq(pdev, 1);
        ret = devm_request_irq(l3->dev, l3->app_irq, l3_interrupt_handler,
-                              0x0, "l3-app-irq", l3);
+                              IRQF_NO_THREAD, "l3-app-irq", l3);
        if (ret)
                dev_err(l3->dev, "request_irq failed for %d\n", l3->app_irq);
 
index a27d751..3d74f23 100644 (file)
@@ -3053,7 +3053,9 @@ static int sysc_remove(struct platform_device *pdev)
 
        pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
-       reset_control_assert(ddata->rsts);
+
+       if (!reset_control_status(ddata->rsts))
+               reset_control_assert(ddata->rsts);
 
 unprepare:
        sysc_unprepare(ddata);
index 14b2d80..45ac7ab 100644 (file)
@@ -81,9 +81,6 @@ MODULE_DESCRIPTION("Driver for Applicom Profibus card");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(AC_MINOR);
 
-MODULE_SUPPORTED_DEVICE("ac");
-
-
 static struct applicom_board {
        unsigned long PhysIO;
        void __iomem *RamIO;
index aff0a8e..776abbf 100644 (file)
@@ -64,7 +64,6 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jonathan Buzzard <jonathan@buzzard.org.uk>");
 MODULE_DESCRIPTION("Toshiba laptop SMM driver");
-MODULE_SUPPORTED_DEVICE("toshiba");
 
 static DEFINE_MUTEX(tosh_mutex);
 static int tosh_fn;
index 42f13a2..05ff3b0 100644 (file)
@@ -730,7 +730,8 @@ static int clk_gfx3d_determine_rate(struct clk_hw *hw,
        struct clk_rate_request parent_req = { };
        struct clk_rcg2_gfx3d *cgfx = to_clk_rcg2_gfx3d(hw);
        struct clk_hw *xo, *p0, *p1, *p2;
-       unsigned long request, p0_rate;
+       unsigned long p0_rate;
+       u8 mux_div = cgfx->div;
        int ret;
 
        p0 = cgfx->hws[0];
@@ -750,14 +751,15 @@ static int clk_gfx3d_determine_rate(struct clk_hw *hw,
                return 0;
        }
 
-       request = req->rate;
-       if (cgfx->div > 1)
-               parent_req.rate = request = request * cgfx->div;
+       if (mux_div == 0)
+               mux_div = 1;
+
+       parent_req.rate = req->rate * mux_div;
 
        /* This has to be a fixed rate PLL */
        p0_rate = clk_hw_get_rate(p0);
 
-       if (request == p0_rate) {
+       if (parent_req.rate == p0_rate) {
                req->rate = req->best_parent_rate = p0_rate;
                req->best_parent_hw = p0;
                return 0;
@@ -765,7 +767,7 @@ static int clk_gfx3d_determine_rate(struct clk_hw *hw,
 
        if (req->best_parent_hw == p0) {
                /* Are we going back to a previously used rate? */
-               if (clk_hw_get_rate(p2) == request)
+               if (clk_hw_get_rate(p2) == parent_req.rate)
                        req->best_parent_hw = p2;
                else
                        req->best_parent_hw = p1;
@@ -780,8 +782,7 @@ static int clk_gfx3d_determine_rate(struct clk_hw *hw,
                return ret;
 
        req->rate = req->best_parent_rate = parent_req.rate;
-       if (cgfx->div > 1)
-               req->rate /= cgfx->div;
+       req->rate /= mux_div;
 
        return 0;
 }
index 91dc390..c623ce9 100644 (file)
@@ -510,9 +510,12 @@ static const struct clk_rpmh_desc clk_rpmh_sm8350 = {
        .num_clks = ARRAY_SIZE(sm8350_rpmh_clocks),
 };
 
+/* Resource name must match resource id present in cmd-db */
+DEFINE_CLK_RPMH_ARC(sc7280, bi_tcxo, bi_tcxo_ao, "xo.lvl", 0x3, 4);
+
 static struct clk_hw *sc7280_rpmh_clocks[] = {
-       [RPMH_CXO_CLK]      = &sdm845_bi_tcxo.hw,
-       [RPMH_CXO_CLK_A]    = &sdm845_bi_tcxo_ao.hw,
+       [RPMH_CXO_CLK]      = &sc7280_bi_tcxo.hw,
+       [RPMH_CXO_CLK_A]    = &sc7280_bi_tcxo_ao.hw,
        [RPMH_LN_BB_CLK2]   = &sdm845_ln_bb_clk2.hw,
        [RPMH_LN_BB_CLK2_A] = &sdm845_ln_bb_clk2_ao.hw,
        [RPMH_RF_CLK1]      = &sdm845_rf_clk1.hw,
index 88e896a..da8b627 100644 (file)
@@ -620,7 +620,7 @@ static struct clk_rcg2 gcc_sdcc1_apps_clk_src = {
                .name = "gcc_sdcc1_apps_clk_src",
                .parent_data = gcc_parent_data_1,
                .num_parents = 5,
-               .ops = &clk_rcg2_ops,
+               .ops = &clk_rcg2_floor_ops,
        },
 };
 
@@ -642,7 +642,7 @@ static struct clk_rcg2 gcc_sdcc1_ice_core_clk_src = {
                .name = "gcc_sdcc1_ice_core_clk_src",
                .parent_data = gcc_parent_data_0,
                .num_parents = 4,
-               .ops = &clk_rcg2_floor_ops,
+               .ops = &clk_rcg2_ops,
        },
 };
 
index ef2a974..75bc401 100644 (file)
@@ -31,7 +31,7 @@ struct stm32_timer_cnt {
        struct counter_device counter;
        struct regmap *regmap;
        struct clk *clk;
-       u32 ceiling;
+       u32 max_arr;
        bool enabled;
        struct stm32_timer_regs bak;
 };
@@ -44,13 +44,14 @@ struct stm32_timer_cnt {
  * @STM32_COUNT_ENCODER_MODE_3: counts on both TI1FP1 and TI2FP2 edges
  */
 enum stm32_count_function {
-       STM32_COUNT_SLAVE_MODE_DISABLED = -1,
+       STM32_COUNT_SLAVE_MODE_DISABLED,
        STM32_COUNT_ENCODER_MODE_1,
        STM32_COUNT_ENCODER_MODE_2,
        STM32_COUNT_ENCODER_MODE_3,
 };
 
 static enum counter_count_function stm32_count_functions[] = {
+       [STM32_COUNT_SLAVE_MODE_DISABLED] = COUNTER_COUNT_FUNCTION_INCREASE,
        [STM32_COUNT_ENCODER_MODE_1] = COUNTER_COUNT_FUNCTION_QUADRATURE_X2_A,
        [STM32_COUNT_ENCODER_MODE_2] = COUNTER_COUNT_FUNCTION_QUADRATURE_X2_B,
        [STM32_COUNT_ENCODER_MODE_3] = COUNTER_COUNT_FUNCTION_QUADRATURE_X4,
@@ -73,8 +74,10 @@ static int stm32_count_write(struct counter_device *counter,
                             const unsigned long val)
 {
        struct stm32_timer_cnt *const priv = counter->priv;
+       u32 ceiling;
 
-       if (val > priv->ceiling)
+       regmap_read(priv->regmap, TIM_ARR, &ceiling);
+       if (val > ceiling)
                return -EINVAL;
 
        return regmap_write(priv->regmap, TIM_CNT, val);
@@ -90,6 +93,9 @@ static int stm32_count_function_get(struct counter_device *counter,
        regmap_read(priv->regmap, TIM_SMCR, &smcr);
 
        switch (smcr & TIM_SMCR_SMS) {
+       case 0:
+               *function = STM32_COUNT_SLAVE_MODE_DISABLED;
+               return 0;
        case 1:
                *function = STM32_COUNT_ENCODER_MODE_1;
                return 0;
@@ -99,9 +105,9 @@ static int stm32_count_function_get(struct counter_device *counter,
        case 3:
                *function = STM32_COUNT_ENCODER_MODE_3;
                return 0;
+       default:
+               return -EINVAL;
        }
-
-       return -EINVAL;
 }
 
 static int stm32_count_function_set(struct counter_device *counter,
@@ -112,6 +118,9 @@ static int stm32_count_function_set(struct counter_device *counter,
        u32 cr1, sms;
 
        switch (function) {
+       case STM32_COUNT_SLAVE_MODE_DISABLED:
+               sms = 0;
+               break;
        case STM32_COUNT_ENCODER_MODE_1:
                sms = 1;
                break;
@@ -122,8 +131,7 @@ static int stm32_count_function_set(struct counter_device *counter,
                sms = 3;
                break;
        default:
-               sms = 0;
-               break;
+               return -EINVAL;
        }
 
        /* Store enable status */
@@ -131,10 +139,6 @@ static int stm32_count_function_set(struct counter_device *counter,
 
        regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, 0);
 
-       /* TIMx_ARR register shouldn't be buffered (ARPE=0) */
-       regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, 0);
-       regmap_write(priv->regmap, TIM_ARR, priv->ceiling);
-
        regmap_update_bits(priv->regmap, TIM_SMCR, TIM_SMCR_SMS, sms);
 
        /* Make sure that registers are updated */
@@ -185,11 +189,13 @@ static ssize_t stm32_count_ceiling_write(struct counter_device *counter,
        if (ret)
                return ret;
 
+       if (ceiling > priv->max_arr)
+               return -ERANGE;
+
        /* TIMx_ARR register shouldn't be buffered (ARPE=0) */
        regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, 0);
        regmap_write(priv->regmap, TIM_ARR, ceiling);
 
-       priv->ceiling = ceiling;
        return len;
 }
 
@@ -274,31 +280,36 @@ static int stm32_action_get(struct counter_device *counter,
        size_t function;
        int err;
 
-       /* Default action mode (e.g. STM32_COUNT_SLAVE_MODE_DISABLED) */
-       *action = STM32_SYNAPSE_ACTION_NONE;
-
        err = stm32_count_function_get(counter, count, &function);
        if (err)
-               return 0;
+               return err;
 
        switch (function) {
+       case STM32_COUNT_SLAVE_MODE_DISABLED:
+               /* counts on internal clock when CEN=1 */
+               *action = STM32_SYNAPSE_ACTION_NONE;
+               return 0;
        case STM32_COUNT_ENCODER_MODE_1:
                /* counts up/down on TI1FP1 edge depending on TI2FP2 level */
                if (synapse->signal->id == count->synapses[0].signal->id)
                        *action = STM32_SYNAPSE_ACTION_BOTH_EDGES;
-               break;
+               else
+                       *action = STM32_SYNAPSE_ACTION_NONE;
+               return 0;
        case STM32_COUNT_ENCODER_MODE_2:
                /* counts up/down on TI2FP2 edge depending on TI1FP1 level */
                if (synapse->signal->id == count->synapses[1].signal->id)
                        *action = STM32_SYNAPSE_ACTION_BOTH_EDGES;
-               break;
+               else
+                       *action = STM32_SYNAPSE_ACTION_NONE;
+               return 0;
        case STM32_COUNT_ENCODER_MODE_3:
                /* counts up/down on both TI1FP1 and TI2FP2 edges */
                *action = STM32_SYNAPSE_ACTION_BOTH_EDGES;
-               break;
+               return 0;
+       default:
+               return -EINVAL;
        }
-
-       return 0;
 }
 
 static const struct counter_ops stm32_timer_cnt_ops = {
@@ -359,7 +370,7 @@ static int stm32_timer_cnt_probe(struct platform_device *pdev)
 
        priv->regmap = ddata->regmap;
        priv->clk = ddata->clk;
-       priv->ceiling = ddata->max_arr;
+       priv->max_arr = ddata->max_arr;
 
        priv->counter.name = dev_name(dev);
        priv->counter.parent = dev;
index d3f756f..67e56cf 100644 (file)
@@ -267,7 +267,7 @@ struct freq_attr cpufreq_freq_attr_##_name##_freqs =     \
 __ATTR_RO(_name##_frequencies)
 
 /*
- * show_scaling_available_frequencies - show available normal frequencies for
+ * scaling_available_frequencies_show - show available normal frequencies for
  * the specified CPU
  */
 static ssize_t scaling_available_frequencies_show(struct cpufreq_policy *policy,
@@ -279,7 +279,7 @@ cpufreq_attr_available_freq(scaling_available);
 EXPORT_SYMBOL_GPL(cpufreq_freq_attr_scaling_available_freqs);
 
 /*
- * show_available_boost_freqs - show available boost frequencies for
+ * scaling_boost_frequencies_show - show available boost frequencies for
  * the specified CPU
  */
 static ssize_t scaling_boost_frequencies_show(struct cpufreq_policy *policy,
index cb9b4c4..6ee7031 100644 (file)
@@ -129,6 +129,7 @@ static int sev_cmd_buffer_len(int cmd)
        case SEV_CMD_DOWNLOAD_FIRMWARE:         return sizeof(struct sev_data_download_firmware);
        case SEV_CMD_GET_ID:                    return sizeof(struct sev_data_get_id);
        case SEV_CMD_ATTESTATION_REPORT:        return sizeof(struct sev_data_attestation_report);
+       case SEV_CMD_SEND_CANCEL:                       return sizeof(struct sev_data_send_cancel);
        default:                                return 0;
        }
 
@@ -141,6 +142,7 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        struct sev_device *sev;
        unsigned int phys_lsb, phys_msb;
        unsigned int reg, ret = 0;
+       int buf_len;
 
        if (!psp || !psp->sev_data)
                return -ENODEV;
@@ -150,15 +152,27 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
 
        sev = psp->sev_data;
 
+       buf_len = sev_cmd_buffer_len(cmd);
+       if (WARN_ON_ONCE(!data != !buf_len))
+               return -EINVAL;
+
+       /*
+        * Copy the incoming data to driver's scratch buffer as __pa() will not
+        * work for some memory, e.g. vmalloc'd addresses, and @data may not be
+        * physically contiguous.
+        */
+       if (data)
+               memcpy(sev->cmd_buf, data, buf_len);
+
        /* Get the physical address of the command buffer */
-       phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
-       phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
+       phys_lsb = data ? lower_32_bits(__psp_pa(sev->cmd_buf)) : 0;
+       phys_msb = data ? upper_32_bits(__psp_pa(sev->cmd_buf)) : 0;
 
        dev_dbg(sev->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n",
                cmd, phys_msb, phys_lsb, psp_timeout);
 
        print_hex_dump_debug("(in):  ", DUMP_PREFIX_OFFSET, 16, 2, data,
-                            sev_cmd_buffer_len(cmd), false);
+                            buf_len, false);
 
        iowrite32(phys_lsb, sev->io_regs + sev->vdata->cmdbuff_addr_lo_reg);
        iowrite32(phys_msb, sev->io_regs + sev->vdata->cmdbuff_addr_hi_reg);
@@ -194,7 +208,14 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        }
 
        print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
-                            sev_cmd_buffer_len(cmd), false);
+                            buf_len, false);
+
+       /*
+        * Copy potential output from the PSP back to data.  Do this even on
+        * failure in case the caller wants to glean something from the error.
+        */
+       if (data)
+               memcpy(data, sev->cmd_buf, buf_len);
 
        return ret;
 }
@@ -213,6 +234,7 @@ static int sev_do_cmd(int cmd, void *data, int *psp_ret)
 static int __sev_platform_init_locked(int *error)
 {
        struct psp_device *psp = psp_master;
+       struct sev_data_init data;
        struct sev_device *sev;
        int rc = 0;
 
@@ -224,6 +246,7 @@ static int __sev_platform_init_locked(int *error)
        if (sev->state == SEV_STATE_INIT)
                return 0;
 
+       memset(&data, 0, sizeof(data));
        if (sev_es_tmr) {
                u64 tmr_pa;
 
@@ -233,12 +256,12 @@ static int __sev_platform_init_locked(int *error)
                 */
                tmr_pa = __pa(sev_es_tmr);
 
-               sev->init_cmd_buf.flags |= SEV_INIT_FLAGS_SEV_ES;
-               sev->init_cmd_buf.tmr_address = tmr_pa;
-               sev->init_cmd_buf.tmr_len = SEV_ES_TMR_SIZE;
+               data.flags |= SEV_INIT_FLAGS_SEV_ES;
+               data.tmr_address = tmr_pa;
+               data.tmr_len = SEV_ES_TMR_SIZE;
        }
 
-       rc = __sev_do_cmd_locked(SEV_CMD_INIT, &sev->init_cmd_buf, error);
+       rc = __sev_do_cmd_locked(SEV_CMD_INIT, &data, error);
        if (rc)
                return rc;
 
@@ -295,15 +318,14 @@ static int sev_platform_shutdown(int *error)
 
 static int sev_get_platform_state(int *state, int *error)
 {
-       struct sev_device *sev = psp_master->sev_data;
+       struct sev_user_data_status data;
        int rc;
 
-       rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS,
-                                &sev->status_cmd_buf, error);
+       rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, error);
        if (rc)
                return rc;
 
-       *state = sev->status_cmd_buf.state;
+       *state = data.state;
        return rc;
 }
 
@@ -341,15 +363,14 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp, bool writable)
 
 static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp)
 {
-       struct sev_device *sev = psp_master->sev_data;
-       struct sev_user_data_status *data = &sev->status_cmd_buf;
+       struct sev_user_data_status data;
        int ret;
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, &data, &argp->error);
        if (ret)
                return ret;
 
-       if (copy_to_user((void __user *)argp->data, data, sizeof(*data)))
+       if (copy_to_user((void __user *)argp->data, &data, sizeof(data)))
                ret = -EFAULT;
 
        return ret;
@@ -376,7 +397,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
 {
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pek_csr input;
-       struct sev_data_pek_csr *data;
+       struct sev_data_pek_csr data;
        void __user *input_address;
        void *blob = NULL;
        int ret;
@@ -387,9 +408,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* userspace wants to query CSR length */
        if (!input.address || !input.length)
@@ -397,19 +416,15 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp, bool writable)
 
        /* allocate a physically contiguous buffer to store the CSR blob */
        input_address = (void __user *)input.address;
-       if (input.length > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.length > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        blob = kmalloc(input.length, GFP_KERNEL);
-       if (!blob) {
-               ret = -ENOMEM;
-               goto e_free;
-       }
+       if (!blob)
+               return -ENOMEM;
 
-       data->address = __psp_pa(blob);
-       data->len = input.length;
+       data.address = __psp_pa(blob);
+       data.len = input.length;
 
 cmd:
        if (sev->state == SEV_STATE_UNINIT) {
@@ -418,10 +433,10 @@ cmd:
                        goto e_free_blob;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, &data, &argp->error);
 
         /* If we query the CSR length, FW responded with expected data. */
-       input.length = data->len;
+       input.length = data.len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -435,8 +450,6 @@ cmd:
 
 e_free_blob:
        kfree(blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -456,21 +469,20 @@ EXPORT_SYMBOL_GPL(psp_copy_user_blob);
 static int sev_get_api_version(void)
 {
        struct sev_device *sev = psp_master->sev_data;
-       struct sev_user_data_status *status;
+       struct sev_user_data_status status;
        int error = 0, ret;
 
-       status = &sev->status_cmd_buf;
-       ret = sev_platform_status(status, &error);
+       ret = sev_platform_status(&status, &error);
        if (ret) {
                dev_err(sev->dev,
                        "SEV: failed to get status. Error: %#x\n", error);
                return 1;
        }
 
-       sev->api_major = status->api_major;
-       sev->api_minor = status->api_minor;
-       sev->build = status->build;
-       sev->state = status->state;
+       sev->api_major = status.api_major;
+       sev->api_minor = status.api_minor;
+       sev->build = status.build;
+       sev->state = status.state;
 
        return 0;
 }
@@ -568,7 +580,7 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
 {
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pek_cert_import input;
-       struct sev_data_pek_cert_import *data;
+       struct sev_data_pek_cert_import data;
        void *pek_blob, *oca_blob;
        int ret;
 
@@ -578,19 +590,14 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-
        /* copy PEK certificate blobs from userspace */
        pek_blob = psp_copy_user_blob(input.pek_cert_address, input.pek_cert_len);
-       if (IS_ERR(pek_blob)) {
-               ret = PTR_ERR(pek_blob);
-               goto e_free;
-       }
+       if (IS_ERR(pek_blob))
+               return PTR_ERR(pek_blob);
 
-       data->pek_cert_address = __psp_pa(pek_blob);
-       data->pek_cert_len = input.pek_cert_len;
+       data.reserved = 0;
+       data.pek_cert_address = __psp_pa(pek_blob);
+       data.pek_cert_len = input.pek_cert_len;
 
        /* copy PEK certificate blobs from userspace */
        oca_blob = psp_copy_user_blob(input.oca_cert_address, input.oca_cert_len);
@@ -599,8 +606,8 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
                goto e_free_pek;
        }
 
-       data->oca_cert_address = __psp_pa(oca_blob);
-       data->oca_cert_len = input.oca_cert_len;
+       data.oca_cert_address = __psp_pa(oca_blob);
+       data.oca_cert_len = input.oca_cert_len;
 
        /* If platform is not in INIT state then transition it to INIT */
        if (sev->state != SEV_STATE_INIT) {
@@ -609,21 +616,19 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
                        goto e_free_oca;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, &data, &argp->error);
 
 e_free_oca:
        kfree(oca_blob);
 e_free_pek:
        kfree(pek_blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
 static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 {
        struct sev_user_data_get_id2 input;
-       struct sev_data_get_id *data;
+       struct sev_data_get_id data;
        void __user *input_address;
        void *id_blob = NULL;
        int ret;
@@ -637,28 +642,25 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 
        input_address = (void __user *)input.address;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
-
        if (input.address && input.length) {
                id_blob = kmalloc(input.length, GFP_KERNEL);
-               if (!id_blob) {
-                       kfree(data);
+               if (!id_blob)
                        return -ENOMEM;
-               }
 
-               data->address = __psp_pa(id_blob);
-               data->len = input.length;
+               data.address = __psp_pa(id_blob);
+               data.len = input.length;
+       } else {
+               data.address = 0;
+               data.len = 0;
        }
 
-       ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_GET_ID, &data, &argp->error);
 
        /*
         * Firmware will return the length of the ID value (either the minimum
         * required length or the actual length written), return it to the user.
         */
-       input.length = data->len;
+       input.length = data.len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -666,7 +668,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
        }
 
        if (id_blob) {
-               if (copy_to_user(input_address, id_blob, data->len)) {
+               if (copy_to_user(input_address, id_blob, data.len)) {
                        ret = -EFAULT;
                        goto e_free;
                }
@@ -674,7 +676,6 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 
 e_free:
        kfree(id_blob);
-       kfree(data);
 
        return ret;
 }
@@ -724,7 +725,7 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        struct sev_device *sev = psp_master->sev_data;
        struct sev_user_data_pdh_cert_export input;
        void *pdh_blob = NULL, *cert_blob = NULL;
-       struct sev_data_pdh_cert_export *data;
+       struct sev_data_pdh_cert_export data;
        void __user *input_cert_chain_address;
        void __user *input_pdh_cert_address;
        int ret;
@@ -742,9 +743,7 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
                return -EFAULT;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return -ENOMEM;
+       memset(&data, 0, sizeof(data));
 
        /* Userspace wants to query the certificate length. */
        if (!input.pdh_cert_address ||
@@ -756,25 +755,19 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
        input_cert_chain_address = (void __user *)input.cert_chain_address;
 
        /* Allocate a physically contiguous buffer to store the PDH blob. */
-       if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        /* Allocate a physically contiguous buffer to store the cert chain blob. */
-       if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) {
-               ret = -EFAULT;
-               goto e_free;
-       }
+       if (input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE)
+               return -EFAULT;
 
        pdh_blob = kmalloc(input.pdh_cert_len, GFP_KERNEL);
-       if (!pdh_blob) {
-               ret = -ENOMEM;
-               goto e_free;
-       }
+       if (!pdh_blob)
+               return -ENOMEM;
 
-       data->pdh_cert_address = __psp_pa(pdh_blob);
-       data->pdh_cert_len = input.pdh_cert_len;
+       data.pdh_cert_address = __psp_pa(pdh_blob);
+       data.pdh_cert_len = input.pdh_cert_len;
 
        cert_blob = kmalloc(input.cert_chain_len, GFP_KERNEL);
        if (!cert_blob) {
@@ -782,15 +775,15 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp, bool writable)
                goto e_free_pdh;
        }
 
-       data->cert_chain_address = __psp_pa(cert_blob);
-       data->cert_chain_len = input.cert_chain_len;
+       data.cert_chain_address = __psp_pa(cert_blob);
+       data.cert_chain_len = input.cert_chain_len;
 
 cmd:
-       ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, data, &argp->error);
+       ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, &data, &argp->error);
 
        /* If we query the length, FW responded with expected data. */
-       input.cert_chain_len = data->cert_chain_len;
-       input.pdh_cert_len = data->pdh_cert_len;
+       input.cert_chain_len = data.cert_chain_len;
+       input.pdh_cert_len = data.pdh_cert_len;
 
        if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
                ret = -EFAULT;
@@ -815,8 +808,6 @@ e_free_cert:
        kfree(cert_blob);
 e_free_pdh:
        kfree(pdh_blob);
-e_free:
-       kfree(data);
        return ret;
 }
 
@@ -976,6 +967,10 @@ int sev_dev_init(struct psp_device *psp)
        if (!sev)
                goto e_err;
 
+       sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 0);
+       if (!sev->cmd_buf)
+               goto e_sev;
+
        psp->sev_data = sev;
 
        sev->dev = dev;
@@ -987,7 +982,7 @@ int sev_dev_init(struct psp_device *psp)
        if (!sev->vdata) {
                ret = -ENODEV;
                dev_err(dev, "sev: missing driver data\n");
-               goto e_err;
+               goto e_buf;
        }
 
        psp_set_sev_irq_handler(psp, sev_irq_handler, sev);
@@ -1002,6 +997,10 @@ int sev_dev_init(struct psp_device *psp)
 
 e_irq:
        psp_clear_sev_irq_handler(psp);
+e_buf:
+       devm_free_pages(dev, (unsigned long)sev->cmd_buf);
+e_sev:
+       devm_kfree(dev, sev);
 e_err:
        psp->sev_data = NULL;
 
index dd5c4fe..666c21e 100644 (file)
@@ -46,12 +46,12 @@ struct sev_device {
        unsigned int int_rcvd;
        wait_queue_head_t int_queue;
        struct sev_misc_dev *misc;
-       struct sev_user_data_status status_cmd_buf;
-       struct sev_data_init init_cmd_buf;
 
        u8 api_major;
        u8 api_minor;
        u8 build;
+
+       void *cmd_buf;
 };
 
 int sev_dev_init(struct psp_device *psp);
index 0a6438c..e7a9561 100644 (file)
@@ -1241,6 +1241,7 @@ int extcon_dev_register(struct extcon_dev *edev)
                                sizeof(*edev->nh), GFP_KERNEL);
        if (!edev->nh) {
                ret = -ENOMEM;
+               device_unregister(&edev->dev);
                goto err_dev;
        }
 
index df3f9bc..4b7ee3f 100644 (file)
@@ -927,7 +927,7 @@ int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size)
        }
 
        /* first try to find a slot in an existing linked list entry */
-       for (prsv = efi_memreserve_root->next; prsv; prsv = rsv->next) {
+       for (prsv = efi_memreserve_root->next; prsv; ) {
                rsv = memremap(prsv, sizeof(*rsv), MEMREMAP_WB);
                index = atomic_fetch_add_unless(&rsv->count, 1, rsv->size);
                if (index < rsv->size) {
@@ -937,6 +937,7 @@ int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size)
                        memunmap(rsv);
                        return efi_mem_reserve_iomem(addr, size);
                }
+               prsv = rsv->next;
                memunmap(rsv);
        }
 
index 41c1d00..abdc8a6 100644 (file)
@@ -485,6 +485,10 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
                        }
 
                        break;
+               case EFI_UNSUPPORTED:
+                       err = -EOPNOTSUPP;
+                       status = EFI_NOT_FOUND;
+                       break;
                case EFI_NOT_FOUND:
                        break;
                default:
index 7ec0822..6367646 100644 (file)
@@ -571,6 +571,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
                               struct lock_class_key *lock_key,
                               struct lock_class_key *request_key)
 {
+       struct fwnode_handle *fwnode = gc->parent ? dev_fwnode(gc->parent) : NULL;
        unsigned long   flags;
        int             ret = 0;
        unsigned        i;
@@ -594,6 +595,12 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
 
        of_gpio_dev_init(gc, gdev);
 
+       /*
+        * Assign fwnode depending on the result of the previous calls,
+        * if none of them succeed, assign it to the parent's one.
+        */
+       gdev->dev.fwnode = dev_fwnode(&gdev->dev) ?: fwnode;
+
        gdev->id = ida_alloc(&gpio_ida, GFP_KERNEL);
        if (gdev->id < 0) {
                ret = gdev->id;
@@ -4256,7 +4263,8 @@ static int __init gpiolib_dev_init(void)
                return ret;
        }
 
-       if (driver_register(&gpio_stub_drv) < 0) {
+       ret = driver_register(&gpio_stub_drv);
+       if (ret < 0) {
                pr_err("gpiolib: could not register GPIO stub driver\n");
                bus_unregister(&gpio_bus_type);
                return ret;
index 49267eb..29885fe 100644 (file)
@@ -1007,13 +1007,9 @@ struct amdgpu_device {
 
        /* s3/s4 mask */
        bool                            in_suspend;
-       bool                            in_hibernate;
-
-       /*
-        * The combination flag in_poweroff_reboot_com used to identify the poweroff
-        * and reboot opt in the s0i3 system-wide suspend.
-        */
-       bool                            in_poweroff_reboot_com;
+       bool                            in_s3;
+       bool                            in_s4;
+       bool                            in_s0ix;
 
        atomic_t                        in_gpu_reset;
        enum pp_mp1_state               mp1_state;
index 6447cd6..8a5a8ff 100644 (file)
@@ -2371,6 +2371,10 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
                i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
                if (!adev->ip_blocks[i].status.late_initialized)
                        continue;
+               /* skip CG for GFX on S0ix */
+               if (adev->in_s0ix &&
+                   adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
+                       continue;
                /* skip CG for VCE/UVD, it's handled specially */
                if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
                    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
@@ -2402,6 +2406,10 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power
                i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
                if (!adev->ip_blocks[i].status.late_initialized)
                        continue;
+               /* skip PG for GFX on S0ix */
+               if (adev->in_s0ix &&
+                   adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
+                       continue;
                /* skip CG for VCE/UVD, it's handled specially */
                if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
                    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
@@ -2678,11 +2686,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
 {
        int i, r;
 
-       if (adev->in_poweroff_reboot_com ||
-           !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) {
-               amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
-               amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
-       }
+       amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
+       amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
 
        for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
                if (!adev->ip_blocks[i].status.valid)
@@ -2722,6 +2727,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
 {
        int i, r;
 
+       if (adev->in_s0ix)
+               amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
+
        for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
                if (!adev->ip_blocks[i].status.valid)
                        continue;
@@ -2734,6 +2742,17 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
                        adev->ip_blocks[i].status.hw = false;
                        continue;
                }
+
+               /* skip suspend of gfx and psp for S0ix
+                * gfx is in gfxoff state, so on resume it will exit gfxoff just
+                * like at runtime. PSP is also part of the always on hardware
+                * so no need to suspend it.
+                */
+               if (adev->in_s0ix &&
+                   (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
+                    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
+                       continue;
+
                /* XXX handle errors */
                r = adev->ip_blocks[i].version->funcs->suspend(adev);
                /* XXX handle errors */
@@ -3673,14 +3692,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
  */
 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 {
-       struct amdgpu_device *adev;
-       struct drm_crtc *crtc;
-       struct drm_connector *connector;
-       struct drm_connector_list_iter iter;
+       struct amdgpu_device *adev = drm_to_adev(dev);
        int r;
 
-       adev = drm_to_adev(dev);
-
        if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
                return 0;
 
@@ -3692,61 +3706,19 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 
        cancel_delayed_work_sync(&adev->delayed_init_work);
 
-       if (!amdgpu_device_has_dc_support(adev)) {
-               /* turn off display hw */
-               drm_modeset_lock_all(dev);
-               drm_connector_list_iter_begin(dev, &iter);
-               drm_for_each_connector_iter(connector, &iter)
-                       drm_helper_connector_dpms(connector,
-                                                 DRM_MODE_DPMS_OFF);
-               drm_connector_list_iter_end(&iter);
-               drm_modeset_unlock_all(dev);
-                       /* unpin the front buffers and cursors */
-               list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
-                       struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
-                       struct drm_framebuffer *fb = crtc->primary->fb;
-                       struct amdgpu_bo *robj;
-
-                       if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
-                               struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
-                               r = amdgpu_bo_reserve(aobj, true);
-                               if (r == 0) {
-                                       amdgpu_bo_unpin(aobj);
-                                       amdgpu_bo_unreserve(aobj);
-                               }
-                       }
-
-                       if (fb == NULL || fb->obj[0] == NULL) {
-                               continue;
-                       }
-                       robj = gem_to_amdgpu_bo(fb->obj[0]);
-                       /* don't unpin kernel fb objects */
-                       if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
-                               r = amdgpu_bo_reserve(robj, true);
-                               if (r == 0) {
-                                       amdgpu_bo_unpin(robj);
-                                       amdgpu_bo_unreserve(robj);
-                               }
-                       }
-               }
-       }
-
        amdgpu_ras_suspend(adev);
 
        r = amdgpu_device_ip_suspend_phase1(adev);
 
-       amdgpu_amdkfd_suspend(adev, adev->in_runpm);
+       if (!adev->in_s0ix)
+               amdgpu_amdkfd_suspend(adev, adev->in_runpm);
 
        /* evict vram memory */
        amdgpu_bo_evict_vram(adev);
 
        amdgpu_fence_driver_suspend(adev);
 
-       if (adev->in_poweroff_reboot_com ||
-           !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev))
-               r = amdgpu_device_ip_suspend_phase2(adev);
-       else
-               amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
+       r = amdgpu_device_ip_suspend_phase2(adev);
        /* evict remaining vram memory
         * This second call to evict vram is to evict the gart page table
         * using the CPU.
@@ -3768,16 +3740,13 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
  */
 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
 {
-       struct drm_connector *connector;
-       struct drm_connector_list_iter iter;
        struct amdgpu_device *adev = drm_to_adev(dev);
-       struct drm_crtc *crtc;
        int r = 0;
 
        if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
                return 0;
 
-       if (amdgpu_acpi_is_s0ix_supported(adev))
+       if (adev->in_s0ix)
                amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
 
        /* post card */
@@ -3802,50 +3771,17 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
        queue_delayed_work(system_wq, &adev->delayed_init_work,
                           msecs_to_jiffies(AMDGPU_RESUME_MS));
 
-       if (!amdgpu_device_has_dc_support(adev)) {
-               /* pin cursors */
-               list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
-                       struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
-
-                       if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
-                               struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
-                               r = amdgpu_bo_reserve(aobj, true);
-                               if (r == 0) {
-                                       r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
-                                       if (r != 0)
-                                               dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
-                                       amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
-                                       amdgpu_bo_unreserve(aobj);
-                               }
-                       }
-               }
+       if (!adev->in_s0ix) {
+               r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
+               if (r)
+                       return r;
        }
-       r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
-       if (r)
-               return r;
 
        /* Make sure IB tests flushed */
        flush_delayed_work(&adev->delayed_init_work);
 
-       /* blat the mode back in */
-       if (fbcon) {
-               if (!amdgpu_device_has_dc_support(adev)) {
-                       /* pre DCE11 */
-                       drm_helper_resume_force_mode(dev);
-
-                       /* turn on display hw */
-                       drm_modeset_lock_all(dev);
-
-                       drm_connector_list_iter_begin(dev, &iter);
-                       drm_for_each_connector_iter(connector, &iter)
-                               drm_helper_connector_dpms(connector,
-                                                         DRM_MODE_DPMS_ON);
-                       drm_connector_list_iter_end(&iter);
-
-                       drm_modeset_unlock_all(dev);
-               }
+       if (fbcon)
                amdgpu_fbdev_set_suspend(adev, 0);
-       }
 
        drm_kms_helper_poll_enable(dev);
 
index 48cb33e..f753e04 100644 (file)
@@ -1310,3 +1310,92 @@ bool amdgpu_crtc_get_scanout_position(struct drm_crtc *crtc,
        return amdgpu_display_get_crtc_scanoutpos(dev, pipe, 0, vpos, hpos,
                                                  stime, etime, mode);
 }
+
+int amdgpu_display_suspend_helper(struct amdgpu_device *adev)
+{
+       struct drm_device *dev = adev_to_drm(adev);
+       struct drm_crtc *crtc;
+       struct drm_connector *connector;
+       struct drm_connector_list_iter iter;
+       int r;
+
+       /* turn off display hw */
+       drm_modeset_lock_all(dev);
+       drm_connector_list_iter_begin(dev, &iter);
+       drm_for_each_connector_iter(connector, &iter)
+               drm_helper_connector_dpms(connector,
+                                         DRM_MODE_DPMS_OFF);
+       drm_connector_list_iter_end(&iter);
+       drm_modeset_unlock_all(dev);
+       /* unpin the front buffers and cursors */
+       list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
+               struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
+               struct drm_framebuffer *fb = crtc->primary->fb;
+               struct amdgpu_bo *robj;
+
+               if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
+                       struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
+                       r = amdgpu_bo_reserve(aobj, true);
+                       if (r == 0) {
+                               amdgpu_bo_unpin(aobj);
+                               amdgpu_bo_unreserve(aobj);
+                       }
+               }
+
+               if (fb == NULL || fb->obj[0] == NULL) {
+                       continue;
+               }
+               robj = gem_to_amdgpu_bo(fb->obj[0]);
+               /* don't unpin kernel fb objects */
+               if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
+                       r = amdgpu_bo_reserve(robj, true);
+                       if (r == 0) {
+                               amdgpu_bo_unpin(robj);
+                               amdgpu_bo_unreserve(robj);
+                       }
+               }
+       }
+       return r;
+}
+
+int amdgpu_display_resume_helper(struct amdgpu_device *adev)
+{
+       struct drm_device *dev = adev_to_drm(adev);
+       struct drm_connector *connector;
+       struct drm_connector_list_iter iter;
+       struct drm_crtc *crtc;
+       int r;
+
+       /* pin cursors */
+       list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
+               struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
+
+               if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
+                       struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
+                       r = amdgpu_bo_reserve(aobj, true);
+                       if (r == 0) {
+                               r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
+                               if (r != 0)
+                                       dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
+                               amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
+                               amdgpu_bo_unreserve(aobj);
+                       }
+               }
+       }
+
+       drm_helper_resume_force_mode(dev);
+
+       /* turn on display hw */
+       drm_modeset_lock_all(dev);
+
+       drm_connector_list_iter_begin(dev, &iter);
+       drm_for_each_connector_iter(connector, &iter)
+               drm_helper_connector_dpms(connector,
+                                         DRM_MODE_DPMS_ON);
+       drm_connector_list_iter_end(&iter);
+
+       drm_modeset_unlock_all(dev);
+
+       return 0;
+}
+
index dc7b7d1..7b6d83e 100644 (file)
@@ -47,4 +47,7 @@ amdgpu_display_user_framebuffer_create(struct drm_device *dev,
 const struct drm_format_info *
 amdgpu_lookup_format_info(u32 format, uint64_t modifier);
 
+int amdgpu_display_suspend_helper(struct amdgpu_device *adev);
+int amdgpu_display_resume_helper(struct amdgpu_device *adev);
+
 #endif
index b26e2fd..e92e7de 100644 (file)
@@ -1107,6 +1107,7 @@ static const struct pci_device_id pciidlist[] = {
        {0x1002, 0x73A3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73AB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73AE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
+       {0x1002, 0x73AF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
        {0x1002, 0x73BF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SIENNA_CICHLID},
 
        /* Van Gogh */
@@ -1274,24 +1275,35 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
         */
        if (!amdgpu_passthrough(adev))
                adev->mp1_state = PP_MP1_STATE_UNLOAD;
-       adev->in_poweroff_reboot_com = true;
        amdgpu_device_ip_suspend(adev);
-       adev->in_poweroff_reboot_com = false;
        adev->mp1_state = PP_MP1_STATE_NONE;
 }
 
 static int amdgpu_pmops_suspend(struct device *dev)
 {
        struct drm_device *drm_dev = dev_get_drvdata(dev);
+       struct amdgpu_device *adev = drm_to_adev(drm_dev);
+       int r;
 
-       return amdgpu_device_suspend(drm_dev, true);
+       if (amdgpu_acpi_is_s0ix_supported(adev))
+               adev->in_s0ix = true;
+       adev->in_s3 = true;
+       r = amdgpu_device_suspend(drm_dev, true);
+       adev->in_s3 = false;
+
+       return r;
 }
 
 static int amdgpu_pmops_resume(struct device *dev)
 {
        struct drm_device *drm_dev = dev_get_drvdata(dev);
+       struct amdgpu_device *adev = drm_to_adev(drm_dev);
+       int r;
 
-       return amdgpu_device_resume(drm_dev, true);
+       r = amdgpu_device_resume(drm_dev, true);
+       if (amdgpu_acpi_is_s0ix_supported(adev))
+               adev->in_s0ix = false;
+       return r;
 }
 
 static int amdgpu_pmops_freeze(struct device *dev)
@@ -1300,9 +1312,9 @@ static int amdgpu_pmops_freeze(struct device *dev)
        struct amdgpu_device *adev = drm_to_adev(drm_dev);
        int r;
 
-       adev->in_hibernate = true;
+       adev->in_s4 = true;
        r = amdgpu_device_suspend(drm_dev, true);
-       adev->in_hibernate = false;
+       adev->in_s4 = false;
        if (r)
                return r;
        return amdgpu_asic_reset(adev);
@@ -1318,13 +1330,8 @@ static int amdgpu_pmops_thaw(struct device *dev)
 static int amdgpu_pmops_poweroff(struct device *dev)
 {
        struct drm_device *drm_dev = dev_get_drvdata(dev);
-       struct amdgpu_device *adev = drm_to_adev(drm_dev);
-       int r;
 
-       adev->in_poweroff_reboot_com = true;
-       r =  amdgpu_device_suspend(drm_dev, true);
-       adev->in_poweroff_reboot_com = false;
-       return r;
+       return amdgpu_device_suspend(drm_dev, true);
 }
 
 static int amdgpu_pmops_restore(struct device *dev)
index 64beb33..a4e2cf7 100644 (file)
@@ -778,9 +778,9 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
                        dev_info->high_va_offset = AMDGPU_GMC_HOLE_END;
                        dev_info->high_va_max = AMDGPU_GMC_HOLE_END | vm_size;
                }
-               dev_info->virtual_address_alignment = max((int)PAGE_SIZE, AMDGPU_GPU_PAGE_SIZE);
+               dev_info->virtual_address_alignment = max_t(u32, PAGE_SIZE, AMDGPU_GPU_PAGE_SIZE);
                dev_info->pte_fragment_size = (1 << adev->vm_manager.fragment_size) * AMDGPU_GPU_PAGE_SIZE;
-               dev_info->gart_page_size = AMDGPU_GPU_PAGE_SIZE;
+               dev_info->gart_page_size = max_t(u32, PAGE_SIZE, AMDGPU_GPU_PAGE_SIZE);
                dev_info->cu_active_number = adev->gfx.cu_info.number;
                dev_info->cu_ao_mask = adev->gfx.cu_info.ao_cu_mask;
                dev_info->ce_ram_size = adev->gfx.ce_ram_size;
index 4b29b82..0720504 100644 (file)
@@ -1028,13 +1028,10 @@ int amdgpu_bo_evict_vram(struct amdgpu_device *adev)
 {
        struct ttm_resource_manager *man;
 
-       /* late 2.6.33 fix IGP hibernate - we need pm ops to do this correct */
-#ifndef CONFIG_HIBERNATION
-       if (adev->flags & AMD_IS_APU) {
-               /* Useless to evict on IGP chips */
+       if (adev->in_s3 && (adev->flags & AMD_IS_APU)) {
+               /* No need to evict vram on APUs for suspend to ram */
                return 0;
        }
-#endif
 
        man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
        return ttm_resource_manager_evict_all(&adev->mman.bdev, man);
index ad91c0c..7d2c8b1 100644 (file)
@@ -2197,8 +2197,8 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
        uint64_t eaddr;
 
        /* validate the parameters */
-       if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
-           size == 0 || size & AMDGPU_GPU_PAGE_MASK)
+       if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK ||
+           size == 0 || size & ~PAGE_MASK)
                return -EINVAL;
 
        /* make sure object fit at this offset */
@@ -2263,8 +2263,8 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
        int r;
 
        /* validate the parameters */
-       if (saddr & AMDGPU_GPU_PAGE_MASK || offset & AMDGPU_GPU_PAGE_MASK ||
-           size == 0 || size & AMDGPU_GPU_PAGE_MASK)
+       if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK ||
+           size == 0 || size & ~PAGE_MASK)
                return -EINVAL;
 
        /* make sure object fit at this offset */
@@ -2409,7 +2409,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
                        after->start = eaddr + 1;
                        after->last = tmp->last;
                        after->offset = tmp->offset;
-                       after->offset += after->start - tmp->start;
+                       after->offset += (after->start - tmp->start) << PAGE_SHIFT;
                        after->flags = tmp->flags;
                        after->bo_va = tmp->bo_va;
                        list_add(&after->list, &tmp->bo_va->invalids);
index 7944781..19abb74 100644 (file)
@@ -2897,6 +2897,11 @@ static int dce_v10_0_hw_fini(void *handle)
 static int dce_v10_0_suspend(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;
+
+       r = amdgpu_display_suspend_helper(adev);
+       if (r)
+               return r;
 
        adev->mode_info.bl_level =
                amdgpu_atombios_encoder_get_backlight_level_from_reg(adev);
@@ -2921,8 +2926,10 @@ static int dce_v10_0_resume(void *handle)
                amdgpu_display_backlight_set_level(adev, adev->mode_info.bl_encoder,
                                                    bl_level);
        }
+       if (ret)
+               return ret;
 
-       return ret;
+       return amdgpu_display_resume_helper(adev);
 }
 
 static bool dce_v10_0_is_idle(void *handle)
index 1b6ff04..320ec35 100644 (file)
@@ -3027,6 +3027,11 @@ static int dce_v11_0_hw_fini(void *handle)
 static int dce_v11_0_suspend(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;
+
+       r = amdgpu_display_suspend_helper(adev);
+       if (r)
+               return r;
 
        adev->mode_info.bl_level =
                amdgpu_atombios_encoder_get_backlight_level_from_reg(adev);
@@ -3051,8 +3056,10 @@ static int dce_v11_0_resume(void *handle)
                amdgpu_display_backlight_set_level(adev, adev->mode_info.bl_encoder,
                                                    bl_level);
        }
+       if (ret)
+               return ret;
 
-       return ret;
+       return amdgpu_display_resume_helper(adev);
 }
 
 static bool dce_v11_0_is_idle(void *handle)
index 83a8838..1332200 100644 (file)
@@ -2770,7 +2770,11 @@ static int dce_v6_0_hw_fini(void *handle)
 static int dce_v6_0_suspend(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;
 
+       r = amdgpu_display_suspend_helper(adev);
+       if (r)
+               return r;
        adev->mode_info.bl_level =
                amdgpu_atombios_encoder_get_backlight_level_from_reg(adev);
 
@@ -2794,8 +2798,10 @@ static int dce_v6_0_resume(void *handle)
                amdgpu_display_backlight_set_level(adev, adev->mode_info.bl_encoder,
                                                    bl_level);
        }
+       if (ret)
+               return ret;
 
-       return ret;
+       return amdgpu_display_resume_helper(adev);
 }
 
 static bool dce_v6_0_is_idle(void *handle)
index 224b302..04ebf02 100644 (file)
@@ -2796,6 +2796,11 @@ static int dce_v8_0_hw_fini(void *handle)
 static int dce_v8_0_suspend(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;
+
+       r = amdgpu_display_suspend_helper(adev);
+       if (r)
+               return r;
 
        adev->mode_info.bl_level =
                amdgpu_atombios_encoder_get_backlight_level_from_reg(adev);
@@ -2820,8 +2825,10 @@ static int dce_v8_0_resume(void *handle)
                amdgpu_display_backlight_set_level(adev, adev->mode_info.bl_encoder,
                                                    bl_level);
        }
+       if (ret)
+               return ret;
 
-       return ret;
+       return amdgpu_display_resume_helper(adev);
 }
 
 static bool dce_v8_0_is_idle(void *handle)
index 9810af7..5c11144 100644 (file)
@@ -39,6 +39,7 @@
 #include "dce_v11_0.h"
 #include "dce_virtual.h"
 #include "ivsrcid/ivsrcid_vislands30.h"
+#include "amdgpu_display.h"
 
 #define DCE_VIRTUAL_VBLANK_PERIOD 16666666
 
@@ -491,12 +492,24 @@ static int dce_virtual_hw_fini(void *handle)
 
 static int dce_virtual_suspend(void *handle)
 {
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;
+
+       r = amdgpu_display_suspend_helper(adev);
+       if (r)
+               return r;
        return dce_virtual_hw_fini(handle);
 }
 
 static int dce_virtual_resume(void *handle)
 {
-       return dce_virtual_hw_init(handle);
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+       int r;
+
+       r = dce_virtual_hw_init(handle);
+       if (r)
+               return r;
+       return amdgpu_display_resume_helper(adev);
 }
 
 static bool dce_virtual_is_idle(void *handle)
index b258a3d..159add0 100644 (file)
@@ -155,7 +155,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
 
        /* Wait till CP writes sync code: */
        status = amdkfd_fence_wait_timeout(
-                       (unsigned int *) rm_state,
+                       rm_state,
                        QUEUESTATE__ACTIVE, 1500);
 
        kfd_gtt_sa_free(dbgdev->dev, mem_obj);
index e686ce2..4598a9a 100644 (file)
@@ -1167,7 +1167,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
        if (retval)
                goto fail_allocate_vidmem;
 
-       dqm->fence_addr = dqm->fence_mem->cpu_ptr;
+       dqm->fence_addr = (uint64_t *)dqm->fence_mem->cpu_ptr;
        dqm->fence_gpu_addr = dqm->fence_mem->gpu_addr;
 
        init_interrupts(dqm);
@@ -1340,8 +1340,8 @@ out:
        return retval;
 }
 
-int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
-                               unsigned int fence_value,
+int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
+                               uint64_t fence_value,
                                unsigned int timeout_ms)
 {
        unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies;
index 7351dd1..45f8159 100644 (file)
@@ -192,7 +192,7 @@ struct device_queue_manager {
        uint16_t                vmid_pasid[VMID_NUM];
        uint64_t                pipelines_addr;
        uint64_t                fence_gpu_addr;
-       unsigned int            *fence_addr;
+       uint64_t                *fence_addr;
        struct kfd_mem_obj      *fence_mem;
        bool                    active_runlist;
        int                     sched_policy;
index 5d541e0..f71a7fa 100644 (file)
@@ -347,7 +347,7 @@ fail_create_runlist_ib:
 }
 
 int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
-                       uint32_t fence_value)
+                       uint64_t fence_value)
 {
        uint32_t *buffer, size;
        int retval = 0;
index dfaf771..e3ba0cd 100644 (file)
@@ -283,7 +283,7 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 }
 
 static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer,
-                       uint64_t fence_address, uint32_t fence_value)
+                       uint64_t fence_address, uint64_t fence_value)
 {
        struct pm4_mes_query_status *packet;
 
index a852e0d..08442e7 100644 (file)
@@ -263,7 +263,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 }
 
 static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
-                       uint64_t fence_address, uint32_t fence_value)
+                       uint64_t fence_address, uint64_t fence_value)
 {
        struct pm4_mes_query_status *packet;
 
index 09599ef..f304d1f 100644 (file)
@@ -1003,8 +1003,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
                       u32 *ctl_stack_used_size,
                       u32 *save_area_used_size);
 
-int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
-                             unsigned int fence_value,
+int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
+                             uint64_t fence_value,
                              unsigned int timeout_ms);
 
 /* Packet Manager */
@@ -1040,7 +1040,7 @@ struct packet_manager_funcs {
                        uint32_t filter_param, bool reset,
                        unsigned int sdma_engine);
        int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
-                       uint64_t fence_address, uint32_t fence_value);
+                       uint64_t fence_address, uint64_t fence_value);
        int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
 
        /* Packet sizes */
@@ -1062,7 +1062,7 @@ int pm_send_set_resources(struct packet_manager *pm,
                                struct scheduling_resources *res);
 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
 int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
-                               uint32_t fence_value);
+                               uint64_t fence_value);
 
 int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
                        enum kfd_unmap_queues_filter mode,
index 5342c30..aece110 100644 (file)
@@ -1507,38 +1507,8 @@ static void dcn20_update_dchubp_dpp(
        if (pipe_ctx->update_flags.bits.enable || pipe_ctx->update_flags.bits.opp_changed
                        || pipe_ctx->stream->update_flags.bits.gamut_remap
                        || pipe_ctx->stream->update_flags.bits.out_csc) {
-               struct mpc *mpc = pipe_ctx->stream_res.opp->ctx->dc->res_pool->mpc;
-
-               if (mpc->funcs->set_gamut_remap) {
-                       int i;
-                       int mpcc_id = hubp->inst;
-                       struct mpc_grph_gamut_adjustment adjust;
-                       bool enable_remap_dpp = false;
-
-                       memset(&adjust, 0, sizeof(adjust));
-                       adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_BYPASS;
-
-                       /* save the enablement of gamut remap for dpp */
-                       enable_remap_dpp = pipe_ctx->stream->gamut_remap_matrix.enable_remap;
-
-                       /* force bypass gamut remap for dpp/cm */
-                       pipe_ctx->stream->gamut_remap_matrix.enable_remap = false;
-                       dc->hwss.program_gamut_remap(pipe_ctx);
-
-                       /* restore gamut remap flag and use this remap into mpc */
-                       pipe_ctx->stream->gamut_remap_matrix.enable_remap = enable_remap_dpp;
-
-                       /* build remap matrix for top plane if enabled */
-                       if (enable_remap_dpp && pipe_ctx->top_pipe == NULL) {
-                                       adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW;
-                                       for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
-                                               adjust.temperature_matrix[i] =
-                                                               pipe_ctx->stream->gamut_remap_matrix.matrix[i];
-                       }
-                       mpc->funcs->set_gamut_remap(mpc, mpcc_id, &adjust);
-               } else
-                       /* dpp/cm gamut remap*/
-                       dc->hwss.program_gamut_remap(pipe_ctx);
+               /* dpp/cm gamut remap*/
+               dc->hwss.program_gamut_remap(pipe_ctx);
 
                /*call the dcn2 method which uses mpc csc*/
                dc->hwss.program_output_csc(dc,
index fa01349..2f9bfae 100644 (file)
@@ -341,8 +341,7 @@ void enc2_hw_init(struct link_encoder *enc)
        } else {
                AUX_REG_WRITE(AUX_DPHY_RX_CONTROL0, 0x103d1110);
 
-               AUX_REG_WRITE(AUX_DPHY_TX_CONTROL, 0x21c4d);
-
+               AUX_REG_WRITE(AUX_DPHY_TX_CONTROL, 0x21c7a);
        }
 
        //AUX_DPHY_TX_REF_CONTROL'AUX_TX_REF_DIV HW default is 0x32;
index 173488a..4a3df13 100644 (file)
@@ -1595,6 +1595,11 @@ static void update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param
        dcn2_1_soc.num_chans = bw_params->num_channels;
 
        ASSERT(clk_table->num_entries);
+       /* Copy dcn2_1_soc.clock_limits to clock_limits to avoid copying over null states later */
+       for (i = 0; i < dcn2_1_soc.num_states + 1; i++) {
+               clock_limits[i] = dcn2_1_soc.clock_limits[i];
+       }
+
        for (i = 0; i < clk_table->num_entries; i++) {
                /* loop backwards*/
                for (closest_clk_lvl = 0, j = dcn2_1_soc.num_states - 1; j >= 0; j--) {
index 41a1d0e..e0df9b0 100644 (file)
@@ -113,6 +113,7 @@ bool cm3_helper_translate_curve_to_hw_format(
        struct pwl_result_data *rgb_resulted;
        struct pwl_result_data *rgb;
        struct pwl_result_data *rgb_plus_1;
+       struct pwl_result_data *rgb_minus_1;
        struct fixed31_32 end_value;
 
        int32_t region_start, region_end;
@@ -140,7 +141,7 @@ bool cm3_helper_translate_curve_to_hw_format(
                region_start = -MAX_LOW_POINT;
                region_end   = NUMBER_REGIONS - MAX_LOW_POINT;
        } else {
-               /* 10 segments
+               /* 11 segments
                 * segment is from 2^-10 to 2^0
                 * There are less than 256 points, for optimization
                 */
@@ -154,9 +155,10 @@ bool cm3_helper_translate_curve_to_hw_format(
                seg_distr[7] = 4;
                seg_distr[8] = 4;
                seg_distr[9] = 4;
+               seg_distr[10] = 1;
 
                region_start = -10;
-               region_end = 0;
+               region_end = 1;
        }
 
        for (i = region_end - region_start; i < MAX_REGIONS_NUMBER ; i++)
@@ -189,6 +191,10 @@ bool cm3_helper_translate_curve_to_hw_format(
        rgb_resulted[hw_points - 1].green = output_tf->tf_pts.green[start_index];
        rgb_resulted[hw_points - 1].blue = output_tf->tf_pts.blue[start_index];
 
+       rgb_resulted[hw_points].red = rgb_resulted[hw_points - 1].red;
+       rgb_resulted[hw_points].green = rgb_resulted[hw_points - 1].green;
+       rgb_resulted[hw_points].blue = rgb_resulted[hw_points - 1].blue;
+
        // All 3 color channels have same x
        corner_points[0].red.x = dc_fixpt_pow(dc_fixpt_from_int(2),
                                             dc_fixpt_from_int(region_start));
@@ -259,15 +265,18 @@ bool cm3_helper_translate_curve_to_hw_format(
 
        rgb = rgb_resulted;
        rgb_plus_1 = rgb_resulted + 1;
+       rgb_minus_1 = rgb;
 
        i = 1;
        while (i != hw_points + 1) {
-               if (dc_fixpt_lt(rgb_plus_1->red, rgb->red))
-                       rgb_plus_1->red = rgb->red;
-               if (dc_fixpt_lt(rgb_plus_1->green, rgb->green))
-                       rgb_plus_1->green = rgb->green;
-               if (dc_fixpt_lt(rgb_plus_1->blue, rgb->blue))
-                       rgb_plus_1->blue = rgb->blue;
+               if (i >= hw_points - 1) {
+                       if (dc_fixpt_lt(rgb_plus_1->red, rgb->red))
+                               rgb_plus_1->red = dc_fixpt_add(rgb->red, rgb_minus_1->delta_red);
+                       if (dc_fixpt_lt(rgb_plus_1->green, rgb->green))
+                               rgb_plus_1->green = dc_fixpt_add(rgb->green, rgb_minus_1->delta_green);
+                       if (dc_fixpt_lt(rgb_plus_1->blue, rgb->blue))
+                               rgb_plus_1->blue = dc_fixpt_add(rgb->blue, rgb_minus_1->delta_blue);
+               }
 
                rgb->delta_red   = dc_fixpt_sub(rgb_plus_1->red,   rgb->red);
                rgb->delta_green = dc_fixpt_sub(rgb_plus_1->green, rgb->green);
@@ -283,6 +292,7 @@ bool cm3_helper_translate_curve_to_hw_format(
                }
 
                ++rgb_plus_1;
+               rgb_minus_1 = rgb;
                ++rgb;
                ++i;
        }
index a2681fe..d0ec838 100644 (file)
@@ -587,6 +587,48 @@ static int smu7_force_switch_to_arbf0(struct pp_hwmgr *hwmgr)
                        tmp, MC_CG_ARB_FREQ_F0);
 }
 
+static uint16_t smu7_override_pcie_speed(struct pp_hwmgr *hwmgr)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev);
+       uint16_t pcie_gen = 0;
+
+       if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 &&
+           adev->pm.pcie_gen_mask & CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4)
+               pcie_gen = 3;
+       else if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 &&
+               adev->pm.pcie_gen_mask & CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3)
+               pcie_gen = 2;
+       else if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 &&
+               adev->pm.pcie_gen_mask & CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2)
+               pcie_gen = 1;
+       else if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 &&
+               adev->pm.pcie_gen_mask & CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1)
+               pcie_gen = 0;
+
+       return pcie_gen;
+}
+
+static uint16_t smu7_override_pcie_width(struct pp_hwmgr *hwmgr)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev);
+       uint16_t pcie_width = 0;
+
+       if (adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X16)
+               pcie_width = 16;
+       else if (adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X12)
+               pcie_width = 12;
+       else if (adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X8)
+               pcie_width = 8;
+       else if (adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X4)
+               pcie_width = 4;
+       else if (adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X2)
+               pcie_width = 2;
+       else if (adev->pm.pcie_mlw_mask & CAIL_PCIE_LINK_WIDTH_SUPPORT_X1)
+               pcie_width = 1;
+
+       return pcie_width;
+}
+
 static int smu7_setup_default_pcie_table(struct pp_hwmgr *hwmgr)
 {
        struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend);
@@ -683,6 +725,11 @@ static int smu7_setup_default_pcie_table(struct pp_hwmgr *hwmgr)
                                        PP_Min_PCIEGen),
                        get_pcie_lane_support(data->pcie_lane_cap,
                                        PP_Max_PCIELane));
+
+               if (data->pcie_dpm_key_disabled)
+                       phm_setup_pcie_table_entry(&data->dpm_table.pcie_speed_table,
+                               data->dpm_table.pcie_speed_table.count,
+                               smu7_override_pcie_speed(hwmgr), smu7_override_pcie_width(hwmgr));
        }
        return 0;
 }
@@ -1248,6 +1295,13 @@ static int smu7_start_dpm(struct pp_hwmgr *hwmgr)
                                                NULL)),
                                "Failed to enable pcie DPM during DPM Start Function!",
                                return -EINVAL);
+       } else {
+               PP_ASSERT_WITH_CODE(
+                               (0 == smum_send_msg_to_smc(hwmgr,
+                                               PPSMC_MSG_PCIeDPM_Disable,
+                                               NULL)),
+                               "Failed to disble pcie DPM during DPM Start Function!",
+                               return -EINVAL);
        }
 
        if (phm_cap_enabled(hwmgr->platform_descriptor.platformCaps,
@@ -3276,7 +3330,8 @@ static int smu7_apply_state_adjust_rules(struct pp_hwmgr *hwmgr,
 
        disable_mclk_switching_for_display = ((1 < hwmgr->display_config->num_display) &&
                                                !hwmgr->display_config->multi_monitor_in_sync) ||
-                                               smu7_vblank_too_short(hwmgr, hwmgr->display_config->min_vblank_time);
+                                               (hwmgr->display_config->num_display &&
+                                               smu7_vblank_too_short(hwmgr, hwmgr->display_config->min_vblank_time));
 
        disable_mclk_switching = disable_mclk_switching_for_frame_lock ||
                                         disable_mclk_switching_for_display;
index 22b636e..599ec97 100644 (file)
@@ -54,6 +54,9 @@
 #include "smuio/smuio_9_0_offset.h"
 #include "smuio/smuio_9_0_sh_mask.h"
 
+#define smnPCIE_LC_SPEED_CNTL                  0x11140290
+#define smnPCIE_LC_LINK_WIDTH_CNTL             0x11140288
+
 #define HBM_MEMORY_CHANNEL_WIDTH    128
 
 static const uint32_t channel_number[] = {1, 2, 0, 4, 0, 8, 0, 16, 2};
@@ -443,8 +446,7 @@ static void vega10_init_dpm_defaults(struct pp_hwmgr *hwmgr)
        if (PP_CAP(PHM_PlatformCaps_VCEDPM))
                data->smu_features[GNLD_DPM_VCE].supported = true;
 
-       if (!data->registry_data.pcie_dpm_key_disabled)
-               data->smu_features[GNLD_DPM_LINK].supported = true;
+       data->smu_features[GNLD_DPM_LINK].supported = true;
 
        if (!data->registry_data.dcefclk_dpm_key_disabled)
                data->smu_features[GNLD_DPM_DCEFCLK].supported = true;
@@ -1544,6 +1546,13 @@ static int vega10_override_pcie_parameters(struct pp_hwmgr *hwmgr)
                        pp_table->PcieLaneCount[i] = pcie_width;
        }
 
+       if (data->registry_data.pcie_dpm_key_disabled) {
+               for (i = 0; i < NUM_LINK_LEVELS; i++) {
+                       pp_table->PcieGenSpeed[i] = pcie_gen;
+                       pp_table->PcieLaneCount[i] = pcie_width;
+               }
+       }
+
        return 0;
 }
 
@@ -2966,6 +2975,14 @@ static int vega10_start_dpm(struct pp_hwmgr *hwmgr, uint32_t bitmap)
                }
        }
 
+       if (data->registry_data.pcie_dpm_key_disabled) {
+               PP_ASSERT_WITH_CODE(!vega10_enable_smc_features(hwmgr,
+                               false, data->smu_features[GNLD_DPM_LINK].smu_feature_bitmap),
+               "Attempt to Disable Link DPM feature Failed!", return -EINVAL);
+               data->smu_features[GNLD_DPM_LINK].enabled = false;
+               data->smu_features[GNLD_DPM_LINK].supported = false;
+       }
+
        return 0;
 }
 
@@ -4584,6 +4601,24 @@ static int vega10_set_ppfeature_status(struct pp_hwmgr *hwmgr, uint64_t new_ppfe
        return 0;
 }
 
+static int vega10_get_current_pcie_link_width_level(struct pp_hwmgr *hwmgr)
+{
+       struct amdgpu_device *adev = hwmgr->adev;
+
+       return (RREG32_PCIE(smnPCIE_LC_LINK_WIDTH_CNTL) &
+               PCIE_LC_LINK_WIDTH_CNTL__LC_LINK_WIDTH_RD_MASK)
+               >> PCIE_LC_LINK_WIDTH_CNTL__LC_LINK_WIDTH_RD__SHIFT;
+}
+
+static int vega10_get_current_pcie_link_speed_level(struct pp_hwmgr *hwmgr)
+{
+       struct amdgpu_device *adev = hwmgr->adev;
+
+       return (RREG32_PCIE(smnPCIE_LC_SPEED_CNTL) &
+               PSWUSP0_PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE_MASK)
+               >> PSWUSP0_PCIE_LC_SPEED_CNTL__LC_CURRENT_DATA_RATE__SHIFT;
+}
+
 static int vega10_print_clock_levels(struct pp_hwmgr *hwmgr,
                enum pp_clock_type type, char *buf)
 {
@@ -4592,8 +4627,9 @@ static int vega10_print_clock_levels(struct pp_hwmgr *hwmgr,
        struct vega10_single_dpm_table *mclk_table = &(data->dpm_table.mem_table);
        struct vega10_single_dpm_table *soc_table = &(data->dpm_table.soc_table);
        struct vega10_single_dpm_table *dcef_table = &(data->dpm_table.dcef_table);
-       struct vega10_pcie_table *pcie_table = &(data->dpm_table.pcie_table);
        struct vega10_odn_clock_voltage_dependency_table *podn_vdd_dep = NULL;
+       uint32_t gen_speed, lane_width, current_gen_speed, current_lane_width;
+       PPTable_t *pptable = &(data->smc_state_table.pp_table);
 
        int i, now, size = 0, count = 0;
 
@@ -4650,15 +4686,31 @@ static int vega10_print_clock_levels(struct pp_hwmgr *hwmgr,
                                        "*" : "");
                break;
        case PP_PCIE:
-               smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetCurrentLinkIndex, &now);
-
-               for (i = 0; i < pcie_table->count; i++)
-                       size += sprintf(buf + size, "%d: %s %s\n", i,
-                                       (pcie_table->pcie_gen[i] == 0) ? "2.5GT/s, x1" :
-                                       (pcie_table->pcie_gen[i] == 1) ? "5.0GT/s, x16" :
-                                       (pcie_table->pcie_gen[i] == 2) ? "8.0GT/s, x16" : "",
-                                       (i == now) ? "*" : "");
+               current_gen_speed =
+                       vega10_get_current_pcie_link_speed_level(hwmgr);
+               current_lane_width =
+                       vega10_get_current_pcie_link_width_level(hwmgr);
+               for (i = 0; i < NUM_LINK_LEVELS; i++) {
+                       gen_speed = pptable->PcieGenSpeed[i];
+                       lane_width = pptable->PcieLaneCount[i];
+
+                       size += sprintf(buf + size, "%d: %s %s %s\n", i,
+                                       (gen_speed == 0) ? "2.5GT/s," :
+                                       (gen_speed == 1) ? "5.0GT/s," :
+                                       (gen_speed == 2) ? "8.0GT/s," :
+                                       (gen_speed == 3) ? "16.0GT/s," : "",
+                                       (lane_width == 1) ? "x1" :
+                                       (lane_width == 2) ? "x2" :
+                                       (lane_width == 3) ? "x4" :
+                                       (lane_width == 4) ? "x8" :
+                                       (lane_width == 5) ? "x12" :
+                                       (lane_width == 6) ? "x16" : "",
+                                       (current_gen_speed == gen_speed) &&
+                                       (current_lane_width == lane_width) ?
+                                       "*" : "");
+               }
                break;
+
        case OD_SCLK:
                if (hwmgr->od_enabled) {
                        size = sprintf(buf, "%s:\n", "OD_SCLK");
index 43e01d8..4f6da11 100644 (file)
@@ -133,6 +133,7 @@ static void vega12_set_default_registry_data(struct pp_hwmgr *hwmgr)
        data->registry_data.auto_wattman_debug = 0;
        data->registry_data.auto_wattman_sample_period = 100;
        data->registry_data.auto_wattman_threshold = 50;
+       data->registry_data.pcie_dpm_key_disabled = !(hwmgr->feature_mask & PP_PCIE_DPM_MASK);
 }
 
 static int vega12_set_features_platform_caps(struct pp_hwmgr *hwmgr)
@@ -539,6 +540,29 @@ static int vega12_override_pcie_parameters(struct pp_hwmgr *hwmgr)
                pp_table->PcieLaneCount[i] = pcie_width_arg;
        }
 
+       /* override to the highest if it's disabled from ppfeaturmask */
+       if (data->registry_data.pcie_dpm_key_disabled) {
+               for (i = 0; i < NUM_LINK_LEVELS; i++) {
+                       smu_pcie_arg = (i << 16) | (pcie_gen << 8) | pcie_width;
+                       ret = smum_send_msg_to_smc_with_parameter(hwmgr,
+                               PPSMC_MSG_OverridePcieParameters, smu_pcie_arg,
+                               NULL);
+                       PP_ASSERT_WITH_CODE(!ret,
+                               "[OverridePcieParameters] Attempt to override pcie params failed!",
+                               return ret);
+
+                       pp_table->PcieGenSpeed[i] = pcie_gen;
+                       pp_table->PcieLaneCount[i] = pcie_width;
+               }
+               ret = vega12_enable_smc_features(hwmgr,
+                               false,
+                               data->smu_features[GNLD_DPM_LINK].smu_feature_bitmap);
+               PP_ASSERT_WITH_CODE(!ret,
+                               "Attempt to Disable DPM LINK Failed!",
+                               return ret);
+               data->smu_features[GNLD_DPM_LINK].enabled = false;
+               data->smu_features[GNLD_DPM_LINK].supported = false;
+       }
        return 0;
 }
 
index f19964c..b6ee3a2 100644 (file)
@@ -171,6 +171,7 @@ static void vega20_set_default_registry_data(struct pp_hwmgr *hwmgr)
        data->registry_data.gfxoff_controlled_by_driver = 1;
        data->gfxoff_allowed = false;
        data->counter_gfxoff = 0;
+       data->registry_data.pcie_dpm_key_disabled = !(hwmgr->feature_mask & PP_PCIE_DPM_MASK);
 }
 
 static int vega20_set_features_platform_caps(struct pp_hwmgr *hwmgr)
@@ -884,6 +885,30 @@ static int vega20_override_pcie_parameters(struct pp_hwmgr *hwmgr)
                pp_table->PcieLaneCount[i] = pcie_width_arg;
        }
 
+       /* override to the highest if it's disabled from ppfeaturmask */
+       if (data->registry_data.pcie_dpm_key_disabled) {
+               for (i = 0; i < NUM_LINK_LEVELS; i++) {
+                       smu_pcie_arg = (i << 16) | (pcie_gen << 8) | pcie_width;
+                       ret = smum_send_msg_to_smc_with_parameter(hwmgr,
+                               PPSMC_MSG_OverridePcieParameters, smu_pcie_arg,
+                               NULL);
+                       PP_ASSERT_WITH_CODE(!ret,
+                               "[OverridePcieParameters] Attempt to override pcie params failed!",
+                               return ret);
+
+                       pp_table->PcieGenSpeed[i] = pcie_gen;
+                       pp_table->PcieLaneCount[i] = pcie_width;
+               }
+               ret = vega20_enable_smc_features(hwmgr,
+                               false,
+                               data->smu_features[GNLD_DPM_LINK].smu_feature_bitmap);
+               PP_ASSERT_WITH_CODE(!ret,
+                               "Attempt to Disable DPM LINK Failed!",
+                               return ret);
+               data->smu_features[GNLD_DPM_LINK].enabled = false;
+               data->smu_features[GNLD_DPM_LINK].supported = false;
+       }
+
        return 0;
 }
 
index d143ef1..cd905e4 100644 (file)
@@ -1294,7 +1294,7 @@ static int smu_disable_dpms(struct smu_context *smu)
        bool use_baco = !smu->is_apu &&
                ((amdgpu_in_reset(adev) &&
                  (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) ||
-                ((adev->in_runpm || adev->in_hibernate) && amdgpu_asic_supports_baco(adev)));
+                ((adev->in_runpm || adev->in_s4) && amdgpu_asic_supports_baco(adev)));
 
        /*
         * For custom pptable uploading, skip the DPM features
@@ -1431,7 +1431,8 @@ static int smu_suspend(void *handle)
 
        smu->watermarks_bitmap &= ~(WATERMARKS_LOADED);
 
-       if (smu->is_apu)
+       /* skip CGPG when in S0ix */
+       if (smu->is_apu && !adev->in_s0ix)
                smu_set_gfx_cgpg(&adev->smu, false);
 
        return 0;
index 7ddbaec..101eaa2 100644 (file)
@@ -384,10 +384,15 @@ static int vangogh_dpm_set_jpeg_enable(struct smu_context *smu, bool enable)
 
 static bool vangogh_is_dpm_running(struct smu_context *smu)
 {
+       struct amdgpu_device *adev = smu->adev;
        int ret = 0;
        uint32_t feature_mask[2];
        uint64_t feature_enabled;
 
+       /* we need to re-init after suspend so return false */
+       if (adev->in_suspend)
+               return false;
+
        ret = smu_cmn_get_enabled_32_bits_mask(smu, feature_mask, 2);
 
        if (ret)
index 6d38c5c..db69f19 100644 (file)
@@ -689,7 +689,8 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj)
                struct page **pages = pvec + pinned;
 
                ret = pin_user_pages_fast(ptr, num_pages,
-                                         !userptr->ro ? FOLL_WRITE : 0, pages);
+                                         FOLL_WRITE | FOLL_FORCE | FOLL_LONGTERM,
+                                         pages);
                if (ret < 0) {
                        unpin_user_pages(pvec, pinned);
                        kvfree(pvec);
index 1f79bc2..1510e4e 100644 (file)
@@ -13,7 +13,6 @@
 #include <linux/irq.h>
 #include <linux/mfd/syscon.h>
 #include <linux/of_device.h>
-#include <linux/of_gpio.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>
index 4683f98..c3f2962 100644 (file)
@@ -317,12 +317,13 @@ int intel_plane_atomic_check_with_state(const struct intel_crtc_state *old_crtc_
        if (!new_plane_state->hw.crtc && !old_plane_state->hw.crtc)
                return 0;
 
-       new_crtc_state->enabled_planes |= BIT(plane->id);
-
        ret = plane->check_plane(new_crtc_state, new_plane_state);
        if (ret)
                return ret;
 
+       if (fb)
+               new_crtc_state->enabled_planes |= BIT(plane->id);
+
        /* FIXME pre-g4x don't work like this */
        if (new_plane_state->uapi.visible)
                new_crtc_state->active_planes |= BIT(plane->id);
index 8c12d53..775d89b 100644 (file)
@@ -3619,9 +3619,7 @@ intel_dp_get_dpcd(struct intel_dp *intel_dp)
 {
        int ret;
 
-       intel_dp_lttpr_init(intel_dp);
-
-       if (drm_dp_read_dpcd_caps(&intel_dp->aux, intel_dp->dpcd))
+       if (intel_dp_init_lttpr_and_dprx_caps(intel_dp) < 0)
                return false;
 
        /*
index eaebf12..10fe17b 100644 (file)
@@ -133,6 +133,7 @@ static u32 g4x_get_aux_send_ctl(struct intel_dp *intel_dp,
        else
                precharge = 5;
 
+       /* Max timeout value on G4x-BDW: 1.6ms */
        if (IS_BROADWELL(dev_priv))
                timeout = DP_AUX_CH_CTL_TIME_OUT_600us;
        else
@@ -159,6 +160,12 @@ static u32 skl_get_aux_send_ctl(struct intel_dp *intel_dp,
        enum phy phy = intel_port_to_phy(i915, dig_port->base.port);
        u32 ret;
 
+       /*
+        * Max timeout values:
+        * SKL-GLK: 1.6ms
+        * CNL: 3.2ms
+        * ICL+: 4ms
+        */
        ret = DP_AUX_CH_CTL_SEND_BUSY |
              DP_AUX_CH_CTL_DONE |
              DP_AUX_CH_CTL_INTERRUPT |
index 892d7db..be6ac0d 100644 (file)
@@ -34,6 +34,11 @@ intel_dp_dump_link_status(const u8 link_status[DP_LINK_STATUS_SIZE])
                      link_status[3], link_status[4], link_status[5]);
 }
 
+static void intel_dp_reset_lttpr_common_caps(struct intel_dp *intel_dp)
+{
+       memset(&intel_dp->lttpr_common_caps, 0, sizeof(intel_dp->lttpr_common_caps));
+}
+
 static void intel_dp_reset_lttpr_count(struct intel_dp *intel_dp)
 {
        intel_dp->lttpr_common_caps[DP_PHY_REPEATER_CNT -
@@ -81,19 +86,36 @@ static void intel_dp_read_lttpr_phy_caps(struct intel_dp *intel_dp,
 
 static bool intel_dp_read_lttpr_common_caps(struct intel_dp *intel_dp)
 {
-       if (drm_dp_read_lttpr_common_caps(&intel_dp->aux,
-                                         intel_dp->lttpr_common_caps) < 0) {
-               memset(intel_dp->lttpr_common_caps, 0,
-                      sizeof(intel_dp->lttpr_common_caps));
+       struct drm_i915_private *i915 = dp_to_i915(intel_dp);
+
+       if (intel_dp_is_edp(intel_dp))
                return false;
-       }
+
+       /*
+        * Detecting LTTPRs must be avoided on platforms with an AUX timeout
+        * period < 3.2ms. (see DP Standard v2.0, 2.11.2, 3.6.6.1).
+        */
+       if (INTEL_GEN(i915) < 10)
+               return false;
+
+       if (drm_dp_read_lttpr_common_caps(&intel_dp->aux,
+                                         intel_dp->lttpr_common_caps) < 0)
+               goto reset_caps;
 
        drm_dbg_kms(&dp_to_i915(intel_dp)->drm,
                    "LTTPR common capabilities: %*ph\n",
                    (int)sizeof(intel_dp->lttpr_common_caps),
                    intel_dp->lttpr_common_caps);
 
+       /* The minimum value of LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV is 1.4 */
+       if (intel_dp->lttpr_common_caps[0] < 0x14)
+               goto reset_caps;
+
        return true;
+
+reset_caps:
+       intel_dp_reset_lttpr_common_caps(intel_dp);
+       return false;
 }
 
 static bool
@@ -106,33 +128,49 @@ intel_dp_set_lttpr_transparent_mode(struct intel_dp *intel_dp, bool enable)
 }
 
 /**
- * intel_dp_lttpr_init - detect LTTPRs and init the LTTPR link training mode
+ * intel_dp_init_lttpr_and_dprx_caps - detect LTTPR and DPRX caps, init the LTTPR link training mode
  * @intel_dp: Intel DP struct
  *
- * Read the LTTPR common capabilities, switch to non-transparent link training
- * mode if any is detected and read the PHY capabilities for all detected
- * LTTPRs. In case of an LTTPR detection error or if the number of
+ * Read the LTTPR common and DPRX capabilities and switch to non-transparent
+ * link training mode if any is detected and read the PHY capabilities for all
+ * detected LTTPRs. In case of an LTTPR detection error or if the number of
  * LTTPRs is more than is supported (8), fall back to the no-LTTPR,
  * transparent mode link training mode.
  *
  * Returns:
- *   >0  if LTTPRs were detected and the non-transparent LT mode was set
+ *   >0  if LTTPRs were detected and the non-transparent LT mode was set. The
+ *       DPRX capabilities are read out.
  *    0  if no LTTPRs or more than 8 LTTPRs were detected or in case of a
- *       detection failure and the transparent LT mode was set
+ *       detection failure and the transparent LT mode was set. The DPRX
+ *       capabilities are read out.
+ *   <0  Reading out the DPRX capabilities failed.
  */
-int intel_dp_lttpr_init(struct intel_dp *intel_dp)
+int intel_dp_init_lttpr_and_dprx_caps(struct intel_dp *intel_dp)
 {
        int lttpr_count;
        bool ret;
        int i;
 
-       if (intel_dp_is_edp(intel_dp))
-               return 0;
-
        ret = intel_dp_read_lttpr_common_caps(intel_dp);
+
+       /* The DPTX shall read the DPRX caps after LTTPR detection. */
+       if (drm_dp_read_dpcd_caps(&intel_dp->aux, intel_dp->dpcd)) {
+               intel_dp_reset_lttpr_common_caps(intel_dp);
+               return -EIO;
+       }
+
        if (!ret)
                return 0;
 
+       /*
+        * The 0xF0000-0xF02FF range is only valid if the DPCD revision is
+        * at least 1.4.
+        */
+       if (intel_dp->dpcd[DP_DPCD_REV] < 0x14) {
+               intel_dp_reset_lttpr_common_caps(intel_dp);
+               return 0;
+       }
+
        lttpr_count = drm_dp_lttpr_count(intel_dp->lttpr_common_caps);
        /*
         * Prevent setting LTTPR transparent mode explicitly if no LTTPRs are
@@ -172,7 +210,7 @@ int intel_dp_lttpr_init(struct intel_dp *intel_dp)
 
        return lttpr_count;
 }
-EXPORT_SYMBOL(intel_dp_lttpr_init);
+EXPORT_SYMBOL(intel_dp_init_lttpr_and_dprx_caps);
 
 static u8 dp_voltage_max(u8 preemph)
 {
@@ -807,7 +845,10 @@ void intel_dp_start_link_train(struct intel_dp *intel_dp,
         * TODO: Reiniting LTTPRs here won't be needed once proper connector
         * HW state readout is added.
         */
-       int lttpr_count = intel_dp_lttpr_init(intel_dp);
+       int lttpr_count = intel_dp_init_lttpr_and_dprx_caps(intel_dp);
+
+       if (lttpr_count < 0)
+               return;
 
        if (!intel_dp_link_train_all_phys(intel_dp, crtc_state, lttpr_count))
                intel_dp_schedule_fallback_link_training(intel_dp, crtc_state);
index 6a1f76b..9cb7c28 100644 (file)
@@ -11,7 +11,7 @@
 struct intel_crtc_state;
 struct intel_dp;
 
-int intel_dp_lttpr_init(struct intel_dp *intel_dp);
+int intel_dp_init_lttpr_and_dprx_caps(struct intel_dp *intel_dp);
 
 void intel_dp_get_adjust_train(struct intel_dp *intel_dp,
                               const struct intel_crtc_state *crtc_state,
index f58cc57..a86c57d 100644 (file)
@@ -1014,20 +1014,14 @@ static i915_reg_t dss_ctl1_reg(const struct intel_crtc_state *crtc_state)
 {
        enum pipe pipe = to_intel_crtc(crtc_state->uapi.crtc)->pipe;
 
-       if (crtc_state->cpu_transcoder == TRANSCODER_EDP)
-               return DSS_CTL1;
-
-       return ICL_PIPE_DSS_CTL1(pipe);
+       return is_pipe_dsc(crtc_state) ? ICL_PIPE_DSS_CTL1(pipe) : DSS_CTL1;
 }
 
 static i915_reg_t dss_ctl2_reg(const struct intel_crtc_state *crtc_state)
 {
        enum pipe pipe = to_intel_crtc(crtc_state->uapi.crtc)->pipe;
 
-       if (crtc_state->cpu_transcoder == TRANSCODER_EDP)
-               return DSS_CTL2;
-
-       return ICL_PIPE_DSS_CTL2(pipe);
+       return is_pipe_dsc(crtc_state) ? ICL_PIPE_DSS_CTL2(pipe) : DSS_CTL2;
 }
 
 void intel_dsc_enable(struct intel_encoder *encoder,
index a357bb4..67de2b1 100644 (file)
@@ -316,7 +316,18 @@ void i915_vma_revoke_fence(struct i915_vma *vma)
        WRITE_ONCE(fence->vma, NULL);
        vma->fence = NULL;
 
-       with_intel_runtime_pm_if_in_use(fence_to_uncore(fence)->rpm, wakeref)
+       /*
+        * Skip the write to HW if and only if the device is currently
+        * suspended.
+        *
+        * If the driver does not currently hold a wakeref (if_in_use == 0),
+        * the device may currently be runtime suspended, or it may be woken
+        * up before the suspend takes place. If the device is not suspended
+        * (powered down) and we skip clearing the fence register, the HW is
+        * left in an undefined state where we may end up with multiple
+        * registers overlapping.
+        */
+       with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref)
                fence_write(fence);
 }
 
index 112ba5f..e62ad69 100644 (file)
@@ -603,7 +603,6 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 {
        int report_size = stream->oa_buffer.format_size;
        struct drm_i915_perf_record_header header;
-       u32 sample_flags = stream->sample_flags;
 
        header.type = DRM_I915_PERF_RECORD_SAMPLE;
        header.pad = 0;
@@ -617,10 +616,8 @@ static int append_oa_sample(struct i915_perf_stream *stream,
                return -EFAULT;
        buf += sizeof(header);
 
-       if (sample_flags & SAMPLE_OA_REPORT) {
-               if (copy_to_user(buf, report, report_size))
-                       return -EFAULT;
-       }
+       if (copy_to_user(buf, report, report_size))
+               return -EFAULT;
 
        (*offset) += header.size;
 
@@ -2682,7 +2679,7 @@ static void i915_oa_stream_enable(struct i915_perf_stream *stream)
 
        stream->perf->ops.oa_enable(stream);
 
-       if (stream->periodic)
+       if (stream->sample_flags & SAMPLE_OA_REPORT)
                hrtimer_start(&stream->poll_check_timer,
                              ns_to_ktime(stream->poll_oa_period),
                              HRTIMER_MODE_REL_PINNED);
@@ -2745,7 +2742,7 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 {
        stream->perf->ops.oa_disable(stream);
 
-       if (stream->periodic)
+       if (stream->sample_flags & SAMPLE_OA_REPORT)
                hrtimer_cancel(&stream->poll_check_timer);
 }
 
@@ -3028,7 +3025,7 @@ static ssize_t i915_perf_read(struct file *file,
         * disabled stream as an error. In particular it might otherwise lead
         * to a deadlock for blocking file descriptors...
         */
-       if (!stream->enabled)
+       if (!stream->enabled || !(stream->sample_flags & SAMPLE_OA_REPORT))
                return -EIO;
 
        if (!(file->f_flags & O_NONBLOCK)) {
index 7146cd0..aaf1f00 100644 (file)
@@ -3316,7 +3316,18 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 
 #define ILK_DISPLAY_CHICKEN1   _MMIO(0x42000)
 #define   ILK_FBCQ_DIS         (1 << 22)
-#define          ILK_PABSTRETCH_DIS    (1 << 21)
+#define   ILK_PABSTRETCH_DIS   REG_BIT(21)
+#define   ILK_SABSTRETCH_DIS   REG_BIT(20)
+#define   IVB_PRI_STRETCH_MAX_MASK     REG_GENMASK(21, 20)
+#define   IVB_PRI_STRETCH_MAX_X8       REG_FIELD_PREP(IVB_PRI_STRETCH_MAX_MASK, 0)
+#define   IVB_PRI_STRETCH_MAX_X4       REG_FIELD_PREP(IVB_PRI_STRETCH_MAX_MASK, 1)
+#define   IVB_PRI_STRETCH_MAX_X2       REG_FIELD_PREP(IVB_PRI_STRETCH_MAX_MASK, 2)
+#define   IVB_PRI_STRETCH_MAX_X1       REG_FIELD_PREP(IVB_PRI_STRETCH_MAX_MASK, 3)
+#define   IVB_SPR_STRETCH_MAX_MASK     REG_GENMASK(19, 18)
+#define   IVB_SPR_STRETCH_MAX_X8       REG_FIELD_PREP(IVB_SPR_STRETCH_MAX_MASK, 0)
+#define   IVB_SPR_STRETCH_MAX_X4       REG_FIELD_PREP(IVB_SPR_STRETCH_MAX_MASK, 1)
+#define   IVB_SPR_STRETCH_MAX_X2       REG_FIELD_PREP(IVB_SPR_STRETCH_MAX_MASK, 2)
+#define   IVB_SPR_STRETCH_MAX_X1       REG_FIELD_PREP(IVB_SPR_STRETCH_MAX_MASK, 3)
 
 
 /*
@@ -8039,6 +8050,16 @@ enum {
 
 #define _CHICKEN_PIPESL_1_A    0x420b0
 #define _CHICKEN_PIPESL_1_B    0x420b4
+#define  HSW_PRI_STRETCH_MAX_MASK      REG_GENMASK(28, 27)
+#define  HSW_PRI_STRETCH_MAX_X8                REG_FIELD_PREP(HSW_PRI_STRETCH_MAX_MASK, 0)
+#define  HSW_PRI_STRETCH_MAX_X4                REG_FIELD_PREP(HSW_PRI_STRETCH_MAX_MASK, 1)
+#define  HSW_PRI_STRETCH_MAX_X2                REG_FIELD_PREP(HSW_PRI_STRETCH_MAX_MASK, 2)
+#define  HSW_PRI_STRETCH_MAX_X1                REG_FIELD_PREP(HSW_PRI_STRETCH_MAX_MASK, 3)
+#define  HSW_SPR_STRETCH_MAX_MASK      REG_GENMASK(26, 25)
+#define  HSW_SPR_STRETCH_MAX_X8                REG_FIELD_PREP(HSW_SPR_STRETCH_MAX_MASK, 0)
+#define  HSW_SPR_STRETCH_MAX_X4                REG_FIELD_PREP(HSW_SPR_STRETCH_MAX_MASK, 1)
+#define  HSW_SPR_STRETCH_MAX_X2                REG_FIELD_PREP(HSW_SPR_STRETCH_MAX_MASK, 2)
+#define  HSW_SPR_STRETCH_MAX_X1                REG_FIELD_PREP(HSW_SPR_STRETCH_MAX_MASK, 3)
 #define  HSW_FBCQ_DIS                  (1 << 22)
 #define  BDW_DPRS_MASK_VBLANK_SRD      (1 << 0)
 #define CHICKEN_PIPESL_1(pipe) _MMIO_PIPE(pipe, _CHICKEN_PIPESL_1_A, _CHICKEN_PIPESL_1_B)
index 0c3e63f..97b57ac 100644 (file)
@@ -7245,11 +7245,16 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv)
        intel_uncore_write(&dev_priv->uncore, CHICKEN_PAR1_1,
                   intel_uncore_read(&dev_priv->uncore, CHICKEN_PAR1_1) | DPA_MASK_VBLANK_SRD);
 
-       /* WaPsrDPRSUnmaskVBlankInSRD:bdw */
        for_each_pipe(dev_priv, pipe) {
+               /* WaPsrDPRSUnmaskVBlankInSRD:bdw */
                intel_uncore_write(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe),
                           intel_uncore_read(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe)) |
                           BDW_DPRS_MASK_VBLANK_SRD);
+
+               /* Undocumented but fixes async flip + VT-d corruption */
+               if (intel_vtd_active())
+                       intel_uncore_rmw(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe),
+                                        HSW_PRI_STRETCH_MAX_MASK, HSW_PRI_STRETCH_MAX_X1);
        }
 
        /* WaVSRefCountFullforceMissDisable:bdw */
@@ -7285,11 +7290,20 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv)
 
 static void hsw_init_clock_gating(struct drm_i915_private *dev_priv)
 {
+       enum pipe pipe;
+
        /* WaFbcAsynchFlipDisableFbcQueue:hsw,bdw */
        intel_uncore_write(&dev_priv->uncore, CHICKEN_PIPESL_1(PIPE_A),
                   intel_uncore_read(&dev_priv->uncore, CHICKEN_PIPESL_1(PIPE_A)) |
                   HSW_FBCQ_DIS);
 
+       for_each_pipe(dev_priv, pipe) {
+               /* Undocumented but fixes async flip + VT-d corruption */
+               if (intel_vtd_active())
+                       intel_uncore_rmw(&dev_priv->uncore, CHICKEN_PIPESL_1(pipe),
+                                        HSW_PRI_STRETCH_MAX_MASK, HSW_PRI_STRETCH_MAX_X1);
+       }
+
        /* This is required by WaCatErrorRejectionIssue:hsw */
        intel_uncore_write(&dev_priv->uncore, GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
                   intel_uncore_read(&dev_priv->uncore, GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
index 153ca9e..8b725ef 100644 (file)
@@ -412,12 +412,20 @@ intel_wakeref_t intel_runtime_pm_get(struct intel_runtime_pm *rpm)
 }
 
 /**
- * intel_runtime_pm_get_if_in_use - grab a runtime pm reference if device in use
+ * __intel_runtime_pm_get_if_active - grab a runtime pm reference if device is active
  * @rpm: the intel_runtime_pm structure
+ * @ignore_usecount: get a ref even if dev->power.usage_count is 0
  *
  * This function grabs a device-level runtime pm reference if the device is
- * already in use and ensures that it is powered up. It is illegal to try
- * and access the HW should intel_runtime_pm_get_if_in_use() report failure.
+ * already active and ensures that it is powered up. It is illegal to try
+ * and access the HW should intel_runtime_pm_get_if_active() report failure.
+ *
+ * If @ignore_usecount=true, a reference will be acquired even if there is no
+ * user requiring the device to be powered up (dev->power.usage_count == 0).
+ * If the function returns false in this case then it's guaranteed that the
+ * device's runtime suspend hook has been called already or that it will be
+ * called (and hence it's also guaranteed that the device's runtime resume
+ * hook will be called eventually).
  *
  * Any runtime pm reference obtained by this function must have a symmetric
  * call to intel_runtime_pm_put() to release the reference again.
@@ -425,7 +433,8 @@ intel_wakeref_t intel_runtime_pm_get(struct intel_runtime_pm *rpm)
  * Returns: the wakeref cookie to pass to intel_runtime_pm_put(), evaluates
  * as True if the wakeref was acquired, or False otherwise.
  */
-intel_wakeref_t intel_runtime_pm_get_if_in_use(struct intel_runtime_pm *rpm)
+static intel_wakeref_t __intel_runtime_pm_get_if_active(struct intel_runtime_pm *rpm,
+                                                       bool ignore_usecount)
 {
        if (IS_ENABLED(CONFIG_PM)) {
                /*
@@ -434,7 +443,7 @@ intel_wakeref_t intel_runtime_pm_get_if_in_use(struct intel_runtime_pm *rpm)
                 * function, since the power state is undefined. This applies
                 * atm to the late/early system suspend/resume handlers.
                 */
-               if (pm_runtime_get_if_in_use(rpm->kdev) <= 0)
+               if (pm_runtime_get_if_active(rpm->kdev, ignore_usecount) <= 0)
                        return 0;
        }
 
@@ -443,6 +452,16 @@ intel_wakeref_t intel_runtime_pm_get_if_in_use(struct intel_runtime_pm *rpm)
        return track_intel_runtime_pm_wakeref(rpm);
 }
 
+intel_wakeref_t intel_runtime_pm_get_if_in_use(struct intel_runtime_pm *rpm)
+{
+       return __intel_runtime_pm_get_if_active(rpm, false);
+}
+
+intel_wakeref_t intel_runtime_pm_get_if_active(struct intel_runtime_pm *rpm)
+{
+       return __intel_runtime_pm_get_if_active(rpm, true);
+}
+
 /**
  * intel_runtime_pm_get_noresume - grab a runtime pm reference
  * @rpm: the intel_runtime_pm structure
index ae64ff1..1e4ddd1 100644 (file)
@@ -177,6 +177,7 @@ void intel_runtime_pm_driver_release(struct intel_runtime_pm *rpm);
 
 intel_wakeref_t intel_runtime_pm_get(struct intel_runtime_pm *rpm);
 intel_wakeref_t intel_runtime_pm_get_if_in_use(struct intel_runtime_pm *rpm);
+intel_wakeref_t intel_runtime_pm_get_if_active(struct intel_runtime_pm *rpm);
 intel_wakeref_t intel_runtime_pm_get_noresume(struct intel_runtime_pm *rpm);
 intel_wakeref_t intel_runtime_pm_get_raw(struct intel_runtime_pm *rpm);
 
@@ -188,6 +189,10 @@ intel_wakeref_t intel_runtime_pm_get_raw(struct intel_runtime_pm *rpm);
        for ((wf) = intel_runtime_pm_get_if_in_use(rpm); (wf); \
             intel_runtime_pm_put((rpm), (wf)), (wf) = 0)
 
+#define with_intel_runtime_pm_if_active(rpm, wf) \
+       for ((wf) = intel_runtime_pm_get_if_active(rpm); (wf); \
+            intel_runtime_pm_put((rpm), (wf)), (wf) = 0)
+
 void intel_runtime_pm_put_unchecked(struct intel_runtime_pm *rpm);
 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_RUNTIME_PM)
 void intel_runtime_pm_put(struct intel_runtime_pm *rpm, intel_wakeref_t wref);
index d1a9841..e6a88c8 100644 (file)
@@ -215,7 +215,7 @@ static int imx_drm_bind(struct device *dev)
 
        ret = drmm_mode_config_init(drm);
        if (ret)
-               return ret;
+               goto err_kms;
 
        ret = drm_vblank_init(drm, MAX_CRTC);
        if (ret)
index dbfe39e..ffdc492 100644 (file)
@@ -197,6 +197,11 @@ static void imx_ldb_encoder_enable(struct drm_encoder *encoder)
        int dual = ldb->ldb_ctrl & LDB_SPLIT_MODE_EN;
        int mux = drm_of_encoder_active_port_id(imx_ldb_ch->child, encoder);
 
+       if (mux < 0 || mux >= ARRAY_SIZE(ldb->clk_sel)) {
+               dev_warn(ldb->dev, "%s: invalid mux %d\n", __func__, mux);
+               return;
+       }
+
        drm_panel_prepare(imx_ldb_ch->panel);
 
        if (dual) {
@@ -255,6 +260,11 @@ imx_ldb_encoder_atomic_mode_set(struct drm_encoder *encoder,
        int mux = drm_of_encoder_active_port_id(imx_ldb_ch->child, encoder);
        u32 bus_format = imx_ldb_ch->bus_format;
 
+       if (mux < 0 || mux >= ARRAY_SIZE(ldb->clk_sel)) {
+               dev_warn(ldb->dev, "%s: invalid mux %d\n", __func__, mux);
+               return;
+       }
+
        if (mode->clock > 170000) {
                dev_warn(ldb->dev,
                         "%s: mode exceeds 170 MHz pixel clock\n", __func__);
@@ -583,7 +593,7 @@ static int imx_ldb_bind(struct device *dev, struct device *master, void *data)
                struct imx_ldb_channel *channel = &imx_ldb->channel[i];
 
                if (!channel->ldb)
-                       break;
+                       continue;
 
                ret = imx_ldb_register(drm, channel);
                if (ret)
index 5ccc9da..c35b06b 100644 (file)
@@ -304,7 +304,7 @@ int a5xx_power_init(struct msm_gpu *gpu)
        /* Set up the limits management */
        if (adreno_is_a530(adreno_gpu))
                a530_lm_setup(gpu);
-       else
+       else if (adreno_is_a540(adreno_gpu))
                a540_lm_setup(gpu);
 
        /* Set up SP/TP power collpase */
index 71c917f..91cf46f 100644 (file)
@@ -339,7 +339,7 @@ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
        else
                bit = a6xx_gmu_oob_bits[state].ack_new;
 
-       gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, bit);
+       gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << bit);
 }
 
 /* Enable CPU control of SPTP power power collapse */
index ba8e9d3..690409c 100644 (file)
@@ -522,28 +522,73 @@ static int a6xx_cp_init(struct msm_gpu *gpu)
        return a6xx_idle(gpu, ring) ? 0 : -EINVAL;
 }
 
-static void a6xx_ucode_check_version(struct a6xx_gpu *a6xx_gpu,
+/*
+ * Check that the microcode version is new enough to include several key
+ * security fixes. Return true if the ucode is safe.
+ */
+static bool a6xx_ucode_check_version(struct a6xx_gpu *a6xx_gpu,
                struct drm_gem_object *obj)
 {
+       struct adreno_gpu *adreno_gpu = &a6xx_gpu->base;
+       struct msm_gpu *gpu = &adreno_gpu->base;
        u32 *buf = msm_gem_get_vaddr(obj);
+       bool ret = false;
 
        if (IS_ERR(buf))
-               return;
+               return false;
 
        /*
-        * If the lowest nibble is 0xa that is an indication that this microcode
-        * has been patched. The actual version is in dword [3] but we only care
-        * about the patchlevel which is the lowest nibble of dword [3]
-        *
-        * Otherwise check that the firmware is greater than or equal to 1.90
-        * which was the first version that had this fix built in
+        * Targets up to a640 (a618, a630 and a640) need to check for a
+        * microcode version that is patched to support the whereami opcode or
+        * one that is new enough to include it by default.
         */
-       if (((buf[0] & 0xf) == 0xa) && (buf[2] & 0xf) >= 1)
-               a6xx_gpu->has_whereami = true;
-       else if ((buf[0] & 0xfff) > 0x190)
-               a6xx_gpu->has_whereami = true;
+       if (adreno_is_a618(adreno_gpu) || adreno_is_a630(adreno_gpu) ||
+               adreno_is_a640(adreno_gpu)) {
+               /*
+                * If the lowest nibble is 0xa that is an indication that this
+                * microcode has been patched. The actual version is in dword
+                * [3] but we only care about the patchlevel which is the lowest
+                * nibble of dword [3]
+                *
+                * Otherwise check that the firmware is greater than or equal
+                * to 1.90 which was the first version that had this fix built
+                * in
+                */
+               if ((((buf[0] & 0xf) == 0xa) && (buf[2] & 0xf) >= 1) ||
+                       (buf[0] & 0xfff) >= 0x190) {
+                       a6xx_gpu->has_whereami = true;
+                       ret = true;
+                       goto out;
+               }
+
+               DRM_DEV_ERROR(&gpu->pdev->dev,
+                       "a630 SQE ucode is too old. Have version %x need at least %x\n",
+                       buf[0] & 0xfff, 0x190);
+       }  else {
+               /*
+                * a650 tier targets don't need whereami but still need to be
+                * equal to or newer than 1.95 for other security fixes
+                */
+               if (adreno_is_a650(adreno_gpu)) {
+                       if ((buf[0] & 0xfff) >= 0x195) {
+                               ret = true;
+                               goto out;
+                       }
+
+                       DRM_DEV_ERROR(&gpu->pdev->dev,
+                               "a650 SQE ucode is too old. Have version %x need at least %x\n",
+                               buf[0] & 0xfff, 0x195);
+               }
 
+               /*
+                * When a660 is added those targets should return true here
+                * since those have all the critical security fixes built in
+                * from the start
+                */
+       }
+out:
        msm_gem_put_vaddr(obj);
+       return ret;
 }
 
 static int a6xx_ucode_init(struct msm_gpu *gpu)
@@ -566,7 +611,13 @@ static int a6xx_ucode_init(struct msm_gpu *gpu)
                }
 
                msm_gem_object_set_name(a6xx_gpu->sqe_bo, "sqefw");
-               a6xx_ucode_check_version(a6xx_gpu, a6xx_gpu->sqe_bo);
+               if (!a6xx_ucode_check_version(a6xx_gpu, a6xx_gpu->sqe_bo)) {
+                       msm_gem_unpin_iova(a6xx_gpu->sqe_bo, gpu->aspace);
+                       drm_gem_object_put(a6xx_gpu->sqe_bo);
+
+                       a6xx_gpu->sqe_bo = NULL;
+                       return -EPERM;
+               }
        }
 
        gpu_write64(gpu, REG_A6XX_CP_SQE_INSTR_BASE_LO,
@@ -1350,35 +1401,20 @@ static int a6xx_set_supported_hw(struct device *dev, struct a6xx_gpu *a6xx_gpu,
                u32 revn)
 {
        struct opp_table *opp_table;
-       struct nvmem_cell *cell;
        u32 supp_hw = UINT_MAX;
-       void *buf;
-
-       cell = nvmem_cell_get(dev, "speed_bin");
-       /*
-        * -ENOENT means that the platform doesn't support speedbin which is
-        * fine
-        */
-       if (PTR_ERR(cell) == -ENOENT)
-               return 0;
-       else if (IS_ERR(cell)) {
-               DRM_DEV_ERROR(dev,
-                               "failed to read speed-bin. Some OPPs may not be supported by hardware");
-               goto done;
-       }
+       u16 speedbin;
+       int ret;
 
-       buf = nvmem_cell_read(cell, NULL);
-       if (IS_ERR(buf)) {
-               nvmem_cell_put(cell);
+       ret = nvmem_cell_read_u16(dev, "speed_bin", &speedbin);
+       if (ret) {
                DRM_DEV_ERROR(dev,
-                               "failed to read speed-bin. Some OPPs may not be supported by hardware");
+                             "failed to read speed-bin (%d). Some OPPs may not be supported by hardware",
+                             ret);
                goto done;
        }
+       speedbin = le16_to_cpu(speedbin);
 
-       supp_hw = fuse_to_supp_hw(dev, revn, *((u32 *) buf));
-
-       kfree(buf);
-       nvmem_cell_put(cell);
+       supp_hw = fuse_to_supp_hw(dev, revn, speedbin);
 
 done:
        opp_table = dev_pm_opp_set_supported_hw(dev, &supp_hw, 1);
index 5a8e3e1..85f2c35 100644 (file)
@@ -43,6 +43,8 @@
 #define DPU_DEBUGFS_DIR "msm_dpu"
 #define DPU_DEBUGFS_HWMASKNAME "hw_log_mask"
 
+#define MIN_IB_BW      400000000ULL /* Min ib vote 400MB */
+
 static int dpu_kms_hw_init(struct msm_kms *kms);
 static void _dpu_kms_mmu_destroy(struct dpu_kms *dpu_kms);
 
@@ -931,6 +933,9 @@ static int dpu_kms_hw_init(struct msm_kms *kms)
                DPU_DEBUG("REG_DMA is not defined");
        }
 
+       if (of_device_is_compatible(dev->dev->of_node, "qcom,sc7180-mdss"))
+               dpu_kms_parse_data_bus_icc_path(dpu_kms);
+
        pm_runtime_get_sync(&dpu_kms->pdev->dev);
 
        dpu_kms->core_rev = readl_relaxed(dpu_kms->mmio + 0x0);
@@ -1032,9 +1037,6 @@ static int dpu_kms_hw_init(struct msm_kms *kms)
 
        dpu_vbif_init_memtypes(dpu_kms);
 
-       if (of_device_is_compatible(dev->dev->of_node, "qcom,sc7180-mdss"))
-               dpu_kms_parse_data_bus_icc_path(dpu_kms);
-
        pm_runtime_put_sync(&dpu_kms->pdev->dev);
 
        return 0;
@@ -1191,10 +1193,10 @@ static int __maybe_unused dpu_runtime_resume(struct device *dev)
 
        ddev = dpu_kms->dev;
 
+       WARN_ON(!(dpu_kms->num_paths));
        /* Min vote of BW is required before turning on AXI clk */
        for (i = 0; i < dpu_kms->num_paths; i++)
-               icc_set_bw(dpu_kms->path[i], 0,
-                       dpu_kms->catalog->perf.min_dram_ib);
+               icc_set_bw(dpu_kms->path[i], 0, Bps_to_icc(MIN_IB_BW));
 
        rc = msm_dss_enable_clk(mp->clk_config, mp->num_clk, true);
        if (rc) {
index 1c6e1d2..7c22bfe 100644 (file)
@@ -32,6 +32,8 @@ struct dp_aux_private {
        struct drm_dp_aux dp_aux;
 };
 
+#define MAX_AUX_RETRIES                        5
+
 static const char *dp_aux_get_error(u32 aux_error)
 {
        switch (aux_error) {
@@ -377,6 +379,11 @@ static ssize_t dp_aux_transfer(struct drm_dp_aux *dp_aux,
        ret = dp_aux_cmd_fifo_tx(aux, msg);
 
        if (ret < 0) {
+               if (aux->native) {
+                       aux->retry_cnt++;
+                       if (!(aux->retry_cnt % MAX_AUX_RETRIES))
+                               dp_catalog_aux_update_cfg(aux->catalog);
+               }
                usleep_range(400, 500); /* at least 400us to next try */
                goto unlock_exit;
        }
index a45fe95..3dc6587 100644 (file)
@@ -163,7 +163,7 @@ struct msm_dsi_pll *msm_dsi_pll_init(struct platform_device *pdev,
                break;
        case MSM_DSI_PHY_7NM:
        case MSM_DSI_PHY_7NM_V4_1:
-               pll = msm_dsi_pll_7nm_init(pdev, id);
+               pll = msm_dsi_pll_7nm_init(pdev, type, id);
                break;
        default:
                pll = ERR_PTR(-ENXIO);
index 3405982..bbecb1d 100644 (file)
@@ -117,10 +117,12 @@ msm_dsi_pll_10nm_init(struct platform_device *pdev, int id)
 }
 #endif
 #ifdef CONFIG_DRM_MSM_DSI_7NM_PHY
-struct msm_dsi_pll *msm_dsi_pll_7nm_init(struct platform_device *pdev, int id);
+struct msm_dsi_pll *msm_dsi_pll_7nm_init(struct platform_device *pdev,
+                                       enum msm_dsi_phy_type type, int id);
 #else
 static inline struct msm_dsi_pll *
-msm_dsi_pll_7nm_init(struct platform_device *pdev, int id)
+msm_dsi_pll_7nm_init(struct platform_device *pdev,
+                                       enum msm_dsi_phy_type type, int id)
 {
        return ERR_PTR(-ENODEV);
 }
index 93bf142..e29b3bf 100644 (file)
@@ -325,7 +325,7 @@ static void dsi_pll_commit(struct dsi_pll_7nm *pll)
        pll_write(base + REG_DSI_7nm_PHY_PLL_FRAC_DIV_START_LOW_1, reg->frac_div_start_low);
        pll_write(base + REG_DSI_7nm_PHY_PLL_FRAC_DIV_START_MID_1, reg->frac_div_start_mid);
        pll_write(base + REG_DSI_7nm_PHY_PLL_FRAC_DIV_START_HIGH_1, reg->frac_div_start_high);
-       pll_write(base + REG_DSI_7nm_PHY_PLL_PLL_LOCKDET_RATE_1, 0x40);
+       pll_write(base + REG_DSI_7nm_PHY_PLL_PLL_LOCKDET_RATE_1, reg->pll_lockdet_rate);
        pll_write(base + REG_DSI_7nm_PHY_PLL_PLL_LOCK_DELAY, 0x06);
        pll_write(base + REG_DSI_7nm_PHY_PLL_CMODE_1, 0x10); /* TODO: 0x00 for CPHY */
        pll_write(base + REG_DSI_7nm_PHY_PLL_CLOCK_INVERTERS, reg->pll_clock_inverters);
@@ -509,6 +509,7 @@ static unsigned long dsi_pll_7nm_vco_recalc_rate(struct clk_hw *hw,
 {
        struct msm_dsi_pll *pll = hw_clk_to_pll(hw);
        struct dsi_pll_7nm *pll_7nm = to_pll_7nm(pll);
+       struct dsi_pll_config *config = &pll_7nm->pll_configuration;
        void __iomem *base = pll_7nm->mmio;
        u64 ref_clk = pll_7nm->vco_ref_clk_rate;
        u64 vco_rate = 0x0;
@@ -529,9 +530,8 @@ static unsigned long dsi_pll_7nm_vco_recalc_rate(struct clk_hw *hw,
        /*
         * TODO:
         *      1. Assumes prescaler is disabled
-        *      2. Multiplier is 2^18. it should be 2^(num_of_frac_bits)
         */
-       multiplier = 1 << 18;
+       multiplier = 1 << config->frac_bits;
        pll_freq = dec * (ref_clk * 2);
        tmp64 = (ref_clk * 2 * frac);
        pll_freq += div_u64(tmp64, multiplier);
@@ -852,7 +852,8 @@ err_base_clk_hw:
        return ret;
 }
 
-struct msm_dsi_pll *msm_dsi_pll_7nm_init(struct platform_device *pdev, int id)
+struct msm_dsi_pll *msm_dsi_pll_7nm_init(struct platform_device *pdev,
+                                       enum msm_dsi_phy_type type, int id)
 {
        struct dsi_pll_7nm *pll_7nm;
        struct msm_dsi_pll *pll;
@@ -885,7 +886,7 @@ struct msm_dsi_pll *msm_dsi_pll_7nm_init(struct platform_device *pdev, int id)
        pll = &pll_7nm->base;
        pll->min_rate = 1000000000UL;
        pll->max_rate = 3500000000UL;
-       if (pll->type == MSM_DSI_PHY_7NM_V4_1) {
+       if (type == MSM_DSI_PHY_7NM_V4_1) {
                pll->min_rate = 600000000UL;
                pll->max_rate = (unsigned long)5000000000ULL;
                /* workaround for max rate overflowing on 32-bit builds: */
index 6a32676..edcacca 100644 (file)
@@ -57,10 +57,13 @@ static void vblank_put(struct msm_kms *kms, unsigned crtc_mask)
 
 static void lock_crtcs(struct msm_kms *kms, unsigned int crtc_mask)
 {
+       int crtc_index;
        struct drm_crtc *crtc;
 
-       for_each_crtc_mask(kms->dev, crtc, crtc_mask)
-               mutex_lock(&kms->commit_lock[drm_crtc_index(crtc)]);
+       for_each_crtc_mask(kms->dev, crtc, crtc_mask) {
+               crtc_index = drm_crtc_index(crtc);
+               mutex_lock_nested(&kms->commit_lock[crtc_index], crtc_index);
+       }
 }
 
 static void unlock_crtcs(struct msm_kms *kms, unsigned int crtc_mask)
index 94525ac..a5c6b8c 100644 (file)
@@ -1072,6 +1072,10 @@ static int __maybe_unused msm_pm_resume(struct device *dev)
 static int __maybe_unused msm_pm_prepare(struct device *dev)
 {
        struct drm_device *ddev = dev_get_drvdata(dev);
+       struct msm_drm_private *priv = ddev ? ddev->dev_private : NULL;
+
+       if (!priv || !priv->kms)
+               return 0;
 
        return drm_mode_config_helper_suspend(ddev);
 }
@@ -1079,6 +1083,10 @@ static int __maybe_unused msm_pm_prepare(struct device *dev)
 static void __maybe_unused msm_pm_complete(struct device *dev)
 {
        struct drm_device *ddev = dev_get_drvdata(dev);
+       struct msm_drm_private *priv = ddev ? ddev->dev_private : NULL;
+
+       if (!priv || !priv->kms)
+               return;
 
        drm_mode_config_helper_resume(ddev);
 }
@@ -1311,6 +1319,10 @@ static int msm_pdev_remove(struct platform_device *pdev)
 static void msm_pdev_shutdown(struct platform_device *pdev)
 {
        struct drm_device *drm = platform_get_drvdata(pdev);
+       struct msm_drm_private *priv = drm ? drm->dev_private : NULL;
+
+       if (!priv || !priv->kms)
+               return;
 
        drm_atomic_helper_shutdown(drm);
 }
index ad27036..cd59a59 100644 (file)
@@ -45,7 +45,7 @@ int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
        int ret;
 
        if (fence > fctx->last_fence) {
-               DRM_ERROR("%s: waiting on invalid fence: %u (of %u)\n",
+               DRM_ERROR_RATELIMITED("%s: waiting on invalid fence: %u (of %u)\n",
                                fctx->name, fence, fctx->last_fence);
                return -EINVAL;
        }
index 4735251..d8151a8 100644 (file)
@@ -157,7 +157,6 @@ struct msm_kms {
         * from the crtc's pending_timer close to end of the frame:
         */
        struct mutex commit_lock[MAX_CRTCS];
-       struct lock_class_key commit_lock_keys[MAX_CRTCS];
        unsigned pending_crtc_mask;
        struct msm_pending_timer pending_timers[MAX_CRTCS];
 };
@@ -167,11 +166,8 @@ static inline int msm_kms_init(struct msm_kms *kms,
 {
        unsigned i, ret;
 
-       for (i = 0; i < ARRAY_SIZE(kms->commit_lock); i++) {
-               lockdep_register_key(&kms->commit_lock_keys[i]);
-               __mutex_init(&kms->commit_lock[i], "&kms->commit_lock[i]",
-                            &kms->commit_lock_keys[i]);
-       }
+       for (i = 0; i < ARRAY_SIZE(kms->commit_lock); i++)
+               mutex_init(&kms->commit_lock[i]);
 
        kms->funcs = funcs;
 
index 196612a..1c9c0cd 100644 (file)
@@ -2693,9 +2693,20 @@ nv50_display_create(struct drm_device *dev)
        else
                nouveau_display(dev)->format_modifiers = disp50xx_modifiers;
 
-       if (disp->disp->object.oclass >= GK104_DISP) {
+       /* FIXME: 256x256 cursors are supported on Kepler, however unlike Maxwell and later
+        * generations Kepler requires that we use small pages (4K) for cursor scanout surfaces. The
+        * proper fix for this is to teach nouveau to migrate fbs being used for the cursor plane to
+        * small page allocations in prepare_fb(). When this is implemented, we should also force
+        * large pages (128K) for ovly fbs in order to fix Kepler ovlys.
+        * But until then, just limit cursors to 128x128 - which is small enough to avoid ever using
+        * large pages.
+        */
+       if (disp->disp->object.oclass >= GM107_DISP) {
                dev->mode_config.cursor_width = 256;
                dev->mode_config.cursor_height = 256;
+       } else if (disp->disp->object.oclass >= GK104_DISP) {
+               dev->mode_config.cursor_width = 128;
+               dev->mode_config.cursor_height = 128;
        } else {
                dev->mode_config.cursor_width = 64;
                dev->mode_config.cursor_height = 64;
index fabb314..f2720a0 100644 (file)
@@ -551,6 +551,10 @@ nouveau_bo_sync_for_device(struct nouveau_bo *nvbo)
 
        if (!ttm_dma)
                return;
+       if (!ttm_dma->pages) {
+               NV_DEBUG(drm, "ttm_dma 0x%p: pages NULL\n", ttm_dma);
+               return;
+       }
 
        /* Don't waste time looping if the object is coherent */
        if (nvbo->force_coherent)
@@ -583,6 +587,10 @@ nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo)
 
        if (!ttm_dma)
                return;
+       if (!ttm_dma->pages) {
+               NV_DEBUG(drm, "ttm_dma 0x%p: pages NULL\n", ttm_dma);
+               return;
+       }
 
        /* Don't waste time looping if the object is coherent */
        if (nvbo->force_coherent)
index 8e11612..b31d750 100644 (file)
@@ -2149,11 +2149,12 @@ static int dsi_vc_send_short(struct dsi_data *dsi, int vc,
                             const struct mipi_dsi_msg *msg)
 {
        struct mipi_dsi_packet pkt;
+       int ret;
        u32 r;
 
-       r = mipi_dsi_create_packet(&pkt, msg);
-       if (r < 0)
-               return r;
+       ret = mipi_dsi_create_packet(&pkt, msg);
+       if (ret < 0)
+               return ret;
 
        WARN_ON(!dsi_bus_is_locked(dsi));
 
index ba8c603..ca37617 100644 (file)
@@ -48,21 +48,12 @@ static unsigned int rcar_du_encoder_count_ports(struct device_node *node)
 static const struct drm_encoder_funcs rcar_du_encoder_funcs = {
 };
 
-static void rcar_du_encoder_release(struct drm_device *dev, void *res)
-{
-       struct rcar_du_encoder *renc = res;
-
-       drm_encoder_cleanup(&renc->base);
-       kfree(renc);
-}
-
 int rcar_du_encoder_init(struct rcar_du_device *rcdu,
                         enum rcar_du_output output,
                         struct device_node *enc_node)
 {
        struct rcar_du_encoder *renc;
        struct drm_bridge *bridge;
-       int ret;
 
        /*
         * Locate the DRM bridge from the DT node. For the DPAD outputs, if the
@@ -101,26 +92,16 @@ int rcar_du_encoder_init(struct rcar_du_device *rcdu,
                        return -ENOLINK;
        }
 
-       renc = kzalloc(sizeof(*renc), GFP_KERNEL);
-       if (renc == NULL)
-               return -ENOMEM;
-
-       renc->output = output;
-
        dev_dbg(rcdu->dev, "initializing encoder %pOF for output %u\n",
                enc_node, output);
 
-       ret = drm_encoder_init(&rcdu->ddev, &renc->base, &rcar_du_encoder_funcs,
-                              DRM_MODE_ENCODER_NONE, NULL);
-       if (ret < 0) {
-               kfree(renc);
-               return ret;
-       }
+       renc = drmm_encoder_alloc(&rcdu->ddev, struct rcar_du_encoder, base,
+                                 &rcar_du_encoder_funcs, DRM_MODE_ENCODER_NONE,
+                                 NULL);
+       if (!renc)
+               return -ENOMEM;
 
-       ret = drmm_add_action_or_reset(&rcdu->ddev, rcar_du_encoder_release,
-                                      renc);
-       if (ret)
-               return ret;
+       renc->output = output;
 
        /*
         * Attach the bridge to the encoder. The bridge will create the
index 0ae3a02..134986d 100644 (file)
@@ -1688,6 +1688,11 @@ static void tegra_dc_commit_state(struct tegra_dc *dc,
                        dev_err(dc->dev,
                                "failed to set clock rate to %lu Hz\n",
                                state->pclk);
+
+               err = clk_set_rate(dc->clk, state->pclk);
+               if (err < 0)
+                       dev_err(dc->dev, "failed to set clock %pC to %lu Hz: %d\n",
+                               dc->clk, state->pclk, err);
        }
 
        DRM_DEBUG_KMS("rate: %lu, div: %u\n", clk_get_rate(dc->clk),
@@ -1698,11 +1703,6 @@ static void tegra_dc_commit_state(struct tegra_dc *dc,
                value = SHIFT_CLK_DIVIDER(state->div) | PIXEL_CLK_DIVIDER_PCD1;
                tegra_dc_writel(dc, value, DC_DISP_DISP_CLOCK_CONTROL);
        }
-
-       err = clk_set_rate(dc->clk, state->pclk);
-       if (err < 0)
-               dev_err(dc->dev, "failed to set clock %pC to %lu Hz: %d\n",
-                       dc->clk, state->pclk, err);
 }
 
 static void tegra_dc_stop(struct tegra_dc *dc)
@@ -2501,22 +2501,18 @@ static int tegra_dc_couple(struct tegra_dc *dc)
         * POWER_CONTROL registers during CRTC enabling.
         */
        if (dc->soc->coupled_pm && dc->pipe == 1) {
-               u32 flags = DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_CONSUMER;
-               struct device_link *link;
-               struct device *partner;
+               struct device *companion;
+               struct tegra_dc *parent;
 
-               partner = driver_find_device(dc->dev->driver, NULL, NULL,
-                                            tegra_dc_match_by_pipe);
-               if (!partner)
+               companion = driver_find_device(dc->dev->driver, NULL, (const void *)0,
+                                              tegra_dc_match_by_pipe);
+               if (!companion)
                        return -EPROBE_DEFER;
 
-               link = device_link_add(dc->dev, partner, flags);
-               if (!link) {
-                       dev_err(dc->dev, "failed to link controllers\n");
-                       return -EINVAL;
-               }
+               parent = dev_get_drvdata(companion);
+               dc->client.parent = &parent->client;
 
-               dev_dbg(dc->dev, "coupled to %s\n", dev_name(partner));
+               dev_dbg(dc->dev, "coupled to %s\n", dev_name(companion));
        }
 
        return 0;
index f02a035..7b88261 100644 (file)
@@ -3115,6 +3115,12 @@ static int tegra_sor_init(struct host1x_client *client)
         * kernel is possible.
         */
        if (sor->rst) {
+               err = pm_runtime_resume_and_get(sor->dev);
+               if (err < 0) {
+                       dev_err(sor->dev, "failed to get runtime PM: %d\n", err);
+                       return err;
+               }
+
                err = reset_control_acquire(sor->rst);
                if (err < 0) {
                        dev_err(sor->dev, "failed to acquire SOR reset: %d\n",
@@ -3148,6 +3154,7 @@ static int tegra_sor_init(struct host1x_client *client)
                }
 
                reset_control_release(sor->rst);
+               pm_runtime_put(sor->dev);
        }
 
        err = clk_prepare_enable(sor->clk_safe);
index 347fb96..68a766f 100644 (file)
@@ -705,8 +705,9 @@ void host1x_driver_unregister(struct host1x_driver *driver)
 EXPORT_SYMBOL(host1x_driver_unregister);
 
 /**
- * host1x_client_register() - register a host1x client
+ * __host1x_client_register() - register a host1x client
  * @client: host1x client
+ * @key: lock class key for the client-specific mutex
  *
  * Registers a host1x client with each host1x controller instance. Note that
  * each client will only match their parent host1x controller and will only be
@@ -715,13 +716,14 @@ EXPORT_SYMBOL(host1x_driver_unregister);
  * device and call host1x_device_init(), which will in turn call each client's
  * &host1x_client_ops.init implementation.
  */
-int host1x_client_register(struct host1x_client *client)
+int __host1x_client_register(struct host1x_client *client,
+                            struct lock_class_key *key)
 {
        struct host1x *host1x;
        int err;
 
        INIT_LIST_HEAD(&client->list);
-       mutex_init(&client->lock);
+       __mutex_init(&client->lock, "host1x client lock", key);
        client->usecount = 0;
 
        mutex_lock(&devices_lock);
@@ -742,7 +744,7 @@ int host1x_client_register(struct host1x_client *client)
 
        return 0;
 }
-EXPORT_SYMBOL(host1x_client_register);
+EXPORT_SYMBOL(__host1x_client_register);
 
 /**
  * host1x_client_unregister() - unregister a host1x client
index bf7d22f..e0667c4 100644 (file)
@@ -266,6 +266,8 @@ config ADI_AXI_ADC
        select IIO_BUFFER
        select IIO_BUFFER_HW_CONSUMER
        select IIO_BUFFER_DMAENGINE
+       depends on HAS_IOMEM
+       depends on OF
        help
          Say yes here to build support for Analog Devices Generic
          AXI ADC IP core. The IP core is used for interfacing with
@@ -923,6 +925,7 @@ config STM32_ADC_CORE
        depends on ARCH_STM32 || COMPILE_TEST
        depends on OF
        depends on REGULATOR
+       depends on HAS_IOMEM
        select IIO_BUFFER
        select MFD_STM32_TIMERS
        select IIO_STM32_TIMER_TRIGGER
index 6f9a3e2..7b5212b 100644 (file)
@@ -918,7 +918,7 @@ static int ab8500_gpadc_read_raw(struct iio_dev *indio_dev,
                        return processed;
 
                /* Return millivolt or milliamps or millicentigrades */
-               *val = processed * 1000;
+               *val = processed;
                return IIO_VAL_INT;
        }
 
index 5d597e5..1b4b320 100644 (file)
@@ -91,7 +91,7 @@ static int ad7949_spi_read_channel(struct ad7949_adc_chip *ad7949_adc, int *val,
        int ret;
        int i;
        int bits_per_word = ad7949_adc->resolution;
-       int mask = GENMASK(ad7949_adc->resolution, 0);
+       int mask = GENMASK(ad7949_adc->resolution - 1, 0);
        struct spi_message msg;
        struct spi_transfer tx[] = {
                {
index 05ff948..07b1a99 100644 (file)
@@ -597,7 +597,7 @@ static const struct vadc_channels vadc_chans[] = {
        VADC_CHAN_NO_SCALE(P_MUX16_1_3, 1)
 
        VADC_CHAN_NO_SCALE(LR_MUX1_BAT_THERM, 0)
-       VADC_CHAN_NO_SCALE(LR_MUX2_BAT_ID, 0)
+       VADC_CHAN_VOLT(LR_MUX2_BAT_ID, 0, SCALE_DEFAULT)
        VADC_CHAN_NO_SCALE(LR_MUX3_XO_THERM, 0)
        VADC_CHAN_NO_SCALE(LR_MUX4_AMUX_THM1, 0)
        VADC_CHAN_NO_SCALE(LR_MUX5_AMUX_THM2, 0)
index dfa31a2..ac90be0 100644 (file)
@@ -551,6 +551,8 @@ static irqreturn_t mpu3050_trigger_handler(int irq, void *p)
                                               MPU3050_FIFO_R,
                                               &fifo_values[offset],
                                               toread);
+                       if (ret)
+                               goto out_trigger_unlock;
 
                        dev_dbg(mpu3050->dev,
                                "%04x %04x %04x %04x %04x\n",
index 52f6051..d627054 100644 (file)
 struct hid_humidity_state {
        struct hid_sensor_common common_attributes;
        struct hid_sensor_hub_attribute_info humidity_attr;
-       s32 humidity_data;
+       struct {
+               s32 humidity_data;
+               u64 timestamp __aligned(8);
+       } scan;
        int scale_pre_decml;
        int scale_post_decml;
        int scale_precision;
@@ -125,9 +128,8 @@ static int humidity_proc_event(struct hid_sensor_hub_device *hsdev,
        struct hid_humidity_state *humid_st = iio_priv(indio_dev);
 
        if (atomic_read(&humid_st->common_attributes.data_ready))
-               iio_push_to_buffers_with_timestamp(indio_dev,
-                                       &humid_st->humidity_data,
-                                       iio_get_time_ns(indio_dev));
+               iio_push_to_buffers_with_timestamp(indio_dev, &humid_st->scan,
+                                                  iio_get_time_ns(indio_dev));
 
        return 0;
 }
@@ -142,7 +144,7 @@ static int humidity_capture_sample(struct hid_sensor_hub_device *hsdev,
 
        switch (usage_id) {
        case HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY:
-               humid_st->humidity_data = *(s32 *)raw_data;
+               humid_st->scan.humidity_data = *(s32 *)raw_data;
 
                return 0;
        default:
index 54af2ed..785a4ce 100644 (file)
@@ -462,8 +462,7 @@ static int adis16400_initial_setup(struct iio_dev *indio_dev)
                if (ret)
                        goto err_ret;
 
-               ret = sscanf(indio_dev->name, "adis%u\n", &device_id);
-               if (ret != 1) {
+               if (sscanf(indio_dev->name, "adis%u\n", &device_id) != 1) {
                        ret = -EINVAL;
                        goto err_ret;
                }
index 330cf35..e9e00ce 100644 (file)
@@ -23,6 +23,9 @@ struct prox_state {
        struct hid_sensor_common common_attributes;
        struct hid_sensor_hub_attribute_info prox_attr;
        u32 human_presence;
+       int scale_pre_decml;
+       int scale_post_decml;
+       int scale_precision;
 };
 
 /* Channel definitions */
@@ -93,8 +96,9 @@ static int prox_read_raw(struct iio_dev *indio_dev,
                ret_type = IIO_VAL_INT;
                break;
        case IIO_CHAN_INFO_SCALE:
-               *val = prox_state->prox_attr.units;
-               ret_type = IIO_VAL_INT;
+               *val = prox_state->scale_pre_decml;
+               *val2 = prox_state->scale_post_decml;
+               ret_type = prox_state->scale_precision;
                break;
        case IIO_CHAN_INFO_OFFSET:
                *val = hid_sensor_convert_exponent(
@@ -234,6 +238,11 @@ static int prox_parse_report(struct platform_device *pdev,
                        HID_USAGE_SENSOR_HUMAN_PRESENCE,
                        &st->common_attributes.sensitivity);
 
+       st->scale_precision = hid_sensor_format_scale(
+                               hsdev->usage,
+                               &st->prox_attr,
+                               &st->scale_pre_decml, &st->scale_post_decml);
+
        return ret;
 }
 
index 81688f1..da9a247 100644 (file)
 struct temperature_state {
        struct hid_sensor_common common_attributes;
        struct hid_sensor_hub_attribute_info temperature_attr;
-       s32 temperature_data;
+       struct {
+               s32 temperature_data;
+               u64 timestamp __aligned(8);
+       } scan;
        int scale_pre_decml;
        int scale_post_decml;
        int scale_precision;
@@ -32,7 +35,7 @@ static const struct iio_chan_spec temperature_channels[] = {
                        BIT(IIO_CHAN_INFO_SAMP_FREQ) |
                        BIT(IIO_CHAN_INFO_HYSTERESIS),
        },
-       IIO_CHAN_SOFT_TIMESTAMP(3),
+       IIO_CHAN_SOFT_TIMESTAMP(1),
 };
 
 /* Adjust channel real bits based on report descriptor */
@@ -123,9 +126,8 @@ static int temperature_proc_event(struct hid_sensor_hub_device *hsdev,
        struct temperature_state *temp_st = iio_priv(indio_dev);
 
        if (atomic_read(&temp_st->common_attributes.data_ready))
-               iio_push_to_buffers_with_timestamp(indio_dev,
-                               &temp_st->temperature_data,
-                               iio_get_time_ns(indio_dev));
+               iio_push_to_buffers_with_timestamp(indio_dev, &temp_st->scan,
+                                                  iio_get_time_ns(indio_dev));
 
        return 0;
 }
@@ -140,7 +142,7 @@ static int temperature_capture_sample(struct hid_sensor_hub_device *hsdev,
 
        switch (usage_id) {
        case HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE:
-               temp_st->temperature_data = *(s32 *)raw_data;
+               temp_st->scan.temperature_data = *(s32 *)raw_data;
                return 0;
        default:
                return -EINVAL;
index 8769e7a..8190374 100644 (file)
@@ -3610,13 +3610,13 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
            ep->com.local_addr.ss_family == AF_INET) {
                err = cxgb4_remove_server_filter(
                        ep->com.dev->rdev.lldi.ports[0], ep->stid,
-                       ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+                       ep->com.dev->rdev.lldi.rxq_ids[0], false);
        } else {
                struct sockaddr_in6 *sin6;
                c4iw_init_wr_wait(ep->com.wr_waitp);
                err = cxgb4_remove_server(
                                ep->com.dev->rdev.lldi.ports[0], ep->stid,
-                               ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+                               ep->com.dev->rdev.lldi.rxq_ids[0], true);
                if (err)
                        goto done;
                err = c4iw_wait_for_reply(&ep->com.dev->rdev, ep->com.wr_waitp,
index c3934ab..ce26f97 100644 (file)
@@ -1194,8 +1194,10 @@ static void hns_roce_cmq_init_regs(struct hns_roce_dev *hr_dev, bool ring_type)
                           upper_32_bits(dma));
                roce_write(hr_dev, ROCEE_TX_CMQ_DEPTH_REG,
                           (u32)ring->desc_num >> HNS_ROCE_CMQ_DESC_NUM_S);
-               roce_write(hr_dev, ROCEE_TX_CMQ_HEAD_REG, 0);
+
+               /* Make sure to write tail first and then head */
                roce_write(hr_dev, ROCEE_TX_CMQ_TAIL_REG, 0);
+               roce_write(hr_dev, ROCEE_TX_CMQ_HEAD_REG, 0);
        } else {
                roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_L_REG, (u32)dma);
                roce_write(hr_dev, ROCEE_RX_CMQ_BASEADDR_H_REG,
index de3c2fc..07b8350 100644 (file)
@@ -1116,7 +1116,7 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din,
        case MLX5_CMD_OP_CREATE_MKEY:
                MLX5_SET(destroy_mkey_in, din, opcode,
                         MLX5_CMD_OP_DESTROY_MKEY);
-               MLX5_SET(destroy_mkey_in, in, mkey_index, *obj_id);
+               MLX5_SET(destroy_mkey_in, din, mkey_index, *obj_id);
                break;
        case MLX5_CMD_OP_CREATE_CQ:
                MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ);
index ec4b3f6..f5a52a6 100644 (file)
@@ -1078,7 +1078,7 @@ static int _create_kernel_qp(struct mlx5_ib_dev *dev,
 
        qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);
        MLX5_SET(qpc, qpc, uar_page, uar_index);
-       MLX5_SET(qpc, qpc, ts_format, MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT);
+       MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev));
        MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
        /* Set "fast registration enabled" for all kernel QPs */
@@ -1188,7 +1188,8 @@ static int get_rq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq)
                }
                return MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING;
        }
-       return MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT;
+       return fr_supported ? MLX5_RQC_TIMESTAMP_FORMAT_FREE_RUNNING :
+                             MLX5_RQC_TIMESTAMP_FORMAT_DEFAULT;
 }
 
 static int get_sq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq)
@@ -1206,7 +1207,8 @@ static int get_sq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq)
                }
                return MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING;
        }
-       return MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT;
+       return fr_supported ? MLX5_SQC_TIMESTAMP_FORMAT_FREE_RUNNING :
+                             MLX5_SQC_TIMESTAMP_FORMAT_DEFAULT;
 }
 
 static int get_qp_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq,
@@ -1217,7 +1219,8 @@ static int get_qp_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq,
                        MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING ||
                MLX5_CAP_ROCE(dev->mdev, qp_ts_format) ==
                        MLX5_QP_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME;
-       int ts_format = MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT;
+       int ts_format = fr_supported ? MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING :
+                                      MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT;
 
        if (recv_cq &&
            recv_cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)
@@ -1930,6 +1933,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
        if (qp->flags & IB_QP_CREATE_MANAGED_RECV)
                MLX5_SET(qpc, qpc, cd_slave_receive, 1);
 
+       MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev));
        MLX5_SET(qpc, qpc, rq_type, MLX5_SRQ_RQ);
        MLX5_SET(qpc, qpc, no_sq, 1);
        MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
@@ -4873,6 +4877,7 @@ static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
        struct mlx5_ib_dev *dev;
        int has_net_offloads;
        __be64 *rq_pas0;
+       int ts_format;
        void *in;
        void *rqc;
        void *wq;
@@ -4881,6 +4886,10 @@ static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 
        dev = to_mdev(pd->device);
 
+       ts_format = get_rq_ts_format(dev, to_mcq(init_attr->cq));
+       if (ts_format < 0)
+               return ts_format;
+
        inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas;
        in = kvzalloc(inlen, GFP_KERNEL);
        if (!in)
@@ -4890,6 +4899,7 @@ static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
        rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
        MLX5_SET(rqc,  rqc, mem_rq_type,
                 MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+       MLX5_SET(rqc, rqc, ts_format, ts_format);
        MLX5_SET(rqc, rqc, user_index, rwq->user_index);
        MLX5_SET(rqc,  rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn);
        MLX5_SET(rqc,  rqc, state, MLX5_RQC_STATE_RST);
index 430dc69..da8963a 100644 (file)
@@ -26,7 +26,6 @@
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
 MODULE_DESCRIPTION("Joystick device interfaces");
-MODULE_SUPPORTED_DEVICE("input/js");
 MODULE_LICENSE("GPL");
 
 #define JOYDEV_MINOR_BASE      0
index 73e2c8d..448cc53 100644 (file)
@@ -53,7 +53,7 @@ void icc_bulk_put(int num_paths, struct icc_bulk_data *paths)
 EXPORT_SYMBOL_GPL(icc_bulk_put);
 
 /**
- * icc_bulk_set() - set bandwidth to a set of paths
+ * icc_bulk_set_bw() - set bandwidth to a set of paths
  * @num_paths: the number of icc_bulk_data
  * @paths: the icc_bulk_data table containing the paths and bandwidth
  *
index 5ad519c..8a1e70e 100644 (file)
@@ -942,6 +942,8 @@ int icc_link_destroy(struct icc_node *src, struct icc_node *dst)
                       GFP_KERNEL);
        if (new)
                src->links = new;
+       else
+               ret = -ENOMEM;
 
 out:
        mutex_unlock(&icc_lock);
index dfbec30..20f31a1 100644 (file)
@@ -131,7 +131,7 @@ DEFINE_QNODE(mas_pcnoc_sdcc_1, MSM8939_MASTER_SDCC_1, 8, -1, -1, MSM8939_PNOC_IN
 DEFINE_QNODE(mas_pcnoc_sdcc_2, MSM8939_MASTER_SDCC_2, 8, -1, -1, MSM8939_PNOC_INT_1);
 DEFINE_QNODE(mas_qdss_bam, MSM8939_MASTER_QDSS_BAM, 8, -1, -1, MSM8939_SNOC_QDSS_INT);
 DEFINE_QNODE(mas_qdss_etr, MSM8939_MASTER_QDSS_ETR, 8, -1, -1, MSM8939_SNOC_QDSS_INT);
-DEFINE_QNODE(mas_snoc_cfg, MSM8939_MASTER_SNOC_CFG, 4, 20, -1, MSM8939_SLAVE_SRVC_SNOC);
+DEFINE_QNODE(mas_snoc_cfg, MSM8939_MASTER_SNOC_CFG, 4, -1, -1, MSM8939_SLAVE_SRVC_SNOC);
 DEFINE_QNODE(mas_spdm, MSM8939_MASTER_SPDM, 4, -1, -1, MSM8939_PNOC_MAS_0);
 DEFINE_QNODE(mas_tcu0, MSM8939_MASTER_TCU0, 16, -1, -1, MSM8939_SLAVE_EBI_CH0, MSM8939_BIMC_SNOC_MAS, MSM8939_SLAVE_AMPSS_L2);
 DEFINE_QNODE(mas_usb_hs1, MSM8939_MASTER_USB_HS1, 4, -1, -1, MSM8939_PNOC_MAS_1);
@@ -156,14 +156,14 @@ DEFINE_QNODE(pcnoc_snoc_mas, MSM8939_PNOC_SNOC_MAS, 8, 29, -1, MSM8939_PNOC_SNOC
 DEFINE_QNODE(pcnoc_snoc_slv, MSM8939_PNOC_SNOC_SLV, 8, -1, 45, MSM8939_SNOC_INT_0, MSM8939_SNOC_INT_BIMC, MSM8939_SNOC_INT_1);
 DEFINE_QNODE(qdss_int, MSM8939_SNOC_QDSS_INT, 8, -1, -1, MSM8939_SNOC_INT_0, MSM8939_SNOC_INT_BIMC);
 DEFINE_QNODE(slv_apps_l2, MSM8939_SLAVE_AMPSS_L2, 16, -1, -1, 0);
-DEFINE_QNODE(slv_apss, MSM8939_SLAVE_APSS, 4, -1, 20, 0);
+DEFINE_QNODE(slv_apss, MSM8939_SLAVE_APSS, 4, -1, -1, 0);
 DEFINE_QNODE(slv_audio, MSM8939_SLAVE_LPASS, 4, -1, -1, 0);
 DEFINE_QNODE(slv_bimc_cfg, MSM8939_SLAVE_BIMC_CFG, 4, -1, -1, 0);
 DEFINE_QNODE(slv_blsp_1, MSM8939_SLAVE_BLSP_1, 4, -1, -1, 0);
 DEFINE_QNODE(slv_boot_rom, MSM8939_SLAVE_BOOT_ROM, 4, -1, -1, 0);
 DEFINE_QNODE(slv_camera_cfg, MSM8939_SLAVE_CAMERA_CFG, 4, -1, -1, 0);
-DEFINE_QNODE(slv_cats_0, MSM8939_SLAVE_CATS_128, 16, -1, 106, 0);
-DEFINE_QNODE(slv_cats_1, MSM8939_SLAVE_OCMEM_64, 8, -1, 107, 0);
+DEFINE_QNODE(slv_cats_0, MSM8939_SLAVE_CATS_128, 16, -1, -1, 0);
+DEFINE_QNODE(slv_cats_1, MSM8939_SLAVE_OCMEM_64, 8, -1, -1, 0);
 DEFINE_QNODE(slv_clk_ctl, MSM8939_SLAVE_CLK_CTL, 4, -1, -1, 0);
 DEFINE_QNODE(slv_crypto_0_cfg, MSM8939_SLAVE_CRYPTO_0_CFG, 4, -1, -1, 0);
 DEFINE_QNODE(slv_dehr_cfg, MSM8939_SLAVE_DEHR_CFG, 4, -1, -1, 0);
@@ -187,20 +187,20 @@ DEFINE_QNODE(slv_sdcc_2, MSM8939_SLAVE_SDCC_2, 4, -1, -1, 0);
 DEFINE_QNODE(slv_security, MSM8939_SLAVE_SECURITY, 4, -1, -1, 0);
 DEFINE_QNODE(slv_snoc_cfg, MSM8939_SLAVE_SNOC_CFG, 4, -1, -1, 0);
 DEFINE_QNODE(slv_spdm, MSM8939_SLAVE_SPDM, 4, -1, -1, 0);
-DEFINE_QNODE(slv_srvc_snoc, MSM8939_SLAVE_SRVC_SNOC, 8, -1, 29, 0);
+DEFINE_QNODE(slv_srvc_snoc, MSM8939_SLAVE_SRVC_SNOC, 8, -1, -1, 0);
 DEFINE_QNODE(slv_tcsr, MSM8939_SLAVE_TCSR, 4, -1, -1, 0);
 DEFINE_QNODE(slv_tlmm, MSM8939_SLAVE_TLMM, 4, -1, -1, 0);
 DEFINE_QNODE(slv_usb_hs1, MSM8939_SLAVE_USB_HS1, 4, -1, -1, 0);
 DEFINE_QNODE(slv_usb_hs2, MSM8939_SLAVE_USB_HS2, 4, -1, -1, 0);
 DEFINE_QNODE(slv_venus_cfg, MSM8939_SLAVE_VENUS_CFG, 4, -1, -1, 0);
-DEFINE_QNODE(snoc_bimc_0_mas, MSM8939_SNOC_BIMC_0_MAS, 16, 3, -1, MSM8939_SNOC_BIMC_0_SLV);
-DEFINE_QNODE(snoc_bimc_0_slv, MSM8939_SNOC_BIMC_0_SLV, 16, -1, 24, MSM8939_SLAVE_EBI_CH0);
+DEFINE_QNODE(snoc_bimc_0_mas, MSM8939_SNOC_BIMC_0_MAS, 16, -1, -1, MSM8939_SNOC_BIMC_0_SLV);
+DEFINE_QNODE(snoc_bimc_0_slv, MSM8939_SNOC_BIMC_0_SLV, 16, -1, -1, MSM8939_SLAVE_EBI_CH0);
 DEFINE_QNODE(snoc_bimc_1_mas, MSM8939_SNOC_BIMC_1_MAS, 16, 76, -1, MSM8939_SNOC_BIMC_1_SLV);
 DEFINE_QNODE(snoc_bimc_1_slv, MSM8939_SNOC_BIMC_1_SLV, 16, -1, 104, MSM8939_SLAVE_EBI_CH0);
 DEFINE_QNODE(snoc_bimc_2_mas, MSM8939_SNOC_BIMC_2_MAS, 16, -1, -1, MSM8939_SNOC_BIMC_2_SLV);
 DEFINE_QNODE(snoc_bimc_2_slv, MSM8939_SNOC_BIMC_2_SLV, 16, -1, -1, MSM8939_SLAVE_EBI_CH0);
 DEFINE_QNODE(snoc_int_0, MSM8939_SNOC_INT_0, 8, 99, 130, MSM8939_SLAVE_QDSS_STM, MSM8939_SLAVE_IMEM, MSM8939_SNOC_PNOC_MAS);
-DEFINE_QNODE(snoc_int_1, MSM8939_SNOC_INT_1, 8, 100, 131, MSM8939_SLAVE_APSS, MSM8939_SLAVE_CATS_128, MSM8939_SLAVE_OCMEM_64);
+DEFINE_QNODE(snoc_int_1, MSM8939_SNOC_INT_1, 8, -1, -1, MSM8939_SLAVE_APSS, MSM8939_SLAVE_CATS_128, MSM8939_SLAVE_OCMEM_64);
 DEFINE_QNODE(snoc_int_bimc, MSM8939_SNOC_INT_BIMC, 8, 101, 132, MSM8939_SNOC_BIMC_1_MAS);
 DEFINE_QNODE(snoc_pcnoc_mas, MSM8939_SNOC_PNOC_MAS, 8, -1, -1, MSM8939_SNOC_PNOC_SLV);
 DEFINE_QNODE(snoc_pcnoc_slv, MSM8939_SNOC_PNOC_SLV, 8, -1, -1, MSM8939_PNOC_INT_0);
index 9126efc..321f590 100644 (file)
@@ -2714,7 +2714,6 @@ static int __init early_amd_iommu_init(void)
        struct acpi_table_header *ivrs_base;
        int i, remap_cache_sz, ret;
        acpi_status status;
-       u32 pci_id;
 
        if (!amd_iommu_detected)
                return -ENODEV;
@@ -2804,16 +2803,6 @@ static int __init early_amd_iommu_init(void)
        if (ret)
                goto out;
 
-       /* Disable IOMMU if there's Stoney Ridge graphics */
-       for (i = 0; i < 32; i++) {
-               pci_id = read_pci_config(0, i, 0, 0);
-               if ((pci_id & 0xffff) == 0x1002 && (pci_id >> 16) == 0x98e4) {
-                       pr_info("Disable IOMMU on Stoney Ridge\n");
-                       amd_iommu_disabled = true;
-                       break;
-               }
-       }
-
        /* Disable any previously enabled IOMMUs */
        if (!is_kdump_kernel() || amd_iommu_disabled)
                disable_iommus();
@@ -2880,6 +2869,7 @@ static bool detect_ivrs(void)
 {
        struct acpi_table_header *ivrs_base;
        acpi_status status;
+       int i;
 
        status = acpi_get_table("IVRS", 0, &ivrs_base);
        if (status == AE_NOT_FOUND)
@@ -2892,6 +2882,17 @@ static bool detect_ivrs(void)
 
        acpi_put_table(ivrs_base);
 
+       /* Don't use IOMMU if there is Stoney Ridge graphics */
+       for (i = 0; i < 32; i++) {
+               u32 pci_id;
+
+               pci_id = read_pci_config(0, i, 0, 0);
+               if ((pci_id & 0xffff) == 0x1002 && (pci_id >> 16) == 0x98e4) {
+                       pr_info("Disable IOMMU on Stoney Ridge\n");
+                       return false;
+               }
+       }
+
        /* Make sure ACS will be enabled during PCI probe */
        pci_request_acs();
 
@@ -2918,12 +2919,12 @@ static int __init state_next(void)
                }
                break;
        case IOMMU_IVRS_DETECTED:
-               ret = early_amd_iommu_init();
-               init_state = ret ? IOMMU_INIT_ERROR : IOMMU_ACPI_FINISHED;
-               if (init_state == IOMMU_ACPI_FINISHED && amd_iommu_disabled) {
-                       pr_info("AMD IOMMU disabled\n");
+               if (amd_iommu_disabled) {
                        init_state = IOMMU_CMDLINE_DISABLED;
                        ret = -EINVAL;
+               } else {
+                       ret = early_amd_iommu_init();
+                       init_state = ret ? IOMMU_INIT_ERROR : IOMMU_ACPI_FINISHED;
                }
                break;
        case IOMMU_ACPI_FINISHED:
@@ -3001,8 +3002,11 @@ int __init amd_iommu_prepare(void)
        amd_iommu_irq_remap = true;
 
        ret = iommu_go_to_state(IOMMU_ACPI_FINISHED);
-       if (ret)
+       if (ret) {
+               amd_iommu_irq_remap = false;
                return ret;
+       }
+
        return amd_iommu_irq_remap ? 0 : -ENODEV;
 }
 
index 97eb62f..602aab9 100644 (file)
@@ -849,12 +849,11 @@ static struct iommu_device *tegra_smmu_probe_device(struct device *dev)
                smmu = tegra_smmu_find(args.np);
                if (smmu) {
                        err = tegra_smmu_configure(smmu, dev, &args);
-                       of_node_put(args.np);
 
-                       if (err < 0)
+                       if (err < 0) {
+                               of_node_put(args.np);
                                return ERR_PTR(err);
-
-                       break;
+                       }
                }
 
                of_node_put(args.np);
index 7168778..cb0afe8 100644 (file)
@@ -721,7 +721,7 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb)
  * Return value: CAPI result code
  */
 
-u16 capi20_get_manufacturer(u32 contr, u8 *buf)
+u16 capi20_get_manufacturer(u32 contr, u8 buf[CAPI_MANUFACTURER_LEN])
 {
        struct capi_ctr *ctr;
        u16 ret;
@@ -787,7 +787,7 @@ u16 capi20_get_version(u32 contr, struct capi_version *verp)
  * Return value: CAPI result code
  */
 
-u16 capi20_get_serial(u32 contr, u8 *serial)
+u16 capi20_get_serial(u32 contr, u8 serial[CAPI_SERIAL_LEN])
 {
        struct capi_ctr *ctr;
        u16 ret;
index ec47508..39f841b 100644 (file)
@@ -694,7 +694,7 @@ isac_release(struct isac_hw *isac)
 {
        if (isac->type & IPAC_TYPE_ISACX)
                WriteISAC(isac, ISACX_MASK, 0xff);
-       else
+       else if (isac->type != 0)
                WriteISAC(isac, ISAC_MASK, 0xff);
        if (isac->dch.timer.function != NULL) {
                del_timer(&isac->dch.timer);
index 5e306bb..1ca65b4 100644 (file)
@@ -529,7 +529,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
         * Grab our output buffer.
         */
        nl = orig_nl = get_result_buffer(param, param_size, &len);
-       if (len < needed) {
+       if (len < needed || len < sizeof(nl->dev)) {
                param->flags |= DM_BUFFER_FULL_FLAG;
                goto out;
        }
index 95391f7..e5f0f17 100644 (file)
@@ -1594,6 +1594,13 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
        return blk_queue_zoned_model(q) != *zoned_model;
 }
 
+/*
+ * Check the device zoned model based on the target feature flag. If the target
+ * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are
+ * also accepted but all devices must have the same zoned model. If the target
+ * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
+ * zoned model with all zoned devices having the same zone size.
+ */
 static bool dm_table_supports_zoned_model(struct dm_table *t,
                                          enum blk_zoned_model zoned_model)
 {
@@ -1603,13 +1610,15 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
        for (i = 0; i < dm_table_get_num_targets(t); i++) {
                ti = dm_table_get_target(t, i);
 
-               if (zoned_model == BLK_ZONED_HM &&
-                   !dm_target_supports_zoned_hm(ti->type))
-                       return false;
-
-               if (!ti->type->iterate_devices ||
-                   ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model))
-                       return false;
+               if (dm_target_supports_zoned_hm(ti->type)) {
+                       if (!ti->type->iterate_devices ||
+                           ti->type->iterate_devices(ti, device_not_zoned_model,
+                                                     &zoned_model))
+                               return false;
+               } else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
+                       if (zoned_model == BLK_ZONED_HM)
+                               return false;
+               }
        }
 
        return true;
@@ -1621,9 +1630,17 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *
        struct request_queue *q = bdev_get_queue(dev->bdev);
        unsigned int *zone_sectors = data;
 
+       if (!blk_queue_is_zoned(q))
+               return 0;
+
        return blk_queue_zone_sectors(q) != *zone_sectors;
 }
 
+/*
+ * Check consistency of zoned model and zone sectors across all targets. For
+ * zone sectors, if the destination device is a zoned block device, it shall
+ * have the specified zone_sectors.
+ */
 static int validate_hardware_zoned_model(struct dm_table *table,
                                         enum blk_zoned_model zoned_model,
                                         unsigned int zone_sectors)
@@ -1642,7 +1659,7 @@ static int validate_hardware_zoned_model(struct dm_table *table,
                return -EINVAL;
 
        if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) {
-               DMERR("%s: zone sectors is not consistent across all devices",
+               DMERR("%s: zone sectors is not consistent across all zoned devices",
                      dm_device_name(table->md));
                return -EINVAL;
        }
index 6b8e5bd..808a98e 100644 (file)
@@ -34,7 +34,7 @@
 #define DM_VERITY_OPT_IGN_ZEROES       "ignore_zero_blocks"
 #define DM_VERITY_OPT_AT_MOST_ONCE     "check_at_most_once"
 
-#define DM_VERITY_OPTS_MAX             (2 + DM_VERITY_OPTS_FEC + \
+#define DM_VERITY_OPTS_MAX             (3 + DM_VERITY_OPTS_FEC + \
                                         DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
 
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
index 697f9de..7e88df6 100644 (file)
@@ -1143,7 +1143,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
 static struct target_type dmz_type = {
        .name            = "zoned",
        .version         = {2, 0, 0},
-       .features        = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
+       .features        = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL,
        .module          = THIS_MODULE,
        .ctr             = dmz_ctr,
        .dtr             = dmz_dtr,
index 50b693d..3f3be94 100644 (file)
@@ -2036,7 +2036,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
        if (size != dm_get_size(md))
                memset(&md->geometry, 0, sizeof(md->geometry));
 
-       set_capacity_and_notify(md->disk, size);
+       if (!get_capacity(md->disk))
+               set_capacity(md->disk, size);
+       else
+               set_capacity_and_notify(md->disk, size);
 
        dm_table_event_callback(t, event_callback, md);
 
index 8a85852..5f6e97a 100644 (file)
@@ -430,4 +430,3 @@ MODULE_AUTHOR("Andreas Monitzer <andy@monitzer.com>");
 MODULE_AUTHOR("Ben Backx <ben@bbackx.com>");
 MODULE_DESCRIPTION("FireDTV DVB Driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("FireDTV DVB");
index 692b95a..9a82e68 100644 (file)
@@ -41,7 +41,6 @@ MODULE_PARM_DESC(debug,
 
 MODULE_AUTHOR("Andy Walls");
 MODULE_DESCRIPTION("CX23418 ALSA Interface");
-MODULE_SUPPORTED_DEVICE("CX23418 MPEG2 encoder");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(CX18_VERSION);
index 95aed00..f2440eb 100644 (file)
@@ -232,7 +232,6 @@ MODULE_PARM_DESC(cx18_first_minor,
 
 MODULE_AUTHOR("Hans Verkuil");
 MODULE_DESCRIPTION("CX23418 driver");
-MODULE_SUPPORTED_DEVICE("CX23418 MPEG2 encoder");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(CX18_VERSION);
index 608fbaf..8797d85 100644 (file)
@@ -104,7 +104,6 @@ MODULE_PARM_DESC(index, "Index value for cx25821 capture interface(s).");
 MODULE_DESCRIPTION("ALSA driver module for cx25821 based capture cards");
 MODULE_AUTHOR("Hiep Huynh");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Conexant,25821}");  /* "{{Conexant,23881}," */
 
 static unsigned int debug;
 module_param(debug, int, 0644);
index 95e0cbb..c83814c 100644 (file)
@@ -98,7 +98,6 @@ MODULE_AUTHOR("Mauro Carvalho Chehab <mchehab@kernel.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_VERSION(CX88_VERSION);
 
-MODULE_SUPPORTED_DEVICE("{{Conexant,23881},{{Conexant,23882},{{Conexant,23883}");
 static unsigned int debug;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
index 39029b8..4cefdb2 100644 (file)
@@ -38,7 +38,6 @@ MODULE_PARM_DESC(index,
 
 MODULE_AUTHOR("Andy Walls");
 MODULE_DESCRIPTION("CX23415/CX23416 ALSA Interface");
-MODULE_SUPPORTED_DEVICE("CX23415/CX23416 MPEG2 encoder");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(IVTV_VERSION);
index 6e448cb..942b8c2 100644 (file)
@@ -275,9 +275,6 @@ MODULE_PARM_DESC(ivtv_first_minor, "Set device node number assigned to first car
 
 MODULE_AUTHOR("Kevin Thayer, Chris Kennedy, Hans Verkuil");
 MODULE_DESCRIPTION("CX23415/CX23416 driver");
-MODULE_SUPPORTED_DEVICE
-    ("CX23415/CX23416 MPEG2 encoder (WinTV PVR-150/250/350/500,\n"
-               "\t\t\tYuan MPG series and similar)");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(IVTV_VERSION);
index 336df65..524912f 100644 (file)
@@ -1269,6 +1269,5 @@ late_initcall_sync(sta2x11_vip_init_module);
 MODULE_DESCRIPTION("STA2X11 Video Input Port driver");
 MODULE_AUTHOR("Wind River");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("sta2x11 video input");
 MODULE_VERSION(DRV_VERSION);
 MODULE_DEVICE_TABLE(pci, sta2x11_vip_pci_tbl);
index 0514be6..e392b3e 100644 (file)
@@ -1363,4 +1363,3 @@ module_platform_driver(atmel_isi_driver);
 MODULE_AUTHOR("Josh Wu <josh.wu@atmel.com>");
 MODULE_DESCRIPTION("The V4L2 driver for Atmel Linux");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("video");
index 0b78fec..61d9885 100644 (file)
@@ -330,4 +330,3 @@ module_platform_driver(atmel_isc_driver);
 MODULE_AUTHOR("Songjun Wu");
 MODULE_DESCRIPTION("The V4L2 driver for Atmel-ISC");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("video");
index 9c94a8b..baac86f 100644 (file)
 MODULE_AUTHOR("Jonathan Corbet <corbet@lwn.net>");
 MODULE_DESCRIPTION("Marvell 88ALP01 CMOS Camera Controller driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Video");
-
-
-
 
 struct cafe_camera {
        int registered;                 /* Fully initialized? */
index bbcc225..d9b4ad0 100644 (file)
@@ -2149,4 +2149,3 @@ MODULE_AUTHOR("Yannick Fertre <yannick.fertre@st.com>");
 MODULE_AUTHOR("Hugues Fruchet <hugues.fruchet@st.com>");
 MODULE_DESCRIPTION("STMicroelectronics STM32 Digital Camera Memory Interface driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("video");
index e488e78..69d5c62 100644 (file)
@@ -56,7 +56,6 @@ MODULE_PARM_DESC(flicker_mode, "Flicker frequency (0 (disabled), " __stringify(5
 
 MODULE_AUTHOR("Steve Miller (STMicroelectronics) <steve.miller@st.com>");
 MODULE_DESCRIPTION("V4L-driver for STMicroelectronics CPiA2 based cameras");
-MODULE_SUPPORTED_DEVICE("video");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(CPIA_VERSION);
 
index 3a2df36..a19a467 100644 (file)
@@ -51,7 +51,6 @@ MODULE_PARM_DESC(index, "Index value for tm6000x capture interface(s).");
 MODULE_DESCRIPTION("ALSA driver module for tm5600/tm6000/tm6010 based TV cards");
 MODULE_AUTHOR("Mauro Carvalho Chehab");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Trident,tm5600},{{Trident,tm6000},{{Trident,tm6010}");
 static unsigned int debug;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
index 293a460..4990fa8 100644 (file)
@@ -23,8 +23,6 @@ MODULE_DESCRIPTION("DVB driver extension module for tm5600/6000/6010 based TV ca
 MODULE_AUTHOR("Mauro Carvalho Chehab");
 MODULE_LICENSE("GPL");
 
-MODULE_SUPPORTED_DEVICE("{{Trident, tm5600},{{Trident, tm6000},{{Trident, tm6010}");
-
 static int debug;
 
 module_param(debug, int, 0644);
index fe8ca94..b67cb0a 100644 (file)
@@ -72,7 +72,8 @@ static const struct dmi_system_id dmi_platform_info[] = {
        {}
 };
 
-static const struct resource intel_quark_i2c_res[] = {
+/* This is used as a place holder and will be modified at run-time */
+static struct resource intel_quark_i2c_res[] = {
        [INTEL_QUARK_IORES_MEM] = {
                .flags = IORESOURCE_MEM,
        },
@@ -85,7 +86,8 @@ static struct mfd_cell_acpi_match intel_quark_acpi_match_i2c = {
        .adr = MFD_ACPI_MATCH_I2C,
 };
 
-static const struct resource intel_quark_gpio_res[] = {
+/* This is used as a place holder and will be modified at run-time */
+static struct resource intel_quark_gpio_res[] = {
        [INTEL_QUARK_IORES_MEM] = {
                .flags = IORESOURCE_MEM,
        },
index 4378a9b..2cc370a 100644 (file)
@@ -2286,8 +2286,8 @@ int mei_cl_dma_alloc_and_map(struct mei_cl *cl, const struct file *fp,
        if (buffer_id == 0)
                return -EINVAL;
 
-       if (!mei_cl_is_connected(cl))
-               return -ENODEV;
+       if (mei_cl_is_connected(cl))
+               return -EPROTO;
 
        if (cl->dma_mapped)
                return -EPROTO;
@@ -2327,9 +2327,7 @@ int mei_cl_dma_alloc_and_map(struct mei_cl *cl, const struct file *fp,
 
        mutex_unlock(&dev->device_lock);
        wait_event_timeout(cl->wait,
-                          cl->dma_mapped ||
-                          cl->status ||
-                          !mei_cl_is_connected(cl),
+                          cl->dma_mapped || cl->status,
                           mei_secs_to_jiffies(MEI_CL_CONNECT_TIMEOUT));
        mutex_lock(&dev->device_lock);
 
@@ -2376,8 +2374,9 @@ int mei_cl_dma_unmap(struct mei_cl *cl, const struct file *fp)
                return -EOPNOTSUPP;
        }
 
-       if (!mei_cl_is_connected(cl))
-               return -ENODEV;
+       /* do not allow unmap for connected client */
+       if (mei_cl_is_connected(cl))
+               return -EPROTO;
 
        if (!cl->dma_mapped)
                return -EPROTO;
@@ -2405,9 +2404,7 @@ int mei_cl_dma_unmap(struct mei_cl *cl, const struct file *fp)
 
        mutex_unlock(&dev->device_lock);
        wait_event_timeout(cl->wait,
-                          !cl->dma_mapped ||
-                          cl->status ||
-                          !mei_cl_is_connected(cl),
+                          !cl->dma_mapped || cl->status,
                           mei_secs_to_jiffies(MEI_CL_CONNECT_TIMEOUT));
        mutex_lock(&dev->device_lock);
 
index eb72582..f9cfb08 100644 (file)
@@ -32,7 +32,6 @@
 
 MODULE_AUTHOR("Eric Brower <ebrower@usa.net>");
 MODULE_DESCRIPTION("User-programmable flash device on Sun Microsystems boardsets");
-MODULE_SUPPORTED_DEVICE(DRIVER_NAME);
 MODULE_LICENSE("GPL");
 MODULE_VERSION("2.1");
 
index 8bdc44b..3c8f665 100644 (file)
@@ -127,6 +127,8 @@ static int com20020pci_probe(struct pci_dev *pdev,
        int i, ioaddr, ret;
        struct resource *r;
 
+       ret = 0;
+
        if (pci_enable_device(pdev))
                return -EIO;
 
@@ -139,6 +141,8 @@ static int com20020pci_probe(struct pci_dev *pdev,
        priv->ci = ci;
        mm = &ci->misc_map;
 
+       pci_set_drvdata(pdev, priv);
+
        INIT_LIST_HEAD(&priv->list_dev);
 
        if (mm->size) {
@@ -161,7 +165,7 @@ static int com20020pci_probe(struct pci_dev *pdev,
                dev = alloc_arcdev(device);
                if (!dev) {
                        ret = -ENOMEM;
-                       goto out_port;
+                       break;
                }
                dev->dev_port = i;
 
@@ -178,7 +182,7 @@ static int com20020pci_probe(struct pci_dev *pdev,
                        pr_err("IO region %xh-%xh already allocated\n",
                               ioaddr, ioaddr + cm->size - 1);
                        ret = -EBUSY;
-                       goto out_port;
+                       goto err_free_arcdev;
                }
 
                /* Dummy access after Reset
@@ -216,18 +220,18 @@ static int com20020pci_probe(struct pci_dev *pdev,
                if (arcnet_inb(ioaddr, COM20020_REG_R_STATUS) == 0xFF) {
                        pr_err("IO address %Xh is empty!\n", ioaddr);
                        ret = -EIO;
-                       goto out_port;
+                       goto err_free_arcdev;
                }
                if (com20020_check(dev)) {
                        ret = -EIO;
-                       goto out_port;
+                       goto err_free_arcdev;
                }
 
                card = devm_kzalloc(&pdev->dev, sizeof(struct com20020_dev),
                                    GFP_KERNEL);
                if (!card) {
                        ret = -ENOMEM;
-                       goto out_port;
+                       goto err_free_arcdev;
                }
 
                card->index = i;
@@ -253,29 +257,29 @@ static int com20020pci_probe(struct pci_dev *pdev,
 
                ret = devm_led_classdev_register(&pdev->dev, &card->tx_led);
                if (ret)
-                       goto out_port;
+                       goto err_free_arcdev;
 
                ret = devm_led_classdev_register(&pdev->dev, &card->recon_led);
                if (ret)
-                       goto out_port;
+                       goto err_free_arcdev;
 
                dev_set_drvdata(&dev->dev, card);
 
                ret = com20020_found(dev, IRQF_SHARED);
                if (ret)
-                       goto out_port;
+                       goto err_free_arcdev;
 
                devm_arcnet_led_init(dev, dev->dev_id, i);
 
                list_add(&card->list, &priv->list_dev);
-       }
+               continue;
 
-       pci_set_drvdata(pdev, priv);
-
-       return 0;
-
-out_port:
-       com20020pci_remove(pdev);
+err_free_arcdev:
+               free_arcdev(dev);
+               break;
+       }
+       if (ret)
+               com20020pci_remove(pdev);
        return ret;
 }
 
index 456315b..74cbbb2 100644 (file)
@@ -3978,15 +3978,11 @@ static int bond_neigh_init(struct neighbour *n)
 
        rcu_read_lock();
        slave = bond_first_slave_rcu(bond);
-       if (!slave) {
-               ret = -EINVAL;
+       if (!slave)
                goto out;
-       }
        slave_ops = slave->dev->netdev_ops;
-       if (!slave_ops->ndo_neigh_setup) {
-               ret = -EINVAL;
+       if (!slave_ops->ndo_neigh_setup)
                goto out;
-       }
 
        /* TODO: find another way [1] to implement this.
         * Passing a zeroed structure is fragile,
index ef474ba..6958830 100644 (file)
@@ -212,18 +212,6 @@ static const struct can_bittiming_const c_can_bittiming_const = {
        .brp_inc = 1,
 };
 
-static inline void c_can_pm_runtime_enable(const struct c_can_priv *priv)
-{
-       if (priv->device)
-               pm_runtime_enable(priv->device);
-}
-
-static inline void c_can_pm_runtime_disable(const struct c_can_priv *priv)
-{
-       if (priv->device)
-               pm_runtime_disable(priv->device);
-}
-
 static inline void c_can_pm_runtime_get_sync(const struct c_can_priv *priv)
 {
        if (priv->device)
@@ -1335,7 +1323,6 @@ static const struct net_device_ops c_can_netdev_ops = {
 
 int register_c_can_dev(struct net_device *dev)
 {
-       struct c_can_priv *priv = netdev_priv(dev);
        int err;
 
        /* Deactivate pins to prevent DRA7 DCAN IP from being
@@ -1345,28 +1332,19 @@ int register_c_can_dev(struct net_device *dev)
         */
        pinctrl_pm_select_sleep_state(dev->dev.parent);
 
-       c_can_pm_runtime_enable(priv);
-
        dev->flags |= IFF_ECHO; /* we support local echo */
        dev->netdev_ops = &c_can_netdev_ops;
 
        err = register_candev(dev);
-       if (err)
-               c_can_pm_runtime_disable(priv);
-       else
+       if (!err)
                devm_can_led_init(dev);
-
        return err;
 }
 EXPORT_SYMBOL_GPL(register_c_can_dev);
 
 void unregister_c_can_dev(struct net_device *dev)
 {
-       struct c_can_priv *priv = netdev_priv(dev);
-
        unregister_candev(dev);
-
-       c_can_pm_runtime_disable(priv);
 }
 EXPORT_SYMBOL_GPL(unregister_c_can_dev);
 
index 406b484..7efb60b 100644 (file)
@@ -239,12 +239,13 @@ static void c_can_pci_remove(struct pci_dev *pdev)
 {
        struct net_device *dev = pci_get_drvdata(pdev);
        struct c_can_priv *priv = netdev_priv(dev);
+       void __iomem *addr = priv->base;
 
        unregister_c_can_dev(dev);
 
        free_c_can_dev(dev);
 
-       pci_iounmap(pdev, priv->base);
+       pci_iounmap(pdev, addr);
        pci_disable_msi(pdev);
        pci_clear_master(pdev);
        pci_release_regions(pdev);
index 05f425c..47b251b 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/clk.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -386,6 +387,7 @@ static int c_can_plat_probe(struct platform_device *pdev)
        platform_set_drvdata(pdev, dev);
        SET_NETDEV_DEV(dev, &pdev->dev);
 
+       pm_runtime_enable(priv->device);
        ret = register_c_can_dev(dev);
        if (ret) {
                dev_err(&pdev->dev, "registering %s failed (err=%d)\n",
@@ -398,6 +400,7 @@ static int c_can_plat_probe(struct platform_device *pdev)
        return 0;
 
 exit_free_device:
+       pm_runtime_disable(priv->device);
        free_c_can_dev(dev);
 exit:
        dev_err(&pdev->dev, "probe failed\n");
@@ -408,9 +411,10 @@ exit:
 static int c_can_plat_remove(struct platform_device *pdev)
 {
        struct net_device *dev = platform_get_drvdata(pdev);
+       struct c_can_priv *priv = netdev_priv(dev);
 
        unregister_c_can_dev(dev);
-
+       pm_runtime_disable(priv->device);
        free_c_can_dev(dev);
 
        return 0;
index 867f6be..f5d79e6 100644 (file)
@@ -355,6 +355,7 @@ static void can_dellink(struct net_device *dev, struct list_head *head)
 
 struct rtnl_link_ops can_link_ops __read_mostly = {
        .kind           = "can",
+       .netns_refund   = true,
        .maxtype        = IFLA_CAN_MAX,
        .policy         = can_policy,
        .setup          = can_setup,
index 134c057..57f3635 100644 (file)
@@ -697,9 +697,15 @@ static int flexcan_chip_disable(struct flexcan_priv *priv)
 static int flexcan_chip_freeze(struct flexcan_priv *priv)
 {
        struct flexcan_regs __iomem *regs = priv->regs;
-       unsigned int timeout = 1000 * 1000 * 10 / priv->can.bittiming.bitrate;
+       unsigned int timeout;
+       u32 bitrate = priv->can.bittiming.bitrate;
        u32 reg;
 
+       if (bitrate)
+               timeout = 1000 * 1000 * 10 / bitrate;
+       else
+               timeout = FLEXCAN_TIMEOUT_US / 10;
+
        reg = priv->read(&regs->mcr);
        reg |= FLEXCAN_MCR_FRZ | FLEXCAN_MCR_HALT;
        priv->write(reg, &regs->mcr);
index 37e0501..74d9899 100644 (file)
@@ -57,6 +57,7 @@ MODULE_DESCRIPTION("CAN driver for Kvaser CAN/PCIe devices");
 #define KVASER_PCIEFD_KCAN_STAT_REG 0x418
 #define KVASER_PCIEFD_KCAN_MODE_REG 0x41c
 #define KVASER_PCIEFD_KCAN_BTRN_REG 0x420
+#define KVASER_PCIEFD_KCAN_BUS_LOAD_REG 0x424
 #define KVASER_PCIEFD_KCAN_BTRD_REG 0x428
 #define KVASER_PCIEFD_KCAN_PWM_REG 0x430
 /* Loopback control register */
@@ -949,6 +950,9 @@ static int kvaser_pciefd_setup_can_ctrls(struct kvaser_pciefd *pcie)
                timer_setup(&can->bec_poll_timer, kvaser_pciefd_bec_poll_timer,
                            0);
 
+               /* Disable Bus load reporting */
+               iowrite32(0, can->reg_base + KVASER_PCIEFD_KCAN_BUS_LOAD_REG);
+
                tx_npackets = ioread32(can->reg_base +
                                       KVASER_PCIEFD_KCAN_TX_NPACKETS_REG);
                if (((tx_npackets >> KVASER_PCIEFD_KCAN_TX_NPACKETS_MAX_SHIFT) &
index 3752520..0c8d36b 100644 (file)
@@ -501,9 +501,6 @@ static int m_can_do_rx_poll(struct net_device *dev, int quota)
        }
 
        while ((rxfs & RXFS_FFL_MASK) && (quota > 0)) {
-               if (rxfs & RXFS_RFL)
-                       netdev_warn(dev, "Rx FIFO 0 Message Lost\n");
-
                m_can_read_fifo(dev, rxfs);
 
                quota--;
@@ -876,7 +873,7 @@ static int m_can_rx_peripheral(struct net_device *dev)
 {
        struct m_can_classdev *cdev = netdev_priv(dev);
 
-       m_can_rx_handler(dev, 1);
+       m_can_rx_handler(dev, M_CAN_NAPI_WEIGHT);
 
        m_can_enable_all_interrupts(cdev);
 
index 0df1cdf..1df3c4b 100644 (file)
@@ -21,7 +21,6 @@
 
 MODULE_AUTHOR("Stephane Grosjean <s.grosjean@peak-system.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCIe/M.2 FD family cards");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN PCIe/M.2 FD CAN cards");
 MODULE_LICENSE("GPL v2");
 
 #define PCIEFD_DRV_NAME                "peak_pciefd"
index 6f88c99..4ab9175 100644 (file)
@@ -21,7 +21,6 @@
 
 MODULE_AUTHOR("Sebastian Haas <haas@ems-wuenche.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for EMS CPC-PCI/PCIe/104P CAN cards");
-MODULE_SUPPORTED_DEVICE("EMS CPC-PCI/PCIe/104P CAN card");
 MODULE_LICENSE("GPL v2");
 
 #define EMS_PCI_V1_MAX_CHAN 2
index 770304e..e21b169 100644 (file)
@@ -21,7 +21,6 @@
 
 MODULE_AUTHOR("Markus Plessing <plessing@ems-wuensche.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for EMS CPC-CARD cards");
-MODULE_SUPPORTED_DEVICE("EMS CPC-CARD CAN card");
 MODULE_LICENSE("GPL v2");
 
 #define EMS_PCMCIA_MAX_CHAN 2
index 0ea6b71..95fe9ee 100644 (file)
@@ -33,7 +33,6 @@
 
 MODULE_AUTHOR("Per Dalen <per.dalen@cnw.se>");
 MODULE_DESCRIPTION("Socket-CAN driver for KVASER PCAN PCI cards");
-MODULE_SUPPORTED_DEVICE("KVASER PCAN PCI CAN card");
 MODULE_LICENSE("GPL v2");
 
 #define MAX_NO_OF_CHANNELS        4 /* max no of channels on a single card */
index 4713921..84eac8c 100644 (file)
@@ -24,8 +24,6 @@
 
 MODULE_AUTHOR("Stephane Grosjean <s.grosjean@peak-system.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCI family cards");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN PCI/PCIe/PCIeC miniPCI CAN cards");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN miniPCIe/cPCI PC/104+ PCI/104e CAN Cards");
 MODULE_LICENSE("GPL v2");
 
 #define DRV_NAME  "peak_pci"
index cf951a7..131a084 100644 (file)
@@ -22,7 +22,6 @@
 MODULE_AUTHOR("Stephane Grosjean <s.grosjean@peak-system.com>");
 MODULE_DESCRIPTION("CAN driver for PEAK-System PCAN-PC Cards");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN-PC Card");
 
 /* PEAK-System PCMCIA driver name */
 #define PCC_NAME               "peak_pcmcia"
index 8567958..5de1ebb 100644 (file)
 MODULE_AUTHOR("Pavel Cheblakov <P.B.Cheblakov@inp.nsk.su>");
 MODULE_DESCRIPTION("Socket-CAN driver for PLX90xx PCI-bridge cards with "
                   "the SJA1000 chips");
-MODULE_SUPPORTED_DEVICE("Adlink PCI-7841/cPCI-7841, "
-                       "Adlink PCI-7841/cPCI-7841 SE, "
-                       "Marathon CAN-bus-PCI, "
-                       "Marathon CAN-bus-PCIe, "
-                       "TEWS TECHNOLOGIES TPMC810, "
-                       "esd CAN-PCI/CPCI/PCI104/200, "
-                       "esd CAN-PCI/PMC/266, "
-                       "esd CAN-PCIe/2000, "
-                       "Connect Tech Inc. CANpro/104-Plus Opto (CRG001), "
-                       "IXXAT PC-I 04/PCI, "
-                       "ELCUS CAN-200-PCI, "
-                       "ASEM DUAL CAN-RAW")
 MODULE_LICENSE("GPL v2");
 
 #define PLX_PCI_MAX_CHAN 2
index c1e5d5b..538f4d9 100644 (file)
@@ -73,6 +73,7 @@ config CAN_KVASER_USB
            - Kvaser Memorator Pro 5xHS
            - Kvaser USBcan Light 4xHS
            - Kvaser USBcan Pro 2xHS v2
+           - Kvaser USBcan Pro 4xHS
            - Kvaser USBcan Pro 5xHS
            - Kvaser U100
            - Kvaser U100P
index 2b7efd2..4e97da8 100644 (file)
@@ -86,8 +86,9 @@
 #define USB_U100_PRODUCT_ID                    273
 #define USB_U100P_PRODUCT_ID                   274
 #define USB_U100S_PRODUCT_ID                   275
+#define USB_USBCAN_PRO_4HS_PRODUCT_ID          276
 #define USB_HYDRA_PRODUCT_ID_END \
-       USB_U100S_PRODUCT_ID
+       USB_USBCAN_PRO_4HS_PRODUCT_ID
 
 static inline bool kvaser_is_leaf(const struct usb_device_id *id)
 {
@@ -193,6 +194,7 @@ static const struct usb_device_id kvaser_usb_table[] = {
        { USB_DEVICE(KVASER_VENDOR_ID, USB_U100_PRODUCT_ID) },
        { USB_DEVICE(KVASER_VENDOR_ID, USB_U100P_PRODUCT_ID) },
        { USB_DEVICE(KVASER_VENDOR_ID, USB_U100S_PRODUCT_ID) },
+       { USB_DEVICE(KVASER_VENDOR_ID, USB_USBCAN_PRO_4HS_PRODUCT_ID) },
        { }
 };
 MODULE_DEVICE_TABLE(usb, kvaser_usb_table);
index e6c1e5d..e393e84 100644 (file)
@@ -18,8 +18,6 @@
 
 #include "pcan_usb_core.h"
 
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB adapter");
-
 /* PCAN-USB Endpoints */
 #define PCAN_USB_EP_CMDOUT             1
 #define PCAN_USB_EP_CMDIN              (PCAN_USB_EP_CMDOUT | USB_DIR_IN)
index f347ecc..bae0785 100644 (file)
@@ -16,9 +16,6 @@
 #include "pcan_usb_core.h"
 #include "pcan_usb_pro.h"
 
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB FD adapter");
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB Pro FD adapter");
-
 #define PCAN_USBPROFD_CHANNEL_COUNT    2
 #define PCAN_USBFD_CHANNEL_COUNT       1
 
index 275087c..18fa180 100644 (file)
@@ -17,8 +17,6 @@
 #include "pcan_usb_core.h"
 #include "pcan_usb_pro.h"
 
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB Pro adapter");
-
 #define PCAN_USBPRO_CHANNEL_COUNT      2
 
 /* PCAN-USB Pro adapter internal clock (MHz) */
index a162499..eb44372 100644 (file)
@@ -1105,13 +1105,6 @@ static int b53_setup(struct dsa_switch *ds)
                        b53_disable_port(ds, port);
        }
 
-       /* Let DSA handle the case were multiple bridges span the same switch
-        * device and different VLAN awareness settings are requested, which
-        * would be breaking filtering semantics for any of the other bridge
-        * devices. (not hardware supported)
-        */
-       ds->vlan_filtering_is_global = true;
-
        return b53_setup_devlink_resources(ds);
 }
 
@@ -2664,6 +2657,13 @@ struct b53_device *b53_switch_alloc(struct device *base,
        ds->ops = &b53_switch_ops;
        ds->untag_bridge_pvid = true;
        dev->vlan_enabled = true;
+       /* Let DSA handle the case were multiple bridges span the same switch
+        * device and different VLAN awareness settings are requested, which
+        * would be breaking filtering semantics for any of the other bridge
+        * devices. (not hardware supported)
+        */
+       ds->vlan_filtering_is_global = true;
+
        mutex_init(&dev->reg_mutex);
        mutex_init(&dev->stats_mutex);
 
index f277df9..ba5d546 100644 (file)
@@ -114,7 +114,10 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
                /* Force link status for IMP port */
                reg = core_readl(priv, offset);
                reg |= (MII_SW_OR | LINK_STS);
-               reg &= ~GMII_SPEED_UP_2G;
+               if (priv->type == BCM4908_DEVICE_ID)
+                       reg |= GMII_SPEED_UP_2G;
+               else
+                       reg &= ~GMII_SPEED_UP_2G;
                core_writel(priv, reg, offset);
 
                /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
@@ -585,8 +588,10 @@ static u32 bcm_sf2_sw_get_phy_flags(struct dsa_switch *ds, int port)
         * in bits 15:8 and the patch level in bits 7:0 which is exactly what
         * the REG_PHY_REVISION register layout is.
         */
-
-       return priv->hw_params.gphy_rev;
+       if (priv->int_phy_mask & BIT(port))
+               return priv->hw_params.gphy_rev;
+       else
+               return 0;
 }
 
 static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port,
index f06f5fa..9871d7c 100644 (file)
@@ -436,34 +436,32 @@ mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface)
                             TD_DM_DRVP(8) | TD_DM_DRVN(8));
 
        /* Setup core clock for MT7530 */
-       if (!trgint) {
-               /* Disable MT7530 core clock */
-               core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
-
-               /* Disable PLL, since phy_device has not yet been created
-                * provided for phy_[read,write]_mmd_indirect is called, we
-                * provide our own core_write_mmd_indirect to complete this
-                * function.
-                */
-               core_write_mmd_indirect(priv,
-                                       CORE_GSWPLL_GRP1,
-                                       MDIO_MMD_VEND2,
-                                       0);
-
-               /* Set core clock into 500Mhz */
-               core_write(priv, CORE_GSWPLL_GRP2,
-                          RG_GSWPLL_POSDIV_500M(1) |
-                          RG_GSWPLL_FBKDIV_500M(25));
+       /* Disable MT7530 core clock */
+       core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
 
-               /* Enable PLL */
-               core_write(priv, CORE_GSWPLL_GRP1,
-                          RG_GSWPLL_EN_PRE |
-                          RG_GSWPLL_POSDIV_200M(2) |
-                          RG_GSWPLL_FBKDIV_200M(32));
-
-               /* Enable MT7530 core clock */
-               core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
-       }
+       /* Disable PLL, since phy_device has not yet been created
+        * provided for phy_[read,write]_mmd_indirect is called, we
+        * provide our own core_write_mmd_indirect to complete this
+        * function.
+        */
+       core_write_mmd_indirect(priv,
+                               CORE_GSWPLL_GRP1,
+                               MDIO_MMD_VEND2,
+                               0);
+
+       /* Set core clock into 500Mhz */
+       core_write(priv, CORE_GSWPLL_GRP2,
+                  RG_GSWPLL_POSDIV_500M(1) |
+                  RG_GSWPLL_FBKDIV_500M(25));
+
+       /* Enable PLL */
+       core_write(priv, CORE_GSWPLL_GRP1,
+                  RG_GSWPLL_EN_PRE |
+                  RG_GSWPLL_POSDIV_200M(2) |
+                  RG_GSWPLL_FBKDIV_200M(32));
+
+       /* Enable MT7530 core clock */
+       core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
 
        /* Setup the MT7530 TRGMII Tx Clock */
        core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
index f8a168b..cb88ffb 100644 (file)
@@ -54,7 +54,7 @@ config B44_PCI
 config BCM4908_ENET
        tristate "Broadcom BCM4908 internal mac support"
        depends on ARCH_BCM4908 || COMPILE_TEST
-       default y
+       default y if ARCH_BCM4908
        help
          This driver supports Ethernet controller integrated into Broadcom
          BCM4908 family SoCs.
index 169e10c..1115b8f 100644 (file)
@@ -722,7 +722,7 @@ static int chcr_ktls_cpl_set_tcb_rpl(struct adapter *adap, unsigned char *input)
                kvfree(tx_info);
                return 0;
        }
-       tx_info->open_state = false;
+       tx_info->open_state = CH_KTLS_OPEN_SUCCESS;
        spin_unlock(&tx_info->lock);
 
        complete(&tx_info->completion);
index 88bfe21..04421ae 100644 (file)
@@ -1337,6 +1337,7 @@ static int ftgmac100_poll(struct napi_struct *napi, int budget)
         */
        if (unlikely(priv->need_mac_restart)) {
                ftgmac100_start_hw(priv);
+               priv->need_mac_restart = false;
 
                /* Re-enable "bad" interrupts */
                iowrite32(FTGMAC100_INT_BAD,
index 88faf05..0b1e890 100644 (file)
@@ -899,6 +899,8 @@ static s32 e1000_set_d0_lplu_state_82571(struct e1000_hw *hw, bool active)
        } else {
                data &= ~IGP02E1000_PM_D0_LPLU;
                ret_val = e1e_wphy(hw, IGP02E1000_PHY_POWER_MGMT, data);
+               if (ret_val)
+                       return ret_val;
                /* LPLU and SmartSpeed are mutually exclusive.  LPLU is used
                 * during Dx states where the power conservation is most
                 * important.  During driver activity we should enable
index 69a2329..db79c4e 100644 (file)
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright(c) 1999 - 2018 Intel Corporation. */
 
-#ifndef _E1000_HW_H_
-#define _E1000_HW_H_
+#ifndef _E1000E_HW_H_
+#define _E1000E_HW_H_
 
 #include "regs.h"
 #include "defines.h"
@@ -714,4 +714,4 @@ struct e1000_hw {
 #include "80003es2lan.h"
 #include "ich8lan.h"
 
-#endif
+#endif /* _E1000E_HW_H_ */
index e9b82c2..a094800 100644 (file)
@@ -5974,15 +5974,19 @@ static void e1000_reset_task(struct work_struct *work)
        struct e1000_adapter *adapter;
        adapter = container_of(work, struct e1000_adapter, reset_task);
 
+       rtnl_lock();
        /* don't run the task if already down */
-       if (test_bit(__E1000_DOWN, &adapter->state))
+       if (test_bit(__E1000_DOWN, &adapter->state)) {
+               rtnl_unlock();
                return;
+       }
 
        if (!(adapter->flags & FLAG_RESTART_NOW)) {
                e1000e_dump(adapter);
                e_err("Reset adapter unexpectedly\n");
        }
        e1000e_reinit_locked(adapter);
+       rtnl_unlock();
 }
 
 /**
index 353deae..17f3b80 100644 (file)
@@ -3259,6 +3259,17 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring)
 }
 
 /**
+ * i40e_rx_offset - Return expected offset into page to access data
+ * @rx_ring: Ring we are requesting offset of
+ *
+ * Returns the offset value for ring into the data buffer.
+ */
+static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
+{
+       return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
+}
+
+/**
  * i40e_configure_rx_ring - Configure a receive ring context
  * @ring: The Rx ring to configure
  *
@@ -3369,6 +3380,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
        else
                set_ring_build_skb_enabled(ring);
 
+       ring->rx_offset = i40e_rx_offset(ring);
+
        /* cache tail for quicker writes, and clear the reg before use */
        ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
        writel(0, ring->tail);
index 627794b..5747a99 100644 (file)
@@ -1570,17 +1570,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
 }
 
 /**
- * i40e_rx_offset - Return expected offset into page to access data
- * @rx_ring: Ring we are requesting offset of
- *
- * Returns the offset value for ring into the data buffer.
- */
-static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
-{
-       return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
-}
-
-/**
  * i40e_setup_rx_descriptors - Allocate Rx descriptors
  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
  *
@@ -1608,7 +1597,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
        rx_ring->next_to_alloc = 0;
        rx_ring->next_to_clean = 0;
        rx_ring->next_to_use = 0;
-       rx_ring->rx_offset = i40e_rx_offset(rx_ring);
 
        /* XDP RX-queue info only needed for RX rings exposed to XDP */
        if (rx_ring->vsi->type == I40E_VSI_MAIN) {
index 3124a3b..1148d76 100644 (file)
@@ -275,6 +275,22 @@ ice_setup_tx_ctx(struct ice_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf_q)
 }
 
 /**
+ * ice_rx_offset - Return expected offset into page to access data
+ * @rx_ring: Ring we are requesting offset of
+ *
+ * Returns the offset value for ring into the data buffer.
+ */
+static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
+{
+       if (ice_ring_uses_build_skb(rx_ring))
+               return ICE_SKB_PAD;
+       else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+               return XDP_PACKET_HEADROOM;
+
+       return 0;
+}
+
+/**
  * ice_setup_rx_ctx - Configure a receive ring context
  * @ring: The Rx ring to configure
  *
@@ -413,11 +429,15 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
        else
                ice_set_ring_build_skb_ena(ring);
 
+       ring->rx_offset = ice_rx_offset(ring);
+
        /* init queue specific tail register */
        ring->tail = hw->hw_addr + QRX_TAIL(pf_q);
        writel(0, ring->tail);
 
        if (ring->xsk_pool) {
+               bool ok;
+
                if (!xsk_buff_can_alloc(ring->xsk_pool, num_bufs)) {
                        dev_warn(dev, "XSK buffer pool does not provide enough addresses to fill %d buffers on Rx ring %d\n",
                                 num_bufs, ring->q_index);
@@ -426,8 +446,8 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
                        return 0;
                }
 
-               err = ice_alloc_rx_bufs_zc(ring, num_bufs);
-               if (err)
+               ok = ice_alloc_rx_bufs_zc(ring, num_bufs);
+               if (!ok)
                        dev_info(dev, "Failed to allocate some buffers on XSK buffer pool enabled Rx ring %d (pf_q %d)\n",
                                 ring->q_index, pf_q);
                return 0;
index b7dc25d..b91dcfd 100644 (file)
@@ -444,22 +444,6 @@ void ice_free_rx_ring(struct ice_ring *rx_ring)
 }
 
 /**
- * ice_rx_offset - Return expected offset into page to access data
- * @rx_ring: Ring we are requesting offset of
- *
- * Returns the offset value for ring into the data buffer.
- */
-static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
-{
-       if (ice_ring_uses_build_skb(rx_ring))
-               return ICE_SKB_PAD;
-       else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
-               return XDP_PACKET_HEADROOM;
-
-       return 0;
-}
-
-/**
  * ice_setup_rx_ring - Allocate the Rx descriptors
  * @rx_ring: the Rx ring to set up
  *
@@ -493,7 +477,6 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
 
        rx_ring->next_to_use = 0;
        rx_ring->next_to_clean = 0;
-       rx_ring->rx_offset = ice_rx_offset(rx_ring);
 
        if (ice_is_xdp_ena_vsi(rx_ring->vsi))
                WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
index 83f3c95..9f94d91 100644 (file)
@@ -358,18 +358,18 @@ xsk_pool_if_up:
  * This function allocates a number of Rx buffers from the fill ring
  * or the internal recycle mechanism and places them on the Rx ring.
  *
- * Returns false if all allocations were successful, true if any fail.
+ * Returns true if all allocations were successful, false if any fail.
  */
 bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
 {
        union ice_32b_rx_flex_desc *rx_desc;
        u16 ntu = rx_ring->next_to_use;
        struct ice_rx_buf *rx_buf;
-       bool ret = false;
+       bool ok = true;
        dma_addr_t dma;
 
        if (!count)
-               return false;
+               return true;
 
        rx_desc = ICE_RX_DESC(rx_ring, ntu);
        rx_buf = &rx_ring->rx_buf[ntu];
@@ -377,7 +377,7 @@ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
        do {
                rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool);
                if (!rx_buf->xdp) {
-                       ret = true;
+                       ok = false;
                        break;
                }
 
@@ -402,7 +402,7 @@ bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
                ice_release_rx_desc(rx_ring, ntu);
        }
 
-       return ret;
+       return ok;
 }
 
 /**
index 5d87957..44111f6 100644 (file)
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright(c) 2007 - 2018 Intel Corporation. */
 
-#ifndef _E1000_HW_H_
-#define _E1000_HW_H_
+#ifndef _E1000_IGB_HW_H_
+#define _E1000_IGB_HW_H_
 
 #include <linux/types.h>
 #include <linux/delay.h>
@@ -551,4 +551,4 @@ s32 igb_write_pcie_cap_reg(struct e1000_hw *hw, u32 reg, u16 *value);
 
 void igb_read_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value);
 void igb_write_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value);
-#endif /* _E1000_HW_H_ */
+#endif /* _E1000_IGB_HW_H_ */
index aaa954a..7bda8c5 100644 (file)
@@ -748,8 +748,8 @@ void igb_ptp_suspend(struct igb_adapter *adapter);
 void igb_ptp_rx_hang(struct igb_adapter *adapter);
 void igb_ptp_tx_hang(struct igb_adapter *adapter);
 void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb);
-void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
-                        struct sk_buff *skb);
+int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
+                       struct sk_buff *skb);
 int igb_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr);
 int igb_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr);
 void igb_set_flag_queue_pairs(struct igb_adapter *, const u32);
index 878b31d..a45cd2b 100644 (file)
@@ -8214,7 +8214,8 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
        new_buff->pagecnt_bias  = old_buff->pagecnt_bias;
 }
 
-static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
+static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+                                 int rx_buf_pgcnt)
 {
        unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
        struct page *page = rx_buffer->page;
@@ -8225,7 +8226,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
 
 #if (PAGE_SIZE < 8192)
        /* if we are only owner of page we can reuse it */
-       if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
+       if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
                return false;
 #else
 #define IGB_LAST_OFFSET \
@@ -8301,9 +8302,10 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring,
                return NULL;
 
        if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) {
-               igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb);
-               xdp->data += IGB_TS_HDR_LEN;
-               size -= IGB_TS_HDR_LEN;
+               if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb)) {
+                       xdp->data += IGB_TS_HDR_LEN;
+                       size -= IGB_TS_HDR_LEN;
+               }
        }
 
        /* Determine available headroom for copy */
@@ -8364,8 +8366,8 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring,
 
        /* pull timestamp out of packet data */
        if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
-               igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb);
-               __skb_pull(skb, IGB_TS_HDR_LEN);
+               if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb))
+                       __skb_pull(skb, IGB_TS_HDR_LEN);
        }
 
        /* update buffer offset */
@@ -8614,11 +8616,17 @@ static unsigned int igb_rx_offset(struct igb_ring *rx_ring)
 }
 
 static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
-                                              const unsigned int size)
+                                              const unsigned int size, int *rx_buf_pgcnt)
 {
        struct igb_rx_buffer *rx_buffer;
 
        rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+       *rx_buf_pgcnt =
+#if (PAGE_SIZE < 8192)
+               page_count(rx_buffer->page);
+#else
+               0;
+#endif
        prefetchw(rx_buffer->page);
 
        /* we are reusing so sync this buffer for CPU use */
@@ -8634,9 +8642,9 @@ static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
 }
 
 static void igb_put_rx_buffer(struct igb_ring *rx_ring,
-                             struct igb_rx_buffer *rx_buffer)
+                             struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
 {
-       if (igb_can_reuse_rx_page(rx_buffer)) {
+       if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
                /* hand second half of page back to the ring */
                igb_reuse_rx_page(rx_ring, rx_buffer);
        } else {
@@ -8664,6 +8672,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
        unsigned int xdp_xmit = 0;
        struct xdp_buff xdp;
        u32 frame_sz = 0;
+       int rx_buf_pgcnt;
 
        /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
 #if (PAGE_SIZE < 8192)
@@ -8693,7 +8702,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
                 */
                dma_rmb();
 
-               rx_buffer = igb_get_rx_buffer(rx_ring, size);
+               rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
 
                /* retrieve a buffer from the ring */
                if (!skb) {
@@ -8736,7 +8745,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
                        break;
                }
 
-               igb_put_rx_buffer(rx_ring, rx_buffer);
+               igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt);
                cleaned_count++;
 
                /* fetch next buffer in frame if non-eop */
index 7cc5428..86a5762 100644 (file)
@@ -856,6 +856,9 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter)
        dev_kfree_skb_any(skb);
 }
 
+#define IGB_RET_PTP_DISABLED 1
+#define IGB_RET_PTP_INVALID 2
+
 /**
  * igb_ptp_rx_pktstamp - retrieve Rx per packet timestamp
  * @q_vector: Pointer to interrupt specific structure
@@ -864,19 +867,29 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter)
  *
  * This function is meant to retrieve a timestamp from the first buffer of an
  * incoming frame.  The value is stored in little endian format starting on
- * byte 8.
+ * byte 8
+ *
+ * Returns: 0 if success, nonzero if failure
  **/
-void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
-                        struct sk_buff *skb)
+int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
+                       struct sk_buff *skb)
 {
-       __le64 *regval = (__le64 *)va;
        struct igb_adapter *adapter = q_vector->adapter;
+       __le64 *regval = (__le64 *)va;
        int adjust = 0;
 
+       if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+               return IGB_RET_PTP_DISABLED;
+
        /* The timestamp is recorded in little endian format.
         * DWORD: 0        1        2        3
         * Field: Reserved Reserved SYSTIML  SYSTIMH
         */
+
+       /* check reserved dwords are zero, be/le doesn't matter for zero */
+       if (regval[0])
+               return IGB_RET_PTP_INVALID;
+
        igb_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb),
                                   le64_to_cpu(regval[1]));
 
@@ -896,6 +909,8 @@ void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
        }
        skb_hwtstamps(skb)->hwtstamp =
                ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust);
+
+       return 0;
 }
 
 /**
@@ -906,13 +921,15 @@ void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
  * This function is meant to retrieve a timestamp from the internal registers
  * of the adapter and store it in the skb.
  **/
-void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
-                        struct sk_buff *skb)
+void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb)
 {
        struct igb_adapter *adapter = q_vector->adapter;
        struct e1000_hw *hw = &adapter->hw;
-       u64 regval;
        int adjust = 0;
+       u64 regval;
+
+       if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+               return;
 
        /* If this bit is set, then the RX registers contain the time stamp. No
         * other packet will be time stamped until we read these registers, so
index 5d2809d..1b08a7d 100644 (file)
@@ -547,7 +547,7 @@ void igc_ptp_init(struct igc_adapter *adapter);
 void igc_ptp_reset(struct igc_adapter *adapter);
 void igc_ptp_suspend(struct igc_adapter *adapter);
 void igc_ptp_stop(struct igc_adapter *adapter);
-void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, void *va,
+void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, __le32 *va,
                         struct sk_buff *skb);
 int igc_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr);
 int igc_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr);
index 824a6c4..8722294 100644 (file)
@@ -1711,6 +1711,9 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
                                                     Autoneg);
        }
 
+       /* Set pause flow control settings */
+       ethtool_link_ksettings_add_link_mode(cmd, supported, Pause);
+
        switch (hw->fc.requested_mode) {
        case igc_fc_full:
                ethtool_link_ksettings_add_link_mode(cmd, advertising, Pause);
@@ -1725,9 +1728,7 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
                                                     Asym_Pause);
                break;
        default:
-               ethtool_link_ksettings_add_link_mode(cmd, advertising, Pause);
-               ethtool_link_ksettings_add_link_mode(cmd, advertising,
-                                                    Asym_Pause);
+               break;
        }
 
        status = pm_runtime_suspended(&adapter->pdev->dev) ?
index 7ac9597..4d989eb 100644 (file)
@@ -3831,10 +3831,19 @@ static void igc_reset_task(struct work_struct *work)
 
        adapter = container_of(work, struct igc_adapter, reset_task);
 
+       rtnl_lock();
+       /* If we're already down or resetting, just bail */
+       if (test_bit(__IGC_DOWN, &adapter->state) ||
+           test_bit(__IGC_RESETTING, &adapter->state)) {
+               rtnl_unlock();
+               return;
+       }
+
        igc_rings_dump(adapter);
        igc_regs_dump(adapter);
        netdev_err(adapter->netdev, "Reset adapter\n");
        igc_reinit_locked(adapter);
+       rtnl_unlock();
 }
 
 /**
index ac0b9c8..545f4d0 100644 (file)
@@ -152,46 +152,54 @@ static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter,
 }
 
 /**
- * igc_ptp_rx_pktstamp - retrieve Rx per packet timestamp
+ * igc_ptp_rx_pktstamp - Retrieve timestamp from Rx packet buffer
  * @q_vector: Pointer to interrupt specific structure
  * @va: Pointer to address containing Rx buffer
  * @skb: Buffer containing timestamp and packet
  *
- * This function is meant to retrieve the first timestamp from the
- * first buffer of an incoming frame. The value is stored in little
- * endian format starting on byte 0. There's a second timestamp
- * starting on byte 8.
- **/
-void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, void *va,
+ * This function retrieves the timestamp saved in the beginning of packet
+ * buffer. While two timestamps are available, one in timer0 reference and the
+ * other in timer1 reference, this function considers only the timestamp in
+ * timer0 reference.
+ */
+void igc_ptp_rx_pktstamp(struct igc_q_vector *q_vector, __le32 *va,
                         struct sk_buff *skb)
 {
        struct igc_adapter *adapter = q_vector->adapter;
-       __le64 *regval = (__le64 *)va;
-       int adjust = 0;
-
-       /* The timestamp is recorded in little endian format.
-        * DWORD: | 0          | 1           | 2          | 3
-        * Field: | Timer0 Low | Timer0 High | Timer1 Low | Timer1 High
+       u64 regval;
+       int adjust;
+
+       /* Timestamps are saved in little endian at the beginning of the packet
+        * buffer following the layout:
+        *
+        * DWORD: | 0              | 1              | 2              | 3              |
+        * Field: | Timer1 SYSTIML | Timer1 SYSTIMH | Timer0 SYSTIML | Timer0 SYSTIMH |
+        *
+        * SYSTIML holds the nanoseconds part while SYSTIMH holds the seconds
+        * part of the timestamp.
         */
-       igc_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb),
-                                  le64_to_cpu(regval[0]));
-
-       /* adjust timestamp for the RX latency based on link speed */
-       if (adapter->hw.mac.type == igc_i225) {
-               switch (adapter->link_speed) {
-               case SPEED_10:
-                       adjust = IGC_I225_RX_LATENCY_10;
-                       break;
-               case SPEED_100:
-                       adjust = IGC_I225_RX_LATENCY_100;
-                       break;
-               case SPEED_1000:
-                       adjust = IGC_I225_RX_LATENCY_1000;
-                       break;
-               case SPEED_2500:
-                       adjust = IGC_I225_RX_LATENCY_2500;
-                       break;
-               }
+       regval = le32_to_cpu(va[2]);
+       regval |= (u64)le32_to_cpu(va[3]) << 32;
+       igc_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), regval);
+
+       /* Adjust timestamp for the RX latency based on link speed */
+       switch (adapter->link_speed) {
+       case SPEED_10:
+               adjust = IGC_I225_RX_LATENCY_10;
+               break;
+       case SPEED_100:
+               adjust = IGC_I225_RX_LATENCY_100;
+               break;
+       case SPEED_1000:
+               adjust = IGC_I225_RX_LATENCY_1000;
+               break;
+       case SPEED_2500:
+               adjust = IGC_I225_RX_LATENCY_2500;
+               break;
+       default:
+               adjust = 0;
+               netdev_warn_once(adapter->netdev, "Imprecise timestamp\n");
+               break;
        }
        skb_hwtstamps(skb)->hwtstamp =
                ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust);
index 9f3f12e..03d9aad 100644 (file)
@@ -4118,6 +4118,8 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 #endif
        }
 
+       ring->rx_offset = ixgbe_rx_offset(ring);
+
        if (ring->xsk_pool && hw->mac.type != ixgbe_mac_82599EB) {
                u32 xsk_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
 
@@ -6578,7 +6580,6 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 
        rx_ring->next_to_clean = 0;
        rx_ring->next_to_use = 0;
-       rx_ring->rx_offset = ixgbe_rx_offset(rx_ring);
 
        /* XDP RX-queue info */
        if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
index 7fe15a3..fe0989c 100644 (file)
@@ -6,7 +6,7 @@
 config NET_VENDOR_MARVELL
        bool "Marvell devices"
        default y
-       depends on PCI || CPU_PXA168 || MV64X60 || PPC32 || PLAT_ORION || INET || COMPILE_TEST
+       depends on PCI || CPU_PXA168 || PPC32 || PLAT_ORION || INET || COMPILE_TEST
        help
          If you have a network (Ethernet) card belonging to this class, say Y.
 
@@ -19,7 +19,7 @@ if NET_VENDOR_MARVELL
 
 config MV643XX_ETH
        tristate "Marvell Discovery (643XX) and Orion ethernet support"
-       depends on MV64X60 || PPC32 || PLAT_ORION || COMPILE_TEST
+       depends on PPC32 || PLAT_ORION || COMPILE_TEST
        depends on INET
        select PHYLIB
        select MVMDIO
index 90e6111..3bfb659 100644 (file)
@@ -2684,7 +2684,7 @@ static const struct of_device_id mv643xx_eth_shared_ids[] = {
 MODULE_DEVICE_TABLE(of, mv643xx_eth_shared_ids);
 #endif
 
-#if defined(CONFIG_OF_IRQ) && !defined(CONFIG_MV64X60)
+#ifdef CONFIG_OF_IRQ
 #define mv643xx_eth_property(_np, _name, _v)                           \
        do {                                                            \
                u32 tmp;                                                \
index b192692..5c372d2 100644 (file)
@@ -13499,8 +13499,6 @@ static struct npc_mcam_kex npc_mkex_default = {
                        [NPC_LT_LC_IP] = {
                                /* SIP+DIP: 8 bytes, KW2[63:0] */
                                KEX_LD_CFG(0x07, 0xc, 0x1, 0x0, 0x10),
-                               /* TOS: 1 byte, KW1[63:56] */
-                               KEX_LD_CFG(0x0, 0x1, 0x1, 0x0, 0xf),
                        },
                        /* Layer C: IPv6 */
                        [NPC_LT_LC_IP6] = {
index d9a1a71..ab24a5e 100644 (file)
@@ -2462,8 +2462,10 @@ static void rvu_unregister_interrupts(struct rvu *rvu)
                    INTR_MASK(rvu->hw->total_pfs) & ~1ULL);
 
        for (irq = 0; irq < rvu->num_vec; irq++) {
-               if (rvu->irq_allocated[irq])
+               if (rvu->irq_allocated[irq]) {
                        free_irq(pci_irq_vector(rvu->pdev, irq), rvu);
+                       rvu->irq_allocated[irq] = false;
+               }
        }
 
        pci_free_irq_vectors(rvu->pdev);
@@ -2975,8 +2977,8 @@ static void rvu_remove(struct pci_dev *pdev)
        struct rvu *rvu = pci_get_drvdata(pdev);
 
        rvu_dbg_exit(rvu);
-       rvu_unregister_interrupts(rvu);
        rvu_unregister_dl(rvu);
+       rvu_unregister_interrupts(rvu);
        rvu_flr_wq_destroy(rvu);
        rvu_cgx_exit(rvu);
        rvu_fwdata_exit(rvu);
index fa6e46e..76f3992 100644 (file)
@@ -678,6 +678,7 @@ void npc_read_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
                         u8 *intf, u8 *ena);
 bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature);
 u32  rvu_cgx_get_fifolen(struct rvu *rvu);
+void *rvu_first_cgx_pdata(struct rvu *rvu);
 
 /* CPT APIs */
 int rvu_cpt_lf_teardown(struct rvu *rvu, u16 pcifunc, int lf, int slot);
index e668e48..6e2bf4f 100644 (file)
@@ -89,6 +89,21 @@ void *rvu_cgx_pdata(u8 cgx_id, struct rvu *rvu)
        return rvu->cgx_idmap[cgx_id];
 }
 
+/* Return first enabled CGX instance if none are enabled then return NULL */
+void *rvu_first_cgx_pdata(struct rvu *rvu)
+{
+       int first_enabled_cgx = 0;
+       void *cgxd = NULL;
+
+       for (; first_enabled_cgx < rvu->cgx_cnt_max; first_enabled_cgx++) {
+               cgxd = rvu_cgx_pdata(first_enabled_cgx, rvu);
+               if (cgxd)
+                       break;
+       }
+
+       return cgxd;
+}
+
 /* Based on P2X connectivity find mapped NIX block for a PF */
 static void rvu_map_cgx_nix_block(struct rvu *rvu, int pf,
                                  int cgx_id, int lmac_id)
@@ -711,10 +726,9 @@ int rvu_mbox_handler_cgx_features_get(struct rvu *rvu,
 u32 rvu_cgx_get_fifolen(struct rvu *rvu)
 {
        struct mac_ops *mac_ops;
-       int rvu_def_cgx_id = 0;
        u32 fifo_len;
 
-       mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+       mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
        fifo_len = mac_ops ? mac_ops->fifo_len : 0;
 
        return fifo_len;
index aa2ca87..de3968d 100644 (file)
@@ -234,12 +234,14 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
                                          char __user *buffer,
                                          size_t count, loff_t *ppos)
 {
-       int index, off = 0, flag = 0, go_back = 0, off_prev;
+       int index, off = 0, flag = 0, go_back = 0, len = 0;
        struct rvu *rvu = filp->private_data;
        int lf, pf, vf, pcifunc;
        struct rvu_block block;
        int bytes_not_copied;
+       int lf_str_size = 12;
        int buf_size = 2048;
+       char *lfs;
        char *buf;
 
        /* don't allow partial reads */
@@ -249,12 +251,20 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
        buf = kzalloc(buf_size, GFP_KERNEL);
        if (!buf)
                return -ENOSPC;
-       off +=  scnprintf(&buf[off], buf_size - 1 - off, "\npcifunc\t\t");
+
+       lfs = kzalloc(lf_str_size, GFP_KERNEL);
+       if (!lfs) {
+               kfree(buf);
+               return -ENOMEM;
+       }
+       off +=  scnprintf(&buf[off], buf_size - 1 - off, "%-*s", lf_str_size,
+                         "pcifunc");
        for (index = 0; index < BLK_COUNT; index++)
-               if (strlen(rvu->hw->block[index].name))
-                       off +=  scnprintf(&buf[off], buf_size - 1 - off,
-                                         "%*s\t", (index - 1) * 2,
-                                         rvu->hw->block[index].name);
+               if (strlen(rvu->hw->block[index].name)) {
+                       off += scnprintf(&buf[off], buf_size - 1 - off,
+                                        "%-*s", lf_str_size,
+                                        rvu->hw->block[index].name);
+               }
        off += scnprintf(&buf[off], buf_size - 1 - off, "\n");
        for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
                for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
@@ -263,14 +273,15 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
                                continue;
 
                        if (vf) {
+                               sprintf(lfs, "PF%d:VF%d", pf, vf - 1);
                                go_back = scnprintf(&buf[off],
                                                    buf_size - 1 - off,
-                                                   "PF%d:VF%d\t\t", pf,
-                                                   vf - 1);
+                                                   "%-*s", lf_str_size, lfs);
                        } else {
+                               sprintf(lfs, "PF%d", pf);
                                go_back = scnprintf(&buf[off],
                                                    buf_size - 1 - off,
-                                                   "PF%d\t\t", pf);
+                                                   "%-*s", lf_str_size, lfs);
                        }
 
                        off += go_back;
@@ -278,20 +289,22 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
                                block = rvu->hw->block[index];
                                if (!strlen(block.name))
                                        continue;
-                               off_prev = off;
+                               len = 0;
+                               lfs[len] = '\0';
                                for (lf = 0; lf < block.lf.max; lf++) {
                                        if (block.fn_map[lf] != pcifunc)
                                                continue;
                                        flag = 1;
-                                       off += scnprintf(&buf[off], buf_size - 1
-                                                       - off, "%3d,", lf);
+                                       len += sprintf(&lfs[len], "%d,", lf);
                                }
-                               if (flag && off_prev != off)
-                                       off--;
-                               else
-                                       go_back++;
+
+                               if (flag)
+                                       len--;
+                               lfs[len] = '\0';
                                off += scnprintf(&buf[off], buf_size - 1 - off,
-                                               "\t");
+                                                "%-*s", lf_str_size, lfs);
+                               if (!strlen(lfs))
+                                       go_back += lf_str_size;
                        }
                        if (!flag)
                                off -= go_back;
@@ -303,6 +316,7 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
        }
 
        bytes_not_copied = copy_to_user(buffer, buf, off);
+       kfree(lfs);
        kfree(buf);
 
        if (bytes_not_copied)
@@ -319,7 +333,6 @@ static int rvu_dbg_rvu_pf_cgx_map_display(struct seq_file *filp, void *unused)
        struct rvu *rvu = filp->private;
        struct pci_dev *pdev = NULL;
        struct mac_ops *mac_ops;
-       int rvu_def_cgx_id = 0;
        char cgx[10], lmac[10];
        struct rvu_pfvf *pfvf;
        int pf, domain, blkid;
@@ -327,7 +340,10 @@ static int rvu_dbg_rvu_pf_cgx_map_display(struct seq_file *filp, void *unused)
        u16 pcifunc;
 
        domain = 2;
-       mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+       mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
+       /* There can be no CGX devices at all */
+       if (!mac_ops)
+               return 0;
        seq_printf(filp, "PCI dev\t\tRVU PF Func\tNIX block\t%s\tLMAC\n",
                   mac_ops->name);
        for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
@@ -1818,7 +1834,6 @@ static void rvu_dbg_cgx_init(struct rvu *rvu)
 {
        struct mac_ops *mac_ops;
        unsigned long lmac_bmap;
-       int rvu_def_cgx_id = 0;
        int i, lmac_id;
        char dname[20];
        void *cgx;
@@ -1826,7 +1841,7 @@ static void rvu_dbg_cgx_init(struct rvu *rvu)
        if (!cgx_get_cgxcnt_max())
                return;
 
-       mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+       mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
        if (!mac_ops)
                return;
 
index d300019..3d068b7 100644 (file)
@@ -2629,7 +2629,7 @@ static int set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg)
        struct nix_rx_flowkey_alg *field;
        struct nix_rx_flowkey_alg tmp;
        u32 key_type, valid_key;
-       int l4_key_offset;
+       int l4_key_offset = 0;
 
        if (!alg)
                return -EINVAL;
index 04bb080..0bd49c7 100644 (file)
@@ -2490,10 +2490,10 @@ int rvu_mbox_handler_npc_mcam_free_counter(struct rvu *rvu,
                index = find_next_bit(mcam->bmap, mcam->bmap_entries, entry);
                if (index >= mcam->bmap_entries)
                        break;
+               entry = index + 1;
                if (mcam->entry2cntr_map[index] != req->cntr)
                        continue;
 
-               entry = index + 1;
                npc_unmap_mcam_entry_and_cntr(rvu, mcam, blkaddr,
                                              index, req->cntr);
        }
index 0dbbf38..dc17784 100644 (file)
@@ -257,17 +257,19 @@ int otx2_get_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
 int otx2_get_all_flows(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
                       u32 *rule_locs)
 {
+       u32 rule_cnt = nfc->rule_cnt;
        u32 location = 0;
        int idx = 0;
        int err = 0;
 
        nfc->data = pfvf->flow_cfg->ntuple_max_flows;
-       while ((!err || err == -ENOENT) && idx < nfc->rule_cnt) {
+       while ((!err || err == -ENOENT) && idx < rule_cnt) {
                err = otx2_get_flow(pfvf, nfc, location);
                if (!err)
                        rule_locs[idx++] = location;
                location++;
        }
+       nfc->rule_cnt = rule_cnt;
 
        return err;
 }
index 53ab181..2fd3d23 100644 (file)
@@ -1672,6 +1672,7 @@ int otx2_stop(struct net_device *netdev)
        struct otx2_nic *pf = netdev_priv(netdev);
        struct otx2_cq_poll *cq_poll = NULL;
        struct otx2_qset *qset = &pf->qset;
+       struct otx2_rss_info *rss;
        int qidx, vec, wrk;
 
        netif_carrier_off(netdev);
@@ -1684,6 +1685,10 @@ int otx2_stop(struct net_device *netdev)
        /* First stop packet Rx/Tx */
        otx2_rxtx_enable(pf, false);
 
+       /* Clear RSS enable flag */
+       rss = &pf->hw.rss_info;
+       rss->enable = false;
+
        /* Cleanup Queue IRQ */
        vec = pci_irq_vector(pf->pdev,
                             pf->hw.nix_msixoff + NIX_LF_QINT_VEC_START);
index d1e4d42..3712e17 100644 (file)
@@ -1544,8 +1544,8 @@ static int pxa168_eth_remove(struct platform_device *pdev)
        clk_disable_unprepare(pep->clk);
        mdiobus_unregister(pep->smi_bus);
        mdiobus_free(pep->smi_bus);
-       unregister_netdev(dev);
        cancel_work_sync(&pep->tx_timeout_task);
+       unregister_netdev(dev);
        free_netdev(dev);
        return 0;
 }
index 7435fe6..304b296 100644 (file)
@@ -92,14 +92,15 @@ struct page_pool;
                                    MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
 #define MLX5_MPWRQ_PAGES_PER_WQE               BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
 
-#define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
+#define MLX5_ALIGN_MTTS(mtts)          (ALIGN(mtts, 8))
+#define MLX5_ALIGNED_MTTS_OCTW(mtts)   ((mtts) / 2)
+#define MLX5_MTT_OCTW(mtts)            (MLX5_ALIGNED_MTTS_OCTW(MLX5_ALIGN_MTTS(mtts)))
 /* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between
  * WQEs, This page will absorb write overflow by the hardware, when
  * receiving packets larger than MTU. These oversize packets are
  * dropped by the driver at a later stage.
  */
-#define MLX5E_REQUIRED_WQE_MTTS                (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE + 1, 8))
-#define MLX5E_LOG_ALIGNED_MPWQE_PPW    (ilog2(MLX5E_REQUIRED_WQE_MTTS))
+#define MLX5E_REQUIRED_WQE_MTTS                (MLX5_ALIGN_MTTS(MLX5_MPWRQ_PAGES_PER_WQE + 1))
 #define MLX5E_REQUIRED_MTTS(wqes)      (wqes * MLX5E_REQUIRED_WQE_MTTS)
 #define MLX5E_MAX_RQ_NUM_MTTS  \
        ((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
index f3f6eb0..b2cd298 100644 (file)
@@ -1181,7 +1181,8 @@ int mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec)
 
        mlx5e_tc_match_to_reg_get_match(spec, CTSTATE_TO_REG,
                                        &ctstate, &ctstate_mask);
-       if (ctstate_mask)
+
+       if ((ctstate & ctstate_mask) == MLX5_CT_STATE_TRK_BIT)
                return -EOPNOTSUPP;
 
        ctstate_mask |= MLX5_CT_STATE_TRK_BIT;
index f8075a6..172e047 100644 (file)
@@ -685,14 +685,14 @@ int mlx5e_tc_tun_route_lookup(struct mlx5e_priv *priv,
        u16 vport_num;
        int err = 0;
 
-       if (flow_attr->ip_version == 4) {
+       if (flow_attr->tun_ip_version == 4) {
                /* Addresses are swapped for decap */
                attr.fl.fl4.saddr = esw_attr->rx_tun_attr->dst_ip.v4;
                attr.fl.fl4.daddr = esw_attr->rx_tun_attr->src_ip.v4;
                err = mlx5e_route_lookup_ipv4_get(priv, priv->netdev, &attr);
        }
 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
-       else if (flow_attr->ip_version == 6) {
+       else if (flow_attr->tun_ip_version == 6) {
                /* Addresses are swapped for decap */
                attr.fl.fl6.saddr = esw_attr->rx_tun_attr->dst_ip.v6;
                attr.fl.fl6.daddr = esw_attr->rx_tun_attr->src_ip.v6;
@@ -718,10 +718,10 @@ int mlx5e_tc_tun_route_lookup(struct mlx5e_priv *priv,
        esw_attr->rx_tun_attr->decap_vport = vport_num;
 
 out:
-       if (flow_attr->ip_version == 4)
+       if (flow_attr->tun_ip_version == 4)
                mlx5e_route_lookup_ipv4_put(&attr);
 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
-       else if (flow_attr->ip_version == 6)
+       else if (flow_attr->tun_ip_version == 6)
                mlx5e_route_lookup_ipv6_put(&attr);
 #endif
        return err;
index 6a11633..7f7b0f6 100644 (file)
@@ -89,6 +89,7 @@ int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
         * required to establish routing.
         */
        flow_flag_set(flow, TUN_RX);
+       flow->attr->tun_ip_version = ip_version;
        return 0;
 }
 
@@ -1091,7 +1092,7 @@ int mlx5e_attach_decap_route(struct mlx5e_priv *priv,
        if (err || !esw_attr->rx_tun_attr->decap_vport)
                goto out;
 
-       key.ip_version = attr->ip_version;
+       key.ip_version = attr->tun_ip_version;
        if (key.ip_version == 4)
                key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
        else
index e472ed0..7ed3f9f 100644 (file)
@@ -227,6 +227,10 @@ static int mlx5e_tc_tun_parse_geneve_options(struct mlx5e_priv *priv,
        option_key = (struct geneve_opt *)&enc_opts.key->data[0];
        option_mask = (struct geneve_opt *)&enc_opts.mask->data[0];
 
+       if (option_mask->opt_class == 0 && option_mask->type == 0 &&
+           !memchr_inv(option_mask->opt_data, 0, option_mask->length * 4))
+               return 0;
+
        if (option_key->length > max_tlv_option_data_len) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Matching on GENEVE options: unsupported option len");
index abdf721..f5f2a8f 100644 (file)
@@ -1887,6 +1887,7 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev,
 {
        struct mlx5e_priv *priv = netdev_priv(netdev);
        struct mlx5_core_dev *mdev = priv->mdev;
+       int err;
 
        if (!MLX5_CAP_GEN(mdev, cqe_compression))
                return -EOPNOTSUPP;
@@ -1896,7 +1897,10 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev,
                return -EINVAL;
        }
 
-       mlx5e_modify_rx_cqe_compression_locked(priv, enable);
+       err = mlx5e_modify_rx_cqe_compression_locked(priv, enable);
+       if (err)
+               return err;
+
        priv->channels.params.rx_cqe_compress_def = enable;
 
        return 0;
@@ -2014,8 +2018,13 @@ static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable)
         */
 
        if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+               struct mlx5e_params old_params;
+
+               old_params = priv->channels.params;
                priv->channels.params = new_channels.params;
                err = mlx5e_num_channels_changed(priv);
+               if (err)
+                       priv->channels.params = old_params;
                goto out;
        }
 
index ec2fcb2..158f947 100644 (file)
@@ -334,9 +334,9 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
                                     rq->wqe_overflow.addr);
 }
 
-static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
+static u64 mlx5e_get_mpwqe_offset(u16 wqe_ix)
 {
-       return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
+       return MLX5E_REQUIRED_MTTS(wqe_ix) << PAGE_SHIFT;
 }
 
 static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
@@ -577,7 +577,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                                mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
                        u32 byte_count =
                                rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
-                       u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
+                       u64 dma_offset = mlx5e_get_mpwqe_offset(i);
 
                        wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
                        wqe->data[0].byte_count = cpu_to_be32(byte_count);
@@ -2368,8 +2368,9 @@ static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5e_params *params,
 {
        switch (params->rq_wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-               return order_base_2(MLX5E_UMR_WQEBBS) +
-                       mlx5e_get_rq_log_wq_sz(rqp->rqc);
+               return max_t(u8, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE,
+                            order_base_2(MLX5E_UMR_WQEBBS) +
+                            mlx5e_get_rq_log_wq_sz(rqp->rqc));
        default: /* MLX5_WQ_TYPE_CYCLIC */
                return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
        }
@@ -2502,8 +2503,10 @@ void mlx5e_close_channels(struct mlx5e_channels *chs)
 {
        int i;
 
-       if (chs->port_ptp)
+       if (chs->port_ptp) {
                mlx5e_port_ptp_close(chs->port_ptp);
+               chs->port_ptp = NULL;
+       }
 
        for (i = 0; i < chs->num; i++)
                mlx5e_close_channel(chs->c[i]);
@@ -3815,6 +3818,15 @@ void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s)
                        s->tx_dropped    += sq_stats->dropped;
                }
        }
+       if (priv->port_ptp_opened) {
+               for (i = 0; i < priv->max_opened_tc; i++) {
+                       struct mlx5e_sq_stats *sq_stats = &priv->port_ptp_stats.sq[i];
+
+                       s->tx_packets    += sq_stats->packets;
+                       s->tx_bytes      += sq_stats->bytes;
+                       s->tx_dropped    += sq_stats->dropped;
+               }
+       }
 }
 
 void
@@ -3834,10 +3846,17 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
        }
 
        if (mlx5e_is_uplink_rep(priv)) {
+               struct mlx5e_vport_stats *vstats = &priv->stats.vport;
+
                stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok);
                stats->rx_bytes   = PPORT_802_3_GET(pstats, a_octets_received_ok);
                stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok);
                stats->tx_bytes   = PPORT_802_3_GET(pstats, a_octets_transmitted_ok);
+
+               /* vport multicast also counts packets that are dropped due to steering
+                * or rx out of buffer
+                */
+               stats->multicast = VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
        } else {
                mlx5e_fold_sw_stats64(priv, stats);
        }
@@ -4683,8 +4702,10 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
                struct mlx5e_channel *c = priv->channels.c[i];
 
                mlx5e_rq_replace_xdp_prog(&c->rq, prog);
-               if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+               if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) {
+                       bpf_prog_inc(prog);
                        mlx5e_rq_replace_xdp_prog(&c->xskrq, prog);
+               }
        }
 
 unlock:
@@ -4958,6 +4979,11 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16
                                     priv->max_nch);
        params->num_tc       = 1;
 
+       /* Set an initial non-zero value, so that mlx5e_select_queue won't
+        * divide by zero if called before first activating channels.
+        */
+       priv->num_tc_x_num_ch = params->num_channels * params->num_tc;
+
        /* SQ */
        params->log_sq_size = is_kdump_kernel() ?
                MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
@@ -5474,8 +5500,6 @@ int mlx5e_priv_init(struct mlx5e_priv *priv,
                    struct net_device *netdev,
                    struct mlx5_core_dev *mdev)
 {
-       memset(priv, 0, sizeof(*priv));
-
        /* priv init */
        priv->mdev        = mdev;
        priv->netdev      = netdev;
@@ -5508,12 +5532,18 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv)
 {
        int i;
 
+       /* bail if change profile failed and also rollback failed */
+       if (!priv->mdev)
+               return;
+
        destroy_workqueue(priv->wq);
        free_cpumask_var(priv->scratchpad.cpumask);
 
        for (i = 0; i < priv->htb.max_qos_sqs; i++)
                kfree(priv->htb.qos_sq_stats[i]);
        kvfree(priv->htb.qos_sq_stats);
+
+       memset(priv, 0, sizeof(*priv));
 }
 
 struct net_device *
@@ -5630,11 +5660,10 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv)
 }
 
 static int
-mlx5e_netdev_attach_profile(struct mlx5e_priv *priv,
+mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev,
                            const struct mlx5e_profile *new_profile, void *new_ppriv)
 {
-       struct net_device *netdev = priv->netdev;
-       struct mlx5_core_dev *mdev = priv->mdev;
+       struct mlx5e_priv *priv = netdev_priv(netdev);
        int err;
 
        err = mlx5e_priv_init(priv, netdev, mdev);
@@ -5647,10 +5676,16 @@ mlx5e_netdev_attach_profile(struct mlx5e_priv *priv,
        priv->ppriv = new_ppriv;
        err = new_profile->init(priv->mdev, priv->netdev);
        if (err)
-               return err;
+               goto priv_cleanup;
        err = mlx5e_attach_netdev(priv);
        if (err)
-               new_profile->cleanup(priv);
+               goto profile_cleanup;
+       return err;
+
+profile_cleanup:
+       new_profile->cleanup(priv);
+priv_cleanup:
+       mlx5e_priv_cleanup(priv);
        return err;
 }
 
@@ -5659,13 +5694,14 @@ int mlx5e_netdev_change_profile(struct mlx5e_priv *priv,
 {
        unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile);
        const struct mlx5e_profile *orig_profile = priv->profile;
+       struct net_device *netdev = priv->netdev;
+       struct mlx5_core_dev *mdev = priv->mdev;
        void *orig_ppriv = priv->ppriv;
        int err, rollback_err;
 
        /* sanity */
        if (new_max_nch != priv->max_nch) {
-               netdev_warn(priv->netdev,
-                           "%s: Replacing profile with different max channels\n",
+               netdev_warn(netdev, "%s: Replacing profile with different max channels\n",
                            __func__);
                return -EINVAL;
        }
@@ -5675,22 +5711,19 @@ int mlx5e_netdev_change_profile(struct mlx5e_priv *priv,
        priv->profile->cleanup(priv);
        mlx5e_priv_cleanup(priv);
 
-       err = mlx5e_netdev_attach_profile(priv, new_profile, new_ppriv);
+       err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv);
        if (err) { /* roll back to original profile */
-               netdev_warn(priv->netdev, "%s: new profile init failed, %d\n",
-                           __func__, err);
+               netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err);
                goto rollback;
        }
 
        return 0;
 
 rollback:
-       rollback_err = mlx5e_netdev_attach_profile(priv, orig_profile, orig_ppriv);
-       if (rollback_err) {
-               netdev_err(priv->netdev,
-                          "%s: failed to rollback to orig profile, %d\n",
+       rollback_err = mlx5e_netdev_attach_profile(netdev, mdev, orig_profile, orig_ppriv);
+       if (rollback_err)
+               netdev_err(netdev, "%s: failed to rollback to orig profile, %d\n",
                           __func__, rollback_err);
-       }
        return err;
 }
 
index 1b6ad94..249d890 100644 (file)
@@ -500,7 +500,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
        struct mlx5e_icosq *sq = rq->icosq;
        struct mlx5_wq_cyc *wq = &sq->wq;
        struct mlx5e_umr_wqe *umr_wqe;
-       u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
        u16 pi;
        int err;
        int i;
@@ -531,7 +530,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
        umr_wqe->ctrl.opmod_idx_opcode =
                cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
                            MLX5_OPCODE_UMR);
-       umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
+       umr_wqe->uctrl.xlt_offset =
+               cpu_to_be16(MLX5_ALIGNED_MTTS_OCTW(MLX5E_REQUIRED_MTTS(ix)));
 
        sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
                .wqe_type   = MLX5E_ICOSQ_WQE_UMR_RX,
index 0da69b9..df2a0af 100644 (file)
@@ -2296,6 +2296,16 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
                        *match_level = MLX5_MATCH_L4;
        }
 
+       /* Currenlty supported only for MPLS over UDP */
+       if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) &&
+           !netif_is_bareudp(filter_dev)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Matching on MPLS is supported only for MPLS over UDP");
+               netdev_err(priv->netdev,
+                          "Matching on MPLS is supported only for MPLS over UDP\n");
+               return -EOPNOTSUPP;
+       }
+
        return 0;
 }
 
@@ -2899,6 +2909,37 @@ static int is_action_keys_supported(const struct flow_action_entry *act,
        return 0;
 }
 
+static bool modify_tuple_supported(bool modify_tuple, bool ct_clear,
+                                  bool ct_flow, struct netlink_ext_ack *extack,
+                                  struct mlx5e_priv *priv,
+                                  struct mlx5_flow_spec *spec)
+{
+       if (!modify_tuple || ct_clear)
+               return true;
+
+       if (ct_flow) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "can't offload tuple modification with non-clear ct()");
+               netdev_info(priv->netdev,
+                           "can't offload tuple modification with non-clear ct()");
+               return false;
+       }
+
+       /* Add ct_state=-trk match so it will be offloaded for non ct flows
+        * (or after clear action), as otherwise, since the tuple is changed,
+        * we can't restore ct state
+        */
+       if (mlx5_tc_ct_add_no_trk_match(spec)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "can't offload tuple modification with ct matches and no ct(clear) action");
+               netdev_info(priv->netdev,
+                           "can't offload tuple modification with ct matches and no ct(clear) action");
+               return false;
+       }
+
+       return true;
+}
+
 static bool modify_header_match_supported(struct mlx5e_priv *priv,
                                          struct mlx5_flow_spec *spec,
                                          struct flow_action *flow_action,
@@ -2937,18 +2978,9 @@ static bool modify_header_match_supported(struct mlx5e_priv *priv,
                        return err;
        }
 
-       /* Add ct_state=-trk match so it will be offloaded for non ct flows
-        * (or after clear action), as otherwise, since the tuple is changed,
-        *  we can't restore ct state
-        */
-       if (!ct_clear && modify_tuple &&
-           mlx5_tc_ct_add_no_trk_match(spec)) {
-               NL_SET_ERR_MSG_MOD(extack,
-                                  "can't offload tuple modify header with ct matches");
-               netdev_info(priv->netdev,
-                           "can't offload tuple modify header with ct matches");
+       if (!modify_tuple_supported(modify_tuple, ct_clear, ct_flow, extack,
+                                   priv, spec))
                return false;
-       }
 
        ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol);
        if (modify_ip_header && ip_proto != IPPROTO_TCP &&
@@ -4445,7 +4477,8 @@ static int apply_police_params(struct mlx5e_priv *priv, u64 rate,
         */
        if (rate) {
                rate = (rate * BITS_PER_BYTE) + 500000;
-               rate_mbps = max_t(u64, do_div(rate, 1000000), 1);
+               do_div(rate, 1000000);
+               rate_mbps = max_t(u32, rate, 1);
        }
 
        err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps);
index 89003ae..25c0917 100644 (file)
@@ -79,6 +79,7 @@ struct mlx5_flow_attr {
        u8 inner_match_level;
        u8 outer_match_level;
        u8 ip_version;
+       u8 tun_ip_version;
        u32 flags;
        union {
                struct mlx5_esw_flow_attr esw_attr[0];
index 94cb021..8694b83 100644 (file)
@@ -551,7 +551,8 @@ esw_setup_dests(struct mlx5_flow_destination *dest,
 
        if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) &&
            MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve) &&
-           mlx5_eswitch_vport_match_metadata_enabled(esw))
+           mlx5_eswitch_vport_match_metadata_enabled(esw) &&
+           MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level))
                attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE;
 
        if (attr->dest_ft) {
index 80da50e..bd66ab2 100644 (file)
@@ -575,6 +575,7 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn,
        MLX5_SET(qpc, qpc, log_sq_size, ilog2(conn->qp.sq.size));
        MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn);
        MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn);
+       MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
        MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma);
        if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
                MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
index 1eeca45..6f7cef4 100644 (file)
@@ -233,6 +233,7 @@ int mlx5i_create_underlay_qp(struct mlx5e_priv *priv)
        }
 
        qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+       MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(priv->mdev));
        MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
        MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
        MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
@@ -694,6 +695,7 @@ static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev)
 static void mlx5_rdma_netdev_free(struct net_device *netdev)
 {
        struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+       struct mlx5_core_dev *mdev = priv->mdev;
        struct mlx5i_priv *ipriv = priv->ppriv;
        const struct mlx5e_profile *profile = priv->profile;
 
@@ -702,7 +704,7 @@ static void mlx5_rdma_netdev_free(struct net_device *netdev)
 
        if (!ipriv->sub_interface) {
                mlx5i_pkey_qpn_ht_cleanup(netdev);
-               mlx5e_destroy_mdev_resources(priv->mdev);
+               mlx5e_destroy_mdev_resources(mdev);
        }
 }
 
index b0e129d..1e7f26b 100644 (file)
@@ -495,15 +495,15 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp,
                return -EINVAL;
 
        field_select = MLX5_MTPPS_FS_ENABLE;
+       pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, rq->perout.index);
+       if (pin < 0)
+               return -EBUSY;
+
        if (on) {
                bool rt_mode = mlx5_real_time_mode(mdev);
                u32 nsec;
                s64 sec;
 
-               pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, rq->perout.index);
-               if (pin < 0)
-                       return -EBUSY;
-
                pin_mode = MLX5_PIN_MODE_OUT;
                pattern = MLX5_OUT_PATTERN_PERIODIC;
                ts.tv_sec = rq->perout.period.sec;
index b265f27..90b524c 100644 (file)
@@ -181,15 +181,13 @@ static int mlx5_sf_dev_vhca_arm_all(struct mlx5_sf_dev_table *table)
        u16 max_functions;
        u16 function_id;
        int err = 0;
-       bool ecpu;
        int i;
 
        max_functions = mlx5_sf_max_functions(dev);
        function_id = MLX5_CAP_GEN(dev, sf_base_id);
-       ecpu = mlx5_read_embedded_cpu(dev);
        /* Arm the vhca context as the vhca event notifier */
        for (i = 0; i < max_functions; i++) {
-               err = mlx5_vhca_event_arm(dev, function_id, ecpu);
+               err = mlx5_vhca_event_arm(dev, function_id);
                if (err)
                        return err;
 
index 58b6be0..a5a0f60 100644 (file)
@@ -6,7 +6,7 @@
 #include "sf.h"
 #include "mlx5_ifc_vhca_event.h"
 #include "vhca_event.h"
-#include "ecpf.h"
+#include "mlx5_core.h"
 
 struct mlx5_sf_hw {
        u32 usr_sfnum;
@@ -18,7 +18,6 @@ struct mlx5_sf_hw_table {
        struct mlx5_core_dev *dev;
        struct mlx5_sf_hw *sfs;
        int max_local_functions;
-       u8 ecpu: 1;
        struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */
        struct notifier_block vhca_nb;
 };
@@ -64,7 +63,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum)
        }
        if (sw_id == -ENOSPC) {
                err = -ENOSPC;
-               goto err;
+               goto exist_err;
        }
 
        hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sw_id);
@@ -72,7 +71,7 @@ int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 usr_sfnum)
        if (err)
                goto err;
 
-       err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, table->ecpu, usr_sfnum);
+       err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, usr_sfnum);
        if (err)
                goto vhca_err;
 
@@ -118,7 +117,7 @@ void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u16 id)
 
        hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id);
        mutex_lock(&table->table_lock);
-       err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, table->ecpu, out, sizeof(out));
+       err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out));
        if (err)
                goto err;
        state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state);
@@ -164,7 +163,6 @@ int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev)
        table->dev = dev;
        table->sfs = sfs;
        table->max_local_functions = max_functions;
-       table->ecpu = mlx5_read_embedded_cpu(dev);
        dev->priv.sf_hw_table = table;
        mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions);
        return 0;
index 1daf5a1..4fc8701 100644 (file)
@@ -20,7 +20,7 @@ struct mlx5_ifc_vhca_state_context_bits {
 
        u8         sw_function_id[0x20];
 
-       u8         reserved_at_40[0x80];
+       u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_query_vhca_state_out_bits {
index af2f2dd..28b14b0 100644 (file)
@@ -19,52 +19,51 @@ struct mlx5_vhca_event_work {
        struct mlx5_vhca_state_event event;
 };
 
-int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id,
-                             bool ecpu, u32 *out, u32 outlen)
+int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, u32 *out, u32 outlen)
 {
        u32 in[MLX5_ST_SZ_DW(query_vhca_state_in)] = {};
 
        MLX5_SET(query_vhca_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_STATE);
        MLX5_SET(query_vhca_state_in, in, function_id, function_id);
-       MLX5_SET(query_vhca_state_in, in, embedded_cpu_function, ecpu);
+       MLX5_SET(query_vhca_state_in, in, embedded_cpu_function, 0);
 
        return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
 }
 
 static int mlx5_cmd_modify_vhca_state(struct mlx5_core_dev *dev, u16 function_id,
-                                     bool ecpu, u32 *in, u32 inlen)
+                                     u32 *in, u32 inlen)
 {
        u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {};
 
        MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE);
        MLX5_SET(modify_vhca_state_in, in, function_id, function_id);
-       MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu);
+       MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, 0);
 
        return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
 }
 
-int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id)
+int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, u32 sw_fn_id)
 {
        u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {};
        u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {};
 
        MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE);
        MLX5_SET(modify_vhca_state_in, in, function_id, function_id);
-       MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, ecpu);
+       MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, 0);
        MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.sw_function_id, 1);
        MLX5_SET(modify_vhca_state_in, in, vhca_state_context.sw_function_id, sw_fn_id);
 
        return mlx5_cmd_exec_inout(dev, modify_vhca_state, in, out);
 }
 
-int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu)
+int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id)
 {
        u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {};
 
        MLX5_SET(modify_vhca_state_in, in, vhca_state_context.arm_change_event, 1);
        MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.arm_change_event, 1);
 
-       return mlx5_cmd_modify_vhca_state(dev, function_id, ecpu, in, sizeof(in));
+       return mlx5_cmd_modify_vhca_state(dev, function_id, in, sizeof(in));
 }
 
 static void
@@ -73,7 +72,7 @@ mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event *
        u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {};
        int err;
 
-       err = mlx5_cmd_query_vhca_state(dev, event->function_id, event->ecpu, out, sizeof(out));
+       err = mlx5_cmd_query_vhca_state(dev, event->function_id, out, sizeof(out));
        if (err)
                return;
 
@@ -82,7 +81,7 @@ mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event *
        event->new_vhca_state = MLX5_GET(query_vhca_state_out, out,
                                         vhca_state_context.vhca_state);
 
-       mlx5_vhca_event_arm(dev, event->function_id, event->ecpu);
+       mlx5_vhca_event_arm(dev, event->function_id);
 
        blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event);
 }
@@ -94,6 +93,7 @@ static void mlx5_vhca_state_work_handler(struct work_struct *_work)
        struct mlx5_core_dev *dev = notifier->dev;
 
        mlx5_vhca_event_notify(dev, &work->event);
+       kfree(work);
 }
 
 static int
@@ -110,7 +110,6 @@ mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, v
        INIT_WORK(&work->work, &mlx5_vhca_state_work_handler);
        work->notifier = notifier;
        work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id);
-       work->event.ecpu = be16_to_cpu(eqe->data.vhca_state.ec_function);
        mlx5_events_work_enqueue(notifier->dev, &work->work);
        return NOTIFY_OK;
 }
index 1fe1ec6..013cdfe 100644 (file)
@@ -10,7 +10,6 @@ struct mlx5_vhca_state_event {
        u16 function_id;
        u16 sw_function_id;
        u8 new_vhca_state;
-       bool ecpu;
 };
 
 static inline bool mlx5_vhca_event_supported(const struct mlx5_core_dev *dev)
@@ -25,10 +24,10 @@ void mlx5_vhca_event_start(struct mlx5_core_dev *dev);
 void mlx5_vhca_event_stop(struct mlx5_core_dev *dev);
 int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
 void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
-int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, bool ecpu, u32 sw_fn_id);
-int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id, bool ecpu);
+int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, u32 sw_fn_id);
+int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id);
 int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id,
-                             bool ecpu, u32 *out, u32 outlen);
+                             u32 *out, u32 outlen);
 #else
 
 static inline void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap)
index 83c4c87..8a6a56f 100644 (file)
@@ -169,6 +169,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
        MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
        MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
        MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
+       MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
        MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
        if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
                MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
index 4088d6e..9143ec3 100644 (file)
@@ -264,8 +264,8 @@ static void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr)
 static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p)
 {
        u64 index =
-               (MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
-                MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32) << 26);
+               ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
+                ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32)) << 26);
 
        return index << 6;
 }
index 5defd31..aa06fcb 100644 (file)
@@ -327,8 +327,14 @@ int nfp_compile_flow_metadata(struct nfp_app *app,
                goto err_free_ctx_entry;
        }
 
+       /* Do net allocate a mask-id for pre_tun_rules. These flows are used to
+        * configure the pre_tun table and are never actually send to the
+        * firmware as an add-flow message. This causes the mask-id allocation
+        * on the firmware to get out of sync if allocated here.
+        */
        new_mask_id = 0;
-       if (!nfp_check_mask_add(app, nfp_flow->mask_data,
+       if (!nfp_flow->pre_tun_rule.dev &&
+           !nfp_check_mask_add(app, nfp_flow->mask_data,
                                nfp_flow->meta.mask_len,
                                &nfp_flow->meta.flags, &new_mask_id)) {
                NL_SET_ERR_MSG_MOD(extack, "invalid entry: cannot allocate a new mask id");
@@ -359,7 +365,8 @@ int nfp_compile_flow_metadata(struct nfp_app *app,
                        goto err_remove_mask;
                }
 
-               if (!nfp_check_mask_remove(app, nfp_flow->mask_data,
+               if (!nfp_flow->pre_tun_rule.dev &&
+                   !nfp_check_mask_remove(app, nfp_flow->mask_data,
                                           nfp_flow->meta.mask_len,
                                           NULL, &new_mask_id)) {
                        NL_SET_ERR_MSG_MOD(extack, "invalid entry: cannot release mask id");
@@ -374,8 +381,10 @@ int nfp_compile_flow_metadata(struct nfp_app *app,
        return 0;
 
 err_remove_mask:
-       nfp_check_mask_remove(app, nfp_flow->mask_data, nfp_flow->meta.mask_len,
-                             NULL, &new_mask_id);
+       if (!nfp_flow->pre_tun_rule.dev)
+               nfp_check_mask_remove(app, nfp_flow->mask_data,
+                                     nfp_flow->meta.mask_len,
+                                     NULL, &new_mask_id);
 err_remove_rhash:
        WARN_ON_ONCE(rhashtable_remove_fast(&priv->stats_ctx_table,
                                            &ctx_entry->ht_node,
@@ -406,9 +415,10 @@ int nfp_modify_flow_metadata(struct nfp_app *app,
 
        __nfp_modify_flow_metadata(priv, nfp_flow);
 
-       nfp_check_mask_remove(app, nfp_flow->mask_data,
-                             nfp_flow->meta.mask_len, &nfp_flow->meta.flags,
-                             &new_mask_id);
+       if (!nfp_flow->pre_tun_rule.dev)
+               nfp_check_mask_remove(app, nfp_flow->mask_data,
+                                     nfp_flow->meta.mask_len, &nfp_flow->meta.flags,
+                                     &new_mask_id);
 
        /* Update flow payload with mask ids. */
        nfp_flow->unmasked_data[NFP_FL_MASK_ID_LOCATION] = new_mask_id;
index 1c59aff..d72225d 100644 (file)
@@ -1142,6 +1142,12 @@ nfp_flower_validate_pre_tun_rule(struct nfp_app *app,
                return -EOPNOTSUPP;
        }
 
+       if (!(key_layer & NFP_FLOWER_LAYER_IPV4) &&
+           !(key_layer & NFP_FLOWER_LAYER_IPV6)) {
+               NL_SET_ERR_MSG_MOD(extack, "unsupported pre-tunnel rule: match on ipv4/ipv6 eth_type must be present");
+               return -EOPNOTSUPP;
+       }
+
        /* Skip fields known to exist. */
        mask += sizeof(struct nfp_flower_meta_tci);
        ext += sizeof(struct nfp_flower_meta_tci);
@@ -1152,6 +1158,13 @@ nfp_flower_validate_pre_tun_rule(struct nfp_app *app,
        mask += sizeof(struct nfp_flower_in_port);
        ext += sizeof(struct nfp_flower_in_port);
 
+       /* Ensure destination MAC address matches pre_tun_dev. */
+       mac = (struct nfp_flower_mac_mpls *)ext;
+       if (memcmp(&mac->mac_dst[0], flow->pre_tun_rule.dev->dev_addr, 6)) {
+               NL_SET_ERR_MSG_MOD(extack, "unsupported pre-tunnel rule: dest MAC must match output dev MAC");
+               return -EOPNOTSUPP;
+       }
+
        /* Ensure destination MAC address is fully matched. */
        mac = (struct nfp_flower_mac_mpls *)mask;
        if (!is_broadcast_ether_addr(&mac->mac_dst[0])) {
@@ -1159,6 +1172,11 @@ nfp_flower_validate_pre_tun_rule(struct nfp_app *app,
                return -EOPNOTSUPP;
        }
 
+       if (mac->mpls_lse) {
+               NL_SET_ERR_MSG_MOD(extack, "unsupported pre-tunnel rule: MPLS not supported");
+               return -EOPNOTSUPP;
+       }
+
        mask += sizeof(struct nfp_flower_mac_mpls);
        ext += sizeof(struct nfp_flower_mac_mpls);
        if (key_layer & NFP_FLOWER_LAYER_IPV4 ||
index 7248d24..d19c02e 100644 (file)
@@ -16,8 +16,9 @@
 #define NFP_FL_MAX_ROUTES               32
 
 #define NFP_TUN_PRE_TUN_RULE_LIMIT     32
-#define NFP_TUN_PRE_TUN_RULE_DEL       0x1
-#define NFP_TUN_PRE_TUN_IDX_BIT                0x8
+#define NFP_TUN_PRE_TUN_RULE_DEL       BIT(0)
+#define NFP_TUN_PRE_TUN_IDX_BIT                BIT(3)
+#define NFP_TUN_PRE_TUN_IPV6_BIT       BIT(7)
 
 /**
  * struct nfp_tun_pre_run_rule - rule matched before decap
@@ -1268,6 +1269,7 @@ int nfp_flower_xmit_pre_tun_flow(struct nfp_app *app,
 {
        struct nfp_flower_priv *app_priv = app->priv;
        struct nfp_tun_offloaded_mac *mac_entry;
+       struct nfp_flower_meta_tci *key_meta;
        struct nfp_tun_pre_tun_rule payload;
        struct net_device *internal_dev;
        int err;
@@ -1290,6 +1292,15 @@ int nfp_flower_xmit_pre_tun_flow(struct nfp_app *app,
        if (!mac_entry)
                return -ENOENT;
 
+       /* Set/clear IPV6 bit. cpu_to_be16() swap will lead to MSB being
+        * set/clear for port_idx.
+        */
+       key_meta = (struct nfp_flower_meta_tci *)flow->unmasked_data;
+       if (key_meta->nfp_flow_key_layer & NFP_FLOWER_LAYER_IPV6)
+               mac_entry->index |= NFP_TUN_PRE_TUN_IPV6_BIT;
+       else
+               mac_entry->index &= ~NFP_TUN_PRE_TUN_IPV6_BIT;
+
        payload.port_idx = cpu_to_be16(mac_entry->index);
 
        /* Copy mac id and vlan to flow - dev may not exist at delete time. */
index 162a1ff..4087311 100644 (file)
@@ -1079,15 +1079,17 @@ static int ionic_tx_descs_needed(struct ionic_queue *q, struct sk_buff *skb)
 {
        int sg_elems = q->lif->qtype_info[IONIC_QTYPE_TXQ].max_sg_elems;
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
+       int ndescs;
        int err;
 
-       /* If TSO, need roundup(skb->len/mss) descs */
+       /* Each desc is mss long max, so a descriptor for each gso_seg */
        if (skb_is_gso(skb))
-               return (skb->len / skb_shinfo(skb)->gso_size) + 1;
+               ndescs = skb_shinfo(skb)->gso_segs;
+       else
+               ndescs = 1;
 
-       /* If non-TSO, just need 1 desc and nr_frags sg elems */
        if (skb_shinfo(skb)->nr_frags <= sg_elems)
-               return 1;
+               return ndescs;
 
        /* Too many frags, so linearize */
        err = skb_linearize(skb);
@@ -1096,8 +1098,7 @@ static int ionic_tx_descs_needed(struct ionic_queue *q, struct sk_buff *skb)
 
        stats->linearize++;
 
-       /* Need 1 desc and zero sg elems */
-       return 1;
+       return ndescs;
 }
 
 static int ionic_maybe_stop_tx(struct ionic_queue *q, int ndescs)
index 7760a33..7ecb3df 100644 (file)
@@ -1425,6 +1425,7 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter)
 
        if (fw_dump->tmpl_hdr == NULL || current_version > prev_version) {
                vfree(fw_dump->tmpl_hdr);
+               fw_dump->tmpl_hdr = NULL;
 
                if (qlcnic_83xx_md_check_extended_dump_capability(adapter))
                        extended = !qlcnic_83xx_extend_md_capab(adapter);
@@ -1443,6 +1444,8 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter)
                        struct qlcnic_83xx_dump_template_hdr *hdr;
 
                        hdr = fw_dump->tmpl_hdr;
+                       if (!hdr)
+                               return;
                        hdr->drv_cap_mask = 0x1f;
                        fw_dump->cap_mask = 0x1f;
                        dev_info(&pdev->dev,
index 7aad0ba..581a92f 100644 (file)
@@ -4646,6 +4646,9 @@ static void rtl8169_down(struct rtl8169_private *tp)
 
        rtl8169_update_counters(tp);
 
+       pci_clear_master(tp->pci_dev);
+       rtl_pci_commit(tp);
+
        rtl8169_cleanup(tp, true);
 
        rtl_prepare_power_down(tp);
@@ -4653,6 +4656,7 @@ static void rtl8169_down(struct rtl8169_private *tp)
 
 static void rtl8169_up(struct rtl8169_private *tp)
 {
+       pci_set_master(tp->pci_dev);
        phy_resume(tp->phydev);
        rtl8169_init_phy(tp);
        napi_enable(&tp->napi);
@@ -5307,8 +5311,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        rtl_hw_reset(tp);
 
-       pci_set_master(pdev);
-
        rc = rtl_alloc_irq(tp);
        if (rc < 0) {
                dev_err(&pdev->dev, "Can't allocate interrupt\n");
index 3c53051..200785e 100644 (file)
@@ -1715,14 +1715,17 @@ static int netsec_netdev_init(struct net_device *ndev)
                goto err1;
 
        /* set phy power down */
-       data = netsec_phy_read(priv->mii_bus, priv->phy_addr, MII_BMCR) |
-               BMCR_PDOWN;
-       netsec_phy_write(priv->mii_bus, priv->phy_addr, MII_BMCR, data);
+       data = netsec_phy_read(priv->mii_bus, priv->phy_addr, MII_BMCR);
+       netsec_phy_write(priv->mii_bus, priv->phy_addr, MII_BMCR,
+                        data | BMCR_PDOWN);
 
        ret = netsec_reset_hardware(priv, true);
        if (ret)
                goto err2;
 
+       /* Restore phy power state */
+       netsec_phy_write(priv->mii_bus, priv->phy_addr, MII_BMCR, data);
+
        spin_lock_init(&priv->desc_ring[NETSEC_RING_TX].lock);
        spin_lock_init(&priv->desc_ring[NETSEC_RING_RX].lock);
 
index 6b75cf2..e62efd1 100644 (file)
@@ -1214,6 +1214,8 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
        plat_dat->init = sun8i_dwmac_init;
        plat_dat->exit = sun8i_dwmac_exit;
        plat_dat->setup = sun8i_dwmac_setup;
+       plat_dat->tx_fifo_size = 4096;
+       plat_dat->rx_fifo_size = 16384;
 
        ret = sun8i_dwmac_set_syscon(&pdev->dev, plat_dat);
        if (ret)
index 3a8775e..5d677db 100644 (file)
@@ -1880,7 +1880,7 @@ static int axienet_probe(struct platform_device *pdev)
        if (IS_ERR(lp->regs)) {
                dev_err(&pdev->dev, "could not map Axi Ethernet regs.\n");
                ret = PTR_ERR(lp->regs);
-               goto free_netdev;
+               goto cleanup_clk;
        }
        lp->regs_start = ethres->start;
 
@@ -1958,18 +1958,18 @@ static int axienet_probe(struct platform_device *pdev)
                        break;
                default:
                        ret = -EINVAL;
-                       goto free_netdev;
+                       goto cleanup_clk;
                }
        } else {
                ret = of_get_phy_mode(pdev->dev.of_node, &lp->phy_mode);
                if (ret)
-                       goto free_netdev;
+                       goto cleanup_clk;
        }
        if (lp->switch_x_sgmii && lp->phy_mode != PHY_INTERFACE_MODE_SGMII &&
            lp->phy_mode != PHY_INTERFACE_MODE_1000BASEX) {
                dev_err(&pdev->dev, "xlnx,switch-x-sgmii only supported with SGMII or 1000BaseX\n");
                ret = -EINVAL;
-               goto free_netdev;
+               goto cleanup_clk;
        }
 
        /* Find the DMA node, map the DMA registers, and decode the DMA IRQs */
@@ -1982,7 +1982,7 @@ static int axienet_probe(struct platform_device *pdev)
                        dev_err(&pdev->dev,
                                "unable to get DMA resource\n");
                        of_node_put(np);
-                       goto free_netdev;
+                       goto cleanup_clk;
                }
                lp->dma_regs = devm_ioremap_resource(&pdev->dev,
                                                     &dmares);
@@ -2002,12 +2002,12 @@ static int axienet_probe(struct platform_device *pdev)
        if (IS_ERR(lp->dma_regs)) {
                dev_err(&pdev->dev, "could not map DMA regs\n");
                ret = PTR_ERR(lp->dma_regs);
-               goto free_netdev;
+               goto cleanup_clk;
        }
        if ((lp->rx_irq <= 0) || (lp->tx_irq <= 0)) {
                dev_err(&pdev->dev, "could not determine irqs\n");
                ret = -ENOMEM;
-               goto free_netdev;
+               goto cleanup_clk;
        }
 
        /* Autodetect the need for 64-bit DMA pointers.
@@ -2037,7 +2037,7 @@ static int axienet_probe(struct platform_device *pdev)
        ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(addr_width));
        if (ret) {
                dev_err(&pdev->dev, "No suitable DMA available\n");
-               goto free_netdev;
+               goto cleanup_clk;
        }
 
        /* Check for Ethernet core IRQ (optional) */
@@ -2068,12 +2068,12 @@ static int axienet_probe(struct platform_device *pdev)
                if (!lp->phy_node) {
                        dev_err(&pdev->dev, "phy-handle required for 1000BaseX/SGMII\n");
                        ret = -EINVAL;
-                       goto free_netdev;
+                       goto cleanup_mdio;
                }
                lp->pcs_phy = of_mdio_find_device(lp->phy_node);
                if (!lp->pcs_phy) {
                        ret = -EPROBE_DEFER;
-                       goto free_netdev;
+                       goto cleanup_mdio;
                }
                lp->phylink_config.pcs_poll = true;
        }
@@ -2087,17 +2087,30 @@ static int axienet_probe(struct platform_device *pdev)
        if (IS_ERR(lp->phylink)) {
                ret = PTR_ERR(lp->phylink);
                dev_err(&pdev->dev, "phylink_create error (%i)\n", ret);
-               goto free_netdev;
+               goto cleanup_mdio;
        }
 
        ret = register_netdev(lp->ndev);
        if (ret) {
                dev_err(lp->dev, "register_netdev() error (%i)\n", ret);
-               goto free_netdev;
+               goto cleanup_phylink;
        }
 
        return 0;
 
+cleanup_phylink:
+       phylink_destroy(lp->phylink);
+
+cleanup_mdio:
+       if (lp->pcs_phy)
+               put_device(&lp->pcs_phy->dev);
+       if (lp->mii_bus)
+               axienet_mdio_teardown(lp);
+       of_node_put(lp->phy_node);
+
+cleanup_clk:
+       clk_disable_unprepare(lp->clk);
+
 free_netdev:
        free_netdev(ndev);
 
index 36eeb80..4690c6a 100644 (file)
@@ -2167,7 +2167,6 @@ static void __exit scc_cleanup_driver(void)
 
 MODULE_AUTHOR("Joerg Reuter <jreuter@yaina.de>");
 MODULE_DESCRIPTION("AX.25 Device Driver for Z8530 based HDLC cards");
-MODULE_SUPPORTED_DEVICE("Z8530 based SCC cards for Amateur Radio");
 MODULE_LICENSE("GPL");
 module_init(scc_init_driver);
 module_exit(scc_cleanup_driver);
index 35e3585..d73b03a 100644 (file)
@@ -175,21 +175,23 @@ bool ipa_cmd_table_valid(struct ipa *ipa, const struct ipa_mem *mem,
                            : field_max(IP_FLTRT_FLAGS_NHASH_ADDR_FMASK);
        if (mem->offset > offset_max ||
            ipa->mem_offset > offset_max - mem->offset) {
-               dev_err(dev, "IPv%c %s%s table region offset too large "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipv6 ? '6' : '4', hashed ? "hashed " : "",
-                             route ? "route" : "filter",
-                             ipa->mem_offset, mem->offset, offset_max);
+               dev_err(dev, "IPv%c %s%s table region offset too large\n",
+                       ipv6 ? '6' : '4', hashed ? "hashed " : "",
+                       route ? "route" : "filter");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       ipa->mem_offset, mem->offset, offset_max);
+
                return false;
        }
 
        if (mem->offset > ipa->mem_size ||
            mem->size > ipa->mem_size - mem->offset) {
-               dev_err(dev, "IPv%c %s%s table region out of range "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipv6 ? '6' : '4', hashed ? "hashed " : "",
-                             route ? "route" : "filter",
-                             mem->offset, mem->size, ipa->mem_size);
+               dev_err(dev, "IPv%c %s%s table region out of range\n",
+                       ipv6 ? '6' : '4', hashed ? "hashed " : "",
+                       route ? "route" : "filter");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       mem->offset, mem->size, ipa->mem_size);
+
                return false;
        }
 
@@ -205,22 +207,36 @@ static bool ipa_cmd_header_valid(struct ipa *ipa)
        u32 size_max;
        u32 size;
 
+       /* In ipa_cmd_hdr_init_local_add() we record the offset and size
+        * of the header table memory area.  Make sure the offset and size
+        * fit in the fields that need to hold them, and that the entire
+        * range is within the overall IPA memory range.
+        */
        offset_max = field_max(HDR_INIT_LOCAL_FLAGS_HDR_ADDR_FMASK);
        if (mem->offset > offset_max ||
            ipa->mem_offset > offset_max - mem->offset) {
-               dev_err(dev, "header table region offset too large "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipa->mem_offset + mem->offset, offset_max);
+               dev_err(dev, "header table region offset too large\n");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       ipa->mem_offset, mem->offset, offset_max);
+
                return false;
        }
 
        size_max = field_max(HDR_INIT_LOCAL_FLAGS_TABLE_SIZE_FMASK);
        size = ipa->mem[IPA_MEM_MODEM_HEADER].size;
        size += ipa->mem[IPA_MEM_AP_HEADER].size;
-       if (mem->offset > ipa->mem_size || size > ipa->mem_size - mem->offset) {
-               dev_err(dev, "header table region out of range "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             mem->offset, size, ipa->mem_size);
+
+       if (size > size_max) {
+               dev_err(dev, "header table region size too large\n");
+               dev_err(dev, "    (0x%04x > 0x%08x)\n", size, size_max);
+
+               return false;
+       }
+       if (size > ipa->mem_size || mem->offset > ipa->mem_size - size) {
+               dev_err(dev, "header table region out of range\n");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       mem->offset, size, ipa->mem_size);
+
                return false;
        }
 
index 2fc6448..e594bf3 100644 (file)
@@ -249,6 +249,7 @@ static const struct qmi_msg_handler ipa_server_msg_handlers[] = {
                .decoded_size   = IPA_QMI_DRIVER_INIT_COMPLETE_REQ_SZ,
                .fn             = ipa_server_driver_init_complete,
        },
+       { },
 };
 
 /* Handle an INIT_DRIVER response message from the modem. */
@@ -269,6 +270,7 @@ static const struct qmi_msg_handler ipa_client_msg_handlers[] = {
                .decoded_size   = IPA_QMI_INIT_DRIVER_RSP_SZ,
                .fn             = ipa_client_init_driver,
        },
+       { },
 };
 
 /* Return a pointer to an init modem driver request structure, which contains
index fa0be59..82fe5f4 100644 (file)
@@ -342,6 +342,10 @@ static int bcm54xx_config_init(struct phy_device *phydev)
        bcm54xx_adjust_rxrefclk(phydev);
 
        switch (BRCM_PHY_MODEL(phydev)) {
+       case PHY_ID_BCM50610:
+       case PHY_ID_BCM50610M:
+               err = bcm54xx_config_clock_delay(phydev);
+               break;
        case PHY_ID_BCM54210E:
                err = bcm54210e_config_init(phydev);
                break;
@@ -399,6 +403,11 @@ static int bcm54xx_resume(struct phy_device *phydev)
        if (ret < 0)
                return ret;
 
+       /* Upon exiting power down, the PHY remains in an internal reset state
+        * for 40us
+        */
+       fsleep(40);
+
        return bcm54xx_config_init(phydev);
 }
 
index 053c92e..dc2800b 100644 (file)
@@ -476,7 +476,7 @@ static void phylink_major_config(struct phylink *pl, bool restart,
                err = pl->mac_ops->mac_finish(pl->config, pl->cur_link_an_mode,
                                              state->interface);
                if (err < 0)
-                       phylink_err(pl, "mac_prepare failed: %pe\n",
+                       phylink_err(pl, "mac_finish failed: %pe\n",
                                    ERR_PTR(err));
        }
 }
index 02e6bbb..8d1f69d 100644 (file)
@@ -387,6 +387,8 @@ static int usbpn_probe(struct usb_interface *intf, const struct usb_device_id *i
 
        err = register_netdev(dev);
        if (err) {
+               /* Set disconnected flag so that disconnect() returns early. */
+               pnd->disconnected = 1;
                usb_driver_release_interface(&usbpn_driver, data_intf);
                goto out;
        }
index 90f1c02..20fb563 100644 (file)
@@ -6553,7 +6553,10 @@ static int rtl_ops_init(struct r8152 *tp)
                ops->in_nway            = rtl8153_in_nway;
                ops->hw_phy_cfg         = r8153_hw_phy_cfg;
                ops->autosuspend_en     = rtl8153_runtime_enable;
-               tp->rx_buf_sz           = 32 * 1024;
+               if (tp->udev->speed < USB_SPEED_SUPER)
+                       tp->rx_buf_sz   = 16 * 1024;
+               else
+                       tp->rx_buf_sz   = 32 * 1024;
                tp->eee_en              = true;
                tp->eee_adv             = MDIO_EEE_1000T | MDIO_EEE_100TX;
                break;
index aa1a66a..34e49c7 100644 (file)
@@ -302,8 +302,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
        if (rxq < rcv->real_num_rx_queues) {
                rq = &rcv_priv->rq[rxq];
                rcv_xdp = rcu_access_pointer(rq->xdp_prog);
-               if (rcv_xdp)
-                       skb_record_rx_queue(skb, rxq);
+               skb_record_rx_queue(skb, rxq);
        }
 
        skb_tx_timestamp(skb);
index 4aaa638..5a6a945 100644 (file)
@@ -23,6 +23,8 @@
 
 struct x25_state {
        x25_hdlc_proto settings;
+       bool up;
+       spinlock_t up_lock; /* Protects "up" */
 };
 
 static int x25_ioctl(struct net_device *dev, struct ifreq *ifr);
@@ -104,6 +106,8 @@ static void x25_data_transmit(struct net_device *dev, struct sk_buff *skb)
 
 static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+       hdlc_device *hdlc = dev_to_hdlc(dev);
+       struct x25_state *x25st = state(hdlc);
        int result;
 
        /* There should be a pseudo header of 1 byte added by upper layers.
@@ -114,11 +118,19 @@ static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev)
                return NETDEV_TX_OK;
        }
 
+       spin_lock_bh(&x25st->up_lock);
+       if (!x25st->up) {
+               spin_unlock_bh(&x25st->up_lock);
+               kfree_skb(skb);
+               return NETDEV_TX_OK;
+       }
+
        switch (skb->data[0]) {
        case X25_IFACE_DATA:    /* Data to be transmitted */
                skb_pull(skb, 1);
                if ((result = lapb_data_request(dev, skb)) != LAPB_OK)
                        dev_kfree_skb(skb);
+               spin_unlock_bh(&x25st->up_lock);
                return NETDEV_TX_OK;
 
        case X25_IFACE_CONNECT:
@@ -147,6 +159,7 @@ static netdev_tx_t x25_xmit(struct sk_buff *skb, struct net_device *dev)
                break;
        }
 
+       spin_unlock_bh(&x25st->up_lock);
        dev_kfree_skb(skb);
        return NETDEV_TX_OK;
 }
@@ -164,6 +177,7 @@ static int x25_open(struct net_device *dev)
                .data_transmit = x25_data_transmit,
        };
        hdlc_device *hdlc = dev_to_hdlc(dev);
+       struct x25_state *x25st = state(hdlc);
        struct lapb_parms_struct params;
        int result;
 
@@ -190,6 +204,10 @@ static int x25_open(struct net_device *dev)
        if (result != LAPB_OK)
                return -EINVAL;
 
+       spin_lock_bh(&x25st->up_lock);
+       x25st->up = true;
+       spin_unlock_bh(&x25st->up_lock);
+
        return 0;
 }
 
@@ -197,6 +215,13 @@ static int x25_open(struct net_device *dev)
 
 static void x25_close(struct net_device *dev)
 {
+       hdlc_device *hdlc = dev_to_hdlc(dev);
+       struct x25_state *x25st = state(hdlc);
+
+       spin_lock_bh(&x25st->up_lock);
+       x25st->up = false;
+       spin_unlock_bh(&x25st->up_lock);
+
        lapb_unregister(dev);
 }
 
@@ -205,15 +230,28 @@ static void x25_close(struct net_device *dev)
 static int x25_rx(struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
+       hdlc_device *hdlc = dev_to_hdlc(dev);
+       struct x25_state *x25st = state(hdlc);
 
        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
                dev->stats.rx_dropped++;
                return NET_RX_DROP;
        }
 
-       if (lapb_data_received(dev, skb) == LAPB_OK)
+       spin_lock_bh(&x25st->up_lock);
+       if (!x25st->up) {
+               spin_unlock_bh(&x25st->up_lock);
+               kfree_skb(skb);
+               dev->stats.rx_dropped++;
+               return NET_RX_DROP;
+       }
+
+       if (lapb_data_received(dev, skb) == LAPB_OK) {
+               spin_unlock_bh(&x25st->up_lock);
                return NET_RX_SUCCESS;
+       }
 
+       spin_unlock_bh(&x25st->up_lock);
        dev->stats.rx_errors++;
        dev_kfree_skb_any(skb);
        return NET_RX_DROP;
@@ -298,6 +336,8 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr)
                        return result;
 
                memcpy(&state(hdlc)->settings, &new_settings, size);
+               state(hdlc)->up = false;
+               spin_lock_init(&state(hdlc)->up_lock);
 
                /* There's no header_ops so hard_header_len should be 0. */
                dev->hard_header_len = 0;
index c41e725..2db9c94 100644 (file)
@@ -28,7 +28,6 @@
 MODULE_AUTHOR("Michael Wu <flamingice@sourmilk.net>");
 MODULE_AUTHOR("Jouni Malinen <j@w1.fi>");
 MODULE_DESCRIPTION("Driver for IEEE 802.11b wireless cards based on ADMtek ADM8211");
-MODULE_SUPPORTED_DEVICE("ADM8211");
 MODULE_LICENSE("GPL");
 
 static unsigned int tx_ring_size __read_mostly = 16;
index 4c6e57f..cef17f3 100644 (file)
@@ -90,7 +90,6 @@ MODULE_PARM_DESC(no_hw_rfkill_switch, "Ignore the GPIO RFKill switch state");
 MODULE_AUTHOR("Jiri Slaby");
 MODULE_AUTHOR("Nick Kossifidis");
 MODULE_DESCRIPTION("Support for 5xxx series of Atheros 802.11 wireless LAN cards.");
-MODULE_SUPPORTED_DEVICE("Atheros 5xxx WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static int ath5k_init(struct ieee80211_hw *hw);
index b66eeb5..5abc2a5 100644 (file)
@@ -34,7 +34,6 @@ static bool ath9k_hw_set_reset_reg(struct ath_hw *ah, u32 type);
 
 MODULE_AUTHOR("Atheros Communications");
 MODULE_DESCRIPTION("Support for Atheros 802.11n wireless LAN cards.");
-MODULE_SUPPORTED_DEVICE("Atheros 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static void ath9k_hw_set_clockrate(struct ath_hw *ah)
index 42a2087..01f9c26 100644 (file)
@@ -37,7 +37,6 @@ static char *dev_info = "ath9k";
 
 MODULE_AUTHOR("Atheros Communications");
 MODULE_DESCRIPTION("Support for Atheros 802.11n wireless LAN cards.");
-MODULE_SUPPORTED_DEVICE("Atheros 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static unsigned int ath9k_debug = ATH_DBG_DEFAULT;
index 707fe66..febce4e 100644 (file)
@@ -75,7 +75,6 @@
 MODULE_AUTHOR("Simon Kelley");
 MODULE_DESCRIPTION("Support for Atmel at76c50x 802.11 wireless ethernet cards.");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Atmel at76c50x wireless cards");
 
 /* The name of the firmware file to be loaded
    over-rides any automatic selection */
index 368eebe..453bb84 100644 (file)
@@ -57,7 +57,6 @@
 MODULE_AUTHOR("Simon Kelley");
 MODULE_DESCRIPTION("Support for Atmel at76c50x 802.11 wireless ethernet cards.");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Atmel at76c50x PCMCIA cards");
 
 /*====================================================================*/
 
index 47f7ccb..f428dc7 100644 (file)
@@ -16,7 +16,6 @@
 MODULE_AUTHOR("Simon Kelley");
 MODULE_DESCRIPTION("Support for Atmel at76c50x 802.11 wireless ethernet cards.");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Atmel at76c506 PCI wireless cards");
 
 static const struct pci_device_id card_ids[] = {
        { 0x1114, 0x0506, PCI_ANY_ID, PCI_ANY_ID },
index 818e523..39f3af2 100644 (file)
@@ -87,7 +87,6 @@ static int n_adapters_found;
 
 MODULE_AUTHOR("Broadcom Corporation");
 MODULE_DESCRIPTION("Broadcom 802.11n wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Broadcom 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 /* This needs to be adjusted when brcms_firmwares changes */
 MODULE_FIRMWARE("brcm/bcm43xx-0.fw");
index 4c84c30..e87e68c 100644 (file)
@@ -12,7 +12,6 @@
 
 MODULE_AUTHOR("Broadcom Corporation");
 MODULE_DESCRIPTION("Broadcom 802.11n wireless LAN driver utilities.");
-MODULE_SUPPORTED_DEVICE("Broadcom 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 struct sk_buff *brcmu_pkt_buf_get_skb(uint len)
index e35e138..60db38c 100644 (file)
@@ -251,7 +251,6 @@ MODULE_AUTHOR("Benjamin Reed");
 MODULE_DESCRIPTION("Support for Cisco/Aironet 802.11 wireless ethernet cards.  "
                   "Direct support for ISA/PCI/MPI cards and support for PCMCIA when used with airo_cs.");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_SUPPORTED_DEVICE("Aironet 4500, 4800 and Cisco 340/350");
 module_param_hw_array(io, int, ioport, NULL, 0);
 module_param_hw_array(irq, int, irq, NULL, 0);
 module_param_array(rates, int, NULL, 0);
index 3718f95..fcfe4c6 100644 (file)
@@ -47,7 +47,6 @@ MODULE_DESCRIPTION("Support for Cisco/Aironet 802.11 wireless ethernet "
                   "cards.  This is the module that links the PCMCIA card "
                   "with the airo module.");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_SUPPORTED_DEVICE("Aironet 4500, 4800 and Cisco 340 PCMCIA cards");
 
 /*====================================================================*/
 
index 1a74867..ec7db2b 100644 (file)
@@ -26,7 +26,6 @@ static char *dev_info = "hostap_cs";
 MODULE_AUTHOR("Jouni Malinen");
 MODULE_DESCRIPTION("Support for Intersil Prism2-based 802.11 wireless LAN "
                   "cards (PC Card).");
-MODULE_SUPPORTED_DEVICE("Intersil Prism2-based WLAN cards (PC Card)");
 MODULE_LICENSE("GPL");
 
 
index 101887e..52d7750 100644 (file)
@@ -27,7 +27,6 @@ static char *dev_info = "hostap_pci";
 MODULE_AUTHOR("Jouni Malinen");
 MODULE_DESCRIPTION("Support for Intersil Prism2.5-based 802.11 wireless LAN "
                   "PCI cards.");
-MODULE_SUPPORTED_DEVICE("Intersil Prism2.5-based WLAN PCI cards");
 MODULE_LICENSE("GPL");
 
 
index 841cfc6..5824729 100644 (file)
@@ -30,7 +30,6 @@ static char *dev_info = "hostap_plx";
 MODULE_AUTHOR("Jouni Malinen");
 MODULE_DESCRIPTION("Support for Intersil Prism2-based 802.11 wireless LAN "
                   "cards (PLX).");
-MODULE_SUPPORTED_DEVICE("Intersil Prism2-based WLAN cards (PLX)");
 MODULE_LICENSE("GPL");
 
 
index 8f860c1..dec6ffd 100644 (file)
@@ -1821,7 +1821,6 @@ static const struct pci_device_id rt2400pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2400 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2460 PCI & PCMCIA chipset based cards");
 MODULE_DEVICE_TABLE(pci, rt2400pci_device_table);
 MODULE_LICENSE("GPL");
 
index e940443..8faa0a8 100644 (file)
@@ -2119,7 +2119,6 @@ static const struct pci_device_id rt2500pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2500 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2560 PCI & PCMCIA chipset based cards");
 MODULE_DEVICE_TABLE(pci, rt2500pci_device_table);
 MODULE_LICENSE("GPL");
 
index fce05fc..bb5ed66 100644 (file)
@@ -1956,7 +1956,6 @@ static const struct usb_device_id rt2500usb_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2500 USB Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2570 USB chipset based cards");
 MODULE_DEVICE_TABLE(usb, rt2500usb_device_table);
 MODULE_LICENSE("GPL");
 
index 9a33baa..1fde0e7 100644 (file)
@@ -439,7 +439,6 @@ static const struct pci_device_id rt2800pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2800 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2860 PCI & PCMCIA chipset based cards");
 MODULE_FIRMWARE(FIRMWARE_RT2860);
 MODULE_DEVICE_TABLE(pci, rt2800pci_device_table);
 MODULE_LICENSE("GPL");
index 36ac18c..b5c67f6 100644 (file)
@@ -1248,7 +1248,6 @@ static const struct usb_device_id rt2800usb_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2800 USB Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2870 USB chipset based cards");
 MODULE_DEVICE_TABLE(usb, rt2800usb_device_table);
 MODULE_FIRMWARE(FIRMWARE_RT2870);
 MODULE_LICENSE("GPL");
index 02da5dd..82cfc2a 100644 (file)
@@ -2993,8 +2993,6 @@ static const struct pci_device_id rt61pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT61 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2561, RT2561s & RT2661 "
-                       "PCI & PCMCIA chipset based cards");
 MODULE_DEVICE_TABLE(pci, rt61pci_device_table);
 MODULE_FIRMWARE(FIRMWARE_RT2561);
 MODULE_FIRMWARE(FIRMWARE_RT2561s);
index e697937..5ff2c74 100644 (file)
@@ -2513,7 +2513,6 @@ static const struct usb_device_id rt73usb_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT73 USB Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2571W & RT2671 USB chipset based cards");
 MODULE_DEVICE_TABLE(usb, rt73usb_device_table);
 MODULE_FIRMWARE(FIRMWARE_RT2571);
 MODULE_LICENSE("GPL");
index 9a3d243..d984832 100644 (file)
@@ -441,6 +441,5 @@ module_init(rsi_91x_hal_module_init);
 module_exit(rsi_91x_hal_module_exit);
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("Station driver for RSI 91x devices");
-MODULE_SUPPORTED_DEVICE("RSI-91x");
 MODULE_VERSION("0.1");
 MODULE_LICENSE("Dual BSD/GPL");
index 592e9da..fe0287b 100644 (file)
@@ -1571,7 +1571,6 @@ module_exit(rsi_module_exit);
 
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("Common SDIO layer for RSI drivers");
-MODULE_SUPPORTED_DEVICE("RSI-91x");
 MODULE_DEVICE_TABLE(sdio, rsi_dev_table);
 MODULE_FIRMWARE(FIRMWARE_RSI9113);
 MODULE_VERSION("0.1");
index a4a533c..3fbe2a3 100644 (file)
@@ -928,7 +928,6 @@ module_usb_driver(rsi_driver);
 
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("Common USB layer for RSI drivers");
-MODULE_SUPPORTED_DEVICE("RSI-91x");
 MODULE_DEVICE_TABLE(usb, rsi_dev_table);
 MODULE_FIRMWARE(FIRMWARE_RSI9113);
 MODULE_VERSION("0.1");
index a565389..0896e21 100644 (file)
@@ -1226,28 +1226,12 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
                queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
 }
 
-static int nvme_keep_alive(struct nvme_ctrl *ctrl)
-{
-       struct request *rq;
-
-       rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd,
-                       BLK_MQ_REQ_RESERVED);
-       if (IS_ERR(rq))
-               return PTR_ERR(rq);
-
-       rq->timeout = ctrl->kato * HZ;
-       rq->end_io_data = ctrl;
-
-       blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io);
-
-       return 0;
-}
-
 static void nvme_keep_alive_work(struct work_struct *work)
 {
        struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
                        struct nvme_ctrl, ka_work);
        bool comp_seen = ctrl->comp_seen;
+       struct request *rq;
 
        if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
                dev_dbg(ctrl->device,
@@ -1257,12 +1241,18 @@ static void nvme_keep_alive_work(struct work_struct *work)
                return;
        }
 
-       if (nvme_keep_alive(ctrl)) {
+       rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd,
+                               BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+       if (IS_ERR(rq)) {
                /* allocation failure, reset the controller */
-               dev_err(ctrl->device, "keep-alive failed\n");
+               dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
                nvme_reset_ctrl(ctrl);
                return;
        }
+
+       rq->timeout = ctrl->kato * HZ;
+       rq->end_io_data = ctrl;
+       blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io);
 }
 
 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
@@ -1964,30 +1954,18 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
                blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
 }
 
-static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
+/*
+ * Even though NVMe spec explicitly states that MDTS is not applicable to the
+ * write-zeroes, we are cautious and limit the size to the controllers
+ * max_hw_sectors value, which is based on the MDTS field and possibly other
+ * limiting factors.
+ */
+static void nvme_config_write_zeroes(struct request_queue *q,
+               struct nvme_ctrl *ctrl)
 {
-       u64 max_blocks;
-
-       if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) ||
-           (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
-               return;
-       /*
-        * Even though NVMe spec explicitly states that MDTS is not
-        * applicable to the write-zeroes:- "The restriction does not apply to
-        * commands that do not transfer data between the host and the
-        * controller (e.g., Write Uncorrectable ro Write Zeroes command).".
-        * In order to be more cautious use controller's max_hw_sectors value
-        * to configure the maximum sectors for the write-zeroes which is
-        * configured based on the controller's MDTS field in the
-        * nvme_init_identify() if available.
-        */
-       if (ns->ctrl->max_hw_sectors == UINT_MAX)
-               max_blocks = (u64)USHRT_MAX + 1;
-       else
-               max_blocks = ns->ctrl->max_hw_sectors + 1;
-
-       blk_queue_max_write_zeroes_sectors(disk->queue,
-                                          nvme_lba_to_sect(ns, max_blocks));
+       if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
+           !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
+               blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors);
 }
 
 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
@@ -2159,7 +2137,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
        set_capacity_and_notify(disk, capacity);
 
        nvme_config_discard(disk, ns);
-       nvme_config_write_zeroes(disk, ns);
+       nvme_config_write_zeroes(disk->queue, ns->ctrl);
 
        set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||
                test_bit(NVME_NS_FORCE_RO, &ns->flags));
index 733010d..888b108 100644 (file)
 #define NVMF_DEF_FAIL_FAST_TMO         -1
 
 /*
+ * Reserved one command for internal usage.  This command is used for sending
+ * the connect command, as well as for the keep alive command on the admin
+ * queue once live.
+ */
+#define NVMF_RESERVED_TAGS     1
+
+/*
  * Define a host as seen by the target.  We allocate one at boot, but also
  * allow the override it when creating controllers.  This is both to provide
  * persistence of the Host NQN over multiple boots, and to allow using
index 73d0737..6ffa8de 100644 (file)
@@ -2863,7 +2863,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
        memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
        ctrl->tag_set.ops = &nvme_fc_mq_ops;
        ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
-       ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+       ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS;
        ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
        ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
        ctrl->tag_set.cmd_size =
@@ -3485,7 +3485,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
        memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
        ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
        ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
-       ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
+       ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS;
        ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
        ctrl->admin_tag_set.cmd_size =
                struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
index 53ac4d7..be905d4 100644 (file)
@@ -736,8 +736,11 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
                return ret;
 
        ctrl->ctrl.queue_count = nr_io_queues + 1;
-       if (ctrl->ctrl.queue_count < 2)
-               return 0;
+       if (ctrl->ctrl.queue_count < 2) {
+               dev_err(ctrl->ctrl.device,
+                       "unable to set any I/O queues\n");
+               return -ENOMEM;
+       }
 
        dev_info(ctrl->ctrl.device,
                "creating %d I/O queues.\n", nr_io_queues);
@@ -798,7 +801,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
                memset(set, 0, sizeof(*set));
                set->ops = &nvme_rdma_admin_mq_ops;
                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
-               set->reserved_tags = 2; /* connect + keep-alive */
+               set->reserved_tags = NVMF_RESERVED_TAGS;
                set->numa_node = nctrl->numa_node;
                set->cmd_size = sizeof(struct nvme_rdma_request) +
                                NVME_RDMA_DATA_SGL_SIZE;
@@ -811,7 +814,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
                memset(set, 0, sizeof(*set));
                set->ops = &nvme_rdma_mq_ops;
                set->queue_depth = nctrl->sqsize + 1;
-               set->reserved_tags = 1; /* fabric connect */
+               set->reserved_tags = NVMF_RESERVED_TAGS;
                set->numa_node = nctrl->numa_node;
                set->flags = BLK_MQ_F_SHOULD_MERGE;
                set->cmd_size = sizeof(struct nvme_rdma_request) +
index 69f59d2..a0f00cb 100644 (file)
@@ -287,7 +287,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
         * directly, otherwise queue io_work. Also, only do that if we
         * are on the same cpu, so we don't introduce contention.
         */
-       if (queue->io_cpu == __smp_processor_id() &&
+       if (queue->io_cpu == raw_smp_processor_id() &&
            sync && empty && mutex_trylock(&queue->send_mutex)) {
                queue->more_requests = !last;
                nvme_tcp_send_all(queue);
@@ -568,6 +568,13 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
        req->pdu_len = le32_to_cpu(pdu->r2t_length);
        req->pdu_sent = 0;
 
+       if (unlikely(!req->pdu_len)) {
+               dev_err(queue->ctrl->ctrl.device,
+                       "req %d r2t len is %u, probably a bug...\n",
+                       rq->tag, req->pdu_len);
+               return -EPROTO;
+       }
+
        if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
                dev_err(queue->ctrl->ctrl.device,
                        "req %d r2t len %u exceeded data len %u (%zu sent)\n",
@@ -1575,7 +1582,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
                memset(set, 0, sizeof(*set));
                set->ops = &nvme_tcp_admin_mq_ops;
                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
-               set->reserved_tags = 2; /* connect + keep-alive */
+               set->reserved_tags = NVMF_RESERVED_TAGS;
                set->numa_node = nctrl->numa_node;
                set->flags = BLK_MQ_F_BLOCKING;
                set->cmd_size = sizeof(struct nvme_tcp_request);
@@ -1587,7 +1594,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
                memset(set, 0, sizeof(*set));
                set->ops = &nvme_tcp_mq_ops;
                set->queue_depth = nctrl->sqsize + 1;
-               set->reserved_tags = 1; /* fabric connect */
+               set->reserved_tags = NVMF_RESERVED_TAGS;
                set->numa_node = nctrl->numa_node;
                set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
                set->cmd_size = sizeof(struct nvme_tcp_request);
@@ -1745,8 +1752,11 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
                return ret;
 
        ctrl->queue_count = nr_io_queues + 1;
-       if (ctrl->queue_count < 2)
-               return 0;
+       if (ctrl->queue_count < 2) {
+               dev_err(ctrl->device,
+                       "unable to set any I/O queues\n");
+               return -ENOMEM;
+       }
 
        dev_info(ctrl->device,
                "creating %d I/O queues.\n", nr_io_queues);
index be6fcda..a027433 100644 (file)
@@ -1118,9 +1118,20 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
 {
        lockdep_assert_held(&ctrl->lock);
 
-       if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
-           nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
-           nvmet_cc_mps(ctrl->cc) != 0 ||
+       /*
+        * Only I/O controllers should verify iosqes,iocqes.
+        * Strictly speaking, the spec says a discovery controller
+        * should verify iosqes,iocqes are zeroed, however that
+        * would break backwards compatibility, so don't enforce it.
+        */
+       if (ctrl->subsys->type != NVME_NQN_DISC &&
+           (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
+            nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) {
+               ctrl->csts = NVME_CSTS_CFS;
+               return;
+       }
+
+       if (nvmet_cc_mps(ctrl->cc) != 0 ||
            nvmet_cc_ams(ctrl->cc) != 0 ||
            nvmet_cc_css(ctrl->cc) != 0) {
                ctrl->csts = NVME_CSTS_CFS;
index cb6f865..3e189e7 100644 (file)
@@ -349,7 +349,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
        memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
        ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
        ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
-       ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
+       ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS;
        ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
        ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
                NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
@@ -520,7 +520,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
        memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
        ctrl->tag_set.ops = &nvme_loop_mq_ops;
        ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
-       ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+       ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS;
        ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
        ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
        ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
index 8b0485a..d658c6e 100644 (file)
@@ -1098,11 +1098,11 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
                cmd->rbytes_done += ret;
        }
 
+       nvmet_tcp_unmap_pdu_iovec(cmd);
        if (queue->data_digest) {
                nvmet_tcp_prep_recv_ddgst(cmd);
                return 0;
        }
-       nvmet_tcp_unmap_pdu_iovec(cmd);
 
        if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
            cmd->rbytes_done == cmd->req.transfer_len) {
index 1e88bcf..84d5701 100644 (file)
@@ -241,6 +241,5 @@ module_platform_driver_probe(amiga_parallel_driver, amiga_parallel_probe);
 
 MODULE_AUTHOR("Joerg Dorchain <joerg@dorchain.net>");
 MODULE_DESCRIPTION("Parport Driver for Amiga builtin Port");
-MODULE_SUPPORTED_DEVICE("Amiga builtin Parallel Port");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:amiga-parallel");
index 2ff0fe0..1623f01 100644 (file)
@@ -218,7 +218,6 @@ static void __exit parport_atari_exit(void)
 
 MODULE_AUTHOR("Andreas Schwab");
 MODULE_DESCRIPTION("Parport Driver for Atari builtin Port");
-MODULE_SUPPORTED_DEVICE("Atari builtin Parallel Port");
 MODULE_LICENSE("GPL");
 
 module_init(parport_atari_init)
index 9228e8f..1e43b3f 100644 (file)
@@ -41,7 +41,6 @@
 
 MODULE_AUTHOR("Helge Deller <deller@gmx.de>");
 MODULE_DESCRIPTION("HP-PARISC PC-style parallel port driver");
-MODULE_SUPPORTED_DEVICE("integrated PC-style parallel port");
 MODULE_LICENSE("GPL");
 
 
index d6bbe84..f4d0da7 100644 (file)
@@ -359,7 +359,6 @@ static void __exit parport_mfc3_exit(void)
 
 MODULE_AUTHOR("Joerg Dorchain <joerg@dorchain.net>");
 MODULE_DESCRIPTION("Parport Driver for Multiface 3 expansion cards Parallel Port");
-MODULE_SUPPORTED_DEVICE("Multiface 3 Parallel Port");
 MODULE_LICENSE("GPL");
 
 module_init(parport_mfc3_init)
index e840c1b..865fc41 100644 (file)
@@ -377,6 +377,5 @@ module_platform_driver(bpp_sbus_driver);
 
 MODULE_AUTHOR("Derrick J Brashear");
 MODULE_DESCRIPTION("Parport Driver for Sparc bidirectional Port");
-MODULE_SUPPORTED_DEVICE("Sparc Bidirectional Parallel Port");
 MODULE_VERSION("2.0");
 MODULE_LICENSE("GPL");
index cdbfa5d..dbfa0b5 100644 (file)
@@ -34,12 +34,11 @@ static ssize_t add_slot_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (nbytes >= MAX_DRC_NAME_LEN)
                return 0;
 
-       memcpy(drc_name, buf, nbytes);
+       strscpy(drc_name, buf, nbytes + 1);
 
        end = strchr(drc_name, '\n');
-       if (!end)
-               end = &drc_name[nbytes];
-       *end = '\0';
+       if (end)
+               *end = '\0';
 
        rc = dlpar_add_slot(drc_name);
        if (rc)
@@ -65,12 +64,11 @@ static ssize_t remove_slot_store(struct kobject *kobj,
        if (nbytes >= MAX_DRC_NAME_LEN)
                return 0;
 
-       memcpy(drc_name, buf, nbytes);
+       strscpy(drc_name, buf, nbytes + 1);
 
        end = strchr(drc_name, '\n');
-       if (!end)
-               end = &drc_name[nbytes];
-       *end = '\0';
+       if (end)
+               *end = '\0';
 
        rc = dlpar_remove_slot(drc_name);
        if (rc)
index c9e790c..a047c42 100644 (file)
@@ -93,8 +93,9 @@ static int disable_slot(struct hotplug_slot *hotplug_slot)
                pci_dev_put(pdev);
                return -EBUSY;
        }
+       pci_dev_put(pdev);
 
-       zpci_remove_device(zdev);
+       zpci_remove_device(zdev, false);
 
        rc = zpci_disable_device(zdev);
        if (rc)
index 8085782..9f3361c 100644 (file)
@@ -1357,6 +1357,7 @@ static int intel_pinctrl_add_padgroups_by_gpps(struct intel_pinctrl *pctrl,
                                gpps[i].gpio_base = 0;
                                break;
                        case INTEL_GPIO_BASE_NOMAP:
+                               break;
                        default:
                                break;
                }
@@ -1393,6 +1394,7 @@ static int intel_pinctrl_add_padgroups_by_size(struct intel_pinctrl *pctrl,
                gpps[i].size = min(gpp_size, npins);
                npins -= gpps[i].size;
 
+               gpps[i].gpio_base = gpps[i].base;
                gpps[i].padown_num = padown_num;
 
                /*
@@ -1491,8 +1493,13 @@ static int intel_pinctrl_probe(struct platform_device *pdev,
                if (IS_ERR(regs))
                        return PTR_ERR(regs);
 
-               /* Determine community features based on the revision */
+               /*
+                * Determine community features based on the revision.
+                * A value of all ones means the device is not present.
+                */
                value = readl(regs + REVID);
+               if (value == ~0u)
+                       return -ENODEV;
                if (((value & REVID_MASK) >> REVID_SHIFT) >= 0x94) {
                        community->features |= PINCTRL_FEATURE_DEBOUNCE;
                        community->features |= PINCTRL_FEATURE_1K_PD;
index f35edb0..c12fa57 100644 (file)
@@ -572,7 +572,7 @@ static void microchip_sgpio_irq_settype(struct irq_data *data,
        /* Type value spread over 2 registers sets: low, high bit */
        sgpio_clrsetbits(bank->priv, REG_INT_TRIGGER, addr.bit,
                         BIT(addr.port), (!!(type & 0x1)) << addr.port);
-       sgpio_clrsetbits(bank->priv, REG_INT_TRIGGER + SGPIO_MAX_BITS, addr.bit,
+       sgpio_clrsetbits(bank->priv, REG_INT_TRIGGER, SGPIO_MAX_BITS + addr.bit,
                         BIT(addr.port), (!!(type & 0x2)) << addr.port);
 
        if (type == SGPIO_INT_TRG_LEVEL)
index aa1a1c8..53a0bad 100644 (file)
@@ -3727,12 +3727,15 @@ static int __maybe_unused rockchip_pinctrl_suspend(struct device *dev)
 static int __maybe_unused rockchip_pinctrl_resume(struct device *dev)
 {
        struct rockchip_pinctrl *info = dev_get_drvdata(dev);
-       int ret = regmap_write(info->regmap_base, RK3288_GRF_GPIO6C_IOMUX,
-                              rk3288_grf_gpio6c_iomux |
-                              GPIO6C6_SEL_WRITE_ENABLE);
+       int ret;
 
-       if (ret)
-               return ret;
+       if (info->ctrl->type == RK3288) {
+               ret = regmap_write(info->regmap_base, RK3288_GRF_GPIO6C_IOMUX,
+                                  rk3288_grf_gpio6c_iomux |
+                                  GPIO6C6_SEL_WRITE_ENABLE);
+               if (ret)
+                       return ret;
+       }
 
        return pinctrl_force_default(info->pctl_dev);
 }
index 369ee20..2f19ab4 100644 (file)
@@ -392,7 +392,7 @@ static int lpi_config_set(struct pinctrl_dev *pctldev, unsigned int group,
                          unsigned long *configs, unsigned int nconfs)
 {
        struct lpi_pinctrl *pctrl = dev_get_drvdata(pctldev->dev);
-       unsigned int param, arg, pullup, strength;
+       unsigned int param, arg, pullup = LPI_GPIO_BIAS_DISABLE, strength = 2;
        bool value, output_enabled = false;
        const struct lpi_pingroup *g;
        unsigned long sval;
index 8daccd5..9d41abf 100644 (file)
@@ -1439,14 +1439,14 @@ static const struct msm_pingroup sc7280_groups[] = {
        [172] = PINGROUP(172, qdss, _, _, _, _, _, _, _, _),
        [173] = PINGROUP(173, qdss, _, _, _, _, _, _, _, _),
        [174] = PINGROUP(174, qdss, _, _, _, _, _, _, _, _),
-       [175] = UFS_RESET(ufs_reset, 0x1be000),
-       [176] = SDC_QDSD_PINGROUP(sdc1_rclk, 0x1b3000, 15, 0),
-       [177] = SDC_QDSD_PINGROUP(sdc1_clk, 0x1b3000, 13, 6),
-       [178] = SDC_QDSD_PINGROUP(sdc1_cmd, 0x1b3000, 11, 3),
-       [179] = SDC_QDSD_PINGROUP(sdc1_data, 0x1b3000, 9, 0),
-       [180] = SDC_QDSD_PINGROUP(sdc2_clk, 0x1b4000, 14, 6),
-       [181] = SDC_QDSD_PINGROUP(sdc2_cmd, 0x1b4000, 11, 3),
-       [182] = SDC_QDSD_PINGROUP(sdc2_data, 0x1b4000, 9, 0),
+       [175] = UFS_RESET(ufs_reset, 0xbe000),
+       [176] = SDC_QDSD_PINGROUP(sdc1_rclk, 0xb3004, 0, 6),
+       [177] = SDC_QDSD_PINGROUP(sdc1_clk, 0xb3000, 13, 6),
+       [178] = SDC_QDSD_PINGROUP(sdc1_cmd, 0xb3000, 11, 3),
+       [179] = SDC_QDSD_PINGROUP(sdc1_data, 0xb3000, 9, 0),
+       [180] = SDC_QDSD_PINGROUP(sdc2_clk, 0xb4000, 14, 6),
+       [181] = SDC_QDSD_PINGROUP(sdc2_cmd, 0xb4000, 11, 3),
+       [182] = SDC_QDSD_PINGROUP(sdc2_data, 0xb4000, 9, 0),
 };
 
 static const struct msm_pinctrl_soc_data sc7280_pinctrl = {
index 2b5b0e2..5aaf57b 100644 (file)
@@ -423,7 +423,7 @@ static const char * const gpio_groups[] = {
 
 static const char * const qdss_stm_groups[] = {
        "gpio0", "gpio1", "gpio2", "gpio3", "gpio4", "gpio5", "gpio6", "gpio7", "gpio12", "gpio13",
-       "gpio14", "gpio15", "gpio16", "gpio17", "gpio18", "gpio19" "gpio20", "gpio21", "gpio22",
+       "gpio14", "gpio15", "gpio16", "gpio17", "gpio18", "gpio19", "gpio20", "gpio21", "gpio22",
        "gpio23", "gpio44", "gpio45", "gpio52", "gpio53", "gpio56", "gpio57", "gpio61", "gpio62",
        "gpio63", "gpio64", "gpio65", "gpio66",
 };
index ad4e630..461ec61 100644 (file)
@@ -1173,15 +1173,20 @@ config INTEL_PMC_CORE
        depends on PCI
        help
          The Intel Platform Controller Hub for Intel Core SoCs provides access
-         to Power Management Controller registers via a PCI interface. This
+         to Power Management Controller registers via various interfaces. This
          driver can utilize debugging capabilities and supported features as
-         exposed by the Power Management Controller.
+         exposed by the Power Management Controller. It also may perform some
+         tasks in the PMC in order to enable transition into the SLPS0 state.
+         It should be selected on all Intel platforms supported by the driver.
 
          Supported features:
                - SLP_S0_RESIDENCY counter
                - PCH IP Power Gating status
-               - LTR Ignore
+               - LTR Ignore / LTR Show
                - MPHY/PLL gating status (Sunrisepoint PCH only)
+               - SLPS0 Debug registers (Cannonlake/Icelake PCH)
+               - Low Power Mode registers (Tigerlake and beyond)
+               - PMC quirks as needed to enable SLPS0/S0ix
 
 config INTEL_PMT_CLASS
        tristate
index 80f4b77..091e48c 100644 (file)
@@ -185,5 +185,8 @@ void exit_enum_attributes(void)
                        sysfs_remove_group(wmi_priv.enumeration_data[instance_id].attr_name_kobj,
                                                                &enumeration_attr_group);
        }
+       wmi_priv.enumeration_instances_count = 0;
+
        kfree(wmi_priv.enumeration_data);
+       wmi_priv.enumeration_data = NULL;
 }
index 75aedbb..8a49ba6 100644 (file)
@@ -175,5 +175,8 @@ void exit_int_attributes(void)
                        sysfs_remove_group(wmi_priv.integer_data[instance_id].attr_name_kobj,
                                                                &integer_attr_group);
        }
+       wmi_priv.integer_instances_count = 0;
+
        kfree(wmi_priv.integer_data);
+       wmi_priv.integer_data = NULL;
 }
index 3abcd95..834b3e8 100644 (file)
@@ -183,5 +183,8 @@ void exit_po_attributes(void)
                        sysfs_remove_group(wmi_priv.po_data[instance_id].attr_name_kobj,
                                                                &po_attr_group);
        }
+       wmi_priv.po_instances_count = 0;
+
        kfree(wmi_priv.po_data);
+       wmi_priv.po_data = NULL;
 }
index ac75dce..5525378 100644 (file)
@@ -155,5 +155,8 @@ void exit_str_attributes(void)
                        sysfs_remove_group(wmi_priv.str_data[instance_id].attr_name_kobj,
                                                                &str_attr_group);
        }
+       wmi_priv.str_instances_count = 0;
+
        kfree(wmi_priv.str_data);
+       wmi_priv.str_data = NULL;
 }
index cb81010..7410cca 100644 (file)
@@ -210,25 +210,17 @@ static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot);
  */
 static int create_attributes_level_sysfs_files(void)
 {
-       int ret = sysfs_create_file(&wmi_priv.main_dir_kset->kobj, &reset_bios.attr);
+       int ret;
 
-       if (ret) {
-               pr_debug("could not create reset_bios file\n");
+       ret = sysfs_create_file(&wmi_priv.main_dir_kset->kobj, &reset_bios.attr);
+       if (ret)
                return ret;
-       }
 
        ret = sysfs_create_file(&wmi_priv.main_dir_kset->kobj, &pending_reboot.attr);
-       if (ret) {
-               pr_debug("could not create changing_pending_reboot file\n");
-               sysfs_remove_file(&wmi_priv.main_dir_kset->kobj, &reset_bios.attr);
-       }
-       return ret;
-}
+       if (ret)
+               return ret;
 
-static void release_reset_bios_data(void)
-{
-       sysfs_remove_file(&wmi_priv.main_dir_kset->kobj, &reset_bios.attr);
-       sysfs_remove_file(&wmi_priv.main_dir_kset->kobj, &pending_reboot.attr);
+       return 0;
 }
 
 static ssize_t wmi_sysman_attr_show(struct kobject *kobj, struct attribute *attr,
@@ -373,8 +365,6 @@ static void destroy_attribute_objs(struct kset *kset)
  */
 static void release_attributes_data(void)
 {
-       release_reset_bios_data();
-
        mutex_lock(&wmi_priv.mutex);
        exit_enum_attributes();
        exit_int_attributes();
@@ -386,11 +376,13 @@ static void release_attributes_data(void)
                wmi_priv.authentication_dir_kset = NULL;
        }
        if (wmi_priv.main_dir_kset) {
+               sysfs_remove_file(&wmi_priv.main_dir_kset->kobj, &reset_bios.attr);
+               sysfs_remove_file(&wmi_priv.main_dir_kset->kobj, &pending_reboot.attr);
                destroy_attribute_objs(wmi_priv.main_dir_kset);
                kset_unregister(wmi_priv.main_dir_kset);
+               wmi_priv.main_dir_kset = NULL;
        }
        mutex_unlock(&wmi_priv.mutex);
-
 }
 
 /**
@@ -497,7 +489,6 @@ nextobj:
 
 err_attr_init:
        mutex_unlock(&wmi_priv.mutex);
-       release_attributes_data();
        kfree(obj);
        return retval;
 }
@@ -513,102 +504,91 @@ static int __init sysman_init(void)
        }
 
        ret = init_bios_attr_set_interface();
-       if (ret || !wmi_priv.bios_attr_wdev) {
-               pr_debug("failed to initialize set interface\n");
-               goto fail_set_interface;
-       }
+       if (ret)
+               return ret;
 
        ret = init_bios_attr_pass_interface();
-       if (ret || !wmi_priv.password_attr_wdev) {
-               pr_debug("failed to initialize pass interface\n");
-               goto fail_pass_interface;
+       if (ret)
+               goto err_exit_bios_attr_set_interface;
+
+       if (!wmi_priv.bios_attr_wdev || !wmi_priv.password_attr_wdev) {
+               pr_debug("failed to find set or pass interface\n");
+               ret = -ENODEV;
+               goto err_exit_bios_attr_pass_interface;
        }
 
        ret = class_register(&firmware_attributes_class);
        if (ret)
-               goto fail_class;
+               goto err_exit_bios_attr_pass_interface;
 
        wmi_priv.class_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0),
                                  NULL, "%s", DRIVER_NAME);
        if (IS_ERR(wmi_priv.class_dev)) {
                ret = PTR_ERR(wmi_priv.class_dev);
-               goto fail_classdev;
+               goto err_unregister_class;
        }
 
        wmi_priv.main_dir_kset = kset_create_and_add("attributes", NULL,
                                                     &wmi_priv.class_dev->kobj);
        if (!wmi_priv.main_dir_kset) {
                ret = -ENOMEM;
-               goto fail_main_kset;
+               goto err_destroy_classdev;
        }
 
        wmi_priv.authentication_dir_kset = kset_create_and_add("authentication", NULL,
                                                                &wmi_priv.class_dev->kobj);
        if (!wmi_priv.authentication_dir_kset) {
                ret = -ENOMEM;
-               goto fail_authentication_kset;
+               goto err_release_attributes_data;
        }
 
        ret = create_attributes_level_sysfs_files();
        if (ret) {
                pr_debug("could not create reset BIOS attribute\n");
-               goto fail_reset_bios;
+               goto err_release_attributes_data;
        }
 
        ret = init_bios_attributes(ENUM, DELL_WMI_BIOS_ENUMERATION_ATTRIBUTE_GUID);
        if (ret) {
                pr_debug("failed to populate enumeration type attributes\n");
-               goto fail_create_group;
+               goto err_release_attributes_data;
        }
 
        ret = init_bios_attributes(INT, DELL_WMI_BIOS_INTEGER_ATTRIBUTE_GUID);
        if (ret) {
                pr_debug("failed to populate integer type attributes\n");
-               goto fail_create_group;
+               goto err_release_attributes_data;
        }
 
        ret = init_bios_attributes(STR, DELL_WMI_BIOS_STRING_ATTRIBUTE_GUID);
        if (ret) {
                pr_debug("failed to populate string type attributes\n");
-               goto fail_create_group;
+               goto err_release_attributes_data;
        }
 
        ret = init_bios_attributes(PO, DELL_WMI_BIOS_PASSOBJ_ATTRIBUTE_GUID);
        if (ret) {
                pr_debug("failed to populate pass object type attributes\n");
-               goto fail_create_group;
+               goto err_release_attributes_data;
        }
 
        return 0;
 
-fail_create_group:
+err_release_attributes_data:
        release_attributes_data();
 
-fail_reset_bios:
-       if (wmi_priv.authentication_dir_kset) {
-               kset_unregister(wmi_priv.authentication_dir_kset);
-               wmi_priv.authentication_dir_kset = NULL;
-       }
-
-fail_authentication_kset:
-       if (wmi_priv.main_dir_kset) {
-               kset_unregister(wmi_priv.main_dir_kset);
-               wmi_priv.main_dir_kset = NULL;
-       }
-
-fail_main_kset:
+err_destroy_classdev:
        device_destroy(&firmware_attributes_class, MKDEV(0, 0));
 
-fail_classdev:
+err_unregister_class:
        class_unregister(&firmware_attributes_class);
 
-fail_class:
+err_exit_bios_attr_pass_interface:
        exit_bios_attr_pass_interface();
 
-fail_pass_interface:
+err_exit_bios_attr_set_interface:
        exit_bios_attr_set_interface();
 
-fail_set_interface:
        return ret;
 }
 
index 2f5b8d0..57cc928 100644 (file)
@@ -90,6 +90,13 @@ static const struct dmi_system_id button_array_table[] = {
                        DMI_MATCH(DMI_PRODUCT_NAME, "HP Spectre x2 Detachable"),
                },
        },
+       {
+               .ident = "Lenovo ThinkPad X1 Tablet Gen 2",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+                       DMI_MATCH(DMI_PRODUCT_FAMILY, "ThinkPad X1 Tablet Gen 2"),
+               },
+       },
        { }
 };
 
index 8a8017f..3fdf4cb 100644 (file)
@@ -48,8 +48,16 @@ static const struct key_entry intel_vbtn_keymap[] = {
 };
 
 static const struct key_entry intel_vbtn_switchmap[] = {
-       { KE_SW,     0xCA, { .sw = { SW_DOCK, 1 } } },          /* Docked */
-       { KE_SW,     0xCB, { .sw = { SW_DOCK, 0 } } },          /* Undocked */
+       /*
+        * SW_DOCK should only be reported for docking stations, but DSDTs using the
+        * intel-vbtn code, always seem to use this for 2-in-1s / convertibles and set
+        * SW_DOCK=1 when in laptop-mode (in tandem with setting SW_TABLET_MODE=0).
+        * This causes userspace to think the laptop is docked to a port-replicator
+        * and to disable suspend-on-lid-close, which is undesirable.
+        * Map the dock events to KEY_IGNORE to avoid this broken SW_DOCK reporting.
+        */
+       { KE_IGNORE, 0xCA, { .sw = { SW_DOCK, 1 } } },          /* Docked */
+       { KE_IGNORE, 0xCB, { .sw = { SW_DOCK, 0 } } },          /* Undocked */
        { KE_SW,     0xCC, { .sw = { SW_TABLET_MODE, 1 } } },   /* Tablet */
        { KE_SW,     0xCD, { .sw = { SW_TABLET_MODE, 0 } } },   /* Laptop */
        { KE_END }
index ee2f757..b5888ae 100644 (file)
@@ -863,34 +863,45 @@ out_unlock:
 }
 DEFINE_SHOW_ATTRIBUTE(pmc_core_pll);
 
-static ssize_t pmc_core_ltr_ignore_write(struct file *file,
-                                        const char __user *userbuf,
-                                        size_t count, loff_t *ppos)
+static int pmc_core_send_ltr_ignore(u32 value)
 {
        struct pmc_dev *pmcdev = &pmc;
        const struct pmc_reg_map *map = pmcdev->map;
-       u32 val, buf_size, fd;
-       int err;
-
-       buf_size = count < 64 ? count : 64;
-
-       err = kstrtou32_from_user(userbuf, buf_size, 10, &val);
-       if (err)
-               return err;
+       u32 reg;
+       int err = 0;
 
        mutex_lock(&pmcdev->lock);
 
-       if (val > map->ltr_ignore_max) {
+       if (value > map->ltr_ignore_max) {
                err = -EINVAL;
                goto out_unlock;
        }
 
-       fd = pmc_core_reg_read(pmcdev, map->ltr_ignore_offset);
-       fd |= (1U << val);
-       pmc_core_reg_write(pmcdev, map->ltr_ignore_offset, fd);
+       reg = pmc_core_reg_read(pmcdev, map->ltr_ignore_offset);
+       reg |= BIT(value);
+       pmc_core_reg_write(pmcdev, map->ltr_ignore_offset, reg);
 
 out_unlock:
        mutex_unlock(&pmcdev->lock);
+
+       return err;
+}
+
+static ssize_t pmc_core_ltr_ignore_write(struct file *file,
+                                        const char __user *userbuf,
+                                        size_t count, loff_t *ppos)
+{
+       u32 buf_size, value;
+       int err;
+
+       buf_size = min_t(u32, count, 64);
+
+       err = kstrtou32_from_user(userbuf, buf_size, 10, &value);
+       if (err)
+               return err;
+
+       err = pmc_core_send_ltr_ignore(value);
+
        return err == 0 ? count : err;
 }
 
@@ -1244,6 +1255,15 @@ static int pmc_core_probe(struct platform_device *pdev)
        pmcdev->pmc_xram_read_bit = pmc_core_check_read_lock_bit();
        dmi_check_system(pmc_core_dmi_table);
 
+       /*
+        * On TGL, due to a hardware limitation, the GBE LTR blocks PC10 when
+        * a cable is attached. Tell the PMC to ignore it.
+        */
+       if (pmcdev->map == &tgl_reg_map) {
+               dev_dbg(&pdev->dev, "ignoring GBE LTR\n");
+               pmc_core_send_ltr_ignore(3);
+       }
+
        pmc_core_dbgfs_register(pmcdev);
 
        device_initialized = true;
index c8939fb..ee2b3bb 100644 (file)
@@ -173,7 +173,7 @@ static int intel_pmt_dev_register(struct intel_pmt_entry *entry,
                                  struct intel_pmt_namespace *ns,
                                  struct device *parent)
 {
-       struct resource res;
+       struct resource res = {0};
        struct device *dev;
        int ret;
 
index 97dd749..92d315a 100644 (file)
 #define CRASH_TYPE_OOBMSM      1
 
 /* Control Flags */
-#define CRASHLOG_FLAG_DISABLE          BIT(27)
+#define CRASHLOG_FLAG_DISABLE          BIT(28)
 
 /*
- * Bits 28 and 29 control the state of bit 31.
+ * Bits 29 and 30 control the state of bit 31.
  *
- * Bit 28 will clear bit 31, if set, allowing a new crashlog to be captured.
- * Bit 29 will immediately trigger a crashlog to be generated, setting bit 31.
- * Bit 30 is read-only and reserved as 0.
+ * Bit 29 will clear bit 31, if set, allowing a new crashlog to be captured.
+ * Bit 30 will immediately trigger a crashlog to be generated, setting bit 31.
  * Bit 31 is the read-only status with a 1 indicating log is complete.
  */
-#define CRASHLOG_FLAG_TRIGGER_CLEAR    BIT(28)
-#define CRASHLOG_FLAG_TRIGGER_EXECUTE  BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_CLEAR    BIT(29)
+#define CRASHLOG_FLAG_TRIGGER_EXECUTE  BIT(30)
 #define CRASHLOG_FLAG_TRIGGER_COMPLETE BIT(31)
 #define CRASHLOG_FLAG_TRIGGER_MASK     GENMASK(31, 28)
 
index b881044..0d9e2dd 100644 (file)
@@ -4081,13 +4081,19 @@ static bool hotkey_notify_6xxx(const u32 hkey,
 
        case TP_HKEY_EV_KEY_NUMLOCK:
        case TP_HKEY_EV_KEY_FN:
-       case TP_HKEY_EV_KEY_FN_ESC:
                /* key press events, we just ignore them as long as the EC
                 * is still reporting them in the normal keyboard stream */
                *send_acpi_ev = false;
                *ignore_acpi_ev = true;
                return true;
 
+       case TP_HKEY_EV_KEY_FN_ESC:
+               /* Get the media key status to foce the status LED to update */
+               acpi_evalf(hkey_handle, NULL, "GMKS", "v");
+               *send_acpi_ev = false;
+               *ignore_acpi_ev = true;
+               return true;
+
        case TP_HKEY_EV_TABLET_CHANGED:
                tpacpi_input_send_tabletsw();
                hotkey_tablet_mode_notify_change();
@@ -9845,6 +9851,11 @@ static struct ibm_struct lcdshadow_driver_data = {
  * Thinkpad sensor interfaces
  */
 
+#define DYTC_CMD_QUERY        0 /* To get DYTC status - enable/revision */
+#define DYTC_QUERY_ENABLE_BIT 8  /* Bit        8 - 0 = disabled, 1 = enabled */
+#define DYTC_QUERY_SUBREV_BIT 16 /* Bits 16 - 27 - sub revision */
+#define DYTC_QUERY_REV_BIT    28 /* Bits 28 - 31 - revision */
+
 #define DYTC_CMD_GET          2 /* To get current IC function and mode */
 #define DYTC_GET_LAPMODE_BIT 17 /* Set when in lapmode */
 
@@ -9855,6 +9866,7 @@ static bool has_palmsensor;
 static bool has_lapsensor;
 static bool palm_state;
 static bool lap_state;
+static int dytc_version;
 
 static int dytc_command(int command, int *output)
 {
@@ -9869,6 +9881,33 @@ static int dytc_command(int command, int *output)
        return 0;
 }
 
+static int dytc_get_version(void)
+{
+       int err, output;
+
+       /* Check if we've been called before - and just return cached value */
+       if (dytc_version)
+               return dytc_version;
+
+       /* Otherwise query DYTC and extract version information */
+       err = dytc_command(DYTC_CMD_QUERY, &output);
+       /*
+        * If support isn't available (ENODEV) then don't return an error
+        * and don't create the sysfs group
+        */
+       if (err == -ENODEV)
+               return 0;
+       /* For all other errors we can flag the failure */
+       if (err)
+               return err;
+
+       /* Check DYTC is enabled and supports mode setting */
+       if (output & BIT(DYTC_QUERY_ENABLE_BIT))
+               dytc_version = (output >> DYTC_QUERY_REV_BIT) & 0xF;
+
+       return 0;
+}
+
 static int lapsensor_get(bool *present, bool *state)
 {
        int output, err;
@@ -9974,7 +10013,18 @@ static int tpacpi_proxsensor_init(struct ibm_init_struct *iibm)
                if (err)
                        return err;
        }
-       if (has_lapsensor) {
+
+       /* Check if we know the DYTC version, if we don't then get it */
+       if (!dytc_version) {
+               err = dytc_get_version();
+               if (err)
+                       return err;
+       }
+       /*
+        * Platforms before DYTC version 5 claim to have a lap sensor, but it doesn't work, so we
+        * ignore them
+        */
+       if (has_lapsensor && (dytc_version >= 5)) {
                err = sysfs_create_file(&tpacpi_pdev->dev.kobj, &dev_attr_dytc_lapmode.attr);
                if (err)
                        return err;
@@ -9999,14 +10049,9 @@ static struct ibm_struct proxsensor_driver_data = {
  * DYTC Platform Profile interface
  */
 
-#define DYTC_CMD_QUERY        0 /* To get DYTC status - enable/revision */
 #define DYTC_CMD_SET          1 /* To enable/disable IC function mode */
 #define DYTC_CMD_RESET    0x1ff /* To reset back to default */
 
-#define DYTC_QUERY_ENABLE_BIT 8  /* Bit        8 - 0 = disabled, 1 = enabled */
-#define DYTC_QUERY_SUBREV_BIT 16 /* Bits 16 - 27 - sub revision */
-#define DYTC_QUERY_REV_BIT    28 /* Bits 28 - 31 - revision */
-
 #define DYTC_GET_FUNCTION_BIT 8  /* Bits  8-11 - function setting */
 #define DYTC_GET_MODE_BIT     12 /* Bits 12-15 - mode setting */
 
@@ -10142,8 +10187,13 @@ static int dytc_profile_set(struct platform_profile_handler *pprof,
                return err;
 
        if (profile == PLATFORM_PROFILE_BALANCED) {
-               /* To get back to balanced mode we just issue a reset command */
-               err = dytc_command(DYTC_CMD_RESET, &output);
+               /*
+                * To get back to balanced mode we need to issue a reset command.
+                * Note we still need to disable CQL mode before hand and re-enable
+                * it afterwards, otherwise dytc_lapmode gets reset to 0 and stays
+                * stuck at 0 for aprox. 30 minutes.
+                */
+               err = dytc_cql_command(DYTC_CMD_RESET, &output);
                if (err)
                        goto unlock;
        } else {
@@ -10211,28 +10261,28 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm)
        if (err)
                return err;
 
+       /* Check if we know the DYTC version, if we don't then get it */
+       if (!dytc_version) {
+               err = dytc_get_version();
+               if (err)
+                       return err;
+       }
        /* Check DYTC is enabled and supports mode setting */
-       if (output & BIT(DYTC_QUERY_ENABLE_BIT)) {
-               /* Only DYTC v5.0 and later has this feature. */
-               int dytc_version;
-
-               dytc_version = (output >> DYTC_QUERY_REV_BIT) & 0xF;
-               if (dytc_version >= 5) {
-                       dbg_printk(TPACPI_DBG_INIT,
-                                  "DYTC version %d: thermal mode available\n", dytc_version);
-                       /* Create platform_profile structure and register */
-                       err = platform_profile_register(&dytc_profile);
-                       /*
-                        * If for some reason platform_profiles aren't enabled
-                        * don't quit terminally.
-                        */
-                       if (err)
-                               return 0;
+       if (dytc_version >= 5) {
+               dbg_printk(TPACPI_DBG_INIT,
+                               "DYTC version %d: thermal mode available\n", dytc_version);
+               /* Create platform_profile structure and register */
+               err = platform_profile_register(&dytc_profile);
+               /*
+                * If for some reason platform_profiles aren't enabled
+                * don't quit terminally.
+                */
+               if (err)
+                       return 0;
 
-                       dytc_profile_available = true;
-                       /* Ensure initial values are correct */
-                       dytc_profile_refresh();
-               }
+               dytc_profile_available = true;
+               /* Ensure initial values are correct */
+               dytc_profile_refresh();
        }
        return 0;
 }
index beb5f74..08f4cf0 100644 (file)
@@ -189,15 +189,16 @@ int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
        tmr_add = ptp_qoriq->tmr_add;
        adj = tmr_add;
 
-       /* calculate diff as adj*(scaled_ppm/65536)/1000000
-        * and round() to the nearest integer
+       /*
+        * Calculate diff and round() to the nearest integer
+        *
+        * diff = adj * (ppb / 1000000000)
+        *      = adj * scaled_ppm / 65536000000
         */
-       adj *= scaled_ppm;
-       diff = div_u64(adj, 8000000);
-       diff = (diff >> 13) + ((diff >> 12) & 1);
+       diff = mul_u64_u64_div_u64(adj, scaled_ppm, 32768000000);
+       diff = DIV64_U64_ROUND_UP(diff, 2);
 
        tmr_add = neg_adj ? tmr_add - diff : tmr_add + diff;
-
        ptp_qoriq->write(&regs->ctrl_regs->tmr_add, tmr_add);
 
        return 0;
index ba9ce4e..3a945ab 100644 (file)
@@ -63,7 +63,6 @@ void dasd_int_handler(struct ccw_device *, unsigned long, struct irb *);
 MODULE_AUTHOR("Holger Smolinski <Holger.Smolinski@de.ibm.com>");
 MODULE_DESCRIPTION("Linux on S/390 DASD device driver,"
                   " Copyright IBM Corp. 2000");
-MODULE_SUPPORTED_DEVICE("dasd");
 MODULE_LICENSE("GPL");
 
 /*
index 00e72b9..d93595b 100644 (file)
@@ -50,7 +50,6 @@ MODULE_PARM_DESC(sol_compat,
 MODULE_AUTHOR("Eric Brower <ebrower@usa.net>");
 MODULE_DESCRIPTION("7-Segment Display driver for Sun Microsystems CP1400/1500");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("d7s");
 
 struct d7s {
        void __iomem    *regs;
index 3836976..f135a10 100644 (file)
@@ -80,7 +80,6 @@
 MODULE_AUTHOR("Hewlett-Packard Company");
 MODULE_DESCRIPTION("Driver for HP Smart Array Controller version " \
        HPSA_DRIVER_VERSION);
-MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
 MODULE_VERSION(HPSA_DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("cciss");
index 1b68734..61831f2 100644 (file)
@@ -2372,6 +2372,24 @@ static int ibmvfc_match_lun(struct ibmvfc_event *evt, void *device)
 }
 
 /**
+ * ibmvfc_event_is_free - Check if event is free or not
+ * @evt:       ibmvfc event struct
+ *
+ * Returns:
+ *     true / false
+ **/
+static bool ibmvfc_event_is_free(struct ibmvfc_event *evt)
+{
+       struct ibmvfc_event *loop_evt;
+
+       list_for_each_entry(loop_evt, &evt->queue->free, queue_list)
+               if (loop_evt == evt)
+                       return true;
+
+       return false;
+}
+
+/**
  * ibmvfc_wait_for_ops - Wait for ops to complete
  * @vhost:     ibmvfc host struct
  * @device:    device to match (starget or sdev)
@@ -2385,35 +2403,58 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device,
 {
        struct ibmvfc_event *evt;
        DECLARE_COMPLETION_ONSTACK(comp);
-       int wait;
+       int wait, i, q_index, q_size;
        unsigned long flags;
        signed long timeout = IBMVFC_ABORT_WAIT_TIMEOUT * HZ;
+       struct ibmvfc_queue *queues;
 
        ENTER;
+       if (vhost->mq_enabled && vhost->using_channels) {
+               queues = vhost->scsi_scrqs.scrqs;
+               q_size = vhost->scsi_scrqs.active_queues;
+       } else {
+               queues = &vhost->crq;
+               q_size = 1;
+       }
+
        do {
                wait = 0;
-               spin_lock_irqsave(&vhost->crq.l_lock, flags);
-               list_for_each_entry(evt, &vhost->crq.sent, queue_list) {
-                       if (match(evt, device)) {
-                               evt->eh_comp = &comp;
-                               wait++;
+               spin_lock_irqsave(vhost->host->host_lock, flags);
+               for (q_index = 0; q_index < q_size; q_index++) {
+                       spin_lock(&queues[q_index].l_lock);
+                       for (i = 0; i < queues[q_index].evt_pool.size; i++) {
+                               evt = &queues[q_index].evt_pool.events[i];
+                               if (!ibmvfc_event_is_free(evt)) {
+                                       if (match(evt, device)) {
+                                               evt->eh_comp = &comp;
+                                               wait++;
+                                       }
+                               }
                        }
+                       spin_unlock(&queues[q_index].l_lock);
                }
-               spin_unlock_irqrestore(&vhost->crq.l_lock, flags);
+               spin_unlock_irqrestore(vhost->host->host_lock, flags);
 
                if (wait) {
                        timeout = wait_for_completion_timeout(&comp, timeout);
 
                        if (!timeout) {
                                wait = 0;
-                               spin_lock_irqsave(&vhost->crq.l_lock, flags);
-                               list_for_each_entry(evt, &vhost->crq.sent, queue_list) {
-                                       if (match(evt, device)) {
-                                               evt->eh_comp = NULL;
-                                               wait++;
+                               spin_lock_irqsave(vhost->host->host_lock, flags);
+                               for (q_index = 0; q_index < q_size; q_index++) {
+                                       spin_lock(&queues[q_index].l_lock);
+                                       for (i = 0; i < queues[q_index].evt_pool.size; i++) {
+                                               evt = &queues[q_index].evt_pool.events[i];
+                                               if (!ibmvfc_event_is_free(evt)) {
+                                                       if (match(evt, device)) {
+                                                               evt->eh_comp = NULL;
+                                                               wait++;
+                                                       }
+                                               }
                                        }
+                                       spin_unlock(&queues[q_index].l_lock);
                                }
-                               spin_unlock_irqrestore(&vhost->crq.l_lock, flags);
+                               spin_unlock_irqrestore(vhost->host->host_lock, flags);
                                if (wait)
                                        dev_err(vhost->dev, "Timed out waiting for aborted commands\n");
                                LEAVE;
@@ -5784,6 +5825,8 @@ static void ibmvfc_free_mem(struct ibmvfc_host *vhost)
                          vhost->disc_buf_dma);
        dma_free_coherent(vhost->dev, sizeof(*vhost->login_buf),
                          vhost->login_buf, vhost->login_buf_dma);
+       dma_free_coherent(vhost->dev, sizeof(*vhost->channel_setup_buf),
+                         vhost->channel_setup_buf, vhost->channel_setup_dma);
        dma_pool_destroy(vhost->sg_pool);
        ibmvfc_free_queue(vhost, async_q);
        LEAVE;
index bc79a01..46a8f2d 100644 (file)
@@ -2421,7 +2421,7 @@ lpfc_debugfs_dif_err_write(struct file *file, const char __user *buf,
        memset(dstbuf, 0, 33);
        size = (nbytes < 32) ? nbytes : 32;
        if (copy_from_user(dstbuf, buf, size))
-               return 0;
+               return -EFAULT;
 
        if (dent == phba->debug_InjErrLBA) {
                if ((dstbuf[0] == 'o') && (dstbuf[1] == 'f') &&
@@ -2430,7 +2430,7 @@ lpfc_debugfs_dif_err_write(struct file *file, const char __user *buf,
        }
 
        if ((tmp == 0) && (kstrtoull(dstbuf, 0, &tmp)))
-               return 0;
+               return -EINVAL;
 
        if (dent == phba->debug_writeGuard)
                phba->lpfc_injerr_wgrd_cnt = (uint32_t)tmp;
index ac066f8..ac0eef9 100644 (file)
@@ -7806,14 +7806,18 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
                ioc->pend_os_device_add_sz++;
        ioc->pend_os_device_add = kzalloc(ioc->pend_os_device_add_sz,
            GFP_KERNEL);
-       if (!ioc->pend_os_device_add)
+       if (!ioc->pend_os_device_add) {
+               r = -ENOMEM;
                goto out_free_resources;
+       }
 
        ioc->device_remove_in_progress_sz = ioc->pend_os_device_add_sz;
        ioc->device_remove_in_progress =
                kzalloc(ioc->device_remove_in_progress_sz, GFP_KERNEL);
-       if (!ioc->device_remove_in_progress)
+       if (!ioc->device_remove_in_progress) {
+               r = -ENOMEM;
                goto out_free_resources;
+       }
 
        ioc->fwfault_debug = mpt3sas_fwfault_debug;
 
index ffca030..6aa6de7 100644 (file)
@@ -413,7 +413,7 @@ mpt3sas_get_port_by_id(struct MPT3SAS_ADAPTER *ioc,
         * And add this object to port_table_list.
         */
        if (!ioc->multipath_on_hba) {
-               port = kzalloc(sizeof(struct hba_port), GFP_KERNEL);
+               port = kzalloc(sizeof(struct hba_port), GFP_ATOMIC);
                if (!port)
                        return NULL;
 
index 4adf9de..329fd02 100644 (file)
@@ -2273,12 +2273,12 @@ static void myrs_cleanup(struct myrs_hba *cs)
        if (cs->mmio_base) {
                cs->disable_intr(cs);
                iounmap(cs->mmio_base);
+               cs->mmio_base = NULL;
        }
        if (cs->irq)
                free_irq(cs->irq, cs);
        if (cs->io_addr)
                release_region(cs->io_addr, 0x80);
-       iounmap(cs->mmio_base);
        pci_set_drvdata(pdev, NULL);
        pci_disable_device(pdev);
        scsi_host_put(cs->host);
index 5d5f50d..ac89002 100644 (file)
@@ -55,7 +55,6 @@
 
 MODULE_AUTHOR("YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp>");
 MODULE_DESCRIPTION("WorkBit NinjaSCSI-3 / NinjaSCSI-32Bi(16bit) PCMCIA SCSI host adapter module");
-MODULE_SUPPORTED_DEVICE("sd,sr,sg,st");
 MODULE_LICENSE("GPL");
 
 #include "nsp_io.h"
index 47ad64b..69c5b5e 100644 (file)
@@ -1675,6 +1675,7 @@ static int qedi_alloc_global_queues(struct qedi_ctx *qedi)
                if (!qedi->global_queues[i]) {
                        QEDI_ERR(&qedi->dbg_ctx,
                                 "Unable to allocation global queue %d.\n", i);
+                       status = -ENOMEM;
                        goto mem_alloc_failure;
                }
 
index c48daf5..480e7d2 100644 (file)
@@ -3222,8 +3222,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type,
        if (!qpair->fw_started || (cmd->reset_count != qpair->chip_reset) ||
            (cmd->sess && cmd->sess->deleted)) {
                cmd->state = QLA_TGT_STATE_PROCESSED;
-               res = 0;
-               goto free;
+               return 0;
        }
 
        ql_dbg_qp(ql_dbg_tgt, qpair, 0xe018,
@@ -3234,8 +3233,9 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type,
 
        res = qlt_pre_xmit_response(cmd, &prm, xmit_type, scsi_status,
            &full_req_cnt);
-       if (unlikely(res != 0))
-               goto free;
+       if (unlikely(res != 0)) {
+               return res;
+       }
 
        spin_lock_irqsave(qpair->qp_lock_ptr, flags);
 
@@ -3255,8 +3255,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type,
                        vha->flags.online, qla2x00_reset_active(vha),
                        cmd->reset_count, qpair->chip_reset);
                spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
-               res = 0;
-               goto free;
+               return 0;
        }
 
        /* Does F/W have an IOCBs for this request */
@@ -3359,8 +3358,6 @@ out_unmap_unlock:
        qlt_unmap_sg(vha, cmd);
        spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
 
-free:
-       vha->hw->tgt.tgt_ops->free_cmd(cmd);
        return res;
 }
 EXPORT_SYMBOL(qlt_xmit_response);
index 10e5e6c..01620f3 100644 (file)
        (min(1270, ((ql) > 0) ? (QLA_TGT_DATASEGS_PER_CMD_24XX + \
                QLA_TGT_DATASEGS_PER_CONT_24XX*((ql) - 1)) : 0))
 #endif
-#endif
 
 #define GET_TARGET_ID(ha, iocb) ((HAS_EXTENDED_IDS(ha))                        \
                         ? le16_to_cpu((iocb)->u.isp2x.target.extended) \
@@ -244,6 +243,7 @@ struct ctio_to_2xxx {
 #ifndef CTIO_RET_TYPE
 #define CTIO_RET_TYPE  0x17            /* CTIO return entry */
 #define ATIO_TYPE7 0x06 /* Accept target I/O entry for 24xx */
+#endif
 
 struct fcp_hdr {
        uint8_t  r_ctl;
index b55fc76..8b4890c 100644 (file)
@@ -644,7 +644,6 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd)
 {
        struct qla_tgt_cmd *cmd = container_of(se_cmd,
                                struct qla_tgt_cmd, se_cmd);
-       struct scsi_qla_host *vha = cmd->vha;
 
        if (cmd->aborted) {
                /* Cmd can loop during Q-full.  tcm_qla2xxx_aborted_task
@@ -657,7 +656,6 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd)
                        cmd->se_cmd.transport_state,
                        cmd->se_cmd.t_state,
                        cmd->se_cmd.se_cmd_flags);
-               vha->hw->tgt.tgt_ops->free_cmd(cmd);
                return 0;
        }
 
@@ -685,7 +683,6 @@ static int tcm_qla2xxx_queue_status(struct se_cmd *se_cmd)
 {
        struct qla_tgt_cmd *cmd = container_of(se_cmd,
                                struct qla_tgt_cmd, se_cmd);
-       struct scsi_qla_host *vha = cmd->vha;
        int xmit_type = QLA_TGT_XMIT_STATUS;
 
        if (cmd->aborted) {
@@ -699,7 +696,6 @@ static int tcm_qla2xxx_queue_status(struct se_cmd *se_cmd)
                    cmd, kref_read(&cmd->se_cmd.cmd_kref),
                    cmd->se_cmd.transport_state, cmd->se_cmd.t_state,
                    cmd->se_cmd.se_cmd_flags);
-               vha->hw->tgt.tgt_ops->free_cmd(cmd);
                return 0;
        }
        cmd->bufflen = se_cmd->data_length;
index 91074fd..f4bf62b 100644 (file)
@@ -2475,6 +2475,7 @@ static void iscsi_if_stop_conn(struct iscsi_cls_conn *conn, int flag)
         */
        mutex_lock(&conn_mutex);
        conn->transport->stop_conn(conn, flag);
+       conn->state = ISCSI_CONN_DOWN;
        mutex_unlock(&conn_mutex);
 
 }
@@ -2901,6 +2902,13 @@ iscsi_set_param(struct iscsi_transport *transport, struct iscsi_uevent *ev)
        default:
                err = transport->set_param(conn, ev->u.set_param.param,
                                           data, ev->u.set_param.len);
+               if ((conn->state == ISCSI_CONN_BOUND) ||
+                       (conn->state == ISCSI_CONN_UP)) {
+                       err = transport->set_param(conn, ev->u.set_param.param,
+                                       data, ev->u.set_param.len);
+               } else {
+                       return -ENOTCONN;
+               }
        }
 
        return err;
@@ -2960,6 +2968,7 @@ static int iscsi_if_ep_disconnect(struct iscsi_transport *transport,
                mutex_lock(&conn->ep_mutex);
                conn->ep = NULL;
                mutex_unlock(&conn->ep_mutex);
+               conn->state = ISCSI_CONN_DOWN;
        }
 
        transport->ep_disconnect(ep);
@@ -3727,6 +3736,8 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
                ev->r.retcode = transport->bind_conn(session, conn,
                                                ev->u.b_conn.transport_eph,
                                                ev->u.b_conn.is_leading);
+               if (!ev->r.retcode)
+                       conn->state = ISCSI_CONN_BOUND;
                mutex_unlock(&conn_mutex);
 
                if (ev->r.retcode || !transport->ep_connect)
@@ -3966,7 +3977,8 @@ iscsi_conn_attr(local_ipaddr, ISCSI_PARAM_LOCAL_IPADDR);
 static const char *const connection_state_names[] = {
        [ISCSI_CONN_UP] = "up",
        [ISCSI_CONN_DOWN] = "down",
-       [ISCSI_CONN_FAILED] = "failed"
+       [ISCSI_CONN_FAILED] = "failed",
+       [ISCSI_CONN_BOUND] = "bound"
 };
 
 static ssize_t show_conn_state(struct device *dev,
index ee55867..994f1b8 100644 (file)
@@ -280,27 +280,28 @@ static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
 static void sd_zbc_update_wp_offset_workfn(struct work_struct *work)
 {
        struct scsi_disk *sdkp;
+       unsigned long flags;
        unsigned int zno;
        int ret;
 
        sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work);
 
-       spin_lock_bh(&sdkp->zones_wp_offset_lock);
+       spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
        for (zno = 0; zno < sdkp->nr_zones; zno++) {
                if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
                        continue;
 
-               spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+               spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
                ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf,
                                             SD_BUF_SIZE,
                                             zno * sdkp->zone_blocks, true);
-               spin_lock_bh(&sdkp->zones_wp_offset_lock);
+               spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
                if (!ret)
                        sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64,
                                            zno, sd_zbc_update_wp_offset_cb,
                                            sdkp);
        }
-       spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+       spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
 
        scsi_device_put(sdkp->device);
 }
@@ -324,6 +325,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
        struct request *rq = cmd->request;
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        unsigned int wp_offset, zno = blk_rq_zone_no(rq);
+       unsigned long flags;
        blk_status_t ret;
 
        ret = sd_zbc_cmnd_checks(cmd);
@@ -337,7 +339,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
        if (!blk_req_zone_write_trylock(rq))
                return BLK_STS_ZONE_RESOURCE;
 
-       spin_lock_bh(&sdkp->zones_wp_offset_lock);
+       spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
        wp_offset = sdkp->zones_wp_offset[zno];
        switch (wp_offset) {
        case SD_ZBC_INVALID_WP_OFST:
@@ -366,7 +368,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
 
                *lba += wp_offset;
        }
-       spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+       spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
        if (ret)
                blk_req_zone_write_unlock(rq);
        return ret;
@@ -445,6 +447,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
        struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
        unsigned int zno = blk_rq_zone_no(rq);
        enum req_opf op = req_op(rq);
+       unsigned long flags;
 
        /*
         * If we got an error for a command that needs updating the write
@@ -452,7 +455,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
         * invalid to force an update from disk the next time a zone append
         * command is issued.
         */
-       spin_lock_bh(&sdkp->zones_wp_offset_lock);
+       spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
 
        if (result && op != REQ_OP_ZONE_RESET_ALL) {
                if (op == REQ_OP_ZONE_APPEND) {
@@ -496,7 +499,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
        }
 
 unlock_wp_offset:
-       spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+       spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
 
        return good_bytes;
 }
index c53f456..a1dacb6 100644 (file)
@@ -48,7 +48,6 @@
 MODULE_AUTHOR("Microsemi");
 MODULE_DESCRIPTION("Driver for Microsemi Smart Family Controller version "
        DRIVER_VERSION);
-MODULE_SUPPORTED_DEVICE("Microsemi Smart Family Controllers");
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 
index 841ad2f..9ca536a 100644 (file)
@@ -1269,8 +1269,8 @@ static int st_open(struct inode *inode, struct file *filp)
        spin_lock(&st_use_lock);
        if (STp->in_use) {
                spin_unlock(&st_use_lock);
-               scsi_tape_put(STp);
                DEBC_printk(STp, "Device already in use.\n");
+               scsi_tape_put(STp);
                return (-EBUSY);
        }
 
index c55202b..a981f26 100644 (file)
@@ -911,7 +911,7 @@ static void ufs_mtk_vreg_set_lpm(struct ufs_hba *hba, bool lpm)
        if (!hba->vreg_info.vccq2 || !hba->vreg_info.vcc)
                return;
 
-       if (lpm & !hba->vreg_info.vcc->enabled)
+       if (lpm && !hba->vreg_info.vcc->enabled)
                regulator_set_mode(hba->vreg_info.vccq2->reg,
                                   REGULATOR_MODE_IDLE);
        else if (!lpm)
index e5d7fb8..bd0fbcd 100644 (file)
@@ -30,7 +30,6 @@
 MODULE_AUTHOR("Adrian McMenamin <adrian@mcmen.demon.co.uk>");
 MODULE_DESCRIPTION("Maple bus driver for Dreamcast");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{SEGA, Dreamcast/Maple}}");
 
 static void maple_dma_handler(struct work_struct *work);
 static void maple_vblank_handler(struct work_struct *work);
index 6268bfa..c3e379a 100644 (file)
@@ -13,7 +13,6 @@
 #include <linux/platform_device.h>
 #include <linux/printk.h>
 #include <linux/module.h>
-#include <linux/errno.h>
 #include <linux/io.h>
 #include <linux/reboot.h>
 
index f42954e..1fd29f9 100644 (file)
@@ -3,7 +3,6 @@
 
 #include <linux/acpi.h>
 #include <linux/clk.h>
-#include <linux/console.h>
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
@@ -92,14 +91,11 @@ struct geni_wrapper {
        struct device *dev;
        void __iomem *base;
        struct clk_bulk_data ahb_clks[NUM_AHB_CLKS];
-       struct geni_icc_path to_core;
 };
 
 static const char * const icc_path_names[] = {"qup-core", "qup-config",
                                                "qup-memory"};
 
-static struct geni_wrapper *earlycon_wrapper;
-
 #define QUP_HW_VER_REG                 0x4
 
 /* Common SE registers */
@@ -843,44 +839,11 @@ int geni_icc_disable(struct geni_se *se)
 }
 EXPORT_SYMBOL(geni_icc_disable);
 
-void geni_remove_earlycon_icc_vote(void)
-{
-       struct platform_device *pdev;
-       struct geni_wrapper *wrapper;
-       struct device_node *parent;
-       struct device_node *child;
-
-       if (!earlycon_wrapper)
-               return;
-
-       wrapper = earlycon_wrapper;
-       parent = of_get_next_parent(wrapper->dev->of_node);
-       for_each_child_of_node(parent, child) {
-               if (!of_device_is_compatible(child, "qcom,geni-se-qup"))
-                       continue;
-
-               pdev = of_find_device_by_node(child);
-               if (!pdev)
-                       continue;
-
-               wrapper = platform_get_drvdata(pdev);
-               icc_put(wrapper->to_core.path);
-               wrapper->to_core.path = NULL;
-
-       }
-       of_node_put(parent);
-
-       earlycon_wrapper = NULL;
-}
-EXPORT_SYMBOL(geni_remove_earlycon_icc_vote);
-
 static int geni_se_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct resource *res;
        struct geni_wrapper *wrapper;
-       struct console __maybe_unused *bcon;
-       bool __maybe_unused has_earlycon = false;
        int ret;
 
        wrapper = devm_kzalloc(dev, sizeof(*wrapper), GFP_KERNEL);
@@ -903,43 +866,6 @@ static int geni_se_probe(struct platform_device *pdev)
                }
        }
 
-#ifdef CONFIG_SERIAL_EARLYCON
-       for_each_console(bcon) {
-               if (!strcmp(bcon->name, "qcom_geni")) {
-                       has_earlycon = true;
-                       break;
-               }
-       }
-       if (!has_earlycon)
-               goto exit;
-
-       wrapper->to_core.path = devm_of_icc_get(dev, "qup-core");
-       if (IS_ERR(wrapper->to_core.path))
-               return PTR_ERR(wrapper->to_core.path);
-       /*
-        * Put minmal BW request on core clocks on behalf of early console.
-        * The vote will be removed earlycon exit function.
-        *
-        * Note: We are putting vote on each QUP wrapper instead only to which
-        * earlycon is connected because QUP core clock of different wrapper
-        * share same voltage domain. If core1 is put to 0, then core2 will
-        * also run at 0, if not voted. Default ICC vote will be removed ASA
-        * we touch any of the core clock.
-        * core1 = core2 = max(core1, core2)
-        */
-       ret = icc_set_bw(wrapper->to_core.path, GENI_DEFAULT_BW,
-                               GENI_DEFAULT_BW);
-       if (ret) {
-               dev_err(&pdev->dev, "%s: ICC BW voting failed for core: %d\n",
-                       __func__, ret);
-               return ret;
-       }
-
-       if (of_get_compatible_child(pdev->dev.of_node, "qcom,geni-debug-uart"))
-               earlycon_wrapper = wrapper;
-       of_node_put(pdev->dev.of_node);
-exit:
-#endif
        dev_set_drvdata(dev, wrapper);
        dev_dbg(dev, "GENI SE Driver probed\n");
        return devm_of_platform_populate(dev);
index bf1468e..51143a6 100644 (file)
@@ -332,7 +332,7 @@ static const struct omap_prm_data dra7_prm_data[] = {
        {
                .name = "l3init", .base = 0x4ae07300,
                .pwrstctrl = 0x0, .pwrstst = 0x4, .dmap = &omap_prm_alwon,
-               .rstctrl = 0x10, .rstst = 0x14, .rstmap = rst_map_012,
+               .rstctrl = 0x10, .rstst = 0x14, .rstmap = rst_map_01,
                .clkdm_name = "pcie"
        },
        {
@@ -830,8 +830,12 @@ static int omap_reset_deassert(struct reset_controller_dev *rcdev,
                       reset->prm->data->name, id);
 
 exit:
-       if (reset->clkdm)
+       if (reset->clkdm) {
+               /* At least dra7 iva needs a delay before clkdm idle */
+               if (has_rstst)
+                       udelay(1);
                pdata->clkdm_allow_idle(reset->clkdm);
+       }
 
        return ret;
 }
index 442cc7c..52ddb32 100644 (file)
@@ -1433,6 +1433,7 @@ static int cqspi_probe(struct platform_device *pdev)
        cqspi = spi_master_get_devdata(master);
 
        cqspi->pdev = pdev;
+       platform_set_drvdata(pdev, cqspi);
 
        /* Obtain configuration from OF. */
        ret = cqspi_of_get_pdata(cqspi);
index d740c47..2f20bd5 100644 (file)
@@ -1281,7 +1281,7 @@ static int cb_pcidas_auto_attach(struct comedi_device *dev,
             devpriv->amcc + AMCC_OP_REG_INTCSR);
 
        ret = request_irq(pcidev->irq, cb_pcidas_interrupt, IRQF_SHARED,
-                         dev->board_name, dev);
+                         "cb_pcidas", dev);
        if (ret) {
                dev_dbg(dev->class_dev, "unable to allocate irq %d\n",
                        pcidev->irq);
index fa987bb..6d3ba39 100644 (file)
@@ -4035,7 +4035,7 @@ static int auto_attach(struct comedi_device *dev,
        init_stc_registers(dev);
 
        retval = request_irq(pcidev->irq, handle_interrupt, IRQF_SHARED,
-                            dev->board_name, dev);
+                            "cb_pcidas64", dev);
        if (retval) {
                dev_dbg(dev->class_dev, "unable to allocate irq %u\n",
                        pcidev->irq);
index 7956abc..9f92081 100644 (file)
@@ -877,5 +877,4 @@ module_comedi_usb_driver(vmk80xx_driver, vmk80xx_usb_driver);
 
 MODULE_AUTHOR("Manuel Gebele <forensixs@gmx.de>");
 MODULE_DESCRIPTION("Velleman USB Board Low-Level Driver");
-MODULE_SUPPORTED_DEVICE("K8055/K8061 aka VM110/VM140");
 MODULE_LICENSE("GPL");
index b84f00b..4cabaf2 100644 (file)
@@ -1105,7 +1105,7 @@ struct rtllib_network {
        bool    bWithAironetIE;
        bool    bCkipSupported;
        bool    bCcxRmEnable;
-       u16     CcxRmState[2];
+       u     CcxRmState[2];
        bool    bMBssidValid;
        u8      MBssidMask;
        u8      MBssid[ETH_ALEN];
index 66c1353..15bbb63 100644 (file)
@@ -1967,7 +1967,7 @@ static void rtllib_parse_mife_generic(struct rtllib_device *ieee,
            info_element->data[2] == 0x96 &&
            info_element->data[3] == 0x01) {
                if (info_element->len == 6) {
-                       memcpy(network->CcxRmState, &info_element[4], 2);
+                       memcpy(network->CcxRmState, &info_element->data[4], 2);
                        if (network->CcxRmState[0] != 0)
                                network->bCcxRmEnable = true;
                        else
index e7061d3..c3c2c15 100644 (file)
@@ -150,7 +150,7 @@ struct vnt_cts {
        u16 reserved;
        struct ieee80211_cts data;
        u16 reserved2;
-} __packed;
+} __packed __aligned(2);
 
 struct vnt_cts_fb {
        struct vnt_phy_field b;
@@ -160,7 +160,7 @@ struct vnt_cts_fb {
        __le16 cts_duration_ba_f1;
        struct ieee80211_cts data;
        u16 reserved2;
-} __packed;
+} __packed __aligned(2);
 
 struct vnt_tx_fifo_head {
        u8 tx_key[WLAN_KEY_LEN_CCMP];
index 3cbc074..9ee797b 100644 (file)
@@ -882,7 +882,6 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
                        if (!bio) {
 new_bio:
                                nr_vecs = bio_max_segs(nr_pages);
-                               nr_pages -= nr_vecs;
                                /*
                                 * Calls bio_kmalloc() and sets bio->bi_end_io()
                                 */
@@ -939,6 +938,14 @@ new_bio:
 
        return 0;
 fail:
+       if (bio)
+               bio_put(bio);
+       while (req->bio) {
+               bio = req->bio;
+               req->bio = bio->bi_next;
+               bio_put(bio);
+       }
+       req->biotail = NULL;
        return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 }
 
index cf4718c..319a1e7 100644 (file)
@@ -747,7 +747,6 @@ module_platform_driver(optee_driver);
 
 MODULE_AUTHOR("Linaro");
 MODULE_DESCRIPTION("OP-TEE driver");
-MODULE_SUPPORTED_DEVICE("");
 MODULE_VERSION("1.0");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("platform:optee");
index 345917a..1c4aac8 100644 (file)
@@ -674,6 +674,9 @@ void thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev,
 {
        struct cooling_dev_stats *stats = cdev->stats;
 
+       if (!stats)
+               return;
+
        spin_lock(&stats->lock);
 
        if (stats->state == new_state)
index b63fecc..2a95b4c 100644 (file)
@@ -768,12 +768,6 @@ static int tb_init_port(struct tb_port *port)
 
        tb_dump_port(port->sw->tb, &port->config);
 
-       /* Control port does not need HopID allocation */
-       if (port->port) {
-               ida_init(&port->in_hopids);
-               ida_init(&port->out_hopids);
-       }
-
        INIT_LIST_HEAD(&port->list);
        return 0;
 
@@ -1842,10 +1836,8 @@ static void tb_switch_release(struct device *dev)
        dma_port_free(sw->dma_port);
 
        tb_switch_for_each_port(sw, port) {
-               if (!port->disabled) {
-                       ida_destroy(&port->in_hopids);
-                       ida_destroy(&port->out_hopids);
-               }
+               ida_destroy(&port->in_hopids);
+               ida_destroy(&port->out_hopids);
        }
 
        kfree(sw->uuid);
@@ -2025,6 +2017,12 @@ struct tb_switch *tb_switch_alloc(struct tb *tb, struct device *parent,
                /* minimum setup for tb_find_cap and tb_drom_read to work */
                sw->ports[i].sw = sw;
                sw->ports[i].port = i;
+
+               /* Control port does not need HopID allocation */
+               if (i) {
+                       ida_init(&sw->ports[i].in_hopids);
+                       ida_init(&sw->ports[i].out_hopids);
+               }
        }
 
        ret = tb_switch_find_vse_cap(sw, TB_VSE_CAP_PLUG_EVENTS);
index 1f000ac..c348b1f 100644 (file)
@@ -138,6 +138,10 @@ static void tb_discover_tunnels(struct tb_switch *sw)
                                parent->boot = true;
                                parent = tb_switch_parent(parent);
                        }
+               } else if (tb_tunnel_is_dp(tunnel)) {
+                       /* Keep the domain from powering down */
+                       pm_runtime_get_sync(&tunnel->src_port->sw->dev);
+                       pm_runtime_get_sync(&tunnel->dst_port->sw->dev);
                }
 
                list_add_tail(&tunnel->list, &tcm->tunnel_list);
index 9a87275..94af7a5 100644 (file)
@@ -1639,8 +1639,6 @@ module_exit(icom_exit);
 
 MODULE_AUTHOR("Michael Anderson <mjanders@us.ibm.com>");
 MODULE_DESCRIPTION("IBM iSeries Serial IOA driver");
-MODULE_SUPPORTED_DEVICE
-    ("IBM iSeries 2745, 2771, 2772, 2742, 2793 and 2805 Communications adapters");
 MODULE_LICENSE("GPL");
 MODULE_FIRMWARE("icom_call_setup.bin");
 MODULE_FIRMWARE("icom_res_dce.bin");
index cd30da0..0ea799b 100644 (file)
@@ -19,7 +19,6 @@
 MODULE_AUTHOR("Digi International, https://www.digi.com");
 MODULE_DESCRIPTION("Driver for the Digi International Neo and Classic PCI based product line");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("jsm");
 
 #define JSM_DRIVER_NAME "jsm"
 #define NR_PORTS       32
index 291649f..0d85b55 100644 (file)
@@ -1177,12 +1177,6 @@ static inline void qcom_geni_serial_enable_early_read(struct geni_se *se,
                                                      struct console *con) { }
 #endif
 
-static int qcom_geni_serial_earlycon_exit(struct console *con)
-{
-       geni_remove_earlycon_icc_vote();
-       return 0;
-}
-
 static struct qcom_geni_private_data earlycon_private_data;
 
 static int __init qcom_geni_serial_earlycon_setup(struct earlycon_device *dev,
@@ -1233,7 +1227,6 @@ static int __init qcom_geni_serial_earlycon_setup(struct earlycon_device *dev,
        writel(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN);
 
        dev->con->write = qcom_geni_serial_earlycon_write;
-       dev->con->exit = qcom_geni_serial_earlycon_exit;
        dev->con->setup = NULL;
        qcom_geni_serial_enable_early_read(&se, dev->con);
 
index f9170d1..5f0513c 100644 (file)
@@ -2197,7 +2197,10 @@ static int cdnsp_queue_isoc_tx(struct cdnsp_device *pdev,
         * inverted in the first TDs isoc TRB.
         */
        field = TRB_TYPE(TRB_ISOC) | TRB_TLBPC(last_burst_pkt) |
-               start_cycle ? 0 : 1 | TRB_SIA | TRB_TBC(burst_count);
+               TRB_SIA | TRB_TBC(burst_count);
+
+       if (!start_cycle)
+               field |= TRB_CYCLE;
 
        /* Fill the rest of the TRB fields, and remaining normal TRBs. */
        for (i = 0; i < trbs_per_td; i++) {
index 39ddb55..3fda1ec 100644 (file)
@@ -147,17 +147,29 @@ static inline int acm_set_control(struct acm *acm, int control)
 #define acm_send_break(acm, ms) \
        acm_ctrl_msg(acm, USB_CDC_REQ_SEND_BREAK, ms, NULL, 0)
 
-static void acm_kill_urbs(struct acm *acm)
+static void acm_poison_urbs(struct acm *acm)
 {
        int i;
 
-       usb_kill_urb(acm->ctrlurb);
+       usb_poison_urb(acm->ctrlurb);
        for (i = 0; i < ACM_NW; i++)
-               usb_kill_urb(acm->wb[i].urb);
+               usb_poison_urb(acm->wb[i].urb);
        for (i = 0; i < acm->rx_buflimit; i++)
-               usb_kill_urb(acm->read_urbs[i]);
+               usb_poison_urb(acm->read_urbs[i]);
+}
+
+static void acm_unpoison_urbs(struct acm *acm)
+{
+       int i;
+
+       for (i = 0; i < acm->rx_buflimit; i++)
+               usb_unpoison_urb(acm->read_urbs[i]);
+       for (i = 0; i < ACM_NW; i++)
+               usb_unpoison_urb(acm->wb[i].urb);
+       usb_unpoison_urb(acm->ctrlurb);
 }
 
+
 /*
  * Write buffer management.
  * All of these assume proper locks taken by the caller.
@@ -226,9 +238,10 @@ static int acm_start_wb(struct acm *acm, struct acm_wb *wb)
 
        rc = usb_submit_urb(wb->urb, GFP_ATOMIC);
        if (rc < 0) {
-               dev_err(&acm->data->dev,
-                       "%s - usb_submit_urb(write bulk) failed: %d\n",
-                       __func__, rc);
+               if (rc != -EPERM)
+                       dev_err(&acm->data->dev,
+                               "%s - usb_submit_urb(write bulk) failed: %d\n",
+                               __func__, rc);
                acm_write_done(acm, wb);
        }
        return rc;
@@ -313,8 +326,10 @@ static void acm_process_notification(struct acm *acm, unsigned char *buf)
                        acm->iocount.dsr++;
                if (difference & ACM_CTRL_DCD)
                        acm->iocount.dcd++;
-               if (newctrl & ACM_CTRL_BRK)
+               if (newctrl & ACM_CTRL_BRK) {
                        acm->iocount.brk++;
+                       tty_insert_flip_char(&acm->port, 0, TTY_BREAK);
+               }
                if (newctrl & ACM_CTRL_RI)
                        acm->iocount.rng++;
                if (newctrl & ACM_CTRL_FRAMING)
@@ -480,11 +495,6 @@ static void acm_read_bulk_callback(struct urb *urb)
        dev_vdbg(&acm->data->dev, "got urb %d, len %d, status %d\n",
                rb->index, urb->actual_length, status);
 
-       if (!acm->dev) {
-               dev_dbg(&acm->data->dev, "%s - disconnected\n", __func__);
-               return;
-       }
-
        switch (status) {
        case 0:
                usb_mark_last_busy(acm->dev);
@@ -649,7 +659,8 @@ static void acm_port_dtr_rts(struct tty_port *port, int raise)
 
        res = acm_set_control(acm, val);
        if (res && (acm->ctrl_caps & USB_CDC_CAP_LINE))
-               dev_err(&acm->control->dev, "failed to set dtr/rts\n");
+               /* This is broken in too many devices to spam the logs */
+               dev_dbg(&acm->control->dev, "failed to set dtr/rts\n");
 }
 
 static int acm_port_activate(struct tty_port *port, struct tty_struct *tty)
@@ -731,6 +742,7 @@ static void acm_port_shutdown(struct tty_port *port)
         * Need to grab write_lock to prevent race with resume, but no need to
         * hold it due to the tty-port initialised flag.
         */
+       acm_poison_urbs(acm);
        spin_lock_irq(&acm->write_lock);
        spin_unlock_irq(&acm->write_lock);
 
@@ -747,7 +759,8 @@ static void acm_port_shutdown(struct tty_port *port)
                usb_autopm_put_interface_async(acm->control);
        }
 
-       acm_kill_urbs(acm);
+       acm_unpoison_urbs(acm);
+
 }
 
 static void acm_tty_cleanup(struct tty_struct *tty)
@@ -1296,13 +1309,6 @@ skip_normal_probe:
        if (!combined_interfaces && intf != control_interface)
                return -ENODEV;
 
-       if (!combined_interfaces && usb_interface_claimed(data_interface)) {
-               /* valid in this context */
-               dev_dbg(&intf->dev, "The data interface isn't available\n");
-               return -EBUSY;
-       }
-
-
        if (data_interface->cur_altsetting->desc.bNumEndpoints < 2 ||
            control_interface->cur_altsetting->desc.bNumEndpoints == 0)
                return -EINVAL;
@@ -1323,8 +1329,8 @@ made_compressed_probe:
        dev_dbg(&intf->dev, "interfaces are valid\n");
 
        acm = kzalloc(sizeof(struct acm), GFP_KERNEL);
-       if (acm == NULL)
-               goto alloc_fail;
+       if (!acm)
+               return -ENOMEM;
 
        tty_port_init(&acm->port);
        acm->port.ops = &acm_port_ops;
@@ -1341,7 +1347,7 @@ made_compressed_probe:
 
        minor = acm_alloc_minor(acm);
        if (minor < 0)
-               goto alloc_fail1;
+               goto err_put_port;
 
        acm->minor = minor;
        acm->dev = usb_dev;
@@ -1372,15 +1378,15 @@ made_compressed_probe:
 
        buf = usb_alloc_coherent(usb_dev, ctrlsize, GFP_KERNEL, &acm->ctrl_dma);
        if (!buf)
-               goto alloc_fail1;
+               goto err_put_port;
        acm->ctrl_buffer = buf;
 
        if (acm_write_buffers_alloc(acm) < 0)
-               goto alloc_fail2;
+               goto err_free_ctrl_buffer;
 
        acm->ctrlurb = usb_alloc_urb(0, GFP_KERNEL);
        if (!acm->ctrlurb)
-               goto alloc_fail3;
+               goto err_free_write_buffers;
 
        for (i = 0; i < num_rx_buf; i++) {
                struct acm_rb *rb = &(acm->read_buffers[i]);
@@ -1389,13 +1395,13 @@ made_compressed_probe:
                rb->base = usb_alloc_coherent(acm->dev, readsize, GFP_KERNEL,
                                                                &rb->dma);
                if (!rb->base)
-                       goto alloc_fail4;
+                       goto err_free_read_urbs;
                rb->index = i;
                rb->instance = acm;
 
                urb = usb_alloc_urb(0, GFP_KERNEL);
                if (!urb)
-                       goto alloc_fail4;
+                       goto err_free_read_urbs;
 
                urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
                urb->transfer_dma = rb->dma;
@@ -1416,8 +1422,8 @@ made_compressed_probe:
                struct acm_wb *snd = &(acm->wb[i]);
 
                snd->urb = usb_alloc_urb(0, GFP_KERNEL);
-               if (snd->urb == NULL)
-                       goto alloc_fail5;
+               if (!snd->urb)
+                       goto err_free_write_urbs;
 
                if (usb_endpoint_xfer_int(epwrite))
                        usb_fill_int_urb(snd->urb, usb_dev, acm->out,
@@ -1435,7 +1441,7 @@ made_compressed_probe:
 
        i = device_create_file(&intf->dev, &dev_attr_bmCapabilities);
        if (i < 0)
-               goto alloc_fail5;
+               goto err_free_write_urbs;
 
        if (h.usb_cdc_country_functional_desc) { /* export the country data */
                struct usb_cdc_country_functional_desc * cfd =
@@ -1480,20 +1486,21 @@ skip_countries:
        acm->nb_index = 0;
        acm->nb_size = 0;
 
-       dev_info(&intf->dev, "ttyACM%d: USB ACM device\n", minor);
-
        acm->line.dwDTERate = cpu_to_le32(9600);
        acm->line.bDataBits = 8;
        acm_set_line(acm, &acm->line);
 
-       usb_driver_claim_interface(&acm_driver, data_interface, acm);
-       usb_set_intfdata(data_interface, acm);
+       if (!acm->combined_interfaces) {
+               rv = usb_driver_claim_interface(&acm_driver, data_interface, acm);
+               if (rv)
+                       goto err_remove_files;
+       }
 
        tty_dev = tty_port_register_device(&acm->port, acm_tty_driver, minor,
                        &control_interface->dev);
        if (IS_ERR(tty_dev)) {
                rv = PTR_ERR(tty_dev);
-               goto alloc_fail6;
+               goto err_release_data_interface;
        }
 
        if (quirks & CLEAR_HALT_CONDITIONS) {
@@ -1501,32 +1508,39 @@ skip_countries:
                usb_clear_halt(usb_dev, acm->out);
        }
 
+       dev_info(&intf->dev, "ttyACM%d: USB ACM device\n", minor);
+
        return 0;
-alloc_fail6:
+
+err_release_data_interface:
+       if (!acm->combined_interfaces) {
+               /* Clear driver data so that disconnect() returns early. */
+               usb_set_intfdata(data_interface, NULL);
+               usb_driver_release_interface(&acm_driver, data_interface);
+       }
+err_remove_files:
        if (acm->country_codes) {
                device_remove_file(&acm->control->dev,
                                &dev_attr_wCountryCodes);
                device_remove_file(&acm->control->dev,
                                &dev_attr_iCountryCodeRelDate);
-               kfree(acm->country_codes);
        }
        device_remove_file(&acm->control->dev, &dev_attr_bmCapabilities);
-alloc_fail5:
-       usb_set_intfdata(intf, NULL);
+err_free_write_urbs:
        for (i = 0; i < ACM_NW; i++)
                usb_free_urb(acm->wb[i].urb);
-alloc_fail4:
+err_free_read_urbs:
        for (i = 0; i < num_rx_buf; i++)
                usb_free_urb(acm->read_urbs[i]);
        acm_read_buffers_free(acm);
        usb_free_urb(acm->ctrlurb);
-alloc_fail3:
+err_free_write_buffers:
        acm_write_buffers_free(acm);
-alloc_fail2:
+err_free_ctrl_buffer:
        usb_free_coherent(usb_dev, ctrlsize, acm->ctrl_buffer, acm->ctrl_dma);
-alloc_fail1:
+err_put_port:
        tty_port_put(&acm->port);
-alloc_fail:
+
        return rv;
 }
 
@@ -1540,8 +1554,14 @@ static void acm_disconnect(struct usb_interface *intf)
        if (!acm)
                return;
 
-       mutex_lock(&acm->mutex);
        acm->disconnected = true;
+       /*
+        * there is a circular dependency. acm_softint() can resubmit
+        * the URBs in error handling so we need to block any
+        * submission right away
+        */
+       acm_poison_urbs(acm);
+       mutex_lock(&acm->mutex);
        if (acm->country_codes) {
                device_remove_file(&acm->control->dev,
                                &dev_attr_wCountryCodes);
@@ -1560,7 +1580,6 @@ static void acm_disconnect(struct usb_interface *intf)
                tty_kref_put(tty);
        }
 
-       acm_kill_urbs(acm);
        cancel_delayed_work_sync(&acm->dwork);
 
        tty_unregister_device(acm_tty_driver, acm->minor);
@@ -1602,7 +1621,7 @@ static int acm_suspend(struct usb_interface *intf, pm_message_t message)
        if (cnt)
                return 0;
 
-       acm_kill_urbs(acm);
+       acm_poison_urbs(acm);
        cancel_delayed_work_sync(&acm->dwork);
        acm->urbs_in_error_delay = 0;
 
@@ -1615,6 +1634,7 @@ static int acm_resume(struct usb_interface *intf)
        struct urb *urb;
        int rv = 0;
 
+       acm_unpoison_urbs(acm);
        spin_lock_irq(&acm->write_lock);
 
        if (--acm->susp_count)
index 6ade3da..76ac5d6 100644 (file)
@@ -498,6 +498,10 @@ static const struct usb_device_id usb_quirk_list[] = {
        /* DJI CineSSD */
        { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM },
 
+       /* Fibocom L850-GL LTE Modem */
+       { USB_DEVICE(0x2cb7, 0x0007), .driver_info =
+                       USB_QUIRK_IGNORE_REMOTE_WAKEUP },
+
        /* INTEL VALUE SSD */
        { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME },
 
index fc3269f..1a9789e 100644 (file)
@@ -4322,7 +4322,8 @@ static int _dwc2_hcd_suspend(struct usb_hcd *hcd)
        if (hsotg->op_state == OTG_STATE_B_PERIPHERAL)
                goto unlock;
 
-       if (hsotg->params.power_down > DWC2_POWER_DOWN_PARAM_PARTIAL)
+       if (hsotg->params.power_down != DWC2_POWER_DOWN_PARAM_PARTIAL ||
+           hsotg->flags.b.port_connect_status == 0)
                goto skip_power_saving;
 
        /*
@@ -5398,7 +5399,7 @@ int dwc2_host_enter_hibernation(struct dwc2_hsotg *hsotg)
        dwc2_writel(hsotg, hprt0, HPRT0);
 
        /* Wait for the HPRT0.PrtSusp register field to be set */
-       if (dwc2_hsotg_wait_bit_set(hsotg, HPRT0, HPRT0_SUSP, 3000))
+       if (dwc2_hsotg_wait_bit_set(hsotg, HPRT0, HPRT0_SUSP, 5000))
                dev_warn(hsotg->dev, "Suspend wasn't generated\n");
 
        /*
index 3d3918a..4c5c697 100644 (file)
@@ -120,6 +120,8 @@ static const struct property_entry dwc3_pci_intel_properties[] = {
 static const struct property_entry dwc3_pci_mrfld_properties[] = {
        PROPERTY_ENTRY_STRING("dr_mode", "otg"),
        PROPERTY_ENTRY_STRING("linux,extcon-name", "mrfld_bcove_pwrsrc"),
+       PROPERTY_ENTRY_BOOL("snps,dis_u3_susphy_quirk"),
+       PROPERTY_ENTRY_BOOL("snps,dis_u2_susphy_quirk"),
        PROPERTY_ENTRY_BOOL("linux,sysdev_is_parent"),
        {}
 };
index fcaf044..3de291a 100644 (file)
@@ -244,6 +244,9 @@ static int dwc3_qcom_interconnect_init(struct dwc3_qcom *qcom)
        struct device *dev = qcom->dev;
        int ret;
 
+       if (has_acpi_companion(dev))
+               return 0;
+
        qcom->icc_path_ddr = of_icc_get(dev, "usb-ddr");
        if (IS_ERR(qcom->icc_path_ddr)) {
                dev_err(dev, "failed to get usb-ddr path: %ld\n",
index aebcf8e..c7ef218 100644 (file)
@@ -783,8 +783,6 @@ static int __dwc3_gadget_ep_disable(struct dwc3_ep *dep)
 
        trace_dwc3_gadget_ep_disable(dep);
 
-       dwc3_remove_requests(dwc, dep);
-
        /* make sure HW endpoint isn't stalled */
        if (dep->flags & DWC3_EP_STALL)
                __dwc3_gadget_ep_set_halt(dep, 0, false);
@@ -793,16 +791,18 @@ static int __dwc3_gadget_ep_disable(struct dwc3_ep *dep)
        reg &= ~DWC3_DALEPENA_EP(dep->number);
        dwc3_writel(dwc->regs, DWC3_DALEPENA, reg);
 
-       dep->stream_capable = false;
-       dep->type = 0;
-       dep->flags = 0;
-
        /* Clear out the ep descriptors for non-ep0 */
        if (dep->number > 1) {
                dep->endpoint.comp_desc = NULL;
                dep->endpoint.desc = NULL;
        }
 
+       dwc3_remove_requests(dwc, dep);
+
+       dep->stream_capable = false;
+       dep->type = 0;
+       dep->flags = 0;
+
        return 0;
 }
 
@@ -1617,7 +1617,7 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep *dep, struct dwc3_request *req)
 {
        struct dwc3             *dwc = dep->dwc;
 
-       if (!dep->endpoint.desc || !dwc->pullups_connected) {
+       if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected) {
                dev_err(dwc->dev, "%s: can't queue to disabled endpoint\n",
                                dep->name);
                return -ESHUTDOWN;
@@ -2083,7 +2083,7 @@ static void __dwc3_gadget_set_speed(struct dwc3 *dwc)
        u32                     reg;
 
        speed = dwc->gadget_max_speed;
-       if (speed > dwc->maximum_speed)
+       if (speed == USB_SPEED_UNKNOWN || speed > dwc->maximum_speed)
                speed = dwc->maximum_speed;
 
        if (speed == USB_SPEED_SUPER_PLUS &&
@@ -2247,6 +2247,7 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
        if (!is_on) {
                u32 count;
 
+               dwc->connected = false;
                /*
                 * In the Synopsis DesignWare Cores USB3 Databook Rev. 3.30a
                 * Section 4.1.8 Table 4-7, it states that for a device-initiated
@@ -2271,7 +2272,6 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on)
                        dwc->ev_buf->lpos = (dwc->ev_buf->lpos + count) %
                                                dwc->ev_buf->length;
                }
-               dwc->connected = false;
        } else {
                __dwc3_gadget_start(dwc);
        }
@@ -2523,6 +2523,7 @@ static void dwc3_gadget_set_ssp_rate(struct usb_gadget *g,
        unsigned long           flags;
 
        spin_lock_irqsave(&dwc->lock, flags);
+       dwc->gadget_max_speed = USB_SPEED_SUPER_PLUS;
        dwc->gadget_ssp_rate = rate;
        spin_unlock_irqrestore(&dwc->lock, flags);
 }
@@ -3321,8 +3322,6 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 *dwc)
 {
        u32                     reg;
 
-       dwc->connected = true;
-
        /*
         * WORKAROUND: DWC3 revisions <1.88a have an issue which
         * would cause a missing Disconnect Event if there's a
@@ -3362,6 +3361,7 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 *dwc)
         * transfers."
         */
        dwc3_stop_active_transfers(dwc);
+       dwc->connected = true;
 
        reg = dwc3_readl(dwc->regs, DWC3_DCTL);
        reg &= ~DWC3_DCTL_TSTCTRL_MASK;
index 0d56f33..15a607c 100644 (file)
@@ -97,6 +97,8 @@ struct gadget_config_name {
        struct list_head list;
 };
 
+#define USB_MAX_STRING_WITH_NULL_LEN   (USB_MAX_STRING_LEN+1)
+
 static int usb_string_copy(const char *s, char **s_copy)
 {
        int ret;
@@ -106,12 +108,16 @@ static int usb_string_copy(const char *s, char **s_copy)
        if (ret > USB_MAX_STRING_LEN)
                return -EOVERFLOW;
 
-       str = kstrdup(s, GFP_KERNEL);
-       if (!str)
-               return -ENOMEM;
+       if (copy) {
+               str = copy;
+       } else {
+               str = kmalloc(USB_MAX_STRING_WITH_NULL_LEN, GFP_KERNEL);
+               if (!str)
+                       return -ENOMEM;
+       }
+       strcpy(str, s);
        if (str[ret - 1] == '\n')
                str[ret - 1] = '\0';
-       kfree(copy);
        *s_copy = str;
        return 0;
 }
index 8d387e0..c80f9bd 100644 (file)
@@ -153,6 +153,11 @@ static int udc_pci_probe(
        pci_set_master(pdev);
        pci_try_set_mwi(pdev);
 
+       dev->phys_addr = resource;
+       dev->irq = pdev->irq;
+       dev->pdev = pdev;
+       dev->dev = &pdev->dev;
+
        /* init dma pools */
        if (use_dma) {
                retval = init_dma_pools(dev);
@@ -160,11 +165,6 @@ static int udc_pci_probe(
                        goto err_dma;
        }
 
-       dev->phys_addr = resource;
-       dev->irq = pdev->irq;
-       dev->pdev = pdev;
-       dev->dev = &pdev->dev;
-
        /* general probing */
        if (udc_probe(dev)) {
                retval = -ENODEV;
index fe010cc..2f27dc0 100644 (file)
@@ -397,6 +397,13 @@ static void xhci_mtk_quirks(struct device *dev, struct xhci_hcd *xhci)
        xhci->quirks |= XHCI_SPURIOUS_SUCCESS;
        if (mtk->lpm_support)
                xhci->quirks |= XHCI_LPM_SUPPORT;
+
+       /*
+        * MTK xHCI 0.96: PSA is 1 by default even if doesn't support stream,
+        * and it's 3 when support it.
+        */
+       if (xhci->hci_version < 0x100 && HCC_MAX_PSA(xhci->hcc_params) == 4)
+               xhci->quirks |= XHCI_BROKEN_STREAMS;
 }
 
 /* called during probe() after chip reset completes */
@@ -548,7 +555,8 @@ static int xhci_mtk_probe(struct platform_device *pdev)
        if (ret)
                goto put_usb3_hcd;
 
-       if (HCC_MAX_PSA(xhci->hcc_params) >= 4)
+       if (HCC_MAX_PSA(xhci->hcc_params) >= 4 &&
+           !(xhci->quirks & XHCI_BROKEN_STREAMS))
                xhci->shared_hcd->can_do_streams = 1;
 
        ret = usb_add_hcd(xhci->shared_hcd, irq, IRQF_SHARED);
index 670e4d9..dcc88df 100644 (file)
@@ -117,7 +117,6 @@ MODULE_DEVICE_TABLE(usb, ld_usb_table);
 MODULE_AUTHOR("Michael Hund <mhund@ld-didactic.de>");
 MODULE_DESCRIPTION("LD USB Driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("LD USB Devices");
 
 /* All interrupt in transfers are collected in a ring buffer to
  * avoid racing conditions and get better performance of the driver.
index 1cd8772..fc0457d 100644 (file)
@@ -2004,10 +2004,14 @@ static void musb_pm_runtime_check_session(struct musb *musb)
                MUSB_DEVCTL_HR;
        switch (devctl & ~s) {
        case MUSB_QUIRK_B_DISCONNECT_99:
-               musb_dbg(musb, "Poll devctl in case of suspend after disconnect\n");
-               schedule_delayed_work(&musb->irq_work,
-                                     msecs_to_jiffies(1000));
-               break;
+               if (musb->quirk_retries && !musb->flush_irq_work) {
+                       musb_dbg(musb, "Poll devctl in case of suspend after disconnect\n");
+                       schedule_delayed_work(&musb->irq_work,
+                                             msecs_to_jiffies(1000));
+                       musb->quirk_retries--;
+                       break;
+               }
+               fallthrough;
        case MUSB_QUIRK_B_INVALID_VBUS_91:
                if (musb->quirk_retries && !musb->flush_irq_work) {
                        musb_dbg(musb,
index 5eb895b..f4304ce 100644 (file)
@@ -656,6 +656,13 @@ void usb_stor_invoke_transport(struct scsi_cmnd *srb, struct us_data *us)
                need_auto_sense = 1;
        }
 
+       /* Some devices (Kindle) require another command after SYNC CACHE */
+       if ((us->fflags & US_FL_SENSE_AFTER_SYNC) &&
+                       srb->cmnd[0] == SYNCHRONIZE_CACHE) {
+               usb_stor_dbg(us, "-- sense after SYNC CACHE\n");
+               need_auto_sense = 1;
+       }
+
        /*
         * If we have a failure, we're going to do a REQUEST_SENSE 
         * automatically.  Note that we differentiate between a command
index 5732e96..efa972b 100644 (file)
@@ -2212,6 +2212,18 @@ UNUSUAL_DEV( 0x1908, 0x3335, 0x0200, 0x0200,
                US_FL_NO_READ_DISC_INFO ),
 
 /*
+ * Reported by Matthias Schwarzott <zzam@gentoo.org>
+ * The Amazon Kindle treats SYNCHRONIZE CACHE as an indication that
+ * the host may be finished with it, and automatically ejects its
+ * emulated media unless it receives another command within one second.
+ */
+UNUSUAL_DEV( 0x1949, 0x0004, 0x0000, 0x9999,
+               "Amazon",
+               "Kindle",
+               USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+               US_FL_SENSE_AFTER_SYNC ),
+
+/*
  * Reported by Oliver Neukum <oneukum@suse.com>
  * This device morphes spontaneously into another device if the access
  * pattern of Windows isn't followed. Thus writable media would be dirty
index be0b646..ce7af39 100644 (file)
@@ -942,6 +942,7 @@ static int tcpm_set_current_limit(struct tcpm_port *port, u32 max_ma, u32 mv)
 
        port->supply_voltage = mv;
        port->current_limit = max_ma;
+       power_supply_changed(port->psy);
 
        if (port->tcpc->set_current_limit)
                ret = port->tcpc->set_current_limit(port->tcpc, max_ma, mv);
@@ -2928,6 +2929,7 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo,
 
        port->pps_data.supported = false;
        port->usb_type = POWER_SUPPLY_USB_TYPE_PD;
+       power_supply_changed(port->psy);
 
        /*
         * Select the source PDO providing the most power which has a
@@ -2952,6 +2954,7 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo,
                                port->pps_data.supported = true;
                                port->usb_type =
                                        POWER_SUPPLY_USB_TYPE_PD_PPS;
+                               power_supply_changed(port->psy);
                        }
                        continue;
                default:
@@ -3109,6 +3112,7 @@ static unsigned int tcpm_pd_select_pps_apdo(struct tcpm_port *port)
                                                  port->pps_data.out_volt));
                port->pps_data.op_curr = min(port->pps_data.max_curr,
                                             port->pps_data.op_curr);
+               power_supply_changed(port->psy);
        }
 
        return src_pdo;
@@ -3344,6 +3348,7 @@ static int tcpm_set_charge(struct tcpm_port *port, bool charge)
                        return ret;
        }
        port->vbus_charge = charge;
+       power_supply_changed(port->psy);
        return 0;
 }
 
@@ -3523,6 +3528,7 @@ static void tcpm_reset_port(struct tcpm_port *port)
        port->try_src_count = 0;
        port->try_snk_count = 0;
        port->usb_type = POWER_SUPPLY_USB_TYPE_C;
+       power_supply_changed(port->psy);
        port->nr_sink_caps = 0;
        port->sink_cap_done = false;
        if (port->tcpc->enable_frs)
@@ -5167,7 +5173,7 @@ static void tcpm_enable_frs_work(struct kthread_work *work)
                goto unlock;
 
        /* Send when the state machine is idle */
-       if (port->state != SNK_READY || port->vdm_state != VDM_STATE_DONE || port->send_discover)
+       if (port->state != SNK_READY || port->vdm_sm_running || port->send_discover)
                goto resched;
 
        port->upcoming_state = GET_SINK_CAP;
@@ -5905,7 +5911,7 @@ static int tcpm_psy_set_prop(struct power_supply *psy,
                ret = -EINVAL;
                break;
        }
-
+       power_supply_changed(port->psy);
        return ret;
 }
 
@@ -6058,6 +6064,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc)
        err = devm_tcpm_psy_register(port);
        if (err)
                goto out_role_sw_put;
+       power_supply_changed(port->psy);
 
        port->typec_port = typec_register_port(port->dev, &port->typec_caps);
        if (IS_ERR(port->typec_port)) {
index 6e6ef63..29bd1c5 100644 (file)
@@ -64,7 +64,6 @@ enum {
 struct tps6598x_rx_identity_reg {
        u8 status;
        struct usb_pd_identity identity;
-       u32 vdo[3];
 } __packed;
 
 /* Standard Task return codes */
index 3209b5d..a20a838 100644 (file)
@@ -594,6 +594,8 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
                                pr_err("invalid port number %d\n", wIndex);
                                goto error;
                        }
+                       if (wValue >= 32)
+                               goto error;
                        if (hcd->speed == HCD_USB3) {
                                if ((vhci_hcd->port_status[rhport] &
                                     USB_SS_PORT_STAT_POWER) != 0) {
index a3ec39f..7383a54 100644 (file)
@@ -174,7 +174,7 @@ static ssize_t usbip_sockfd_store(struct device *dev,
 
                udc->ud.tcp_socket = socket;
                udc->ud.tcp_rx = tcp_rx;
-               udc->ud.tcp_rx = tcp_tx;
+               udc->ud.tcp_tx = tcp_tx;
                udc->ud.status = SDEV_ST_USED;
 
                spin_unlock_irq(&udc->ud.lock);
index 7c8bbfc..d555a6a 100644 (file)
@@ -431,8 +431,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        }
 
        adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
-                                   dev, &ifc_vdpa_ops,
-                                   IFCVF_MAX_QUEUE_PAIRS * 2, NULL);
+                                   dev, &ifc_vdpa_ops, NULL);
        if (adapter == NULL) {
                IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
                return -ENOMEM;
@@ -456,7 +455,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
                vf->vring[i].irq = -EINVAL;
 
-       ret = vdpa_register_device(&adapter->vdpa);
+       ret = vdpa_register_device(&adapter->vdpa, IFCVF_MAX_QUEUE_PAIRS * 2);
        if (ret) {
                IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
                goto err;
index 10e9b09..71397fd 100644 (file)
@@ -1982,7 +1982,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
        max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
        ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
-                                2 * mlx5_vdpa_max_qps(max_vqs), NULL);
+                                NULL);
        if (IS_ERR(ndev))
                return PTR_ERR(ndev);
 
@@ -2009,7 +2009,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
        if (err)
                goto err_res;
 
-       err = vdpa_register_device(&mvdev->vdev);
+       err = vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs));
        if (err)
                goto err_reg;
 
index da67f07..5cffce6 100644 (file)
@@ -69,7 +69,6 @@ static void vdpa_release_dev(struct device *d)
  * initialized but before registered.
  * @parent: the parent device
  * @config: the bus operations that is supported by this device
- * @nvqs: number of virtqueues supported by this device
  * @size: size of the parent structure that contains private data
  * @name: name of the vdpa device; optional.
  *
@@ -81,7 +80,7 @@ static void vdpa_release_dev(struct device *d)
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
                                        const struct vdpa_config_ops *config,
-                                       int nvqs, size_t size, const char *name)
+                                       size_t size, const char *name)
 {
        struct vdpa_device *vdev;
        int err = -EINVAL;
@@ -107,7 +106,6 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
        vdev->index = err;
        vdev->config = config;
        vdev->features_valid = false;
-       vdev->nvqs = nvqs;
 
        if (name)
                err = dev_set_name(&vdev->dev, "%s", name);
@@ -136,10 +134,12 @@ static int vdpa_name_match(struct device *dev, const void *data)
        return (strcmp(dev_name(&vdev->dev), data) == 0);
 }
 
-static int __vdpa_register_device(struct vdpa_device *vdev)
+static int __vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
        struct device *dev;
 
+       vdev->nvqs = nvqs;
+
        lockdep_assert_held(&vdpa_dev_mutex);
        dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
        if (dev) {
@@ -155,15 +155,16 @@ static int __vdpa_register_device(struct vdpa_device *vdev)
  * Caller must invoke this routine in the management device dev_add()
  * callback after setting up valid mgmtdev for this vdpa device.
  * @vdev: the vdpa device to be registered to vDPA bus
+ * @nvqs: number of virtqueues supported by this device
  *
  * Returns an error when fail to add device to vDPA bus
  */
-int _vdpa_register_device(struct vdpa_device *vdev)
+int _vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
        if (!vdev->mdev)
                return -EINVAL;
 
-       return __vdpa_register_device(vdev);
+       return __vdpa_register_device(vdev, nvqs);
 }
 EXPORT_SYMBOL_GPL(_vdpa_register_device);
 
@@ -171,15 +172,16 @@ EXPORT_SYMBOL_GPL(_vdpa_register_device);
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
  * @vdev: the vdpa device to be registered to vDPA bus
+ * @nvqs: number of virtqueues supported by this device
  *
  * Returns an error when fail to add to vDPA bus
  */
-int vdpa_register_device(struct vdpa_device *vdev)
+int vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
        int err;
 
        mutex_lock(&vdpa_dev_mutex);
-       err = __vdpa_register_device(vdev);
+       err = __vdpa_register_device(vdev, nvqs);
        mutex_unlock(&vdpa_dev_mutex);
        return err;
 }
index d594284..5b6b2f8 100644 (file)
@@ -235,7 +235,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
                ops = &vdpasim_config_ops;
 
        vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
-                                   dev_attr->nvqs, dev_attr->name);
+                                   dev_attr->name);
        if (!vdpasim)
                goto err_alloc;
 
index d344c5b..a1ab616 100644 (file)
@@ -110,8 +110,7 @@ out:
 
 static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config)
 {
-       struct virtio_net_config *net_config =
-               (struct virtio_net_config *)config;
+       struct virtio_net_config *net_config = config;
 
        net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
        net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
@@ -147,7 +146,7 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
        if (IS_ERR(simdev))
                return PTR_ERR(simdev);
 
-       ret = _vdpa_register_device(&simdev->vdpa);
+       ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM);
        if (ret)
                goto reg_err;
 
index 5533df9..67d0bf4 100644 (file)
@@ -21,8 +21,8 @@ config VFIO_VIRQFD
 
 menuconfig VFIO
        tristate "VFIO Non-Privileged userspace driver framework"
-       depends on IOMMU_API
-       select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64)
+       select IOMMU_API
+       select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64)
        help
          VFIO provides a framework for secure userspace device drivers.
          See Documentation/driver-api/vfio.rst for more details.
index ac3c1dd..4abddbe 100644 (file)
@@ -42,6 +42,6 @@ config VFIO_PCI_IGD
 
 config VFIO_PCI_NVLINK2
        def_bool y
-       depends on VFIO_PCI && PPC_POWERNV
+       depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
        help
          VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
index dc1a3c4..ab34110 100644 (file)
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config VFIO_PLATFORM
        tristate "VFIO support for platform devices"
-       depends on VFIO && EVENTFD && (ARM || ARM64)
+       depends on VFIO && EVENTFD && (ARM || ARM64 || COMPILE_TEST)
        select VFIO_VIRQFD
        help
          Support for platform devices with VFIO. This is required to make
@@ -12,7 +12,7 @@ config VFIO_PLATFORM
 
 config VFIO_AMBA
        tristate "VFIO support for AMBA devices"
-       depends on VFIO_PLATFORM && ARM_AMBA
+       depends on VFIO_PLATFORM && (ARM_AMBA || COMPILE_TEST)
        help
          Support for ARM AMBA devices with VFIO. This is required to make
          use of ARM AMBA devices present on the system using the VFIO
index 4bb162c..45cbfd4 100644 (file)
@@ -189,7 +189,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 }
 
 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
-                                               dma_addr_t start, size_t size)
+                                               dma_addr_t start, u64 size)
 {
        struct rb_node *res = NULL;
        struct rb_node *node = iommu->dma_list.rb_node;
@@ -739,6 +739,12 @@ out:
        ret = vfio_lock_acct(dma, lock_acct, false);
 
 unpin_out:
+       if (batch->size == 1 && !batch->offset) {
+               /* May be a VM_PFNMAP pfn, which the batch can't remember. */
+               put_pfn(pfn, dma->prot);
+               batch->size = 0;
+       }
+
        if (ret < 0) {
                if (pinned && !rsvd) {
                        for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
@@ -785,7 +791,12 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
                return -ENODEV;
 
        ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
-       if (ret == 1 && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+       if (ret != 1)
+               goto out;
+
+       ret = 0;
+
+       if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
                ret = vfio_lock_acct(dma, 1, true);
                if (ret) {
                        put_pfn(*pfn_base, dma->prot);
@@ -797,6 +808,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
                }
        }
 
+out:
        mmput(mm);
        return ret;
 }
@@ -1288,7 +1300,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
        int ret = -EINVAL, retries = 0;
        unsigned long pgshift;
        dma_addr_t iova = unmap->iova;
-       unsigned long size = unmap->size;
+       u64 size = unmap->size;
        bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
        bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
        struct rb_node *n, *first_n;
@@ -1304,14 +1316,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
        if (unmap_all) {
                if (iova || size)
                        goto unlock;
-               size = SIZE_MAX;
-       } else if (!size || size & (pgsize - 1)) {
+               size = U64_MAX;
+       } else if (!size || size & (pgsize - 1) ||
+                  iova + size - 1 < iova || size > SIZE_MAX) {
                goto unlock;
        }
 
-       if (iova + size - 1 < iova || size > SIZE_MAX)
-               goto unlock;
-
        /* When dirty tracking is enabled, allow only min supported pgsize */
        if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
            (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
index ef688c8..e0a27e3 100644 (file)
@@ -308,8 +308,10 @@ static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
 
 static void vhost_vdpa_config_put(struct vhost_vdpa *v)
 {
-       if (v->config_ctx)
+       if (v->config_ctx) {
                eventfd_ctx_put(v->config_ctx);
+               v->config_ctx = NULL;
+       }
 }
 
 static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp)
@@ -329,8 +331,12 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp)
        if (!IS_ERR_OR_NULL(ctx))
                eventfd_ctx_put(ctx);
 
-       if (IS_ERR(v->config_ctx))
-               return PTR_ERR(v->config_ctx);
+       if (IS_ERR(v->config_ctx)) {
+               long ret = PTR_ERR(v->config_ctx);
+
+               v->config_ctx = NULL;
+               return ret;
+       }
 
        v->vdpa->config->set_config_cb(v->vdpa, &cb);
 
@@ -900,14 +906,10 @@ err:
 
 static void vhost_vdpa_clean_irq(struct vhost_vdpa *v)
 {
-       struct vhost_virtqueue *vq;
        int i;
 
-       for (i = 0; i < v->nvqs; i++) {
-               vq = &v->vqs[i];
-               if (vq->call_ctx.producer.irq)
-                       irq_bypass_unregister_producer(&vq->call_ctx.producer);
-       }
+       for (i = 0; i < v->nvqs; i++)
+               vhost_vdpa_unsetup_vq_irq(v, i);
 }
 
 static int vhost_vdpa_release(struct inode *inode, struct file *filep)
index a262e12..5ccb070 100644 (file)
@@ -332,8 +332,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
        vq->error_ctx = NULL;
        vq->kick = NULL;
        vq->log_ctx = NULL;
-       vhost_reset_is_le(vq);
        vhost_disable_cross_endian(vq);
+       vhost_reset_is_le(vq);
        vq->busyloop_timeout = 0;
        vq->umem = NULL;
        vq->iotlb = NULL;
index 44a5cd2..3406067 100644 (file)
@@ -1333,6 +1333,9 @@ static void fbcon_cursor(struct vc_data *vc, int mode)
 
        ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1;
 
+       if (!ops->cursor)
+               return;
+
        ops->cursor(vc, info, mode, get_color(vc, info, c, 1),
                    get_color(vc, info, c, 0));
 }
index c8b0ae6..4dc9077 100644 (file)
@@ -1031,7 +1031,6 @@ static int hvfb_getmem(struct hv_device *hdev, struct fb_info *info)
                        PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
                if (!pdev) {
                        pr_err("Unable to find PCI Hyper-V video\n");
-                       kfree(info->apertures);
                        return -ENODEV;
                }
 
@@ -1129,7 +1128,6 @@ getmem_done:
        } else {
                pci_dev_put(pdev);
        }
-       kfree(info->apertures);
 
        return 0;
 
@@ -1141,7 +1139,6 @@ err2:
 err1:
        if (!gen2vm)
                pci_dev_put(pdev);
-       kfree(info->apertures);
 
        return -ENOMEM;
 }
index 42e09cc..4b15c00 100644 (file)
@@ -141,15 +141,14 @@ void virtio_config_changed(struct virtio_device *dev)
 }
 EXPORT_SYMBOL_GPL(virtio_config_changed);
 
-void virtio_config_disable(struct virtio_device *dev)
+static void virtio_config_disable(struct virtio_device *dev)
 {
        spin_lock_irq(&dev->config_lock);
        dev->config_enabled = false;
        spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_disable);
 
-void virtio_config_enable(struct virtio_device *dev)
+static void virtio_config_enable(struct virtio_device *dev)
 {
        spin_lock_irq(&dev->config_lock);
        dev->config_enabled = true;
@@ -158,7 +157,6 @@ void virtio_config_enable(struct virtio_device *dev)
        dev->config_change_pending = false;
        spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_enable);
 
 void virtio_add_status(struct virtio_device *dev, unsigned int status)
 {
index a286d22..56128b9 100644 (file)
@@ -548,8 +548,7 @@ static void virtio_mmio_release_dev(struct device *_d)
 {
        struct virtio_device *vdev =
                        container_of(_d, struct virtio_device, dev);
-       struct virtio_mmio_device *vm_dev =
-                       container_of(vdev, struct virtio_mmio_device, vdev);
+       struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
        struct platform_device *pdev = vm_dev->pdev;
 
        devm_kfree(&pdev->dev, vm_dev);
index 9867a3a..688b112 100644 (file)
@@ -273,7 +273,6 @@ module_exit(cpu5wdt_exit_module);
 
 MODULE_AUTHOR("Heiko Ronsdorf <hero@ihg.uni-duisburg.de>");
 MODULE_DESCRIPTION("sma cpu5 watchdog driver");
-MODULE_SUPPORTED_DEVICE("sma cpu5 watchdog");
 MODULE_LICENSE("GPL");
 
 module_param_hw(port, int, ioport, 0);
index 808eeb4..1eafe0b 100644 (file)
@@ -172,7 +172,6 @@ MODULE_PARM_DESC(wd2_timeout, "Default watchdog2 timeout in 1/10secs");
 MODULE_AUTHOR("Eric Brower <ebrower@usa.net>");
 MODULE_DESCRIPTION("Hardware watchdog driver for Sun Microsystems CP1400/1500");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("watchdog");
 
 static void cpwd_writew(u16 val, void __iomem *addr)
 {
index 7008596..747e346 100644 (file)
@@ -46,7 +46,6 @@
 
 MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Hardware watchdog driver for Sun RIO");
-MODULE_SUPPORTED_DEVICE("watchdog");
 MODULE_LICENSE("GPL");
 
 #define DRIVER_NAME    "riowd"
index 41645fe..ea0efd2 100644 (file)
@@ -50,11 +50,11 @@ config XEN_BALLOON_MEMORY_HOTPLUG
 
          SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'"
 
-config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+config XEN_MEMORY_HOTPLUG_LIMIT
        int "Hotplugged memory limit (in GiB) for a PV guest"
        default 512
        depends on XEN_HAVE_PVMMU
-       depends on XEN_BALLOON_MEMORY_HOTPLUG
+       depends on MEMORY_HOTPLUG
        help
          Maxmium amount of memory (in GiB) that a PV guest can be
          expanded to when using memory hotplug.
index 714fcca..17548c1 100644 (file)
@@ -70,7 +70,6 @@ const struct inode_operations afs_dir_inode_operations = {
        .permission     = afs_permission,
        .getattr        = afs_getattr,
        .setattr        = afs_setattr,
-       .listxattr      = afs_listxattr,
 };
 
 const struct address_space_operations afs_dir_aops = {
index 85f5adf..960b642 100644 (file)
@@ -43,7 +43,6 @@ const struct inode_operations afs_file_inode_operations = {
        .getattr        = afs_getattr,
        .setattr        = afs_setattr,
        .permission     = afs_permission,
-       .listxattr      = afs_listxattr,
 };
 
 const struct address_space_operations afs_fs_aops = {
index 97cab12..71c5872 100644 (file)
@@ -181,10 +181,13 @@ void afs_wait_for_operation(struct afs_operation *op)
                if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
                    op->ops->issue_yfs_rpc)
                        op->ops->issue_yfs_rpc(op);
-               else
+               else if (op->ops->issue_afs_rpc)
                        op->ops->issue_afs_rpc(op);
+               else
+                       op->ac.error = -ENOTSUPP;
 
-               op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
+               if (op->call)
+                       op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
        }
 
        switch (op->error) {
index 1156b2d..12be887 100644 (file)
@@ -27,7 +27,6 @@
 
 static const struct inode_operations afs_symlink_inode_operations = {
        .get_link       = page_get_link,
-       .listxattr      = afs_listxattr,
 };
 
 static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
index b626e38..1627b18 100644 (file)
@@ -1509,7 +1509,6 @@ extern int afs_launder_page(struct page *);
  * xattr.c
  */
 extern const struct xattr_handler *afs_xattr_handlers[];
-extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
 
 /*
  * yfsclient.c
index 052dab2..bbb2c21 100644 (file)
@@ -32,7 +32,6 @@ const struct inode_operations afs_mntpt_inode_operations = {
        .lookup         = afs_mntpt_lookup,
        .readlink       = page_readlink,
        .getattr        = afs_getattr,
-       .listxattr      = afs_listxattr,
 };
 
 const struct inode_operations afs_autocell_inode_operations = {
index c9195fc..eb737ed 100644 (file)
@@ -851,8 +851,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
        fscache_wait_on_page_write(vnode->cache, vmf->page);
 #endif
 
-       if (PageWriteback(vmf->page) &&
-           wait_on_page_bit_killable(vmf->page, PG_writeback) < 0)
+       if (wait_on_page_writeback_killable(vmf->page))
                return VM_FAULT_RETRY;
 
        if (lock_page_killable(vmf->page) < 0)
index c629caa..7751b0b 100644 (file)
 #include <linux/xattr.h>
 #include "internal.h"
 
-static const char afs_xattr_list[] =
-       "afs.acl\0"
-       "afs.cell\0"
-       "afs.fid\0"
-       "afs.volume\0"
-       "afs.yfs.acl\0"
-       "afs.yfs.acl_inherited\0"
-       "afs.yfs.acl_num_cleaned\0"
-       "afs.yfs.vol_acl";
-
-/*
- * Retrieve a list of the supported xattrs.
- */
-ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-       if (size == 0)
-               return sizeof(afs_xattr_list);
-       if (size < sizeof(afs_xattr_list))
-               return -ERANGE;
-       memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list));
-       return sizeof(afs_xattr_list);
-}
-
 /*
  * Deal with the result of a successful fetch ACL operation.
  */
@@ -231,6 +208,8 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler,
                        else
                                ret = -ERANGE;
                }
+       } else if (ret == -ENOTSUPP) {
+               ret = -ENODATA;
        }
 
 error_yacl:
@@ -256,6 +235,7 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler,
 {
        struct afs_operation *op;
        struct afs_vnode *vnode = AFS_FS_I(inode);
+       int ret;
 
        if (flags == XATTR_CREATE ||
            strcmp(name, "acl") != 0)
@@ -270,7 +250,10 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler,
                return afs_put_operation(op);
 
        op->ops = &yfs_store_opaque_acl2_operation;
-       return afs_do_sync_operation(op);
+       ret = afs_do_sync_operation(op);
+       if (ret == -ENOTSUPP)
+               ret = -ENODATA;
+       return ret;
 }
 
 static const struct xattr_handler afs_xattr_yfs_handler = {
index 92ed7d5..09d6f72 100644 (file)
@@ -275,6 +275,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
                bio.bi_opf = dio_bio_write_op(iocb);
                task_io_account_write(ret);
        }
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               bio.bi_opf |= REQ_NOWAIT;
        if (iocb->ki_flags & IOCB_HIPRI)
                bio_set_polled(&bio, iocb);
 
@@ -428,6 +430,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                        bio->bi_opf = dio_bio_write_op(iocb);
                        task_io_account_write(bio->bi_iter.bi_size);
                }
+               if (iocb->ki_flags & IOCB_NOWAIT)
+                       bio->bi_opf |= REQ_NOWAIT;
 
                dio->size += bio->bi_iter.bi_size;
                pos += bio->bi_iter.bi_size;
@@ -1240,13 +1244,13 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
 
        lockdep_assert_held(&bdev->bd_mutex);
 
-       clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
-
 rescan:
        ret = blk_drop_partitions(bdev);
        if (ret)
                return ret;
 
+       clear_bit(GD_NEED_PART_SCAN, &disk->state);
+
        /*
         * Historically we only set the capacity to zero for devices that
         * support partitions (independ of actually having partitions created).
index b634c42..b4fb997 100644 (file)
@@ -7,10 +7,12 @@ subdir-ccflags-y += -Wmissing-format-attribute
 subdir-ccflags-y += -Wmissing-prototypes
 subdir-ccflags-y += -Wold-style-definition
 subdir-ccflags-y += -Wmissing-include-dirs
-subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
-subdir-ccflags-y += $(call cc-option, -Wunused-const-variable)
-subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned)
-subdir-ccflags-y += $(call cc-option, -Wstringop-truncation)
+condflags := \
+       $(call cc-option, -Wunused-but-set-variable)            \
+       $(call cc-option, -Wunused-const-variable)              \
+       $(call cc-option, -Wpacked-not-aligned)                 \
+       $(call cc-option, -Wstringop-truncation)
+subdir-ccflags-y += $(condflags)
 # The following turn off the warnings enabled by -Wextra
 subdir-ccflags-y += -Wno-missing-field-initializers
 subdir-ccflags-y += -Wno-sign-compare
index d56730a..34b929b 100644 (file)
@@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                                   "failed to read tree block %llu from get_old_root",
                                   logical);
                } else {
+                       btrfs_tree_read_lock(old);
                        eb = btrfs_clone_extent_buffer(old);
+                       btrfs_tree_read_unlock(old);
                        free_extent_buffer(old);
                }
        } else if (old_root) {
index 3a9c1e0..d05f735 100644 (file)
@@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
        struct btrfs_dev_replace_item *ptr;
        u64 src_devid;
 
+       if (!dev_root)
+               return 0;
+
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
index 41b718c..289f1f0 100644 (file)
@@ -2387,8 +2387,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        } else {
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->dev_root = root;
-               btrfs_init_devices_late(fs_info);
        }
+       /* Initialize fs_info for all devices in any case */
+       btrfs_init_devices_late(fs_info);
 
        /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
        if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
@@ -3009,6 +3010,21 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
                }
        }
 
+       /*
+        * btrfs_find_orphan_roots() is responsible for finding all the dead
+        * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
+        * them into the fs_info->fs_roots_radix tree. This must be done before
+        * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
+        * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
+        * item before the root's tree is deleted - this means that if we unmount
+        * or crash before the deletion completes, on the next mount we will not
+        * delete what remains of the tree because the orphan item does not
+        * exists anymore, which is what tells us we have a pending deletion.
+        */
+       ret = btrfs_find_orphan_roots(fs_info);
+       if (ret)
+               goto out;
+
        ret = btrfs_cleanup_fs_roots(fs_info);
        if (ret)
                goto out;
@@ -3068,7 +3084,6 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
                }
        }
 
-       ret = btrfs_find_orphan_roots(fs_info);
 out:
        return ret;
 }
index 78ad31a..36a3c97 100644 (file)
@@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
        if (last_ref && btrfs_header_generation(buf) == trans->transid) {
                struct btrfs_block_group *cache;
+               bool must_pin = false;
 
                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                        ret = check_ref_cleanup(trans, buf->start);
@@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                        goto out;
                }
 
-               if (btrfs_is_zoned(fs_info)) {
+               /*
+                * If this is a leaf and there are tree mod log users, we may
+                * have recorded mod log operations that point to this leaf.
+                * So we must make sure no one reuses this leaf's extent before
+                * mod log operations are applied to a node, otherwise after
+                * rewinding a node using the mod log operations we get an
+                * inconsistent btree, as the leaf's extent may now be used as
+                * a node or leaf for another different btree.
+                * We are safe from races here because at this point no other
+                * node or root points to this extent buffer, so if after this
+                * check a new tree mod log user joins, it will not be able to
+                * find a node pointing to this leaf and record operations that
+                * point to this leaf.
+                */
+               if (btrfs_header_level(buf) == 0) {
+                       read_lock(&fs_info->tree_mod_log_lock);
+                       must_pin = !list_empty(&fs_info->tree_mod_seq_list);
+                       read_unlock(&fs_info->tree_mod_log_lock);
+               }
+
+               if (must_pin || btrfs_is_zoned(fs_info)) {
                        btrfs_redirty_list_add(trans->transaction, buf);
                        pin_down_extent(trans, cache, buf->start, buf->len, 1);
                        btrfs_put_block_group(cache);
index 191e358..910769d 100644 (file)
@@ -2886,6 +2886,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
 }
 
 /*
+ * Find extent buffer for a givne bytenr.
+ *
+ * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
+ * in endio context.
+ */
+static struct extent_buffer *find_extent_buffer_readpage(
+               struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+       struct extent_buffer *eb;
+
+       /*
+        * For regular sectorsize, we can use page->private to grab extent
+        * buffer
+        */
+       if (fs_info->sectorsize == PAGE_SIZE) {
+               ASSERT(PagePrivate(page) && page->private);
+               return (struct extent_buffer *)page->private;
+       }
+
+       /* For subpage case, we need to lookup buffer radix tree */
+       rcu_read_lock();
+       eb = radix_tree_lookup(&fs_info->buffer_radix,
+                              bytenr >> fs_info->sectorsize_bits);
+       rcu_read_unlock();
+       ASSERT(eb);
+       return eb;
+}
+
+/*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
  * set the uptodate bits if things worked
@@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio)
                } else {
                        struct extent_buffer *eb;
 
-                       eb = (struct extent_buffer *)page->private;
+                       eb = find_extent_buffer_readpage(fs_info, page, start);
                        set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
                        eb->read_mirror = mirror;
                        atomic_dec(&eb->io_pages);
@@ -3020,7 +3049,7 @@ readpage_ok:
                         */
                        if (page->index == end_index && i_size <= end) {
                                u32 zero_start = max(offset_in_page(i_size),
-                                                    offset_in_page(end));
+                                                    offset_in_page(start));
 
                                zero_user_segment(page, zero_start,
                                                  offset_in_page(end) + 1);
index 35bfa05..a520775 100644 (file)
@@ -3099,11 +3099,13 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
  * @bio_offset:        offset to the beginning of the bio (in bytes)
  * @page:      page where is the data to be verified
  * @pgoff:     offset inside the page
+ * @start:     logical offset in the file
  *
  * The length of such check is always one sector size.
  */
 static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
-                          u32 bio_offset, struct page *page, u32 pgoff)
+                          u32 bio_offset, struct page *page, u32 pgoff,
+                          u64 start)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -3130,8 +3132,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
        kunmap_atomic(kaddr);
        return 0;
 zeroit:
-       btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff,
-                                   csum, csum_expected, io_bio->mirror_num);
+       btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
+                                   io_bio->mirror_num);
        if (io_bio->device)
                btrfs_dev_stat_inc_and_print(io_bio->device,
                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -3184,7 +3186,8 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
             pg_off += sectorsize, bio_offset += sectorsize) {
                int ret;
 
-               ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off);
+               ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+                                     page_offset(page) + pg_off);
                if (ret < 0)
                        return -EIO;
        }
@@ -7910,7 +7913,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                        ASSERT(pgoff < PAGE_SIZE);
                        if (uptodate &&
                            (!csum || !check_data_csum(inode, io_bio,
-                                       bio_offset, bvec.bv_page, pgoff))) {
+                                                      bio_offset, bvec.bv_page,
+                                                      pgoff, start))) {
                                clean_io_failure(fs_info, failure_tree, io_tree,
                                                 start, bvec.bv_page,
                                                 btrfs_ino(BTRFS_I(inode)),
@@ -8169,10 +8173,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
                bio->bi_end_io = btrfs_end_dio_bio;
                btrfs_io_bio(bio)->logical = file_offset;
 
-               WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) &&
-                            fs_info->max_zone_append_size &&
-                            bio_op(bio) != REQ_OP_ZONE_APPEND);
-
                if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                        status = extract_ordered_extent(BTRFS_I(inode), bio,
                                                        file_offset);
@@ -9008,7 +9008,7 @@ int __init btrfs_init_cachep(void)
 
        btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
                                                        PAGE_SIZE, PAGE_SIZE,
-                                                       SLAB_RED_ZONE, NULL);
+                                                       SLAB_MEM_SPREAD, NULL);
        if (!btrfs_free_space_bitmap_cachep)
                goto fail;
 
@@ -9877,6 +9877,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
        struct btrfs_path *path;
        u64 start = ins->objectid;
        u64 len = ins->offset;
+       int qgroup_released;
        int ret;
 
        memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9889,16 +9890,16 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
        btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
        /* Encryption and other encoding is reserved and all 0 */
 
-       ret = btrfs_qgroup_release_data(inode, file_offset, len);
-       if (ret < 0)
-               return ERR_PTR(ret);
+       qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
+       if (qgroup_released < 0)
+               return ERR_PTR(qgroup_released);
 
        if (trans) {
                ret = insert_reserved_file_extent(trans, inode,
                                                  file_offset, &stack_fi,
-                                                 true, ret);
+                                                 true, qgroup_released);
                if (ret)
-                       return ERR_PTR(ret);
+                       goto free_qgroup;
                return trans;
        }
 
@@ -9909,21 +9910,35 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
        extent_info.file_offset = file_offset;
        extent_info.extent_buf = (char *)&stack_fi;
        extent_info.is_new_extent = true;
-       extent_info.qgroup_reserved = ret;
+       extent_info.qgroup_reserved = qgroup_released;
        extent_info.insertions = 0;
 
        path = btrfs_alloc_path();
-       if (!path)
-               return ERR_PTR(-ENOMEM);
+       if (!path) {
+               ret = -ENOMEM;
+               goto free_qgroup;
+       }
 
        ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
                                     file_offset + len - 1, &extent_info,
                                     &trans);
        btrfs_free_path(path);
        if (ret)
-               return ERR_PTR(ret);
-
+               goto free_qgroup;
        return trans;
+
+free_qgroup:
+       /*
+        * We have released qgroup data range at the beginning of the function,
+        * and normally qgroup_released bytes will be freed when committing
+        * transaction.
+        * But if we error out early, we have to free what we have released
+        * or we leak qgroup data reservation.
+        */
+       btrfs_qgroup_free_refroot(inode->root->fs_info,
+                       inode->root->root_key.objectid, qgroup_released,
+                       BTRFS_QGROUP_RSV_DATA);
+       return ERR_PTR(ret);
 }
 
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
index 14ff388..f0b9ef1 100644 (file)
@@ -226,7 +226,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
 {
        struct btrfs_qgroup_list *list;
 
-       btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
        list_del(&qgroup->dirty);
        while (!list_empty(&qgroup->groups)) {
                list = list_first_entry(&qgroup->groups,
@@ -243,7 +242,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
                list_del(&list->next_member);
                kfree(list);
        }
-       kfree(qgroup);
 }
 
 /* must be called with qgroup_lock held */
@@ -569,6 +567,8 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
                qgroup = rb_entry(n, struct btrfs_qgroup, node);
                rb_erase(n, &fs_info->qgroup_tree);
                __del_qgroup_rb(fs_info, qgroup);
+               btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+               kfree(qgroup);
        }
        /*
         * We call btrfs_free_qgroup_config() when unmounting
@@ -1578,6 +1578,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
        spin_lock(&fs_info->qgroup_lock);
        del_qgroup_rb(fs_info, qgroupid);
        spin_unlock(&fs_info->qgroup_lock);
+
+       /*
+        * Remove the qgroup from sysfs now without holding the qgroup_lock
+        * spinlock, since the sysfs_remove_group() function needs to take
+        * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
+        */
+       btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+       kfree(qgroup);
 out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
index 20fd4aa..06713a8 100644 (file)
@@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err)
        /* find extent */
        spin_lock(&fs_info->reada_lock);
        re = radix_tree_lookup(&fs_info->reada_tree,
-                              eb->start >> PAGE_SHIFT);
+                              eb->start >> fs_info->sectorsize_bits);
        if (re)
                re->refcnt++;
        spin_unlock(&fs_info->reada_lock);
@@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
        zone = NULL;
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-                                    logical >> PAGE_SHIFT, 1);
+                                    logical >> fs_info->sectorsize_bits, 1);
        if (ret == 1 && logical >= zone->start && logical <= zone->end) {
                kref_get(&zone->refcnt);
                spin_unlock(&fs_info->reada_lock);
@@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
 
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&dev->reada_zones,
-                               (unsigned long)(zone->end >> PAGE_SHIFT),
-                               zone);
+                       (unsigned long)(zone->end >> fs_info->sectorsize_bits),
+                       zone);
 
        if (ret == -EEXIST) {
                kfree(zone);
                ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-                                            logical >> PAGE_SHIFT, 1);
+                                       logical >> fs_info->sectorsize_bits, 1);
                if (ret == 1 && logical >= zone->start && logical <= zone->end)
                        kref_get(&zone->refcnt);
                else
@@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
        u64 length;
        int real_stripes;
        int nzones = 0;
-       unsigned long index = logical >> PAGE_SHIFT;
+       unsigned long index = logical >> fs_info->sectorsize_bits;
        int dev_replace_is_ongoing;
        int have_zone = 0;
 
@@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
                             struct reada_extent *re)
 {
        int i;
-       unsigned long index = re->logical >> PAGE_SHIFT;
+       unsigned long index = re->logical >> fs_info->sectorsize_bits;
 
        spin_lock(&fs_info->reada_lock);
        if (--re->refcnt) {
@@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
 static void reada_zone_release(struct kref *kref)
 {
        struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+       struct btrfs_fs_info *fs_info = zone->device->fs_info;
 
-       lockdep_assert_held(&zone->device->fs_info->reada_lock);
+       lockdep_assert_held(&fs_info->reada_lock);
 
        radix_tree_delete(&zone->device->reada_zones,
-                         zone->end >> PAGE_SHIFT);
+                         zone->end >> fs_info->sectorsize_bits);
 
        kfree(zone);
 }
@@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
 static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
 {
        int i;
-       unsigned long index = zone->end >> PAGE_SHIFT;
+       unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
 
        for (i = 0; i < zone->ndevs; ++i) {
                struct reada_zone *peer;
@@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
                                             (void **)&zone, index, 1);
                if (ret == 0)
                        break;
-               index = (zone->end >> PAGE_SHIFT) + 1;
+               index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
                if (zone->locked) {
                        if (zone->elems > top_locked_elems) {
                                top_locked_elems = zone->elems;
@@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
         * plugging to speed things up
         */
        ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-                                    dev->reada_next >> PAGE_SHIFT, 1);
+                               dev->reada_next >> fs_info->sectorsize_bits, 1);
        if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
                ret = reada_pick_zone(dev);
                if (!ret) {
@@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
                }
                re = NULL;
                ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-                                       dev->reada_next >> PAGE_SHIFT, 1);
+                               dev->reada_next >> fs_info->sectorsize_bits, 1);
        }
        if (ret == 0) {
                spin_unlock(&fs_info->reada_lock);
@@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                                pr_cont(" curr off %llu",
                                        device->reada_next - zone->start);
                        pr_cont("\n");
-                       index = (zone->end >> PAGE_SHIFT) + 1;
+                       index = (zone->end >> fs_info->sectorsize_bits) + 1;
                }
                cnt = 0;
                index = 0;
@@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                                }
                        }
                        pr_cont("\n");
-                       index = (re->logical >> PAGE_SHIFT) + 1;
+                       index = (re->logical >> fs_info->sectorsize_bits) + 1;
                        if (++cnt > 15)
                                break;
                }
@@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                if (ret == 0)
                        break;
                if (!re->scheduled) {
-                       index = (re->logical >> PAGE_SHIFT) + 1;
+                       index = (re->logical >> fs_info->sectorsize_bits) + 1;
                        continue;
                }
                pr_debug("re: logical %llu size %u list empty %d scheduled %d",
@@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                        }
                }
                pr_cont("\n");
-               index = (re->logical >> PAGE_SHIFT) + 1;
+               index = (re->logical >> fs_info->sectorsize_bits) + 1;
        }
        spin_unlock(&fs_info->reada_lock);
 }
index 2f1acc9..92a3686 100644 (file)
@@ -3169,10 +3169,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        mutex_lock(&log_root_tree->log_mutex);
 
-       index2 = log_root_tree->log_transid % 2;
-       list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
-       root_log_ctx.log_transid = log_root_tree->log_transid;
-
        if (btrfs_is_zoned(fs_info)) {
                if (!log_root_tree->node) {
                        ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
@@ -3183,6 +3179,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                }
        }
 
+       index2 = log_root_tree->log_transid % 2;
+       list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+       root_log_ctx.log_transid = log_root_tree->log_transid;
+
        /*
         * Now we are safe to update the log_root_tree because we're under the
         * log_mutex, and we're a current writer so we're holding the commit
index bc3b33e..1c6810b 100644 (file)
@@ -7448,6 +7448,9 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
        int item_size;
        int i, ret, slot;
 
+       if (!device->fs_info->dev_root)
+               return 0;
+
        key.objectid = BTRFS_DEV_STATS_OBJECTID;
        key.type = BTRFS_PERSISTENT_ITEM_KEY;
        key.offset = device->devid;
index dfb14db..38bb776 100644 (file)
@@ -118,6 +118,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
        cache->mnt = path.mnt;
        root = path.dentry;
 
+       ret = -EINVAL;
+       if (mnt_user_ns(path.mnt) != &init_user_ns) {
+               pr_warn("File cache on idmapped mounts not supported");
+               goto error_unsupported;
+       }
+
        /* check parameters */
        ret = -EOPNOTSUPP;
        if (d_is_negative(root) ||
index e027c71..8ffc40e 100644 (file)
@@ -24,17 +24,16 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
                container_of(wait, struct cachefiles_one_read, monitor);
        struct cachefiles_object *object;
        struct fscache_retrieval *op = monitor->op;
-       struct wait_bit_key *key = _key;
+       struct wait_page_key *key = _key;
        struct page *page = wait->private;
 
        ASSERT(key);
 
        _enter("{%lu},%u,%d,{%p,%u}",
               monitor->netfs_page->index, mode, sync,
-              key->flags, key->bit_nr);
+              key->page, key->bit_nr);
 
-       if (key->flags != &page->flags ||
-           key->bit_nr != PG_locked)
+       if (key->page != page || key->bit_nr != PG_locked)
                return 0;
 
        _debug("--- monitor %p %lx ---", page, page->flags);
index f2d730f..d829b8b 100644 (file)
@@ -248,7 +248,7 @@ nlmsg_fail:
 
 /*
  * Try to find a matching registration for the tcon's server name and share name.
- * Calls to this funciton must be protected by cifs_swnreg_idr_mutex.
+ * Calls to this function must be protected by cifs_swnreg_idr_mutex.
  * TODO Try to avoid memory allocations
  */
 static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon)
index 9d29eb9..d178cf8 100644 (file)
@@ -1118,7 +1118,6 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
        /* Retain old ACEs which we can retain */
        for (i = 0; i < src_num_aces; ++i) {
                pntace = (struct cifs_ace *) (acl_base + size);
-               pnntace = (struct cifs_ace *) (nacl_base + nsize);
 
                if (!new_aces_set && (pntace->flags & INHERITED_ACE)) {
                        /* Place the new ACEs in between existing explicit and inherited */
@@ -1131,14 +1130,17 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
                }
 
                /* If it's any one of the ACE we're replacing, skip! */
-               if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) ||
+               if (((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) ||
                                (compare_sids(&pntace->sid, pownersid) == 0) ||
                                (compare_sids(&pntace->sid, pgrpsid) == 0) ||
                                (compare_sids(&pntace->sid, &sid_everyone) == 0) ||
-                               (compare_sids(&pntace->sid, &sid_authusers) == 0)) {
+                               (compare_sids(&pntace->sid, &sid_authusers) == 0))) {
                        goto next_ace;
                }
 
+               /* update the pointer to the next ACE to populate*/
+               pnntace = (struct cifs_ace *) (nacl_base + nsize);
+
                nsize += cifs_copy_ace(pnntace, pntace, NULL);
                num_aces++;
 
index 31fc869..67c056a 100644 (file)
@@ -919,8 +919,8 @@ struct cifs_ses {
        bool binding:1; /* are we binding the session? */
        __u16 session_flags;
        __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
-       __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
-       __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+       __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
+       __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE];
        __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
 
        __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
index 64fe5a4..9adc74b 100644 (file)
  */
 #define SMB3_SIGN_KEY_SIZE (16)
 
+/*
+ * Size of the smb3 encryption/decryption keys
+ */
+#define SMB3_ENC_DEC_KEY_SIZE (32)
+
 #define CIFS_CLIENT_CHALLENGE_SIZE (8)
 #define CIFS_SERVER_CHALLENGE_SIZE (8)
 #define CIFS_HMAC_MD5_HASH_SIZE (16)
index 26de432..042e24a 100644 (file)
@@ -165,6 +165,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
                        goto posix_open_ret;
                }
        } else {
+               cifs_revalidate_mapping(*pinode);
                cifs_fattr_to_inode(*pinode, &fattr);
        }
 
index 892f51a..7888902 100644 (file)
@@ -1196,9 +1196,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
                pr_warn_once("Witness protocol support is experimental\n");
                break;
        case Opt_rootfs:
-#ifdef CONFIG_CIFS_ROOT
-               ctx->rootfs = true;
+#ifndef CONFIG_CIFS_ROOT
+               cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n");
+               goto cifs_parse_mount_err;
 #endif
+               ctx->rootfs = true;
                break;
        case Opt_posixpaths:
                if (result.negated)
index 7c61bc9..f2df442 100644 (file)
@@ -2395,7 +2395,7 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path,
         * We need to be sure that all dirty pages are written and the server
         * has actual ctime, mtime and file length.
         */
-       if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE)) &&
+       if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE | STATX_BLOCKS)) &&
            !CIFS_CACHE_READ(CIFS_I(inode)) &&
            inode->i_mapping && inode->i_mapping->nrpages != 0) {
                rc = filemap_fdatawait(inode->i_mapping);
@@ -2585,6 +2585,14 @@ set_size_out:
        if (rc == 0) {
                cifsInode->server_eof = attrs->ia_size;
                cifs_setsize(inode, attrs->ia_size);
+               /*
+                * i_blocks is not related to (i_size / i_blksize), but instead
+                * 512 byte (2**9) size is required for calculating num blocks.
+                * Until we can query the server for actual allocation size,
+                * this is best estimate we have for blocks allocated for a file
+                * Number of blocks must be rounded up so size 1 is not 0 blocks
+                */
+               inode->i_blocks = (512 - 1 + attrs->ia_size) >> 9;
 
                /*
                 * The man page of truncate says if the size changed,
index 99a1951..d9a990c 100644 (file)
@@ -58,6 +58,7 @@
 #define SMB2_HMACSHA256_SIZE (32)
 #define SMB2_CMACAES_SIZE (16)
 #define SMB3_SIGNKEY_SIZE (16)
+#define SMB3_GCM128_CRYPTKEY_SIZE (16)
 #define SMB3_GCM256_CRYPTKEY_SIZE (32)
 
 /* Maximum buffer size value we can send with 1 credit */
index b50164e..aac384f 100644 (file)
@@ -754,8 +754,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
                }
        }
        spin_unlock(&cifs_tcp_ses_lock);
-       cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
-       return false;
+       cifs_dbg(FYI, "No file id matched, oplock break ignored\n");
+       return true;
 }
 
 void
index 9bae7e8..f703204 100644 (file)
@@ -2038,6 +2038,7 @@ smb2_duplicate_extents(const unsigned int xid,
 {
        int rc;
        unsigned int ret_data_len;
+       struct inode *inode;
        struct duplicate_extents_to_file dup_ext_buf;
        struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink);
 
@@ -2054,10 +2055,21 @@ smb2_duplicate_extents(const unsigned int xid,
        cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n",
                src_off, dest_off, len);
 
-       rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
-       if (rc)
-               goto duplicate_extents_out;
+       inode = d_inode(trgtfile->dentry);
+       if (inode->i_size < dest_off + len) {
+               rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
+               if (rc)
+                       goto duplicate_extents_out;
 
+               /*
+                * Although also could set plausible allocation size (i_blocks)
+                * here in addition to setting the file size, in reflink
+                * it is likely that the target file is sparse. Its allocation
+                * size will be queried on next revalidate, but it is important
+                * to make sure that file's cached size is updated immediately
+                */
+               cifs_setsize(inode, dest_off + len);
+       }
        rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
                        trgtfile->fid.volatile_fid,
                        FSCTL_DUPLICATE_EXTENTS_TO_FILE,
@@ -4158,7 +4170,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
                        if (ses->Suid == ses_id) {
                                ses_enc_key = enc ? ses->smb3encryptionkey :
                                        ses->smb3decryptionkey;
-                               memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE);
+                               memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
                                spin_unlock(&cifs_tcp_ses_lock);
                                return 0;
                        }
@@ -4185,7 +4197,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
        int rc = 0;
        struct scatterlist *sg;
        u8 sign[SMB2_SIGNATURE_SIZE] = {};
-       u8 key[SMB3_SIGN_KEY_SIZE];
+       u8 key[SMB3_ENC_DEC_KEY_SIZE];
        struct aead_request *req;
        char *iv;
        unsigned int iv_len;
@@ -4209,10 +4221,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
        tfm = enc ? server->secmech.ccmaesencrypt :
                                                server->secmech.ccmaesdecrypt;
 
-       if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+       if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
+               (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
                rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE);
        else
-               rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE);
+               rc = crypto_aead_setkey(tfm, key, SMB3_GCM128_CRYPTKEY_SIZE);
 
        if (rc) {
                cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc);
index ebccd71..e6fa76a 100644 (file)
@@ -298,7 +298,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
 {
        unsigned char zero = 0x0;
        __u8 i[4] = {0, 0, 0, 1};
-       __u8 L[4] = {0, 0, 0, 128};
+       __u8 L128[4] = {0, 0, 0, 128};
+       __u8 L256[4] = {0, 0, 1, 0};
        int rc = 0;
        unsigned char prfhash[SMB2_HMACSHA256_SIZE];
        unsigned char *hashptr = prfhash;
@@ -354,8 +355,14 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
                goto smb3signkey_ret;
        }
 
-       rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
-                               L, 4);
+       if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
+               (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) {
+               rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+                               L256, 4);
+       } else {
+               rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+                               L128, 4);
+       }
        if (rc) {
                cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__);
                goto smb3signkey_ret;
@@ -390,6 +397,9 @@ generate_smb3signingkey(struct cifs_ses *ses,
                        const struct derivation_triplet *ptriplet)
 {
        int rc;
+#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
+       struct TCP_Server_Info *server = ses->server;
+#endif
 
        /*
         * All channels use the same encryption/decryption keys but
@@ -422,11 +432,11 @@ generate_smb3signingkey(struct cifs_ses *ses,
                rc = generate_key(ses, ptriplet->encryption.label,
                                  ptriplet->encryption.context,
                                  ses->smb3encryptionkey,
-                                 SMB3_SIGN_KEY_SIZE);
+                                 SMB3_ENC_DEC_KEY_SIZE);
                rc = generate_key(ses, ptriplet->decryption.label,
                                  ptriplet->decryption.context,
                                  ses->smb3decryptionkey,
-                                 SMB3_SIGN_KEY_SIZE);
+                                 SMB3_ENC_DEC_KEY_SIZE);
                if (rc)
                        return rc;
        }
@@ -442,14 +452,23 @@ generate_smb3signingkey(struct cifs_ses *ses,
         */
        cifs_dbg(VFS, "Session Id    %*ph\n", (int)sizeof(ses->Suid),
                        &ses->Suid);
+       cifs_dbg(VFS, "Cipher type   %d\n", server->cipher_type);
        cifs_dbg(VFS, "Session Key   %*ph\n",
                 SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response);
        cifs_dbg(VFS, "Signing Key   %*ph\n",
                 SMB3_SIGN_KEY_SIZE, ses->smb3signingkey);
-       cifs_dbg(VFS, "ServerIn Key  %*ph\n",
-                SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey);
-       cifs_dbg(VFS, "ServerOut Key %*ph\n",
-                SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey);
+       if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
+               (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) {
+               cifs_dbg(VFS, "ServerIn Key  %*ph\n",
+                               SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3encryptionkey);
+               cifs_dbg(VFS, "ServerOut Key %*ph\n",
+                               SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3decryptionkey);
+       } else {
+               cifs_dbg(VFS, "ServerIn Key  %*ph\n",
+                               SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3encryptionkey);
+               cifs_dbg(VFS, "ServerOut Key %*ph\n",
+                               SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3decryptionkey);
+       }
 #endif
        return rc;
 }
index 007d994..c1725b5 100644 (file)
@@ -1196,9 +1196,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
        /*
         * Compounding is never used during session establish.
         */
-       if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP))
+       if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
+               mutex_lock(&server->srv_mutex);
                smb311_update_preauth_hash(ses, rqst[0].rq_iov,
                                           rqst[0].rq_nvec);
+               mutex_unlock(&server->srv_mutex);
+       }
 
        for (i = 0; i < num_rqst; i++) {
                rc = wait_for_response(server, midQ[i]);
@@ -1266,7 +1269,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
                        .iov_base = resp_iov[0].iov_base,
                        .iov_len = resp_iov[0].iov_len
                };
+               mutex_lock(&server->srv_mutex);
                smb311_update_preauth_hash(ses, &iov, 1);
+               mutex_unlock(&server->srv_mutex);
        }
 
 out:
index f45f9fe..74a5172 100644 (file)
@@ -626,27 +626,41 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
 
 /**
  * ext4_should_retry_alloc() - check if a block allocation should be retried
- * @sb:                        super block
- * @retries:           number of attemps has been made
+ * @sb:                        superblock
+ * @retries:           number of retry attempts made so far
  *
- * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE.  We will only retry once.
+ * ext4_should_retry_alloc() is called when ENOSPC is returned while
+ * attempting to allocate blocks.  If there's an indication that a pending
+ * journal transaction might free some space and allow another attempt to
+ * succeed, this function will wait for the current or committing transaction
+ * to complete and then return TRUE.
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-       if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
-           (*retries)++ > 1 ||
-           !EXT4_SB(sb)->s_journal)
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!sbi->s_journal)
                return 0;
 
-       smp_mb();
-       if (EXT4_SB(sb)->s_mb_free_pending == 0)
+       if (++(*retries) > 3) {
+               percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
                return 0;
+       }
 
+       /*
+        * if there's no indication that blocks are about to be freed it's
+        * possible we just missed a transaction commit that did so
+        */
+       smp_mb();
+       if (sbi->s_mb_free_pending == 0)
+               return ext4_has_free_clusters(sbi, 1, 0);
+
+       /*
+        * it's possible we've just missed a transaction commit here,
+        * so ignore the returned status
+        */
        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
-       jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+       (void) jbd2_journal_force_commit_nested(sbi->s_journal);
        return 1;
 }
 
index 644fd69..826a56e 100644 (file)
@@ -1484,6 +1484,7 @@ struct ext4_sb_info {
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyclusters_counter;
+       struct percpu_counter s_sra_exceeded_retry_limit;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
@@ -2793,6 +2794,8 @@ void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+                           struct dentry *dentry);
 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
 void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
 void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
index 77c7c8a..77c84d6 100644 (file)
@@ -4382,7 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 {
        struct inode *inode = file_inode(file);
        handle_t *handle;
-       int ret, ret2 = 0, ret3 = 0;
+       int ret = 0, ret2 = 0, ret3 = 0;
        int retries = 0;
        int depth = 0;
        struct ext4_map_blocks map;
index 6c4f19b..7541d0b 100644 (file)
@@ -513,10 +513,10 @@ void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
        __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 }
 
-void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+                         struct dentry *dentry)
 {
        struct __track_dentry_update_args args;
-       struct inode *inode = d_inode(dentry);
        int ret;
 
        args.dentry = dentry;
@@ -527,6 +527,11 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
        trace_ext4_fc_track_create(inode, dentry, ret);
 }
 
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
+{
+       __ext4_fc_track_create(handle, d_inode(dentry), dentry);
+}
+
 /* __track_fn for inode tracking */
 static int __track_inode(struct inode *inode, void *arg, bool update)
 {
index 650c5ac..0948a43 100644 (file)
@@ -1938,13 +1938,13 @@ static int __ext4_journalled_writepage(struct page *page,
        if (!ret)
                ret = err;
 
-       if (!ext4_has_inline_data(inode))
-               ext4_walk_page_buffers(NULL, page_bufs, 0, len,
-                                      NULL, bput_one);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
        unlock_page(page);
 out_no_pagelock:
+       if (!inline_data && page_bufs)
+               ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+                                      NULL, bput_one);
        brelse(inode_bh);
        return ret;
 }
@@ -5026,7 +5026,7 @@ static int ext4_do_update_inode(handle_t *handle,
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct buffer_head *bh = iloc->bh;
        struct super_block *sb = inode->i_sb;
-       int err = 0, rc, block;
+       int err = 0, block;
        int need_datasync = 0, set_large_file = 0;
        uid_t i_uid;
        gid_t i_gid;
@@ -5138,9 +5138,9 @@ static int ext4_do_update_inode(handle_t *handle,
                                              bh->b_data);
 
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-       rc = ext4_handle_dirty_metadata(handle, NULL, bh);
-       if (!err)
-               err = rc;
+       err = ext4_handle_dirty_metadata(handle, NULL, bh);
+       if (err)
+               goto out_brelse;
        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        if (set_large_file) {
                BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
@@ -5387,8 +5387,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
                        inode->i_gid = attr->ia_gid;
                error = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
-               if (unlikely(error))
+               if (unlikely(error)) {
+                       ext4_fc_stop_update(inode);
                        return error;
+               }
        }
 
        if (attr->ia_valid & ATTR_SIZE) {
index 99bf091..a02fadf 100644 (file)
@@ -2709,8 +2709,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
        }
 
        if (ext4_has_feature_flex_bg(sb)) {
-               /* a single flex group is supposed to be read by a single IO */
-               sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
+               /* a single flex group is supposed to be read by a single IO.
+                * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
+                * unsigned integer, so the maximum shift is 32.
+                */
+               if (sbi->s_es->s_log_groups_per_flex >= 32) {
+                       ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
+                       goto err_freesgi;
+               }
+               sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
                        BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
                sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
        } else {
index 686bf98..883e2a7 100644 (file)
@@ -3613,6 +3613,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
        return retval;
 }
 
+static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
+                         unsigned ino, unsigned file_type)
+{
+       struct ext4_renament old = *ent;
+       int retval = 0;
+
+       /*
+        * old->de could have moved from under us during make indexed dir,
+        * so the old->de may no longer valid and need to find it again
+        * before reset old inode info.
+        */
+       old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+       if (IS_ERR(old.bh))
+               retval = PTR_ERR(old.bh);
+       if (!old.bh)
+               retval = -ENOENT;
+       if (retval) {
+               ext4_std_error(old.dir->i_sb, retval);
+               return;
+       }
+
+       ext4_setent(handle, &old, ino, file_type);
+       brelse(old.bh);
+}
+
 static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
                                  const struct qstr *d_name)
 {
@@ -3774,14 +3799,14 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
         */
        retval = -ENOENT;
        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
-               goto end_rename;
+               goto release_bh;
 
        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
        if (IS_ERR(new.bh)) {
                retval = PTR_ERR(new.bh);
                new.bh = NULL;
-               goto end_rename;
+               goto release_bh;
        }
        if (new.bh) {
                if (!new.inode) {
@@ -3798,15 +3823,13 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
                handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
                if (IS_ERR(handle)) {
                        retval = PTR_ERR(handle);
-                       handle = NULL;
-                       goto end_rename;
+                       goto release_bh;
                }
        } else {
                whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle);
                if (IS_ERR(whiteout)) {
                        retval = PTR_ERR(whiteout);
-                       whiteout = NULL;
-                       goto end_rename;
+                       goto release_bh;
                }
        }
 
@@ -3850,6 +3873,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
                retval = ext4_mark_inode_dirty(handle, whiteout);
                if (unlikely(retval))
                        goto end_rename;
+
        }
        if (!new.bh) {
                retval = ext4_add_entry(handle, new.dentry, old.inode);
@@ -3923,6 +3947,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
                        ext4_fc_track_unlink(handle, new.dentry);
                __ext4_fc_track_link(handle, old.inode, new.dentry);
                __ext4_fc_track_unlink(handle, old.inode, old.dentry);
+               if (whiteout)
+                       __ext4_fc_track_create(handle, whiteout, old.dentry);
        }
 
        if (new.inode) {
@@ -3937,19 +3963,21 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 end_rename:
        if (whiteout) {
                if (retval) {
-                       ext4_setent(handle, &old,
-                               old.inode->i_ino, old_file_type);
+                       ext4_resetent(handle, &old,
+                                     old.inode->i_ino, old_file_type);
                        drop_nlink(whiteout);
+                       ext4_orphan_add(handle, whiteout);
                }
                unlock_new_inode(whiteout);
+               ext4_journal_stop(handle);
                iput(whiteout);
-
+       } else {
+               ext4_journal_stop(handle);
        }
+release_bh:
        brelse(old.dir_bh);
        brelse(old.bh);
        brelse(new.bh);
-       if (handle)
-               ext4_journal_stop(handle);
        return retval;
 }
 
index ad34a37..b969368 100644 (file)
@@ -1210,6 +1210,7 @@ static void ext4_put_super(struct super_block *sb)
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+       percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -5012,6 +5013,9 @@ no_journal:
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
        if (!err)
+               err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
+                                         GFP_KERNEL);
+       if (!err)
                err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
 
        if (err) {
@@ -5124,6 +5128,7 @@ failed_mount6:
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+       percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
 failed_mount5:
        ext4_ext_release(sb);
@@ -5149,8 +5154,8 @@ failed_mount_wq:
 failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
 failed_mount3:
-       del_timer_sync(&sbi->s_err_report);
        flush_work(&sbi->s_error_work);
+       del_timer_sync(&sbi->s_err_report);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
index 075aa3a..a3d0827 100644 (file)
@@ -24,6 +24,7 @@ typedef enum {
        attr_session_write_kbytes,
        attr_lifetime_write_kbytes,
        attr_reserved_clusters,
+       attr_sra_exceeded_retry_limit,
        attr_inode_readahead,
        attr_trigger_test_error,
        attr_first_error_time,
@@ -202,6 +203,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
 EXT4_ATTR_FUNC(session_write_kbytes, 0444);
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
+EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
                 ext4_sb_info, s_inode_readahead_blks);
@@ -251,6 +253,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
        ATTR_LIST(reserved_clusters),
+       ATTR_LIST(sra_exceeded_retry_limit),
        ATTR_LIST(inode_readahead_blks),
        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
@@ -374,6 +377,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
                return snprintf(buf, PAGE_SIZE, "%llu\n",
                                (unsigned long long)
                                atomic64_read(&sbi->s_resv_clusters));
+       case attr_sra_exceeded_retry_limit:
+               return snprintf(buf, PAGE_SIZE, "%llu\n",
+                               (unsigned long long)
+                       percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
        case attr_inode_readahead:
        case attr_pointer_ui:
                if (!ptr)
index 5b7ba8f..00e3cbd 100644 (file)
@@ -201,55 +201,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
        struct inode *inode = file_inode(filp);
        const int credits = 2; /* superblock and inode for ext4_orphan_del() */
        handle_t *handle;
+       struct ext4_iloc iloc;
        int err = 0;
-       int err2;
 
-       if (desc != NULL) {
-               /* Succeeded; write the verity descriptor. */
-               err = ext4_write_verity_descriptor(inode, desc, desc_size,
-                                                  merkle_tree_size);
-
-               /* Write all pages before clearing VERITY_IN_PROGRESS. */
-               if (!err)
-                       err = filemap_write_and_wait(inode->i_mapping);
-       }
+       /*
+        * If an error already occurred (which fs/verity/ signals by passing
+        * desc == NULL), then only clean-up is needed.
+        */
+       if (desc == NULL)
+               goto cleanup;
 
-       /* If we failed, truncate anything we wrote past i_size. */
-       if (desc == NULL || err)
-               ext4_truncate(inode);
+       /* Append the verity descriptor. */
+       err = ext4_write_verity_descriptor(inode, desc, desc_size,
+                                          merkle_tree_size);
+       if (err)
+               goto cleanup;
 
        /*
-        * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
-        * deleting the inode from the orphan list, even if something failed.
-        * If everything succeeded, we'll also set the verity bit in the same
-        * transaction.
+        * Write all pages (both data and verity metadata).  Note that this must
+        * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages
+        * beyond i_size won't be written properly.  For crash consistency, this
+        * also must happen before the verity inode flag gets persisted.
         */
+       err = filemap_write_and_wait(inode->i_mapping);
+       if (err)
+               goto cleanup;
 
-       ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+       /*
+        * Finally, set the verity inode flag and remove the inode from the
+        * orphan list (in a single transaction).
+        */
 
        handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
        if (IS_ERR(handle)) {
-               ext4_orphan_del(NULL, inode);
-               return PTR_ERR(handle);
+               err = PTR_ERR(handle);
+               goto cleanup;
        }
 
-       err2 = ext4_orphan_del(handle, inode);
-       if (err2)
-               goto out_stop;
+       err = ext4_orphan_del(handle, inode);
+       if (err)
+               goto stop_and_cleanup;
 
-       if (desc != NULL && !err) {
-               struct ext4_iloc iloc;
+       err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               goto stop_and_cleanup;
 
-               err = ext4_reserve_inode_write(handle, inode, &iloc);
-               if (err)
-                       goto out_stop;
-               ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
-               ext4_set_inode_flags(inode, false);
-               err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-       }
-out_stop:
+       ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+       ext4_set_inode_flags(inode, false);
+       err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+       if (err)
+               goto stop_and_cleanup;
+
+       ext4_journal_stop(handle);
+
+       ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+       return 0;
+
+stop_and_cleanup:
        ext4_journal_stop(handle);
-       return err ?: err2;
+cleanup:
+       /*
+        * Verity failed to be enabled, so clean up by truncating any verity
+        * metadata that was written beyond i_size (both from cache and from
+        * disk), removing the inode from the orphan list (if it wasn't done
+        * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS.
+        */
+       truncate_inode_pages(inode->i_mapping, inode->i_size);
+       ext4_truncate(inode);
+       ext4_orphan_del(NULL, inode);
+       ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+       return err;
 }
 
 static int ext4_get_verity_descriptor_location(struct inode *inode,
index 3722085..6c10182 100644 (file)
@@ -1462,6 +1462,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
        if (!ce)
                return NULL;
 
+       WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
+                    !(current->flags & PF_MEMALLOC_NOFS));
+
        ea_data = kvmalloc(value_len, GFP_KERNEL);
        if (!ea_data) {
                mb_cache_entry_put(ea_inode_cache, ce);
@@ -2327,6 +2330,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                        error = -ENOSPC;
                        goto cleanup;
                }
+               WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
        }
 
        error = ext4_reserve_inode_write(handle, inode, &is.iloc);
@@ -2400,7 +2404,7 @@ retry_inode:
                                 * external inode if possible.
                                 */
                                if (ext4_has_feature_ea_inode(inode->i_sb) &&
-                                   !i.in_inode) {
+                                   i.value_len && !i.in_inode) {
                                        i.in_inode = 1;
                                        goto retry_inode;
                                }
index c6636b4..c0fee83 100644 (file)
@@ -2229,19 +2229,21 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
 static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
                           unsigned long arg)
 {
-       int err = -ENOTTY;
+       int res;
+       int oldfd;
+       struct fuse_dev *fud = NULL;
 
-       if (cmd == FUSE_DEV_IOC_CLONE) {
-               int oldfd;
+       if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC)
+               return -ENOTTY;
 
-               err = -EFAULT;
-               if (!get_user(oldfd, (__u32 __user *) arg)) {
+       switch (_IOC_NR(cmd)) {
+       case _IOC_NR(FUSE_DEV_IOC_CLONE):
+               res = -EFAULT;
+               if (!get_user(oldfd, (__u32 __user *)arg)) {
                        struct file *old = fget(oldfd);
 
-                       err = -EINVAL;
+                       res = -EINVAL;
                        if (old) {
-                               struct fuse_dev *fud = NULL;
-
                                /*
                                 * Check against file->f_op because CUSE
                                 * uses the same ioctl handler.
@@ -2252,14 +2254,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 
                                if (fud) {
                                        mutex_lock(&fuse_mutex);
-                                       err = fuse_device_clone(fud->fc, file);
+                                       res = fuse_device_clone(fud->fc, file);
                                        mutex_unlock(&fuse_mutex);
                                }
                                fput(old);
                        }
                }
+               break;
+       default:
+               res = -ENOTTY;
+               break;
        }
-       return err;
+       return res;
 }
 
 const struct file_operations fuse_dev_operations = {
index 68cca8d..63d97a1 100644 (file)
@@ -863,6 +863,7 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
 
 static inline void fuse_make_bad(struct inode *inode)
 {
+       remove_inode_hash(inode);
        set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
 }
 
index 8868ac3..4ee6f73 100644 (file)
@@ -1324,8 +1324,15 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc)
 
        /* virtiofs allocates and installs its own fuse devices */
        ctx->fudptr = NULL;
-       if (ctx->dax)
+       if (ctx->dax) {
+               if (!fs->dax_dev) {
+                       err = -EINVAL;
+                       pr_err("virtio-fs: dax can't be enabled as filesystem"
+                              " device does not support it.\n");
+                       goto err_free_fuse_devs;
+               }
                ctx->dax_dev = fs->dax_dev;
+       }
        err = fuse_fill_super_common(sb, ctx);
        if (err < 0)
                goto err_free_fuse_devs;
index 97076d3..8fb9602 100644 (file)
@@ -162,8 +162,10 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
        int error;
 
        error = init_threads(sdp);
-       if (error)
+       if (error) {
+               gfs2_withdraw_delayed(sdp);
                return error;
+       }
 
        j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
        if (gfs2_withdrawn(sdp)) {
@@ -750,11 +752,13 @@ void gfs2_freeze_func(struct work_struct *work)
 static int gfs2_freeze(struct super_block *sb)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-       int error = 0;
+       int error;
 
        mutex_lock(&sdp->sd_freeze_mutex);
-       if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
+       if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) {
+               error = -EBUSY;
                goto out;
+       }
 
        for (;;) {
                if (gfs2_withdrawn(sdp)) {
@@ -795,10 +799,10 @@ static int gfs2_unfreeze(struct super_block *sb)
        struct gfs2_sbd *sdp = sb->s_fs_info;
 
        mutex_lock(&sdp->sd_freeze_mutex);
-        if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
+       if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
            !gfs2_holder_initialized(&sdp->sd_freeze_gh)) {
                mutex_unlock(&sdp->sd_freeze_mutex);
-                return 0;
+               return -EINVAL;
        }
 
        gfs2_freeze_unlock(&sdp->sd_freeze_gh);
index 0ae9eca..433c4d3 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/rculist_nulls.h>
 #include <linux/cpu.h>
 #include <linux/tracehook.h>
-#include <linux/freezer.h>
 
 #include "../kernel/sched/sched.h"
 #include "io-wq.h"
@@ -386,13 +385,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
        return NULL;
 }
 
-static void io_flush_signals(void)
+static bool io_flush_signals(void)
 {
-       if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) {
-               if (current->task_works)
-                       task_work_run();
-               clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL);
+       if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
+               __set_current_state(TASK_RUNNING);
+               tracehook_notify_signal();
+               return true;
        }
+       return false;
 }
 
 static void io_assign_current_work(struct io_worker *worker,
@@ -484,10 +484,12 @@ static int io_wqe_worker(void *data)
        worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
        io_wqe_inc_running(worker);
 
-       sprintf(buf, "iou-wrk-%d", wq->task_pid);
+       snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task_pid);
        set_task_comm(current, buf);
 
        while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+               long ret;
+
                set_current_state(TASK_INTERRUPTIBLE);
 loop:
                raw_spin_lock_irq(&wqe->lock);
@@ -497,11 +499,18 @@ loop:
                }
                __io_worker_idle(wqe, worker);
                raw_spin_unlock_irq(&wqe->lock);
-               io_flush_signals();
-               if (schedule_timeout(WORKER_IDLE_TIMEOUT))
+               if (io_flush_signals())
                        continue;
-               if (fatal_signal_pending(current))
+               ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
+               if (signal_pending(current)) {
+                       struct ksignal ksig;
+
+                       if (!get_signal(&ksig))
+                               continue;
                        break;
+               }
+               if (ret)
+                       continue;
                /* timed out, exit unless we're the fixed worker */
                if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
                    !(worker->flags & IO_WORKER_F_FIXED))
@@ -702,15 +711,20 @@ static int io_wq_manager(void *data)
        char buf[TASK_COMM_LEN];
        int node;
 
-       sprintf(buf, "iou-mgr-%d", wq->task_pid);
+       snprintf(buf, sizeof(buf), "iou-mgr-%d", wq->task_pid);
        set_task_comm(current, buf);
 
        do {
                set_current_state(TASK_INTERRUPTIBLE);
                io_wq_check_workers(wq);
                schedule_timeout(HZ);
-               if (fatal_signal_pending(current))
+               if (signal_pending(current)) {
+                       struct ksignal ksig;
+
+                       if (!get_signal(&ksig))
+                               continue;
                        set_bit(IO_WQ_BIT_EXIT, &wq->state);
+               }
        } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
 
        io_wq_check_workers(wq);
@@ -1057,7 +1071,11 @@ static void io_wq_destroy(struct io_wq *wq)
 
        for_each_node(node) {
                struct io_wqe *wqe = wq->wqes[node];
-               WARN_ON_ONCE(!wq_list_empty(&wqe->work_list));
+               struct io_cb_cancel_data match = {
+                       .fn             = io_wq_work_match_all,
+                       .cancel_all     = true,
+               };
+               io_wqe_cancel_pending_work(wqe, &match);
                kfree(wqe);
        }
        io_wq_put_hash(wq->hash);
index 1ac2f32..80d5905 100644 (file)
@@ -2,7 +2,6 @@
 #define INTERNAL_IO_WQ_H
 
 #include <linux/refcount.h>
-#include <linux/io_uring.h>
 
 struct io_wq;
 
@@ -21,6 +20,15 @@ enum io_wq_cancel {
        IO_WQ_CANCEL_NOTFOUND,  /* work not found */
 };
 
+struct io_wq_work_node {
+       struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+       struct io_wq_work_node *first;
+       struct io_wq_work_node *last;
+};
+
 static inline void wq_list_add_after(struct io_wq_work_node *node,
                                     struct io_wq_work_node *pos,
                                     struct io_wq_work_list *list)
index a4bce17..65a17d5 100644 (file)
@@ -78,7 +78,6 @@
 #include <linux/task_work.h>
 #include <linux/pagemap.h>
 #include <linux/io_uring.h>
-#include <linux/freezer.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -258,7 +257,8 @@ enum {
 
 struct io_sq_data {
        refcount_t              refs;
-       struct rw_semaphore     rw_lock;
+       atomic_t                park_pending;
+       struct mutex            lock;
 
        /* ctx's that are using this sqd */
        struct list_head        ctx_list;
@@ -273,6 +273,7 @@ struct io_sq_data {
 
        unsigned long           state;
        struct completion       exited;
+       struct callback_head    *park_task_work;
 };
 
 #define IO_IOPOLL_BATCH                        8
@@ -402,7 +403,7 @@ struct io_ring_ctx {
        struct socket           *ring_sock;
 #endif
 
-       struct idr              io_buffer_idr;
+       struct xarray           io_buffers;
 
        struct xarray           personalities;
        u32                     pers_next;
@@ -454,6 +455,22 @@ struct io_ring_ctx {
        struct list_head                tctx_list;
 };
 
+struct io_uring_task {
+       /* submission side */
+       struct xarray           xa;
+       struct wait_queue_head  wait;
+       const struct io_ring_ctx *last;
+       struct io_wq            *io_wq;
+       struct percpu_counter   inflight;
+       atomic_t                in_idle;
+       bool                    sqpoll;
+
+       spinlock_t              task_lock;
+       struct io_wq_work_list  task_list;
+       unsigned long           task_state;
+       struct callback_head    task_work;
+};
+
 /*
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -680,6 +697,7 @@ enum {
        REQ_F_NO_FILE_TABLE_BIT,
        REQ_F_LTIMEOUT_ACTIVE_BIT,
        REQ_F_COMPLETE_INLINE_BIT,
+       REQ_F_REISSUE_BIT,
 
        /* not a real bit, just to check we're not overflowing the space */
        __REQ_F_LAST_BIT,
@@ -723,6 +741,8 @@ enum {
        REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
        /* completion is deferred through io_comp_state */
        REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
+       /* caller should reissue async */
+       REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
 };
 
 struct async_poll {
@@ -1077,8 +1097,6 @@ static bool io_match_task(struct io_kiocb *head,
        io_for_each_link(req, head) {
                if (req->flags & REQ_F_INFLIGHT)
                        return true;
-               if (req->task->files == files)
-                       return true;
        }
        return false;
 }
@@ -1135,7 +1153,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        init_waitqueue_head(&ctx->cq_wait);
        INIT_LIST_HEAD(&ctx->cq_overflow_list);
        init_completion(&ctx->ref_comp);
-       idr_init(&ctx->io_buffer_idr);
+       xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
        xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
        mutex_init(&ctx->uring_lock);
        init_waitqueue_head(&ctx->wait);
@@ -1198,7 +1216,7 @@ static void io_prep_async_work(struct io_kiocb *req)
        if (req->flags & REQ_F_ISREG) {
                if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
                        io_wq_hash_work(&req->work, file_inode(req->file));
-       } else {
+       } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
        }
@@ -1221,16 +1239,16 @@ static void io_queue_async_work(struct io_kiocb *req)
        BUG_ON(!tctx);
        BUG_ON(!tctx->io_wq);
 
-       trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
-                                       &req->work, req->flags);
        /* init ->work of the whole link before punting */
        io_prep_async_link(req);
+       trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
+                                       &req->work, req->flags);
        io_wq_enqueue(tctx->io_wq, &req->work);
        if (link)
                io_queue_linked_timeout(link);
 }
 
-static void io_kill_timeout(struct io_kiocb *req)
+static void io_kill_timeout(struct io_kiocb *req, int status)
 {
        struct io_timeout_data *io = req->async_data;
        int ret;
@@ -1240,31 +1258,11 @@ static void io_kill_timeout(struct io_kiocb *req)
                atomic_set(&req->ctx->cq_timeouts,
                        atomic_read(&req->ctx->cq_timeouts) + 1);
                list_del_init(&req->timeout.list);
-               io_cqring_fill_event(req, 0);
+               io_cqring_fill_event(req, status);
                io_put_req_deferred(req, 1);
        }
 }
 
-/*
- * Returns true if we found and killed one or more timeouts
- */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
-                            struct files_struct *files)
-{
-       struct io_kiocb *req, *tmp;
-       int canceled = 0;
-
-       spin_lock_irq(&ctx->completion_lock);
-       list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
-               if (io_match_task(req, tsk, files)) {
-                       io_kill_timeout(req);
-                       canceled++;
-               }
-       }
-       spin_unlock_irq(&ctx->completion_lock);
-       return canceled != 0;
-}
-
 static void __io_queue_deferred(struct io_ring_ctx *ctx)
 {
        do {
@@ -1309,7 +1307,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
                        break;
 
                list_del_init(&req->timeout.list);
-               io_kill_timeout(req);
+               io_kill_timeout(req, 0);
        } while (!list_empty(&ctx->timeout_list));
 
        ctx->cq_last_tm_flush = seq;
@@ -1550,14 +1548,17 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
                io_put_task(req->task, 1);
                list_add(&req->compl.list, &cs->locked_free_list);
                cs->locked_free_nr++;
-       } else
-               req = NULL;
+       } else {
+               if (!percpu_ref_tryget(&ctx->refs))
+                       req = NULL;
+       }
        io_commit_cqring(ctx);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
-       io_cqring_ev_posted(ctx);
 
-       if (req)
+       if (req) {
+               io_cqring_ev_posted(ctx);
                percpu_ref_put(&ctx->refs);
+       }
 }
 
 static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -1925,17 +1926,44 @@ static int io_req_task_work_add(struct io_kiocb *req)
        return ret;
 }
 
-static void io_req_task_work_add_fallback(struct io_kiocb *req,
-                                         task_work_func_t cb)
+static bool io_run_task_work_head(struct callback_head **work_head)
+{
+       struct callback_head *work, *next;
+       bool executed = false;
+
+       do {
+               work = xchg(work_head, NULL);
+               if (!work)
+                       break;
+
+               do {
+                       next = work->next;
+                       work->func(work);
+                       work = next;
+                       cond_resched();
+               } while (work);
+               executed = true;
+       } while (1);
+
+       return executed;
+}
+
+static void io_task_work_add_head(struct callback_head **work_head,
+                                 struct callback_head *task_work)
 {
-       struct io_ring_ctx *ctx = req->ctx;
        struct callback_head *head;
 
-       init_task_work(&req->task_work, cb);
        do {
-               head = READ_ONCE(ctx->exit_task_work);
-               req->task_work.next = head;
-       } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
+               head = READ_ONCE(*work_head);
+               task_work->next = head;
+       } while (cmpxchg(work_head, head, task_work) != head);
+}
+
+static void io_req_task_work_add_fallback(struct io_kiocb *req,
+                                         task_work_func_t cb)
+{
+       init_task_work(&req->task_work, cb);
+       io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
 }
 
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
@@ -2451,6 +2479,11 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
                return false;
        return true;
 }
+#else
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+       return false;
+}
 #endif
 
 static bool io_rw_reissue(struct io_kiocb *req)
@@ -2476,13 +2509,14 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 {
        int cflags = 0;
 
-       if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
+       if (req->rw.kiocb.ki_flags & IOCB_WRITE)
+               kiocb_end_write(req);
+       if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) {
+               req->flags |= REQ_F_REISSUE;
                return;
+       }
        if (res != req->result)
                req_set_fail_links(req);
-
-       if (req->rw.kiocb.ki_flags & IOCB_WRITE)
-               kiocb_end_write(req);
        if (req->flags & REQ_F_BUFFER_SELECTED)
                cflags = io_put_rw_kbuf(req);
        __io_req_complete(req, issue_flags, res, cflags);
@@ -2843,7 +2877,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 
        lockdep_assert_held(&req->ctx->uring_lock);
 
-       head = idr_find(&req->ctx->io_buffer_idr, bgid);
+       head = xa_load(&req->ctx->io_buffers, bgid);
        if (head) {
                if (!list_empty(&head->list)) {
                        kbuf = list_last_entry(&head->list, struct io_buffer,
@@ -2851,7 +2885,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
                        list_del(&kbuf->list);
                } else {
                        kbuf = head;
-                       idr_remove(&req->ctx->io_buffer_idr, bgid);
+                       xa_erase(&req->ctx->io_buffers, bgid);
                }
                if (*len > kbuf->len)
                        *len = kbuf->len;
@@ -3259,11 +3293,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 
        ret = io_iter_do_read(req, iter);
 
-       if (ret == -EIOCBQUEUED) {
-               if (req->async_data)
-                       iov_iter_revert(iter, io_size - iov_iter_count(iter));
-               goto out_free;
-       } else if (ret == -EAGAIN) {
+       if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
                /* IOPOLL retry should happen for io-wq threads */
                if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
                        goto done;
@@ -3273,6 +3303,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                /* some cases will consume bytes even on error returns */
                iov_iter_revert(iter, io_size - iov_iter_count(iter));
                ret = 0;
+       } else if (ret == -EIOCBQUEUED) {
+               goto out_free;
        } else if (ret <= 0 || ret == io_size || !force_nonblock ||
                   (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
                /* read all, failed, already did sync or don't want to retry */
@@ -3385,6 +3417,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
        else
                ret2 = -EINVAL;
 
+       if (req->flags & REQ_F_REISSUE)
+               ret2 = -EAGAIN;
+
        /*
         * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
         * retry them without IOCB_NOWAIT.
@@ -3394,8 +3429,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
        /* no retry on NONBLOCK nor RWF_NOWAIT */
        if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
                goto done;
-       if (ret2 == -EIOCBQUEUED && req->async_data)
-               iov_iter_revert(iter, io_size - iov_iter_count(iter));
        if (!force_nonblock || ret2 != -EAGAIN) {
                /* IOPOLL retry should happen for io-wq threads */
                if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
@@ -3892,7 +3925,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
        }
        i++;
        kfree(buf);
-       idr_remove(&ctx->io_buffer_idr, bgid);
+       xa_erase(&ctx->io_buffers, bgid);
 
        return i;
 }
@@ -3910,7 +3943,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
        lockdep_assert_held(&ctx->uring_lock);
 
        ret = -ENOENT;
-       head = idr_find(&ctx->io_buffer_idr, p->bgid);
+       head = xa_load(&ctx->io_buffers, p->bgid);
        if (head)
                ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
        if (ret < 0)
@@ -3930,6 +3963,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 static int io_provide_buffers_prep(struct io_kiocb *req,
                                   const struct io_uring_sqe *sqe)
 {
+       unsigned long size;
        struct io_provide_buf *p = &req->pbuf;
        u64 tmp;
 
@@ -3943,7 +3977,8 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
        p->addr = READ_ONCE(sqe->addr);
        p->len = READ_ONCE(sqe->len);
 
-       if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
+       size = (unsigned long)p->len * p->nbufs;
+       if (!access_ok(u64_to_user_ptr(p->addr), size))
                return -EFAULT;
 
        p->bgid = READ_ONCE(sqe->buf_group);
@@ -3993,21 +4028,14 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
        lockdep_assert_held(&ctx->uring_lock);
 
-       list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
+       list = head = xa_load(&ctx->io_buffers, p->bgid);
 
        ret = io_add_buffers(p, &head);
-       if (ret < 0)
-               goto out;
-
-       if (!list) {
-               ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
-                                       GFP_KERNEL);
-               if (ret < 0) {
+       if (ret >= 0 && !list) {
+               ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
+               if (ret < 0)
                        __io_remove_buffers(ctx, head, p->bgid, -1U);
-                       goto out;
-               }
        }
-out:
        if (ret < 0)
                req_set_fail_links(req);
 
@@ -4345,6 +4373,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
        struct io_async_msghdr iomsg, *kmsg;
        struct socket *sock;
        unsigned flags;
+       int min_ret = 0;
        int ret;
 
        sock = sock_from_file(req->file);
@@ -4359,12 +4388,15 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
                kmsg = &iomsg;
        }
 
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
        if (flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;
        else if (issue_flags & IO_URING_F_NONBLOCK)
                flags |= MSG_DONTWAIT;
 
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
        ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
        if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
                return io_setup_async_msg(req, kmsg);
@@ -4375,7 +4407,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
        if (kmsg->free_iov)
                kfree(kmsg->free_iov);
        req->flags &= ~REQ_F_NEED_CLEANUP;
-       if (ret < 0)
+       if (ret < min_ret)
                req_set_fail_links(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
@@ -4388,6 +4420,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
        struct iovec iov;
        struct socket *sock;
        unsigned flags;
+       int min_ret = 0;
        int ret;
 
        sock = sock_from_file(req->file);
@@ -4403,12 +4436,15 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;
 
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
        if (flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;
        else if (issue_flags & IO_URING_F_NONBLOCK)
                flags |= MSG_DONTWAIT;
 
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&msg.msg_iter);
+
        msg.msg_flags = flags;
        ret = sock_sendmsg(sock, &msg);
        if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
@@ -4416,7 +4452,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
        if (ret == -ERESTARTSYS)
                ret = -EINTR;
 
-       if (ret < 0)
+       if (ret < min_ret)
                req_set_fail_links(req);
        __io_req_complete(req, issue_flags, ret, 0);
        return 0;
@@ -4568,6 +4604,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
        struct socket *sock;
        struct io_buffer *kbuf;
        unsigned flags;
+       int min_ret = 0;
        int ret, cflags = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
@@ -4593,12 +4630,15 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
                                1, req->sr_msg.len);
        }
 
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
        if (flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;
        else if (force_nonblock)
                flags |= MSG_DONTWAIT;
 
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
        ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
                                        kmsg->uaddr, flags);
        if (force_nonblock && ret == -EAGAIN)
@@ -4612,7 +4652,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
        if (kmsg->free_iov)
                kfree(kmsg->free_iov);
        req->flags &= ~REQ_F_NEED_CLEANUP;
-       if (ret < 0)
+       if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
                req_set_fail_links(req);
        __io_req_complete(req, issue_flags, ret, cflags);
        return 0;
@@ -4627,6 +4667,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
        struct socket *sock;
        struct iovec iov;
        unsigned flags;
+       int min_ret = 0;
        int ret, cflags = 0;
        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
@@ -4652,12 +4693,15 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
        msg.msg_iocb = NULL;
        msg.msg_flags = 0;
 
-       flags = req->sr_msg.msg_flags;
+       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
        if (flags & MSG_DONTWAIT)
                req->flags |= REQ_F_NOWAIT;
        else if (force_nonblock)
                flags |= MSG_DONTWAIT;
 
+       if (flags & MSG_WAITALL)
+               min_ret = iov_iter_count(&msg.msg_iter);
+
        ret = sock_recvmsg(sock, &msg, flags);
        if (force_nonblock && ret == -EAGAIN)
                return -EAGAIN;
@@ -4666,7 +4710,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 out_free:
        if (req->flags & REQ_F_BUFFER_SELECTED)
                cflags = io_put_recv_kbuf(req);
-       if (ret < 0)
+       if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
                req_set_fail_links(req);
        __io_req_complete(req, issue_flags, ret, cflags);
        return 0;
@@ -4763,7 +4807,6 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
                        ret = -ENOMEM;
                        goto out;
                }
-               io = req->async_data;
                memcpy(req->async_data, &__io, sizeof(__io));
                return -EAGAIN;
        }
@@ -5526,7 +5569,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
        data->mode = io_translate_timeout_mode(flags);
        hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
-       io_req_track_inflight(req);
+       if (is_timeout_link)
+               io_req_track_inflight(req);
        return 0;
 }
 
@@ -6129,6 +6173,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
                ret = -ECANCELED;
 
        if (!ret) {
+               req->flags &= ~REQ_F_REISSUE;
                do {
                        ret = io_issue_sqe(req, 0);
                        /*
@@ -6204,7 +6249,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
        if (prev) {
-               req_set_fail_links(prev);
                io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
                io_put_req_deferred(prev, 1);
        } else {
@@ -6423,8 +6467,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
        ret = io_init_req(ctx, req, sqe);
        if (unlikely(ret)) {
 fail_req:
-               io_put_req(req);
-               io_req_complete(req, ret);
                if (link->head) {
                        /* fail even hard links since we don't submit */
                        link->head->flags |= REQ_F_FAIL_LINK;
@@ -6432,6 +6474,8 @@ fail_req:
                        io_req_complete(link->head, -ECANCELED);
                        link->head = NULL;
                }
+               io_put_req(req);
+               io_req_complete(req, ret);
                return ret;
        }
        ret = io_req_prep(req, sqe);
@@ -6684,7 +6728,7 @@ static int io_sq_thread(void *data)
        char buf[TASK_COMM_LEN];
        DEFINE_WAIT(wait);
 
-       sprintf(buf, "iou-sqp-%d", sqd->task_pid);
+       snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
        set_task_comm(current, buf);
        current->pf_io_worker = NULL;
 
@@ -6694,22 +6738,30 @@ static int io_sq_thread(void *data)
                set_cpus_allowed_ptr(current, cpu_online_mask);
        current->flags |= PF_NO_SETAFFINITY;
 
-       down_read(&sqd->rw_lock);
-
+       mutex_lock(&sqd->lock);
        while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
                int ret;
                bool cap_entries, sqt_spin, needs_sched;
 
-               if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
-                       up_read(&sqd->rw_lock);
+               if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
+                   signal_pending(current)) {
+                       bool did_sig = false;
+
+                       mutex_unlock(&sqd->lock);
+                       if (signal_pending(current)) {
+                               struct ksignal ksig;
+
+                               did_sig = get_signal(&ksig);
+                       }
                        cond_resched();
-                       down_read(&sqd->rw_lock);
+                       mutex_lock(&sqd->lock);
+                       if (did_sig)
+                               break;
                        io_run_task_work();
+                       io_run_task_work_head(&sqd->park_task_work);
                        timeout = jiffies + sqd->sq_thread_idle;
                        continue;
                }
-               if (fatal_signal_pending(current))
-                       break;
                sqt_spin = false;
                cap_entries = !list_is_singular(&sqd->ctx_list);
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
@@ -6750,32 +6802,27 @@ static int io_sq_thread(void *data)
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_set_wakeup_flag(ctx);
 
-                       up_read(&sqd->rw_lock);
+                       mutex_unlock(&sqd->lock);
                        schedule();
-                       down_read(&sqd->rw_lock);
+                       mutex_lock(&sqd->lock);
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_clear_wakeup_flag(ctx);
                }
 
                finish_wait(&sqd->wait, &wait);
+               io_run_task_work_head(&sqd->park_task_work);
                timeout = jiffies + sqd->sq_thread_idle;
        }
-       up_read(&sqd->rw_lock);
-       down_write(&sqd->rw_lock);
-       /*
-        * someone may have parked and added a cancellation task_work, run
-        * it first because we don't want it in io_uring_cancel_sqpoll()
-        */
-       io_run_task_work();
 
        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                io_uring_cancel_sqpoll(ctx);
        sqd->thread = NULL;
        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                io_ring_set_wakeup_flag(ctx);
-       up_write(&sqd->rw_lock);
+       mutex_unlock(&sqd->lock);
 
        io_run_task_work();
+       io_run_task_work_head(&sqd->park_task_work);
        complete(&sqd->exited);
        do_exit(0);
 }
@@ -6821,7 +6868,7 @@ static int io_run_task_work_sig(void)
                return 1;
        if (!signal_pending(current))
                return 0;
-       if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
+       if (test_thread_flag(TIF_NOTIFY_SIGNAL))
                return -ERESTARTSYS;
        return -EINTR;
 }
@@ -7075,23 +7122,28 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 }
 
 static void io_sq_thread_unpark(struct io_sq_data *sqd)
-       __releases(&sqd->rw_lock)
+       __releases(&sqd->lock)
 {
        WARN_ON_ONCE(sqd->thread == current);
 
+       /*
+        * Do the dance but not conditional clear_bit() because it'd race with
+        * other threads incrementing park_pending and setting the bit.
+        */
        clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-       up_write(&sqd->rw_lock);
+       if (atomic_dec_return(&sqd->park_pending))
+               set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       mutex_unlock(&sqd->lock);
 }
 
 static void io_sq_thread_park(struct io_sq_data *sqd)
-       __acquires(&sqd->rw_lock)
+       __acquires(&sqd->lock)
 {
        WARN_ON_ONCE(sqd->thread == current);
 
+       atomic_inc(&sqd->park_pending);
        set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-       down_write(&sqd->rw_lock);
-       /* set again for consistency, in case concurrent parks are happening */
-       set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       mutex_lock(&sqd->lock);
        if (sqd->thread)
                wake_up_process(sqd->thread);
 }
@@ -7100,17 +7152,19 @@ static void io_sq_thread_stop(struct io_sq_data *sqd)
 {
        WARN_ON_ONCE(sqd->thread == current);
 
-       down_write(&sqd->rw_lock);
+       mutex_lock(&sqd->lock);
        set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
        if (sqd->thread)
                wake_up_process(sqd->thread);
-       up_write(&sqd->rw_lock);
+       mutex_unlock(&sqd->lock);
        wait_for_completion(&sqd->exited);
 }
 
 static void io_put_sq_data(struct io_sq_data *sqd)
 {
        if (refcount_dec_and_test(&sqd->refs)) {
+               WARN_ON_ONCE(atomic_read(&sqd->park_pending));
+
                io_sq_thread_stop(sqd);
                kfree(sqd);
        }
@@ -7184,9 +7238,10 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
        if (!sqd)
                return ERR_PTR(-ENOMEM);
 
+       atomic_set(&sqd->park_pending, 0);
        refcount_set(&sqd->refs, 1);
        INIT_LIST_HEAD(&sqd->ctx_list);
-       init_rwsem(&sqd->rw_lock);
+       mutex_init(&sqd->lock);
        init_waitqueue_head(&sqd->wait);
        init_completion(&sqd->exited);
        return sqd;
@@ -7866,22 +7921,17 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 
                ret = 0;
                io_sq_thread_park(sqd);
+               list_add(&ctx->sqd_list, &sqd->ctx_list);
+               io_sqd_update_thread_idle(sqd);
                /* don't attach to a dying SQPOLL thread, would be racy */
-               if (attached && !sqd->thread) {
+               if (attached && !sqd->thread)
                        ret = -ENXIO;
-               } else {
-                       list_add(&ctx->sqd_list, &sqd->ctx_list);
-                       io_sqd_update_thread_idle(sqd);
-               }
                io_sq_thread_unpark(sqd);
 
-               if (ret < 0) {
-                       io_put_sq_data(sqd);
-                       ctx->sq_data = NULL;
-                       return ret;
-               } else if (attached) {
+               if (ret < 0)
+                       goto err;
+               if (attached)
                        return 0;
-               }
 
                if (p->flags & IORING_SETUP_SQ_AFF) {
                        int cpu = p->sq_thread_cpu;
@@ -8332,19 +8382,13 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
        return -ENXIO;
 }
 
-static int __io_destroy_buffers(int id, void *p, void *data)
-{
-       struct io_ring_ctx *ctx = data;
-       struct io_buffer *buf = p;
-
-       __io_remove_buffers(ctx, buf, id, -1U);
-       return 0;
-}
-
 static void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
-       idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
-       idr_destroy(&ctx->io_buffer_idr);
+       struct io_buffer *buf;
+       unsigned long index;
+
+       xa_for_each(&ctx->io_buffers, index, buf)
+               __io_remove_buffers(ctx, buf, index, -1U);
 }
 
 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
@@ -8386,11 +8430,13 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
        /*
         * Some may use context even when all refs and requests have been put,
-        * and they are free to do so while still holding uring_lock, see
-        * __io_req_task_submit(). Wait for them to finish.
+        * and they are free to do so while still holding uring_lock or
+        * completion_lock, see __io_req_task_submit(). Wait for them to finish.
         */
        mutex_lock(&ctx->uring_lock);
        mutex_unlock(&ctx->uring_lock);
+       spin_lock_irq(&ctx->completion_lock);
+       spin_unlock_irq(&ctx->completion_lock);
 
        io_sq_thread_finish(ctx);
        io_sqe_buffers_unregister(ctx);
@@ -8478,26 +8524,9 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
        return -EINVAL;
 }
 
-static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
+static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
 {
-       struct callback_head *work, *next;
-       bool executed = false;
-
-       do {
-               work = xchg(&ctx->exit_task_work, NULL);
-               if (!work)
-                       break;
-
-               do {
-                       next = work->next;
-                       work->func(work);
-                       work = next;
-                       cond_resched();
-               } while (work);
-               executed = true;
-       } while (1);
-
-       return executed;
+       return io_run_task_work_head(&ctx->exit_task_work);
 }
 
 struct io_tctx_exit {
@@ -8529,6 +8558,14 @@ static void io_ring_exit_work(struct work_struct *work)
        struct io_tctx_node *node;
        int ret;
 
+       /* prevent SQPOLL from submitting new requests */
+       if (ctx->sq_data) {
+               io_sq_thread_park(ctx->sq_data);
+               list_del_init(&ctx->sqd_list);
+               io_sqd_update_thread_idle(ctx->sq_data);
+               io_sq_thread_unpark(ctx->sq_data);
+       }
+
        /*
         * If we're doing polled IO and end up having requests being
         * submitted async (out-of-line), then completions can come in while
@@ -8565,6 +8602,28 @@ static void io_ring_exit_work(struct work_struct *work)
        io_ring_ctx_free(ctx);
 }
 
+/* Returns true if we found and killed one or more timeouts */
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+                            struct files_struct *files)
+{
+       struct io_kiocb *req, *tmp;
+       int canceled = 0;
+
+       spin_lock_irq(&ctx->completion_lock);
+       list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
+               if (io_match_task(req, tsk, files)) {
+                       io_kill_timeout(req, -ECANCELED);
+                       canceled++;
+               }
+       }
+       if (canceled != 0)
+               io_commit_cqring(ctx);
+       spin_unlock_irq(&ctx->completion_lock);
+       if (canceled != 0)
+               io_cqring_ev_posted(ctx);
+       return canceled != 0;
+}
+
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
        unsigned long index;
@@ -8879,7 +8938,7 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
        if (task) {
                init_completion(&work.completion);
                init_task_work(&work.task_work, io_sqpoll_cancel_cb);
-               WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL));
+               io_task_work_add_head(&sqd->park_task_work, &work.task_work);
                wake_up_process(task);
        }
        io_sq_thread_unpark(sqd);
@@ -8956,6 +9015,8 @@ void __io_uring_task_cancel(void)
 
        /* make sure overflow events are dropped */
        atomic_inc(&tctx->in_idle);
+       __io_uring_files_cancel(NULL);
+
        do {
                /* read completions before cancelations */
                inflight = tctx_inflight(tctx);
index a648dbf..a5e478d 100644 (file)
@@ -170,6 +170,16 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
                        return ret;
        }
 
+       /*
+        * If this swapfile doesn't contain even a single page-aligned
+        * contiguous range of blocks, reject this useless swapfile to
+        * prevent confusion later on.
+        */
+       if (isi.nr_pages == 0) {
+               pr_warn("swapon: Cannot find a single usable page in file.\n");
+               return -EINVAL;
+       }
+
        *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
        sis->max = isi.nr_pages;
        sis->pages = isi.nr_pages - 1;
index 99ca97e..6125d2d 100644 (file)
@@ -1808,9 +1808,6 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
 
        if (flags & FL_LAYOUT)
                return 0;
-       if (flags & FL_DELEG)
-               /* We leave these checks to the caller. */
-               return 0;
 
        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
index 821e591..d6cff5f 100644 (file)
@@ -73,6 +73,7 @@ config NFSD_V4
        select NFSD_V3
        select FS_POSIX_ACL
        select SUNRPC_GSS
+       select CRYPTO
        select CRYPTO_MD5
        select CRYPTO_SHA256
        select GRACE_PERIOD
index 53fcbf7..7629248 100644 (file)
@@ -898,6 +898,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
                        continue;
                if (!nfsd_match_cred(nf->nf_cred, current_cred()))
                        continue;
+               if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+                       continue;
                if (nfsd_file_get(nf) != NULL)
                        return nf;
        }
index 052be5b..7325592 100644 (file)
@@ -1189,6 +1189,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
                switch (task->tk_status) {
                case -EIO:
                case -ETIMEDOUT:
+               case -EACCES:
                        nfsd4_mark_cb_down(clp, task->tk_status);
                }
                break;
index acdb3cd..dd9f38d 100644 (file)
@@ -1302,7 +1302,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
                        struct nfsd_file *dst)
 {
        nfs42_ssc_close(src->nf_file);
-       /* 'src' is freed by nfsd4_do_async_copy */
+       fput(src->nf_file);
        nfsd_file_put(dst);
        mntput(ss_mnt);
 }
index 423fd66..97447a6 100644 (file)
@@ -4940,31 +4940,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
        return fl;
 }
 
-static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
-                                               struct nfs4_file *fp)
-{
-       struct nfs4_clnt_odstate *co;
-       struct file *f = fp->fi_deleg_file->nf_file;
-       struct inode *ino = locks_inode(f);
-       int writes = atomic_read(&ino->i_writecount);
-
-       if (fp->fi_fds[O_WRONLY])
-               writes--;
-       if (fp->fi_fds[O_RDWR])
-               writes--;
-       if (writes > 0)
-               return -EAGAIN;
-       spin_lock(&fp->fi_lock);
-       list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) {
-               if (co->co_client != clp) {
-                       spin_unlock(&fp->fi_lock);
-                       return -EAGAIN;
-               }
-       }
-       spin_unlock(&fp->fi_lock);
-       return 0;
-}
-
 static struct nfs4_delegation *
 nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
@@ -4984,12 +4959,9 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 
        nf = find_readable_file(fp);
        if (!nf) {
-               /*
-                * We probably could attempt another open and get a read
-                * delegation, but for now, don't bother until the
-                * client actually sends us one.
-                */
-               return ERR_PTR(-EAGAIN);
+               /* We should always have a readable file here */
+               WARN_ON_ONCE(1);
+               return ERR_PTR(-EBADF);
        }
        spin_lock(&state_lock);
        spin_lock(&fp->fi_lock);
@@ -5019,19 +4991,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
        if (!fl)
                goto out_clnt_odstate;
 
-       status = nfsd4_check_conflicting_opens(clp, fp);
-       if (status) {
-               locks_free_lock(fl);
-               goto out_clnt_odstate;
-       }
        status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
        if (fl)
                locks_free_lock(fl);
        if (status)
                goto out_clnt_odstate;
-       status = nfsd4_check_conflicting_opens(clp, fp);
-       if (status)
-               goto out_clnt_odstate;
 
        spin_lock(&state_lock);
        spin_lock(&fp->fi_lock);
@@ -5113,6 +5077,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
                                goto out_no_deleg;
                        if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
                                goto out_no_deleg;
+                       /*
+                        * Also, if the file was opened for write or
+                        * create, there's a good chance the client's
+                        * about to write to it, resulting in an
+                        * immediate recall (since we don't support
+                        * write delegations):
+                        */
+                       if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+                               goto out_no_deleg;
+                       if (open->op_create == NFS4_OPEN_CREATE)
+                               goto out_no_deleg;
                        break;
                default:
                        goto out_no_deleg;
@@ -5389,7 +5364,7 @@ nfs4_laundromat(struct nfsd_net *nn)
        idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
                cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
                if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
-                               cps->cpntf_time > cutoff)
+                               cps->cpntf_time < cutoff)
                        _free_cpntf_state_locked(nn, cps);
        }
        spin_unlock(&nn->s2s_cp_lock);
index 9b3b06d..e47fde1 100644 (file)
@@ -44,7 +44,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec);
 
 static inline int reiserfs_xattrs_initialized(struct super_block *sb)
 {
-       return REISERFS_SB(sb)->priv_root != NULL;
+       return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
 }
 
 #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
index 37aaa83..945896d 100644 (file)
@@ -1055,10 +1055,9 @@ static long do_restart_poll(struct restart_block *restart_block)
 
        ret = do_sys_poll(ufds, nfds, to);
 
-       if (ret == -ERESTARTNOHAND) {
-               restart_block->fn = do_restart_poll;
-               ret = -ERESTART_RESTARTBLOCK;
-       }
+       if (ret == -ERESTARTNOHAND)
+               ret = set_restart_fn(restart_block, do_restart_poll);
+
        return ret;
 }
 
@@ -1080,7 +1079,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                struct restart_block *restart_block;
 
                restart_block = &current->restart_block;
-               restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;
 
@@ -1091,7 +1089,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                } else
                        restart_block->poll.has_timeout = 0;
 
-               ret = -ERESTART_RESTARTBLOCK;
+               ret = set_restart_fn(restart_block, do_restart_poll);
        }
        return ret;
 }
index eb02072..7237637 100644 (file)
@@ -152,14 +152,18 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
                start = le64_to_cpu(table[n]);
                end = le64_to_cpu(table[n + 1]);
 
-               if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+               if (start >= end
+                   || (end - start) >
+                   (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                        kfree(table);
                        return ERR_PTR(-EINVAL);
                }
        }
 
        start = le64_to_cpu(table[indexes - 1]);
-       if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) {
+       if (start >= lookup_table_start ||
+           (lookup_table_start - start) >
+           (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }
index 11581bf..ea53876 100644 (file)
@@ -97,14 +97,16 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
                start = le64_to_cpu(table[n]);
                end = le64_to_cpu(table[n + 1]);
 
-               if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+               if (start >= end || (end - start) >
+                               (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                        kfree(table);
                        return ERR_PTR(-EINVAL);
                }
        }
 
        start = le64_to_cpu(table[indexes - 1]);
-       if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) {
+       if (start >= id_table_start || (id_table_start - start) >
+                               (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }
index 8d64edb..b3fdc82 100644 (file)
@@ -17,6 +17,7 @@
 
 /* size of metadata (inode and directory) blocks */
 #define SQUASHFS_METADATA_SIZE         8192
+#define SQUASHFS_BLOCK_OFFSET          2
 
 /* default size of block device I/O */
 #ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
index ead6667..087cab8 100644 (file)
@@ -109,14 +109,16 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
                start = le64_to_cpu(table[n]);
                end = le64_to_cpu(table[n + 1]);
 
-               if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+               if (start >= end || (end - start) >
+                               (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                        kfree(table);
                        return ERR_PTR(-EINVAL);
                }
        }
 
        start = le64_to_cpu(table[indexes - 1]);
-       if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) {
+       if (start >= table_start || (table_start - start) >
+                               (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }
index 46a861d..f93370b 100644 (file)
@@ -1007,9 +1007,10 @@ xfs_create(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-                                       &udqp, &gdqp, &pdqp);
+       error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
+                       fsgid_into_mnt(mnt_userns), prid,
+                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                       &udqp, &gdqp, &pdqp);
        if (error)
                return error;
 
@@ -1157,9 +1158,10 @@ xfs_create_tmpfile(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                               XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-                               &udqp, &gdqp, &pdqp);
+       error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
+                       fsgid_into_mnt(mnt_userns), prid,
+                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                       &udqp, &gdqp, &pdqp);
        if (error)
                return error;
 
index ca310a1..3498b97 100644 (file)
@@ -168,6 +168,12 @@ xfs_bulkstat_one(
        };
        int                     error;
 
+       if (breq->mnt_userns != &init_user_ns) {
+               xfs_warn_ratelimited(breq->mp,
+                       "bulkstat not supported inside of idmapped mounts.");
+               return -EINVAL;
+       }
+
        ASSERT(breq->icount == 1);
 
        bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
index 52370d0..1c97b15 100644 (file)
@@ -635,6 +635,47 @@ xfs_check_summary_counts(
 }
 
 /*
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+ * internal inode structures can be sitting in the CIL and AIL at this point,
+ * so we need to unpin them, write them back and/or reclaim them before unmount
+ * can proceed.
+ *
+ * An inode cluster that has been freed can have its buffer still pinned in
+ * memory because the transaction is still sitting in a iclog. The stale inodes
+ * on that buffer will be pinned to the buffer until the transaction hits the
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
+ * may never see the pinned buffer, so nothing will push out the iclog and
+ * unpin the buffer.
+ *
+ * Hence we need to force the log to unpin everything first. However, log
+ * forces don't wait for the discards they issue to complete, so we have to
+ * explicitly wait for them to complete here as well.
+ *
+ * Then we can tell the world we are unmounting so that error handling knows
+ * that the filesystem is going away and we should error out anything that we
+ * have been retrying in the background.  This will prevent never-ending
+ * retries in AIL pushing from hanging the unmount.
+ *
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
+ * reclaim the remaining inodes that are still in memory at this point in time.
+ */
+static void
+xfs_unmount_flush_inodes(
+       struct xfs_mount        *mp)
+{
+       xfs_log_force(mp, XFS_LOG_SYNC);
+       xfs_extent_busy_wait_all(mp);
+       flush_workqueue(xfs_discard_wq);
+
+       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
+       xfs_ail_push_all_sync(mp->m_ail);
+       cancel_delayed_work_sync(&mp->m_reclaim_work);
+       xfs_reclaim_inodes(mp);
+       xfs_health_unmount(mp);
+}
+
+/*
  * This function does the following on an initial mount of a file system:
  *     - reads the superblock from disk and init the mount struct
  *     - if we're a 32-bit kernel, do a size check on the superblock
@@ -1008,7 +1049,7 @@ xfs_mountfs(
        /* Clean out dquots that might be in memory after quotacheck. */
        xfs_qm_unmount(mp);
        /*
-        * Cancel all delayed reclaim work and reclaim the inodes directly.
+        * Flush all inode reclamation work and flush the log.
         * We have to do this /after/ rtunmount and qm_unmount because those
         * two will have scheduled delayed reclaim for the rt/quota inodes.
         *
@@ -1018,11 +1059,8 @@ xfs_mountfs(
         * qm_unmount_quotas and therefore rely on qm_unmount to release the
         * quota inodes.
         */
-       cancel_delayed_work_sync(&mp->m_reclaim_work);
-       xfs_reclaim_inodes(mp);
-       xfs_health_unmount(mp);
+       xfs_unmount_flush_inodes(mp);
  out_log_dealloc:
-       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
        xfs_log_mount_cancel(mp);
  out_fail_wait:
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -1063,47 +1101,7 @@ xfs_unmountfs(
        xfs_rtunmount_inodes(mp);
        xfs_irele(mp->m_rootip);
 
-       /*
-        * We can potentially deadlock here if we have an inode cluster
-        * that has been freed has its buffer still pinned in memory because
-        * the transaction is still sitting in a iclog. The stale inodes
-        * on that buffer will be pinned to the buffer until the
-        * transaction hits the disk and the callbacks run. Pushing the AIL will
-        * skip the stale inodes and may never see the pinned buffer, so
-        * nothing will push out the iclog and unpin the buffer. Hence we
-        * need to force the log here to ensure all items are flushed into the
-        * AIL before we go any further.
-        */
-       xfs_log_force(mp, XFS_LOG_SYNC);
-
-       /*
-        * Wait for all busy extents to be freed, including completion of
-        * any discard operation.
-        */
-       xfs_extent_busy_wait_all(mp);
-       flush_workqueue(xfs_discard_wq);
-
-       /*
-        * We now need to tell the world we are unmounting. This will allow
-        * us to detect that the filesystem is going away and we should error
-        * out anything that we have been retrying in the background. This will
-        * prevent neverending retries in AIL pushing from hanging the unmount.
-        */
-       mp->m_flags |= XFS_MOUNT_UNMOUNTING;
-
-       /*
-        * Flush all pending changes from the AIL.
-        */
-       xfs_ail_push_all_sync(mp->m_ail);
-
-       /*
-        * Reclaim all inodes. At this point there should be no dirty inodes and
-        * none should be pinned or locked. Stop background inode reclaim here
-        * if it is still running.
-        */
-       cancel_delayed_work_sync(&mp->m_reclaim_work);
-       xfs_reclaim_inodes(mp);
-       xfs_health_unmount(mp);
+       xfs_unmount_flush_inodes(mp);
 
        xfs_qm_unmount(mp);
 
index 1379013..7f368b1 100644 (file)
@@ -182,7 +182,8 @@ xfs_symlink(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+       error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
+                       fsgid_into_mnt(mnt_userns), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
                        &udqp, &gdqp, &pdqp);
        if (error)
index 0fe76f3..049e36c 100644 (file)
@@ -165,6 +165,21 @@ static int zonefs_writepages(struct address_space *mapping,
        return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 }
 
+static int zonefs_swap_activate(struct swap_info_struct *sis,
+                               struct file *swap_file, sector_t *span)
+{
+       struct inode *inode = file_inode(swap_file);
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+       if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
+               zonefs_err(inode->i_sb,
+                          "swap file: not a conventional zone file\n");
+               return -EINVAL;
+       }
+
+       return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
+}
+
 static const struct address_space_operations zonefs_file_aops = {
        .readpage               = zonefs_readpage,
        .readahead              = zonefs_readahead,
@@ -177,6 +192,7 @@ static const struct address_space_operations zonefs_file_aops = {
        .is_partially_uptodate  = iomap_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
        .direct_IO              = noop_direct_IO,
+       .swap_activate          = zonefs_swap_activate,
 };
 
 static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
@@ -728,6 +744,68 @@ out_release:
 }
 
 /*
+ * Do not exceed the LFS limits nor the file zone size. If pos is under the
+ * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
+ */
+static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
+                                       loff_t count)
+{
+       struct inode *inode = file_inode(file);
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       loff_t limit = rlimit(RLIMIT_FSIZE);
+       loff_t max_size = zi->i_max_size;
+
+       if (limit != RLIM_INFINITY) {
+               if (pos >= limit) {
+                       send_sig(SIGXFSZ, current, 0);
+                       return -EFBIG;
+               }
+               count = min(count, limit - pos);
+       }
+
+       if (!(file->f_flags & O_LARGEFILE))
+               max_size = min_t(loff_t, MAX_NON_LFS, max_size);
+
+       if (unlikely(pos >= max_size))
+               return -EFBIG;
+
+       return min(count, max_size - pos);
+}
+
+static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       loff_t count;
+
+       if (IS_SWAPFILE(inode))
+               return -ETXTBSY;
+
+       if (!iov_iter_count(from))
+               return 0;
+
+       if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+               return -EINVAL;
+
+       if (iocb->ki_flags & IOCB_APPEND) {
+               if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+                       return -EINVAL;
+               mutex_lock(&zi->i_truncate_mutex);
+               iocb->ki_pos = zi->i_wpoffset;
+               mutex_unlock(&zi->i_truncate_mutex);
+       }
+
+       count = zonefs_write_check_limits(file, iocb->ki_pos,
+                                         iov_iter_count(from));
+       if (count < 0)
+               return count;
+
+       iov_iter_truncate(from, count);
+       return iov_iter_count(from);
+}
+
+/*
  * Handle direct writes. For sequential zone files, this is the only possible
  * write path. For these files, check that the user is issuing writes
  * sequentially from the end of the file. This code assumes that the block layer
@@ -744,8 +822,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
        struct super_block *sb = inode->i_sb;
        bool sync = is_sync_kiocb(iocb);
        bool append = false;
-       size_t count;
-       ssize_t ret;
+       ssize_t ret, count;
 
        /*
         * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
@@ -763,12 +840,11 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
                inode_lock(inode);
        }
 
-       ret = generic_write_checks(iocb, from);
-       if (ret <= 0)
+       count = zonefs_write_checks(iocb, from);
+       if (count <= 0) {
+               ret = count;
                goto inode_unlock;
-
-       iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos);
-       count = iov_iter_count(from);
+       }
 
        if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
                ret = -EINVAL;
@@ -828,12 +904,10 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
                inode_lock(inode);
        }
 
-       ret = generic_write_checks(iocb, from);
+       ret = zonefs_write_checks(iocb, from);
        if (ret <= 0)
                goto inode_unlock;
 
-       iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos);
-
        ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
        if (ret > 0)
                iocb->ki_pos += ret;
@@ -966,9 +1040,7 @@ static int zonefs_open_zone(struct inode *inode)
 
        mutex_lock(&zi->i_truncate_mutex);
 
-       zi->i_wr_refcnt++;
-       if (zi->i_wr_refcnt == 1) {
-
+       if (!zi->i_wr_refcnt) {
                if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
                        atomic_dec(&sbi->s_open_zones);
                        ret = -EBUSY;
@@ -978,7 +1050,6 @@ static int zonefs_open_zone(struct inode *inode)
                if (i_size_read(inode) < zi->i_max_size) {
                        ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
                        if (ret) {
-                               zi->i_wr_refcnt--;
                                atomic_dec(&sbi->s_open_zones);
                                goto unlock;
                        }
@@ -986,6 +1057,8 @@ static int zonefs_open_zone(struct inode *inode)
                }
        }
 
+       zi->i_wr_refcnt++;
+
 unlock:
        mutex_unlock(&zi->i_truncate_mutex);
 
index 02a716a..f28b097 100644 (file)
@@ -233,6 +233,7 @@ struct acpi_pnp_type {
 
 struct acpi_device_pnp {
        acpi_bus_id bus_id;             /* Object name */
+       int instance_no;                /* Instance number of this object */
        struct acpi_pnp_type type;      /* ID type */
        acpi_bus_address bus_address;   /* _ADR */
        char *unique_id;                /* _UID */
index e17be32..b8ca136 100644 (file)
@@ -612,9 +612,11 @@ static inline void ttm_bo_pin(struct ttm_buffer_object *bo)
 static inline void ttm_bo_unpin(struct ttm_buffer_object *bo)
 {
        dma_resv_assert_held(bo->base.resv);
-       WARN_ON_ONCE(!bo->pin_count);
        WARN_ON_ONCE(!kref_read(&bo->kref));
-       --bo->pin_count;
+       if (bo->pin_count)
+               --bo->pin_count;
+       else
+               WARN_ON_ONCE(true);
 }
 
 int ttm_mem_evict_first(struct ttm_bo_device *bdev,
index fcdaab7..3bdcfc4 100644 (file)
@@ -222,10 +222,14 @@ void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
 void __acpi_unmap_table(void __iomem *map, unsigned long size);
 int early_acpi_boot_init(void);
 int acpi_boot_init (void);
+void acpi_boot_table_prepare (void);
 void acpi_boot_table_init (void);
 int acpi_mps_check (void);
 int acpi_numa_init (void);
 
+int acpi_locate_initial_tables (void);
+void acpi_reserve_initial_tables (void);
+void acpi_table_init_complete (void);
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
 int __init acpi_table_parse_entries(char *id, unsigned long table_size,
@@ -814,9 +818,12 @@ static inline int acpi_boot_init(void)
        return 0;
 }
 
+static inline void acpi_boot_table_prepare(void)
+{
+}
+
 static inline void acpi_boot_table_init(void)
 {
-       return;
 }
 
 static inline int acpi_mps_check(void)
index 6cc93ab..c68d87b 100644 (file)
@@ -105,8 +105,19 @@ extern struct bus_type amba_bustype;
 #define amba_get_drvdata(d)    dev_get_drvdata(&d->dev)
 #define amba_set_drvdata(d,p)  dev_set_drvdata(&d->dev, p)
 
+#ifdef CONFIG_ARM_AMBA
 int amba_driver_register(struct amba_driver *);
 void amba_driver_unregister(struct amba_driver *);
+#else
+static inline int amba_driver_register(struct amba_driver *drv)
+{
+       return -EINVAL;
+}
+static inline void amba_driver_unregister(struct amba_driver *drv)
+{
+}
+#endif
+
 struct amba_device *amba_device_alloc(const char *, resource_size_t, size_t);
 void amba_device_put(struct amba_device *);
 int amba_device_add(struct amba_device *, struct resource *);
index bc6bc83..158aefa 100644 (file)
@@ -85,8 +85,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_ELVPRIV            ((__force req_flags_t)(1 << 12))
 /* account into disk and partition IO statistics */
 #define RQF_IO_STAT            ((__force req_flags_t)(1 << 13))
-/* request came from our alloc pool */
-#define RQF_ALLOCED            ((__force req_flags_t)(1 << 14))
 /* runtime pm request */
 #define RQF_PM                 ((__force req_flags_t)(1 << 15))
 /* on IO scheduler merge hash */
index cccaef1..3625f01 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/capability.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
+#include <linux/percpu-refcount.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -556,7 +557,8 @@ struct bpf_tramp_progs {
  *      fentry = a set of program to run before calling original function
  *      fexit = a set of program to run after original function
  */
-int arch_prepare_bpf_trampoline(void *image, void *image_end,
+struct bpf_tramp_image;
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
                                const struct btf_func_model *m, u32 flags,
                                struct bpf_tramp_progs *tprogs,
                                void *orig_call);
@@ -565,6 +567,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog);
 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog);
 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
 
 struct bpf_ksym {
        unsigned long            start;
@@ -583,6 +587,18 @@ enum bpf_tramp_prog_type {
        BPF_TRAMP_REPLACE, /* more than MAX */
 };
 
+struct bpf_tramp_image {
+       void *image;
+       struct bpf_ksym ksym;
+       struct percpu_ref pcref;
+       void *ip_after_call;
+       void *ip_epilogue;
+       union {
+               struct rcu_head rcu;
+               struct work_struct work;
+       };
+};
+
 struct bpf_trampoline {
        /* hlist for trampoline_table */
        struct hlist_node hlist;
@@ -605,9 +621,8 @@ struct bpf_trampoline {
        /* Number of attached programs. A counter per kind. */
        int progs_cnt[BPF_TRAMP_MAX];
        /* Executable image of trampoline */
-       void *image;
+       struct bpf_tramp_image *cur_image;
        u64 selector;
-       struct bpf_ksym ksym;
 };
 
 struct bpf_attach_target_info {
@@ -691,6 +706,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
+int bpf_jit_charge_modmem(u32 pages);
+void bpf_jit_uncharge_modmem(u32 pages);
 #else
 static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
                                           struct bpf_trampoline *tr)
@@ -787,7 +804,6 @@ struct bpf_prog_aux {
        bool func_proto_unreliable;
        bool sleepable;
        bool tail_call_reachable;
-       enum bpf_tramp_prog_type trampoline_prog_type;
        struct hlist_node tramp_hlist;
        /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
        const struct btf_type *attach_func_proto;
@@ -1093,7 +1109,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                _ret;                                                   \
         })
 
-#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \
        ({                                              \
                struct bpf_prog_array_item *_item;      \
                struct bpf_prog *_prog;                 \
@@ -1106,7 +1122,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        goto _out;                      \
                _item = &_array->items[0];              \
                while ((_prog = READ_ONCE(_item->prog))) {              \
-                       bpf_cgroup_storage_set(_item->cgroup_storage);  \
+                       if (set_cg_storage)             \
+                               bpf_cgroup_storage_set(_item->cgroup_storage);  \
                        _ret &= func(_prog, ctx);       \
                        _item++;                        \
                }                                       \
@@ -1153,10 +1170,10 @@ _out:                                                   \
        })
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)           \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+       __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true)
 
 #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)     \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+       __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
index acb77dc..4452354 100644 (file)
@@ -61,6 +61,10 @@ SUBSYS(pids)
 SUBSYS(rdma)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_MISC)
+SUBSYS(misc)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
index 7f4ac87..5c641f9 100644 (file)
@@ -253,7 +253,11 @@ struct target_type {
 #define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
 
 /*
- * Indicates that a target supports host-managed zoned block devices.
+ * Indicates support for zoned block devices:
+ * - DM_TARGET_ZONED_HM: the target also supports host-managed zoned
+ *   block devices but does not support combining different zoned models.
+ * - DM_TARGET_MIXED_ZONED_MODEL: the target supports combining multiple
+ *   devices with different zoned models.
  */
 #ifdef CONFIG_BLK_DEV_ZONED
 #define DM_TARGET_ZONED_HM             0x00000040
@@ -275,6 +279,15 @@ struct target_type {
 #define DM_TARGET_PASSES_CRYPTO                0x00000100
 #define dm_target_passes_crypto(type) ((type)->features & DM_TARGET_PASSES_CRYPTO)
 
+#ifdef CONFIG_BLK_DEV_ZONED
+#define DM_TARGET_MIXED_ZONED_MODEL    0x00000200
+#define dm_target_supports_mixed_zoned_model(type) \
+       ((type)->features & DM_TARGET_MIXED_ZONED_MODEL)
+#else
+#define DM_TARGET_MIXED_ZONED_MODEL    0x00000000
+#define dm_target_supports_mixed_zoned_model(type) (false)
+#endif
+
 struct dm_target {
        struct dm_table *table;
        struct target_type *type;
index 8710f57..6b5d36b 100644 (file)
@@ -72,8 +72,10 @@ typedef void *efi_handle_t;
  */
 typedef guid_t efi_guid_t __aligned(__alignof__(u32));
 
-#define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
-       GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
+#define EFI_GUID(a, b, c, d...) (efi_guid_t){ {                                        \
+       (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff,  \
+       (b) & 0xff, ((b) >> 8) & 0xff,                                          \
+       (c) & 0xff, ((c) >> 8) & 0xff, d } }
 
 /*
  * Generic EFI table header
index fd183fb..0c19010 100644 (file)
@@ -271,6 +271,29 @@ static inline  void devm_extcon_unregister_notifier(struct device *dev,
                                struct extcon_dev *edev, unsigned int id,
                                struct notifier_block *nb) { }
 
+static inline int extcon_register_notifier_all(struct extcon_dev *edev,
+                                              struct notifier_block *nb)
+{
+       return 0;
+}
+
+static inline int extcon_unregister_notifier_all(struct extcon_dev *edev,
+                                                struct notifier_block *nb)
+{
+       return 0;
+}
+
+static inline int devm_extcon_register_notifier_all(struct device *dev,
+                                                   struct extcon_dev *edev,
+                                                   struct notifier_block *nb)
+{
+       return 0;
+}
+
+static inline void devm_extcon_unregister_notifier_all(struct device *dev,
+                                                      struct extcon_dev *edev,
+                                                      struct notifier_block *nb) { }
+
 static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
 {
        return ERR_PTR(-ENODEV);
index ebc2956..19781b0 100644 (file)
@@ -56,7 +56,7 @@
  * COMMAND_RECONFIG_FLAG_PARTIAL:
  * Set to FPGA configuration type (full or partial).
  */
-#define COMMAND_RECONFIG_FLAG_PARTIAL  1
+#define COMMAND_RECONFIG_FLAG_PARTIAL  0
 
 /*
  * Timeout settings for service clients:
index ce59a6a..9eb77c8 100644 (file)
@@ -320,7 +320,14 @@ static inline struct host1x_device *to_host1x_device(struct device *dev)
 int host1x_device_init(struct host1x_device *device);
 int host1x_device_exit(struct host1x_device *device);
 
-int host1x_client_register(struct host1x_client *client);
+int __host1x_client_register(struct host1x_client *client,
+                            struct lock_class_key *key);
+#define host1x_client_register(class) \
+       ({ \
+               static struct lock_class_key __key; \
+               __host1x_client_register(class, &__key); \
+       })
+
 int host1x_client_unregister(struct host1x_client *client);
 
 int host1x_client_suspend(struct host1x_client *client);
index 2ad6e92..0bff345 100644 (file)
@@ -113,6 +113,11 @@ static inline bool hugetlb_cgroup_disabled(void)
        return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
 }
 
+static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
+{
+       css_put(&h_cg->css);
+}
+
 extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                        struct hugetlb_cgroup **ptr);
 extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
@@ -138,7 +143,8 @@ extern void hugetlb_cgroup_uncharge_counter(struct resv_map *resv,
 
 extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                                struct file_region *rg,
-                                               unsigned long nr_pages);
+                                               unsigned long nr_pages,
+                                               bool region_del);
 
 extern void hugetlb_cgroup_file_init(void) __init;
 extern void hugetlb_cgroup_migrate(struct page *oldhpage,
@@ -147,7 +153,8 @@ extern void hugetlb_cgroup_migrate(struct page *oldhpage,
 #else
 static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                                       struct file_region *rg,
-                                                      unsigned long nr_pages)
+                                                      unsigned long nr_pages,
+                                                      bool region_del)
 {
 }
 
@@ -185,6 +192,10 @@ static inline bool hugetlb_cgroup_disabled(void)
        return true;
 }
 
+static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
+{
+}
+
 static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                               struct hugetlb_cgroup **ptr)
 {
index 96556c6..10c94a3 100644 (file)
@@ -43,13 +43,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
        if (likely(success)) {
                struct vlan_pcpu_stats *pcpu_stats;
 
-               pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
+               pcpu_stats = get_cpu_ptr(vlan->pcpu_stats);
                u64_stats_update_begin(&pcpu_stats->syncp);
                pcpu_stats->rx_packets++;
                pcpu_stats->rx_bytes += len;
                if (multicast)
                        pcpu_stats->rx_multicast++;
                u64_stats_update_end(&pcpu_stats->syncp);
+               put_cpu_ptr(vlan->pcpu_stats);
        } else {
                this_cpu_inc(vlan->pcpu_stats->rx_errors);
        }
index 9761a0e..79cde99 100644 (file)
@@ -5,31 +5,6 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 
-struct io_wq_work_node {
-       struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-       struct io_wq_work_node *first;
-       struct io_wq_work_node *last;
-};
-
-struct io_uring_task {
-       /* submission side */
-       struct xarray           xa;
-       struct wait_queue_head  wait;
-       void                    *last;
-       void                    *io_wq;
-       struct percpu_counter   inflight;
-       atomic_t                in_idle;
-       bool                    sqpoll;
-
-       spinlock_t              task_lock;
-       struct io_wq_work_list  task_list;
-       unsigned long           task_state;
-       struct callback_head    task_work;
-};
-
 #if defined(CONFIG_IO_URING)
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_task_cancel(void);
index 1b65e72..8895b95 100644 (file)
@@ -192,8 +192,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                              struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                             struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                                         gpa_t addr);
 
@@ -218,6 +218,20 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+struct kvm_gfn_range {
+       struct kvm_memory_slot *slot;
+       gfn_t start;
+       gfn_t end;
+       pte_t pte;
+       bool may_block;
+};
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+#endif
+
 enum {
        OUTSIDE_GUEST_MODE,
        IN_GUEST_MODE,
@@ -640,6 +654,7 @@ void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
+bool file_is_kvm(struct file *file);
 void kvm_put_kvm_no_destroy(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
@@ -886,7 +901,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       struct kvm_memory_slot *memslot);
+                                       const struct kvm_memory_slot *memslot);
 #else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
@@ -945,6 +960,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 
@@ -1116,7 +1132,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 }
 
 static inline unsigned long
-__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
        return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
 }
index d13e3cd..5984fff 100644 (file)
@@ -460,7 +460,7 @@ static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
-static inline __init void memblock_set_bottom_up(bool enable)
+static inline __init_memblock void memblock_set_bottom_up(bool enable)
 {
        memblock.bottom_up = enable;
 }
@@ -470,7 +470,7 @@ static inline __init void memblock_set_bottom_up(bool enable)
  * if this is true, that said, memblock will allocate memory
  * in bottom-up direction.
  */
-static inline __init bool memblock_bottom_up(void)
+static inline __init_memblock bool memblock_bottom_up(void)
 {
        return memblock.bottom_up;
 }
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
new file mode 100644 (file)
index 0000000..c5af592
--- /dev/null
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Miscellaneous cgroup controller.
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+#ifndef _MISC_CGROUP_H_
+#define _MISC_CGROUP_H_
+
+/**
+ * Types of misc cgroup entries supported by the host.
+ */
+enum misc_res_type {
+#ifdef CONFIG_KVM_AMD_SEV
+       /* AMD SEV ASIDs resource */
+       MISC_CG_RES_SEV,
+       /* AMD SEV-ES ASIDs resource */
+       MISC_CG_RES_SEV_ES,
+#endif
+       MISC_CG_RES_TYPES
+};
+
+struct misc_cg;
+
+#ifdef CONFIG_CGROUP_MISC
+
+#include <linux/cgroup.h>
+
+/**
+ * struct misc_res: Per cgroup per misc type resource
+ * @max: Maximum limit on the resource.
+ * @usage: Current usage of the resource.
+ * @failed: True if charged failed for the resource in a cgroup.
+ */
+struct misc_res {
+       unsigned long max;
+       atomic_long_t usage;
+       bool failed;
+};
+
+/**
+ * struct misc_cg - Miscellaneous controller's cgroup structure.
+ * @css: cgroup subsys state object.
+ * @res: Array of misc resources usage in the cgroup.
+ */
+struct misc_cg {
+       struct cgroup_subsys_state css;
+       struct misc_res res[MISC_CG_RES_TYPES];
+};
+
+unsigned long misc_cg_res_total_usage(enum misc_res_type type);
+int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity);
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
+                      unsigned long amount);
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
+                     unsigned long amount);
+
+/**
+ * css_misc() - Get misc cgroup from the css.
+ * @css: cgroup subsys state object.
+ *
+ * Context: Any context.
+ * Return:
+ * * %NULL - If @css is null.
+ * * struct misc_cg* - misc cgroup pointer of the passed css.
+ */
+static inline struct misc_cg *css_misc(struct cgroup_subsys_state *css)
+{
+       return css ? container_of(css, struct misc_cg, css) : NULL;
+}
+
+/*
+ * get_current_misc_cg() - Find and get the misc cgroup of the current task.
+ *
+ * Returned cgroup has its ref count increased by 1. Caller must call
+ * put_misc_cg() to return the reference.
+ *
+ * Return: Misc cgroup to which the current task belongs to.
+ */
+static inline struct misc_cg *get_current_misc_cg(void)
+{
+       return css_misc(task_get_css(current, misc_cgrp_id));
+}
+
+/*
+ * put_misc_cg() - Put the misc cgroup and reduce its ref count.
+ * @cg - cgroup to put.
+ */
+static inline void put_misc_cg(struct misc_cg *cg)
+{
+       if (cg)
+               css_put(&cg->css);
+}
+
+#else /* !CONFIG_CGROUP_MISC */
+
+unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+{
+       return 0;
+}
+
+static inline int misc_cg_set_capacity(enum misc_res_type type,
+                                      unsigned long capacity)
+{
+       return 0;
+}
+
+static inline int misc_cg_try_charge(enum misc_res_type type,
+                                    struct misc_cg *cg,
+                                    unsigned long amount)
+{
+       return 0;
+}
+
+static inline void misc_cg_uncharge(enum misc_res_type type,
+                                   struct misc_cg *cg,
+                                   unsigned long amount)
+{
+}
+
+static inline struct misc_cg *get_current_misc_cg(void)
+{
+       return NULL;
+}
+
+static inline void put_misc_cg(struct misc_cg *cg)
+{
+}
+
+#endif /* CONFIG_CGROUP_MISC */
+#endif /* _MISC_CGROUP_H_ */
index d75ef8a..b7deb79 100644 (file)
@@ -547,4 +547,11 @@ static inline const char *mlx5_qp_state_str(int state)
        }
 }
 
+static inline int mlx5_get_qp_default_ts(struct mlx5_core_dev *dev)
+{
+       return !MLX5_CAP_ROCE(dev, qp_ts_format) ?
+                      MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING :
+                      MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT;
+}
+
 #endif /* MLX5_QP_H */
index 64a71bf..8ba4342 100644 (file)
@@ -1461,16 +1461,28 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
+/*
+ * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
+ * setting tags for all pages to native kernel tag value 0xff, as the default
+ * value 0x00 maps to 0xff.
+ */
+
 static inline u8 page_kasan_tag(const struct page *page)
 {
-       if (kasan_enabled())
-               return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
-       return 0xff;
+       u8 tag = 0xff;
+
+       if (kasan_enabled()) {
+               tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+               tag ^= 0xff;
+       }
+
+       return tag;
 }
 
 static inline void page_kasan_tag_set(struct page *page, u8 tag)
 {
        if (kasan_enabled()) {
+               tag ^= 0xff;
                page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
                page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
        }
index b820078..1a6a9eb 100644 (file)
@@ -169,11 +169,11 @@ struct mmu_notifier_ops {
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
-        * sleep and has to return with -EAGAIN. 0 should be returned
-        * otherwise. Please note that if invalidate_range_start approves
-        * a non-blocking behavior then the same applies to
-        * invalidate_range_end.
-        *
+        * sleep and has to return with -EAGAIN if sleeping would be required.
+        * 0 should be returned otherwise. Please note that notifiers that can
+        * fail invalidate_range_start are not allowed to implement
+        * invalidate_range_end, as there is no mechanism for informing the
+        * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
index 59f094f..da4b6fb 100644 (file)
@@ -30,9 +30,6 @@
 #include <linux/percpu.h>
 #include <asm/module.h>
 
-/* Not Yet Implemented */
-#define MODULE_SUPPORTED_DEVICE(name)
-
 #define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN
 
 struct modversion_info {
index 0cd631a..515cff7 100644 (file)
@@ -185,7 +185,7 @@ extern void mutex_lock_io(struct mutex *lock);
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
-# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock)
+# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock)
 #endif
 
 /*
index 5b67ea8..87a5d18 100644 (file)
@@ -360,6 +360,7 @@ enum {
        NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
        NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
        NAPI_STATE_THREADED,            /* The poll is performed inside its own thread*/
+       NAPI_STATE_SCHED_THREADED,      /* Napi is currently scheduled in threaded mode */
 };
 
 enum {
@@ -372,6 +373,7 @@ enum {
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL    = BIT(NAPI_STATE_PREFER_BUSY_POLL),
        NAPIF_STATE_THREADED            = BIT(NAPI_STATE_THREADED),
+       NAPIF_STATE_SCHED_THREADED      = BIT(NAPI_STATE_SCHED_THREADED),
 };
 
 enum gro_result {
index 8ebb641..8ec4846 100644 (file)
@@ -227,7 +227,7 @@ struct xt_table {
        unsigned int valid_hooks;
 
        /* Man behind the curtain... */
-       struct xt_table_info __rcu *private;
+       struct xt_table_info *private;
 
        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;
@@ -376,7 +376,7 @@ static inline unsigned int xt_write_recseq_begin(void)
         * since addend is most likely 1
         */
        __this_cpu_add(xt_recseq.sequence, addend);
-       smp_wmb();
+       smp_mb();
 
        return addend;
 }
@@ -448,9 +448,6 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 
 struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
 
-struct xt_table_info
-*xt_table_get_private_protected(const struct xt_table *table);
-
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
index 20225b0..8c9947f 100644 (file)
@@ -559,7 +559,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
        return pgoff;
 }
 
-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
 struct wait_page_key {
        struct page *page;
        int bit_nr;
@@ -683,6 +682,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
 
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
+int wait_on_page_writeback_killable(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
 
index b801ead..d48a719 100644 (file)
@@ -73,6 +73,7 @@ enum sev_cmd {
        SEV_CMD_SEND_UPDATE_DATA        = 0x041,
        SEV_CMD_SEND_UPDATE_VMSA        = 0x042,
        SEV_CMD_SEND_FINISH             = 0x043,
+       SEV_CMD_SEND_CANCEL             = 0x044,
 
        /* Guest migration commands (incoming) */
        SEV_CMD_RECEIVE_START           = 0x050,
@@ -326,11 +327,11 @@ struct sev_data_send_start {
        u64 pdh_cert_address;                   /* In */
        u32 pdh_cert_len;                       /* In */
        u32 reserved1;
-       u64 plat_cert_address;                  /* In */
-       u32 plat_cert_len;                      /* In */
+       u64 plat_certs_address;                 /* In */
+       u32 plat_certs_len;                     /* In */
        u32 reserved2;
-       u64 amd_cert_address;                   /* In */
-       u32 amd_cert_len;                       /* In */
+       u64 amd_certs_address;                  /* In */
+       u32 amd_certs_len;                      /* In */
        u32 reserved3;
        u64 session_address;                    /* In */
        u32 session_len;                        /* In/Out */
@@ -393,6 +394,15 @@ struct sev_data_send_finish {
 } __packed;
 
 /**
+ * struct sev_data_send_cancel - SEND_CANCEL command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_send_cancel {
+       u32 handle;                             /* In */
+} __packed;
+
+/**
  * struct sev_data_receive_start - RECEIVE_START command parameters
  *
  * @handle: handle of the VM to perform receive operation
index ec2ad4b..c4fdb44 100644 (file)
@@ -460,7 +460,5 @@ void geni_icc_set_tag(struct geni_se *se, u32 tag);
 int geni_icc_enable(struct geni_se *se);
 
 int geni_icc_disable(struct geni_se *se);
-
-void geni_remove_earlycon_icc_vote(void);
 #endif
 #endif
index bba2920..980a655 100644 (file)
@@ -23,6 +23,7 @@ enum timespec_type {
  * System call restart block.
  */
 struct restart_block {
+       unsigned long arch_data;
        long (*fn)(struct restart_block *);
        union {
                /* For futex_wait and futex_wait_requeue_pi */
index 6d0a33d..f2c9ee7 100644 (file)
@@ -285,6 +285,7 @@ struct nf_bridge_info {
 struct tc_skb_ext {
        __u32 chain;
        __u16 mru;
+       bool post_ct;
 };
 #endif
 
index 7c693b3..1e76ed6 100644 (file)
@@ -104,7 +104,6 @@ struct svcxprt_rdma {
 
        wait_queue_head_t    sc_send_wait;      /* SQ exhaustion waitlist */
        unsigned long        sc_flags;
-       u32                  sc_pending_recvs;
        struct list_head     sc_read_complete_q;
        struct work_struct   sc_work;
 
index 9b2158c..157762d 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/restart_block.h>
+#include <linux/errno.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 /*
@@ -59,6 +60,18 @@ enum syscall_work_bit {
 
 #ifdef __KERNEL__
 
+#ifndef arch_set_restart_data
+#define arch_set_restart_data(restart) do { } while (0)
+#endif
+
+static inline long set_restart_fn(struct restart_block *restart,
+                                       long (*fn)(struct restart_block *))
+{
+       restart->fn = fn;
+       arch_set_restart_data(restart);
+       return -ERESTART_RESTARTBLOCK;
+}
+
 #ifndef THREAD_ALIGN
 #define THREAD_ALIGN   THREAD_SIZE
 #endif
index 6b03fdd..712363c 100644 (file)
@@ -86,6 +86,8 @@
                /* lies about caching, so always sync */        \
        US_FLAG(NO_SAME, 0x40000000)                            \
                /* Cannot handle WRITE_SAME */                  \
+       US_FLAG(SENSE_AFTER_SYNC, 0x80000000)                   \
+               /* Do REQUEST_SENSE after SYNCHRONIZE_CACHE */  \
 
 #define US_FLAG(name, value)   US_FL_##name = value ,
 enum { US_DO_ALL_FLAGS };
index 073a9e0..ad97041 100644 (file)
@@ -14,5 +14,6 @@ struct umd_info {
 int umd_load_blob(struct umd_info *info, const void *data, size_t len);
 int umd_unload_blob(struct umd_info *info);
 int fork_usermode_driver(struct umd_info *info);
+void umd_cleanup_helper(struct umd_info *info);
 
 #endif /* __LINUX_USERMODE_DRIVER_H__ */
index 4ab5494..15fa085 100644 (file)
@@ -250,20 +250,20 @@ struct vdpa_config_ops {
 
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
                                        const struct vdpa_config_ops *config,
-                                       int nvqs, size_t size, const char *name);
+                                       size_t size, const char *name);
 
-#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs, name)   \
+#define vdpa_alloc_device(dev_struct, member, parent, config, name)   \
                          container_of(__vdpa_alloc_device( \
-                                      parent, config, nvqs, \
+                                      parent, config, \
                                       sizeof(dev_struct) + \
                                       BUILD_BUG_ON_ZERO(offsetof( \
                                       dev_struct, member)), name), \
                                       dev_struct, member)
 
-int vdpa_register_device(struct vdpa_device *vdev);
+int vdpa_register_device(struct vdpa_device *vdev, int nvqs);
 void vdpa_unregister_device(struct vdpa_device *vdev);
 
-int _vdpa_register_device(struct vdpa_device *vdev);
+int _vdpa_register_device(struct vdpa_device *vdev, int nvqs);
 void _vdpa_unregister_device(struct vdpa_device *vdev);
 
 /**
index 55ea329..b1894e0 100644 (file)
@@ -132,8 +132,6 @@ bool is_virtio_device(struct device *dev);
 void virtio_break_device(struct virtio_device *dev);
 
 void virtio_config_changed(struct virtio_device *dev);
-void virtio_config_disable(struct virtio_device *dev);
-void virtio_config_enable(struct virtio_device *dev);
 int virtio_finalize_features(struct virtio_device *dev);
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
index 850424e..6ecf2a0 100644 (file)
@@ -173,9 +173,10 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
  */
 static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 {
-#ifdef CONFIG_DEBUG_MUTEXES
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
        mutex_release(&ctx->dep_map, _THIS_IP_);
-
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
        DEBUG_LOCKS_WARN_ON(ctx->acquired);
        if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
                /*
index 92c0160..a91e3d9 100644 (file)
@@ -229,9 +229,10 @@ static inline int xa_err(void *entry)
  *
  * This structure is used either directly or via the XA_LIMIT() macro
  * to communicate the range of IDs that are valid for allocation.
- * Two common ranges are predefined for you:
+ * Three common ranges are predefined for you:
  * * xa_limit_32b      - [0 - UINT_MAX]
  * * xa_limit_31b      - [0 - INT_MAX]
+ * * xa_limit_16b      - [0 - USHRT_MAX]
  */
 struct xa_limit {
        u32 max;
@@ -242,6 +243,7 @@ struct xa_limit {
 
 #define xa_limit_32b   XA_LIMIT(0, UINT_MAX)
 #define xa_limit_31b   XA_LIMIT(0, INT_MAX)
+#define xa_limit_16b   XA_LIMIT(0, USHRT_MAX)
 
 typedef unsigned __bitwise xa_mark_t;
 #define XA_MARK_0              ((__force xa_mark_t)0U)
index 26f134a..75b1e73 100644 (file)
@@ -550,4 +550,15 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
 }
 
+struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
+void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+                              struct sk_buff *skb, u32 mtu, bool confirm_neigh);
+void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+                           struct sk_buff *skb);
+u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old);
+struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
+                                            struct sk_buff *skb,
+                                            const void *daddr);
+unsigned int dst_blackhole_mtu(const struct dst_entry *dst);
+
 #endif /* _NET_DST_H */
index 10a6257..3c8c594 100644 (file)
@@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
        return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
 }
 
-void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
+bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
 void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);
 
 static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
index fdec57d..5aaced6 100644 (file)
@@ -1536,6 +1536,7 @@ struct nft_trans_flowtable {
        struct nft_flowtable            *flowtable;
        bool                            update;
        struct list_head                hook_list;
+       u32                             flags;
 };
 
 #define nft_trans_flowtable(trans)     \
@@ -1544,6 +1545,8 @@ struct nft_trans_flowtable {
        (((struct nft_trans_flowtable *)trans->data)->update)
 #define nft_trans_flowtable_hooks(trans)       \
        (((struct nft_trans_flowtable *)trans->data)->hook_list)
+#define nft_trans_flowtable_flags(trans)       \
+       (((struct nft_trans_flowtable *)trans->data)->flags)
 
 int __init nft_chain_filter_init(void);
 void nft_chain_filter_fini(void);
index 7bc057a..a10a319 100644 (file)
@@ -410,6 +410,7 @@ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
 int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                       struct netlink_ext_ack *extack);
 
+/* Caller should either hold rcu_read_lock(), or RTNL. */
 static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
 {
        struct nh_info *nhi;
@@ -430,6 +431,29 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
        return NULL;
 }
 
+/* Variant of nexthop_fib6_nh().
+ * Caller should either hold rcu_read_lock_bh(), or RTNL.
+ */
+static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
+{
+       struct nh_info *nhi;
+
+       if (nh->is_group) {
+               struct nh_group *nh_grp;
+
+               nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp);
+               nh = nexthop_mpath_select(nh_grp, 0);
+               if (!nh)
+                       return NULL;
+       }
+
+       nhi = rcu_dereference_bh_rtnl(nh->nh_info);
+       if (nhi->family == AF_INET6)
+               return &nhi->fib6_nh;
+
+       return NULL;
+}
+
 static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
 {
        struct fib6_nh *fib6_nh;
index 932f0d7..0b39eff 100644 (file)
@@ -168,7 +168,8 @@ static inline void red_set_vars(struct red_vars *v)
        v->qcount       = -1;
 }
 
-static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_log)
+static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog,
+                                   u8 Scell_log, u8 *stab)
 {
        if (fls(qth_min) + Wlog > 32)
                return false;
@@ -178,6 +179,13 @@ static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_
                return false;
        if (qth_max < qth_min)
                return false;
+       if (stab) {
+               int i;
+
+               for (i = 0; i < RED_STAB_SIZE; i++)
+                       if (stab[i] >= 32)
+                               return false;
+       }
        return true;
 }
 
@@ -287,7 +295,7 @@ static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms
        int  shift;
 
        /*
-        * The problem: ideally, average length queue recalcultion should
+        * The problem: ideally, average length queue recalculation should
         * be done over constant clock intervals. This is too expensive, so
         * that the calculation is driven by outgoing packets.
         * When the queue is idle we have to model this clock by hand.
index e2091bb..4da61c9 100644 (file)
@@ -33,6 +33,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
  *
  *     @list: Used internally
  *     @kind: Identifier
+ *     @netns_refund: Physical device, move to init_net on netns exit
  *     @maxtype: Highest device specific netlink attribute number
  *     @policy: Netlink policy for device specific attribute validation
  *     @validate: Optional validation function for netlink/changelink parameters
@@ -64,6 +65,7 @@ struct rtnl_link_ops {
        size_t                  priv_size;
        void                    (*setup)(struct net_device *dev);
 
+       bool                    netns_refund;
        unsigned int            maxtype;
        const struct nla_policy *policy;
        int                     (*validate)(struct nlattr *tb[],
index 636810d..0b6266f 100644 (file)
@@ -936,7 +936,7 @@ static inline void sk_acceptq_added(struct sock *sk)
 
 static inline bool sk_acceptq_is_full(const struct sock *sk)
 {
-       return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
+       return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
 }
 
 /*
index 8a26a2f..fc5a398 100644 (file)
@@ -193,6 +193,7 @@ enum iscsi_connection_state {
        ISCSI_CONN_UP = 0,
        ISCSI_CONN_DOWN,
        ISCSI_CONN_FAILED,
+       ISCSI_CONN_BOUND,
 };
 
 struct iscsi_cls_conn {
index 49d7d0f..37e1e1a 100644 (file)
@@ -255,30 +255,6 @@ TRACE_EVENT(kvm_fpu,
        TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol))
 );
 
-TRACE_EVENT(kvm_age_page,
-       TP_PROTO(ulong gfn, int level, struct kvm_memory_slot *slot, int ref),
-       TP_ARGS(gfn, level, slot, ref),
-
-       TP_STRUCT__entry(
-               __field(        u64,    hva             )
-               __field(        u64,    gfn             )
-               __field(        u8,     level           )
-               __field(        u8,     referenced      )
-       ),
-
-       TP_fast_assign(
-               __entry->gfn            = gfn;
-               __entry->level          = level;
-               __entry->hva            = ((gfn - slot->base_gfn) <<
-                                           PAGE_SHIFT) + slot->userspace_addr;
-               __entry->referenced     = ref;
-       ),
-
-       TP_printk("hva %llx gfn %llx level %u %s",
-                 __entry->hva, __entry->gfn, __entry->level,
-                 __entry->referenced ? "YOUNG" : "OLD")
-);
-
 #ifdef CONFIG_KVM_ASYNC_PF
 DECLARE_EVENT_CLASS(kvm_async_get_page_class,
 
@@ -462,6 +438,72 @@ TRACE_EVENT(kvm_dirty_ring_exit,
        TP_printk("vcpu %d", __entry->vcpu_id)
 );
 
+TRACE_EVENT(kvm_unmap_hva_range,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_set_spte_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_age_hva,
+       TP_PROTO(unsigned long start, unsigned long end),
+       TP_ARGS(start, end),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  start           )
+               __field(        unsigned long,  end             )
+       ),
+
+       TP_fast_assign(
+               __entry->start          = start;
+               __entry->end            = end;
+       ),
+
+       TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
+                 __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+       TP_PROTO(unsigned long hva),
+       TP_ARGS(hva),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  hva             )
+       ),
+
+       TP_fast_assign(
+               __entry->hva            = hva;
+       ),
+
+       TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
index 970cc2e..6154a2e 100644 (file)
@@ -30,7 +30,7 @@ TRACE_EVENT(workqueue_queue_work,
        TP_STRUCT__entry(
                __field( void *,        work    )
                __field( void *,        function)
-               __field( const char *,  workqueue)
+               __string( workqueue,    pwq->wq->name)
                __field( unsigned int,  req_cpu )
                __field( unsigned int,  cpu     )
        ),
@@ -38,13 +38,13 @@ TRACE_EVENT(workqueue_queue_work,
        TP_fast_assign(
                __entry->work           = work;
                __entry->function       = work->func;
-               __entry->workqueue      = pwq->wq->name;
+               __assign_str(workqueue, pwq->wq->name);
                __entry->req_cpu        = req_cpu;
                __entry->cpu            = pwq->pool->cpu;
        ),
 
        TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%u cpu=%u",
-                 __entry->work, __entry->function, __entry->workqueue,
+                 __entry->work, __entry->function, __get_str(workqueue),
                  __entry->req_cpu, __entry->cpu)
 );
 
index ac6474e..d0a64ee 100644 (file)
@@ -2,29 +2,6 @@
 #ifndef _UAPI__LINUX_BLKPG_H
 #define _UAPI__LINUX_BLKPG_H
 
-/*
- * Partition table and disk geometry handling
- *
- * A single ioctl with lots of subfunctions:
- *
- * Device number stuff:
- *    get_whole_disk()         (given the device number of a partition,
- *                               find the device number of the encompassing disk)
- *    get_all_partitions()     (given the device number of a disk, return the
- *                              device numbers of all its known partitions)
- *
- * Partition stuff:
- *    add_partition()
- *    delete_partition()
- *    test_partition_in_use()  (also for test_disk_in_use)
- *
- * Geometry stuff:
- *    get_geometry()
- *    set_geometry()
- *    get_bios_drivedata()
- *
- * For today, only the partition stuff - aeb, 990515
- */
 #include <linux/compiler.h>
 #include <linux/ioctl.h>
 
@@ -52,9 +29,8 @@ struct blkpg_partition {
        long long start;                /* starting offset in bytes */
        long long length;               /* length in bytes */
        int pno;                        /* partition number */
-       char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2,
-                                          to be used in kernel messages */
-       char volname[BLKPG_VOLNAMELTH]; /* volume label */
+       char devname[BLKPG_DEVNAMELTH]; /* unused / ignored */
+       char volname[BLKPG_VOLNAMELTH]; /* unused / ignore */
 };
 
 #endif /* _UAPI__LINUX_BLKPG_H */
index 79c8933..4ba4ef0 100644 (file)
@@ -3850,7 +3850,7 @@ union bpf_attr {
  *
  * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
  *     Description
- *             Check ctx packet size against exceeding MTU of net device (based
+ *             Check packet size against exceeding MTU of net device (based
  *             on *ifindex*).  This helper will likely be used in combination
  *             with helpers that adjust/change the packet size.
  *
@@ -3867,6 +3867,14 @@ union bpf_attr {
  *             against the current net device.  This is practical if this isn't
  *             used prior to redirect.
  *
+ *             On input *mtu_len* must be a valid pointer, else verifier will
+ *             reject BPF program.  If the value *mtu_len* is initialized to
+ *             zero then the ctx packet size is use.  When value *mtu_len* is
+ *             provided as input this specify the L3 length that the MTU check
+ *             is done against. Remember XDP and TC length operate at L2, but
+ *             this value is L3 as this correlate to MTU and IP-header tot_len
+ *             values which are L3 (similar behavior as bpf_fib_lookup).
+ *
  *             The Linux kernel route table can configure MTUs on a more
  *             specific per route level, which is not provided by this helper.
  *             For route level MTU checks use the **bpf_fib_lookup**\ ()
@@ -3891,11 +3899,9 @@ union bpf_attr {
  *
  *             On return *mtu_len* pointer contains the MTU value of the net
  *             device.  Remember the net device configured MTU is the L3 size,
- *             which is returned here and XDP and TX length operate at L2.
+ *             which is returned here and XDP and TC length operate at L2.
  *             Helper take this into account for you, but remember when using
- *             MTU value in your BPF-code.  On input *mtu_len* must be a valid
- *             pointer and be initialized (to zero), else verifier will reject
- *             BPF program.
+ *             MTU value in your BPF-code.
  *
  *     Return
  *             * 0 on success, and populate MTU value in *mtu_len* pointer.
index 98ca64d..5444261 100644 (file)
@@ -903,7 +903,8 @@ struct fuse_notify_retrieve_in {
 };
 
 /* Device ioctls: */
-#define FUSE_DEV_IOC_CLONE     _IOR(229, 0, uint32_t)
+#define FUSE_DEV_IOC_MAGIC             229
+#define FUSE_DEV_IOC_CLONE             _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t)
 
 struct fuse_lseek_in {
        uint64_t        fh;
index 0e0f70c..3fd9a7e 100644 (file)
@@ -1078,7 +1078,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
-#define KVM_CAP_PTP_KVM 195
+#define KVM_CAP_SET_GUEST_DEBUG2 195
+#define KVM_CAP_SGX_ATTRIBUTE 196
+#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
+#define KVM_CAP_PTP_KVM 198
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1672,6 +1675,8 @@ enum sev_cmd_id {
        KVM_SEV_CERT_EXPORT,
        /* Attestation report */
        KVM_SEV_GET_ATTESTATION_REPORT,
+       /* Guest Migration Extension */
+       KVM_SEV_SEND_CANCEL,
 
        KVM_SEV_NR_MAX,
 };
@@ -1730,6 +1735,45 @@ struct kvm_sev_attestation_report {
        __u32 len;
 };
 
+struct kvm_sev_send_start {
+       __u32 policy;
+       __u64 pdh_cert_uaddr;
+       __u32 pdh_cert_len;
+       __u64 plat_certs_uaddr;
+       __u32 plat_certs_len;
+       __u64 amd_certs_uaddr;
+       __u32 amd_certs_len;
+       __u64 session_uaddr;
+       __u32 session_len;
+};
+
+struct kvm_sev_send_update_data {
+       __u64 hdr_uaddr;
+       __u32 hdr_len;
+       __u64 guest_uaddr;
+       __u32 guest_len;
+       __u64 trans_uaddr;
+       __u32 trans_len;
+};
+
+struct kvm_sev_receive_start {
+       __u32 handle;
+       __u32 policy;
+       __u64 pdh_uaddr;
+       __u32 pdh_len;
+       __u64 session_uaddr;
+       __u32 session_len;
+};
+
+struct kvm_sev_receive_update_data {
+       __u64 hdr_uaddr;
+       __u32 hdr_len;
+       __u64 guest_uaddr;
+       __u32 guest_len;
+       __u64 trans_uaddr;
+       __u32 trans_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX       (1 << 2)
index aea26ab..bff5032 100644 (file)
@@ -3,7 +3,6 @@
 #define __UAPI_PSAMPLE_H
 
 enum {
-       /* sampled packet metadata */
        PSAMPLE_ATTR_IIFINDEX,
        PSAMPLE_ATTR_OIFINDEX,
        PSAMPLE_ATTR_ORIGSIZE,
@@ -11,10 +10,8 @@ enum {
        PSAMPLE_ATTR_GROUP_SEQ,
        PSAMPLE_ATTR_SAMPLE_RATE,
        PSAMPLE_ATTR_DATA,
-       PSAMPLE_ATTR_TUNNEL,
-
-       /* commands attributes */
        PSAMPLE_ATTR_GROUP_REFCOUNT,
+       PSAMPLE_ATTR_TUNNEL,
 
        __PSAMPLE_ATTR_MAX
 };
index 5f5c776..18ece59 100644 (file)
@@ -1110,6 +1110,20 @@ config CGROUP_BPF
          BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
          inet sockets.
 
+config CGROUP_MISC
+       bool "Misc resource controller"
+       default n
+       help
+         Provides a controller for miscellaneous resources on a host.
+
+         Miscellaneous scalar resources are the resources on the host system
+         which cannot be abstracted like the other cgroups. This controller
+         tracks and limits the miscellaneous resources used by a process
+         attached to a cgroup hierarchy.
+
+         For more information, please check misc cgroup section in
+         /Documentation/admin-guide/cgroup-v2.rst.
+
 config CGROUP_DEBUG
        bool "Debug controller"
        default n
index 6639640..b58b2ef 100644 (file)
@@ -109,7 +109,7 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
        fd = *(int *)key;
        f = fget_raw(fd);
        if (!f)
-               return NULL;
+               return ERR_PTR(-EBADF);
 
        sdata = inode_storage_lookup(f->f_inode, map, true);
        fput(f);
index 1a666a9..70f6fd4 100644 (file)
@@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
                tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
                tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
-               err = arch_prepare_bpf_trampoline(image,
+               err = arch_prepare_bpf_trampoline(NULL, image,
                                                  st_map->image + PAGE_SIZE,
                                                  &st_ops->func_models[i], 0,
                                                  tprogs, NULL);
index 3a283bf..75244ec 100644 (file)
@@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void)
 }
 pure_initcall(bpf_jit_charge_init);
 
-static int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 pages)
 {
        if (atomic_long_add_return(pages, &bpf_jit_current) >
            (bpf_jit_limit >> PAGE_SHIFT)) {
@@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages)
        return 0;
 }
 
-static void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 pages)
 {
        atomic_long_sub(pages, &bpf_jit_current);
 }
index 79c5772..53736e5 100644 (file)
@@ -60,9 +60,12 @@ static int finish(void)
                         &magic, sizeof(magic), &pos);
        if (n != sizeof(magic))
                return -EPIPE;
+
        tgid = umd_ops.info.tgid;
-       wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
-       umd_ops.info.tgid = NULL;
+       if (tgid) {
+               wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+               umd_cleanup_helper(&umd_ops.info);
+       }
        return 0;
 }
 
@@ -80,10 +83,18 @@ static int __init load_umd(void)
 
 static void __exit fini_umd(void)
 {
+       struct pid *tgid;
+
        bpf_preload_ops = NULL;
+
        /* kill UMD in case it's still there due to earlier error */
-       kill_pid(umd_ops.info.tgid, SIGKILL, 1);
-       umd_ops.info.tgid = NULL;
+       tgid = umd_ops.info.tgid;
+       if (tgid) {
+               kill_pid(tgid, SIGKILL, 1);
+
+               wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+               umd_cleanup_helper(&umd_ops.info);
+       }
        umd_unload_blob(&umd_ops.info);
 }
 late_initcall(load_umd);
index c859bc4..2505034 100644 (file)
@@ -854,6 +854,11 @@ static int map_create(union bpf_attr *attr)
                        err = PTR_ERR(btf);
                        goto free_map;
                }
+               if (btf_is_kernel(btf)) {
+                       btf_put(btf);
+                       err = -EACCES;
+                       goto free_map;
+               }
                map->btf = btf;
 
                if (attr->btf_value_type_id) {
index 7bc3b32..1f3a4be 100644 (file)
@@ -57,19 +57,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
                           PAGE_SIZE, true, ksym->name);
 }
 
-static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
-{
-       struct bpf_ksym *ksym = &tr->ksym;
-
-       snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
-       bpf_image_ksym_add(tr->image, ksym);
-}
-
 static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
        struct bpf_trampoline *tr;
        struct hlist_head *head;
-       void *image;
        int i;
 
        mutex_lock(&trampoline_mutex);
@@ -84,14 +75,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
        if (!tr)
                goto out;
 
-       /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
-       image = bpf_jit_alloc_exec_page();
-       if (!image) {
-               kfree(tr);
-               tr = NULL;
-               goto out;
-       }
-
        tr->key = key;
        INIT_HLIST_NODE(&tr->hlist);
        hlist_add_head(&tr->hlist, head);
@@ -99,9 +82,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
        mutex_init(&tr->mutex);
        for (i = 0; i < BPF_TRAMP_MAX; i++)
                INIT_HLIST_HEAD(&tr->progs_hlist[i]);
-       tr->image = image;
-       INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
-       bpf_trampoline_ksym_add(tr);
 out:
        mutex_unlock(&trampoline_mutex);
        return tr;
@@ -185,10 +165,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
        return tprogs;
 }
 
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+       struct bpf_tramp_image *im;
+
+       im = container_of(work, struct bpf_tramp_image, work);
+       bpf_image_ksym_del(&im->ksym);
+       bpf_jit_free_exec(im->image);
+       bpf_jit_uncharge_modmem(1);
+       percpu_ref_exit(&im->pcref);
+       kfree_rcu(im, rcu);
+}
+
+/* callback, fexit step 3 or fentry step 2 */
+static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
+{
+       struct bpf_tramp_image *im;
+
+       im = container_of(rcu, struct bpf_tramp_image, rcu);
+       INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
+       schedule_work(&im->work);
+}
+
+/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
+static void __bpf_tramp_image_release(struct percpu_ref *pcref)
+{
+       struct bpf_tramp_image *im;
+
+       im = container_of(pcref, struct bpf_tramp_image, pcref);
+       call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+/* callback, fexit or fentry step 1 */
+static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
+{
+       struct bpf_tramp_image *im;
+
+       im = container_of(rcu, struct bpf_tramp_image, rcu);
+       if (im->ip_after_call)
+               /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
+               percpu_ref_kill(&im->pcref);
+       else
+               /* the case of fentry trampoline */
+               call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+static void bpf_tramp_image_put(struct bpf_tramp_image *im)
+{
+       /* The trampoline image that calls original function is using:
+        * rcu_read_lock_trace to protect sleepable bpf progs
+        * rcu_read_lock to protect normal bpf progs
+        * percpu_ref to protect trampoline itself
+        * rcu tasks to protect trampoline asm not covered by percpu_ref
+        * (which are few asm insns before __bpf_tramp_enter and
+        *  after __bpf_tramp_exit)
+        *
+        * The trampoline is unreachable before bpf_tramp_image_put().
+        *
+        * First, patch the trampoline to avoid calling into fexit progs.
+        * The progs will be freed even if the original function is still
+        * executing or sleeping.
+        * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
+        * first few asm instructions to execute and call into
+        * __bpf_tramp_enter->percpu_ref_get.
+        * Then use percpu_ref_kill to wait for the trampoline and the original
+        * function to finish.
+        * Then use call_rcu_tasks() to make sure few asm insns in
+        * the trampoline epilogue are done as well.
+        *
+        * In !PREEMPT case the task that got interrupted in the first asm
+        * insns won't go through an RCU quiescent state which the
+        * percpu_ref_kill will be waiting for. Hence the first
+        * call_rcu_tasks() is not necessary.
+        */
+       if (im->ip_after_call) {
+               int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
+                                            NULL, im->ip_epilogue);
+               WARN_ON(err);
+               if (IS_ENABLED(CONFIG_PREEMPTION))
+                       call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+               else
+                       percpu_ref_kill(&im->pcref);
+               return;
+       }
+
+       /* The trampoline without fexit and fmod_ret progs doesn't call original
+        * function and doesn't use percpu_ref.
+        * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+        * Then use call_rcu_tasks() to wait for the rest of trampoline asm
+        * and normal progs.
+        */
+       call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+}
+
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+{
+       struct bpf_tramp_image *im;
+       struct bpf_ksym *ksym;
+       void *image;
+       int err = -ENOMEM;
+
+       im = kzalloc(sizeof(*im), GFP_KERNEL);
+       if (!im)
+               goto out;
+
+       err = bpf_jit_charge_modmem(1);
+       if (err)
+               goto out_free_im;
+
+       err = -ENOMEM;
+       im->image = image = bpf_jit_alloc_exec_page();
+       if (!image)
+               goto out_uncharge;
+
+       err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
+       if (err)
+               goto out_free_image;
+
+       ksym = &im->ksym;
+       INIT_LIST_HEAD_RCU(&ksym->lnode);
+       snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+       bpf_image_ksym_add(image, ksym);
+       return im;
+
+out_free_image:
+       bpf_jit_free_exec(im->image);
+out_uncharge:
+       bpf_jit_uncharge_modmem(1);
+out_free_im:
+       kfree(im);
+out:
+       return ERR_PTR(err);
+}
+
 static int bpf_trampoline_update(struct bpf_trampoline *tr)
 {
-       void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
-       void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+       struct bpf_tramp_image *im;
        struct bpf_tramp_progs *tprogs;
        u32 flags = BPF_TRAMP_F_RESTORE_REGS;
        int err, total;
@@ -198,41 +310,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
                return PTR_ERR(tprogs);
 
        if (total == 0) {
-               err = unregister_fentry(tr, old_image);
+               err = unregister_fentry(tr, tr->cur_image->image);
+               bpf_tramp_image_put(tr->cur_image);
+               tr->cur_image = NULL;
                tr->selector = 0;
                goto out;
        }
 
+       im = bpf_tramp_image_alloc(tr->key, tr->selector);
+       if (IS_ERR(im)) {
+               err = PTR_ERR(im);
+               goto out;
+       }
+
        if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
            tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
                flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 
-       /* Though the second half of trampoline page is unused a task could be
-        * preempted in the middle of the first half of trampoline and two
-        * updates to trampoline would change the code from underneath the
-        * preempted task. Hence wait for tasks to voluntarily schedule or go
-        * to userspace.
-        * The same trampoline can hold both sleepable and non-sleepable progs.
-        * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
-        * programs finish executing.
-        * Wait for these two grace periods together.
-        */
-       synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
-
-       err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
+       err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
                                          &tr->func.model, flags, tprogs,
                                          tr->func.addr);
        if (err < 0)
                goto out;
 
-       if (tr->selector)
+       WARN_ON(tr->cur_image && tr->selector == 0);
+       WARN_ON(!tr->cur_image && tr->selector);
+       if (tr->cur_image)
                /* progs already running at this address */
-               err = modify_fentry(tr, old_image, new_image);
+               err = modify_fentry(tr, tr->cur_image->image, im->image);
        else
                /* first time registering */
-               err = register_fentry(tr, new_image);
+               err = register_fentry(tr, im->image);
        if (err)
                goto out;
+       if (tr->cur_image)
+               bpf_tramp_image_put(tr->cur_image);
+       tr->cur_image = im;
        tr->selector++;
 out:
        kfree(tprogs);
@@ -364,17 +477,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
                goto out;
        if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
                goto out;
-       bpf_image_ksym_del(&tr->ksym);
-       /* This code will be executed when all bpf progs (both sleepable and
-        * non-sleepable) went through
-        * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
-        * Hence no need for another synchronize_rcu_tasks_trace() here,
-        * but synchronize_rcu_tasks() is still needed, since trampoline
-        * may not have had any sleepable programs and we need to wait
-        * for tasks to get out of trampoline code before freeing it.
+       /* This code will be executed even when the last bpf_tramp_image
+        * is alive. All progs are detached from the trampoline and the
+        * trampoline image is patched with jmp into epilogue to skip
+        * fexit progs. The fentry-only trampoline will be freed via
+        * multiple rcu callbacks.
         */
-       synchronize_rcu_tasks();
-       bpf_jit_free_exec(tr->image);
        hlist_del(&tr->hlist);
        kfree(tr);
 out:
@@ -478,8 +586,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
        rcu_read_unlock_trace();
 }
 
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
+{
+       percpu_ref_get(&tr->pcref);
+}
+
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
+{
+       percpu_ref_put(&tr->pcref);
+}
+
 int __weak
-arch_prepare_bpf_trampoline(void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
                            const struct btf_func_model *m, u32 flags,
                            struct bpf_tramp_progs *tprogs,
                            void *orig_call)
index c56e3fc..44e4ec1 100644 (file)
@@ -5861,10 +5861,14 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 {
        bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
                            (opcode == BPF_SUB && !off_is_neg);
-       u32 off;
+       u32 off, max;
 
        switch (ptr_reg->type) {
        case PTR_TO_STACK:
+               /* Offset 0 is out-of-bounds, but acceptable start for the
+                * left direction, see BPF_REG_FP.
+                */
+               max = MAX_BPF_STACK + mask_to_left;
                /* Indirect variable offset stack access is prohibited in
                 * unprivileged mode so it's not handled here.
                 */
@@ -5872,16 +5876,17 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
                if (mask_to_left)
                        *ptr_limit = MAX_BPF_STACK + off;
                else
-                       *ptr_limit = -off;
-               return 0;
+                       *ptr_limit = -off - 1;
+               return *ptr_limit >= max ? -ERANGE : 0;
        case PTR_TO_MAP_VALUE:
+               max = ptr_reg->map_ptr->value_size;
                if (mask_to_left) {
                        *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
                } else {
                        off = ptr_reg->smin_value + ptr_reg->off;
-                       *ptr_limit = ptr_reg->map_ptr->value_size - off;
+                       *ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
                }
-               return 0;
+               return *ptr_limit >= max ? -ERANGE : 0;
        default:
                return -EINVAL;
        }
@@ -5934,6 +5939,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
        u32 alu_state, alu_limit;
        struct bpf_reg_state tmp;
        bool ret;
+       int err;
 
        if (can_skip_alu_sanitation(env, insn))
                return 0;
@@ -5949,10 +5955,13 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
        alu_state |= ptr_is_dst_reg ?
                     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
 
-       if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
-               return 0;
-       if (update_alu_sanitation_state(aux, alu_state, alu_limit))
-               return -EACCES;
+       err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg);
+       if (err < 0)
+               return err;
+
+       err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+       if (err < 0)
+               return err;
 do_sim:
        /* Simulate and find potential out-of-bounds access under
         * speculative execution from truncation as a result of
@@ -6103,7 +6112,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case BPF_ADD:
                ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
                if (ret < 0) {
-                       verbose(env, "R%d tried to add from different maps or paths\n", dst);
+                       verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst);
                        return ret;
                }
                /* We can take a fixed offset as long as it doesn't overflow
@@ -6158,7 +6167,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case BPF_SUB:
                ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
                if (ret < 0) {
-                       verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+                       verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst);
                        return ret;
                }
                if (dst_reg == off_reg) {
@@ -9056,6 +9065,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
        btf = btf_get_by_fd(attr->prog_btf_fd);
        if (IS_ERR(btf))
                return PTR_ERR(btf);
+       if (btf_is_kernel(btf)) {
+               btf_put(btf);
+               return -EACCES;
+       }
        env->prog->aux->btf = btf;
 
        err = check_btf_func(env, attr, uattr);
@@ -11660,7 +11673,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                        off_reg = issrc ? insn->src_reg : insn->dst_reg;
                        if (isneg)
                                *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
-                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                        *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
                        *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
                        *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
index 5d7a76b..12f8457 100644 (file)
@@ -5,4 +5,5 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_MISC) += misc.o
 obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
new file mode 100644 (file)
index 0000000..ec02d96
--- /dev/null
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Miscellaneous cgroup controller
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include <linux/limits.h>
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/atomic.h>
+#include <linux/slab.h>
+#include <linux/misc_cgroup.h>
+
+#define MAX_STR "max"
+#define MAX_NUM ULONG_MAX
+
+/* Miscellaneous res name, keep it in sync with enum misc_res_type */
+static const char *const misc_res_name[] = {
+#ifdef CONFIG_KVM_AMD_SEV
+       /* AMD SEV ASIDs resource */
+       "sev",
+       /* AMD SEV-ES ASIDs resource */
+       "sev_es",
+#endif
+};
+
+/* Root misc cgroup */
+static struct misc_cg root_cg;
+
+/*
+ * Miscellaneous resources capacity for the entire machine. 0 capacity means
+ * resource is not initialized or not present in the host.
+ *
+ * root_cg.max and capacity are independent of each other. root_cg.max can be
+ * more than the actual capacity. We are using Limits resource distribution
+ * model of cgroup for miscellaneous controller.
+ */
+static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
+
+/**
+ * parent_misc() - Get the parent of the passed misc cgroup.
+ * @cgroup: cgroup whose parent needs to be fetched.
+ *
+ * Context: Any context.
+ * Return:
+ * * struct misc_cg* - Parent of the @cgroup.
+ * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
+ */
+static struct misc_cg *parent_misc(struct misc_cg *cgroup)
+{
+       return cgroup ? css_misc(cgroup->css.parent) : NULL;
+}
+
+/**
+ * valid_type() - Check if @type is valid or not.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return:
+ * * true - If valid type.
+ * * false - If not valid type.
+ */
+static inline bool valid_type(enum misc_res_type type)
+{
+       return type >= 0 && type < MISC_CG_RES_TYPES;
+}
+
+/**
+ * misc_cg_res_total_usage() - Get the current total usage of the resource.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return: Current total usage of the resource.
+ */
+unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+{
+       if (valid_type(type))
+               return atomic_long_read(&root_cg.res[type].usage);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
+
+/**
+ * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
+ * @type: Type of the misc res.
+ * @capacity: Supported capacity of the misc res on the host.
+ *
+ * If capacity is 0 then the charging a misc cgroup fails for that type.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - Successfully registered the capacity.
+ * * %-EINVAL - If @type is invalid.
+ */
+int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
+{
+       if (!valid_type(type))
+               return -EINVAL;
+
+       WRITE_ONCE(misc_res_capacity[type], capacity);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
+
+/**
+ * misc_cg_cancel_charge() - Cancel the charge from the misc cgroup.
+ * @type: Misc res type in misc cg to cancel the charge from.
+ * @cg: Misc cgroup to cancel charge from.
+ * @amount: Amount to cancel.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
+                                 unsigned long amount)
+{
+       WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
+                 "misc cgroup resource %s became less than 0",
+                 misc_res_name[type]);
+}
+
+/**
+ * misc_cg_try_charge() - Try charging the misc cgroup.
+ * @type: Misc res type to charge.
+ * @cg: Misc cgroup which will be charged.
+ * @amount: Amount to charge.
+ *
+ * Charge @amount to the misc cgroup. Caller must use the same cgroup during
+ * the uncharge call.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -EINVAL - If @type is invalid or misc res has 0 capacity.
+ * * -EBUSY - If max limit will be crossed or total usage will be more than the
+ *           capacity.
+ */
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
+                      unsigned long amount)
+{
+       struct misc_cg *i, *j;
+       int ret;
+       struct misc_res *res;
+       int new_usage;
+
+       if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
+               return -EINVAL;
+
+       if (!amount)
+               return 0;
+
+       for (i = cg; i; i = parent_misc(i)) {
+               res = &i->res[type];
+
+               new_usage = atomic_long_add_return(amount, &res->usage);
+               if (new_usage > READ_ONCE(res->max) ||
+                   new_usage > READ_ONCE(misc_res_capacity[type])) {
+                       if (!res->failed) {
+                               pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
+                                       misc_res_name[type]);
+                               pr_cont_cgroup_path(i->css.cgroup);
+                               pr_cont("\n");
+                               res->failed = true;
+                       }
+                       ret = -EBUSY;
+                       goto err_charge;
+               }
+       }
+       return 0;
+
+err_charge:
+       for (j = cg; j != i; j = parent_misc(j))
+               misc_cg_cancel_charge(type, j, amount);
+       misc_cg_cancel_charge(type, i, amount);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(misc_cg_try_charge);
+
+/**
+ * misc_cg_uncharge() - Uncharge the misc cgroup.
+ * @type: Misc res type which was charged.
+ * @cg: Misc cgroup which will be uncharged.
+ * @amount: Charged amount.
+ *
+ * Context: Any context.
+ */
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
+                     unsigned long amount)
+{
+       struct misc_cg *i;
+
+       if (!(amount && valid_type(type) && cg))
+               return;
+
+       for (i = cg; i; i = parent_misc(i))
+               misc_cg_cancel_charge(type, i, amount);
+}
+EXPORT_SYMBOL_GPL(misc_cg_uncharge);
+
+/**
+ * misc_cg_max_show() - Show the misc cgroup max limit.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_max_show(struct seq_file *sf, void *v)
+{
+       int i;
+       struct misc_cg *cg = css_misc(seq_css(sf));
+       unsigned long max;
+
+       for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+               if (READ_ONCE(misc_res_capacity[i])) {
+                       max = READ_ONCE(cg->res[i].max);
+                       if (max == MAX_NUM)
+                               seq_printf(sf, "%s max\n", misc_res_name[i]);
+                       else
+                               seq_printf(sf, "%s %lu\n", misc_res_name[i],
+                                          max);
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * misc_cg_max_write() - Update the maximum limit of the cgroup.
+ * @of: Handler for the file.
+ * @buf: Data from the user. It should be either "max", 0, or a positive
+ *      integer.
+ * @nbytes: Number of bytes of the data.
+ * @off: Offset in the file.
+ *
+ * User can pass data like:
+ * echo sev 23 > misc.max, OR
+ * echo sev max > misc.max
+ *
+ * Context: Any context.
+ * Return:
+ * * >= 0 - Number of bytes processed in the input.
+ * * -EINVAL - If buf is not valid.
+ * * -ERANGE - If number is bigger than the unsigned long capacity.
+ */
+static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
+                                size_t nbytes, loff_t off)
+{
+       struct misc_cg *cg;
+       unsigned long max;
+       int ret = 0, i;
+       enum misc_res_type type = MISC_CG_RES_TYPES;
+       char *token;
+
+       buf = strstrip(buf);
+       token = strsep(&buf, " ");
+
+       if (!token || !buf)
+               return -EINVAL;
+
+       for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+               if (!strcmp(misc_res_name[i], token)) {
+                       type = i;
+                       break;
+               }
+       }
+
+       if (type == MISC_CG_RES_TYPES)
+               return -EINVAL;
+
+       if (!strcmp(MAX_STR, buf)) {
+               max = MAX_NUM;
+       } else {
+               ret = kstrtoul(buf, 0, &max);
+               if (ret)
+                       return ret;
+       }
+
+       cg = css_misc(of_css(of));
+
+       if (READ_ONCE(misc_res_capacity[type]))
+               WRITE_ONCE(cg->res[type].max, max);
+       else
+               ret = -EINVAL;
+
+       return ret ? ret : nbytes;
+}
+
+/**
+ * misc_cg_current_show() - Show the current usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_current_show(struct seq_file *sf, void *v)
+{
+       int i;
+       unsigned long usage;
+       struct misc_cg *cg = css_misc(seq_css(sf));
+
+       for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+               usage = atomic_long_read(&cg->res[i].usage);
+               if (READ_ONCE(misc_res_capacity[i]) || usage)
+                       seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
+       }
+
+       return 0;
+}
+
+/**
+ * misc_cg_capacity_show() - Show the total capacity of misc res on the host.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Only present in the root cgroup directory.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_capacity_show(struct seq_file *sf, void *v)
+{
+       int i;
+       unsigned long cap;
+
+       for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+               cap = READ_ONCE(misc_res_capacity[i]);
+               if (cap)
+                       seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
+       }
+
+       return 0;
+}
+
+/* Misc cgroup interface files */
+static struct cftype misc_cg_files[] = {
+       {
+               .name = "max",
+               .write = misc_cg_max_write,
+               .seq_show = misc_cg_max_show,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       {
+               .name = "current",
+               .seq_show = misc_cg_current_show,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+       {
+               .name = "capacity",
+               .seq_show = misc_cg_capacity_show,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
+       {}
+};
+
+/**
+ * misc_cg_alloc() - Allocate misc cgroup.
+ * @parent_css: Parent cgroup.
+ *
+ * Context: Process context.
+ * Return:
+ * * struct cgroup_subsys_state* - css of the allocated cgroup.
+ * * ERR_PTR(-ENOMEM) - No memory available to allocate.
+ */
+static struct cgroup_subsys_state *
+misc_cg_alloc(struct cgroup_subsys_state *parent_css)
+{
+       enum misc_res_type i;
+       struct misc_cg *cg;
+
+       if (!parent_css) {
+               cg = &root_cg;
+       } else {
+               cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+               if (!cg)
+                       return ERR_PTR(-ENOMEM);
+       }
+
+       for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+               WRITE_ONCE(cg->res[i].max, MAX_NUM);
+               atomic_long_set(&cg->res[i].usage, 0);
+       }
+
+       return &cg->css;
+}
+
+/**
+ * misc_cg_free() - Free the misc cgroup.
+ * @css: cgroup subsys object.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_free(struct cgroup_subsys_state *css)
+{
+       kfree(css_misc(css));
+}
+
+/* Cgroup controller callbacks */
+struct cgroup_subsys misc_cgrp_subsys = {
+       .css_alloc = misc_cg_alloc,
+       .css_free = misc_cg_free,
+       .legacy_cftypes = misc_cg_files,
+       .dfl_cftypes = misc_cg_files,
+};
index 0acc8ed..426cd0c 100644 (file)
@@ -1948,8 +1948,14 @@ static __latent_entropy struct task_struct *copy_process(
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
-       if (args->io_thread)
+       if (args->io_thread) {
+               /*
+                * Mark us an IO worker, and block any signal that isn't
+                * fatal or STOP
+                */
                p->flags |= PF_IO_WORKER;
+               siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
+       }
 
        /*
         * This _must_ happen before we call free_task(), i.e. before we jump
@@ -2438,15 +2444,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
                .stack_size     = (unsigned long)arg,
                .io_thread      = 1,
        };
-       struct task_struct *tsk;
 
-       tsk = copy_process(NULL, 0, node, &args);
-       if (!IS_ERR(tsk)) {
-               sigfillset(&tsk->blocked);
-               sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
-               tsk->flags |= PF_NOFREEZE;
-       }
-       return tsk;
+       return copy_process(NULL, 0, node, &args);
 }
 
 /*
index e68db77..00febd6 100644 (file)
@@ -2728,14 +2728,13 @@ retry:
                goto out;
 
        restart = &current->restart_block;
-       restart->fn = futex_wait_restart;
        restart->futex.uaddr = uaddr;
        restart->futex.val = val;
        restart->futex.time = *abs_time;
        restart->futex.bitset = bitset;
        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
-       ret = -ERESTART_RESTARTBLOCK;
+       ret = set_restart_fn(restart, futex_wait_restart);
 
 out:
        if (to) {
index c94b820..8743150 100644 (file)
@@ -75,7 +75,9 @@ struct gcov_fn_info {
 
        u32 num_counters;
        u64 *counters;
+#if CONFIG_CLANG_VERSION < 110000
        const char *function_name;
+#endif
 };
 
 static struct gcov_info *current_info;
@@ -105,6 +107,7 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
 }
 EXPORT_SYMBOL(llvm_gcov_init);
 
+#if CONFIG_CLANG_VERSION < 110000
 void llvm_gcda_start_file(const char *orig_filename, const char version[4],
                u32 checksum)
 {
@@ -113,7 +116,17 @@ void llvm_gcda_start_file(const char *orig_filename, const char version[4],
        current_info->checksum = checksum;
 }
 EXPORT_SYMBOL(llvm_gcda_start_file);
+#else
+void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
+{
+       current_info->filename = orig_filename;
+       current_info->version = version;
+       current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+#endif
 
+#if CONFIG_CLANG_VERSION < 110000
 void llvm_gcda_emit_function(u32 ident, const char *function_name,
                u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
 {
@@ -133,6 +146,24 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name,
        list_add_tail(&info->head, &current_info->functions);
 }
 EXPORT_SYMBOL(llvm_gcda_emit_function);
+#else
+void llvm_gcda_emit_function(u32 ident, u32 func_checksum,
+               u8 use_extra_checksum, u32 cfg_checksum)
+{
+       struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+       if (!info)
+               return;
+
+       INIT_LIST_HEAD(&info->head);
+       info->ident = ident;
+       info->checksum = func_checksum;
+       info->use_extra_checksum = use_extra_checksum;
+       info->cfg_checksum = cfg_checksum;
+       list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+#endif
 
 void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
 {
@@ -295,6 +326,7 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
        }
 }
 
+#if CONFIG_CLANG_VERSION < 110000
 static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
 {
        size_t cv_size; /* counter values size */
@@ -322,6 +354,28 @@ err_name:
        kfree(fn_dup);
        return NULL;
 }
+#else
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+       size_t cv_size; /* counter values size */
+       struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+                       GFP_KERNEL);
+       if (!fn_dup)
+               return NULL;
+       INIT_LIST_HEAD(&fn_dup->head);
+
+       cv_size = fn->num_counters * sizeof(fn->counters[0]);
+       fn_dup->counters = vmalloc(cv_size);
+       if (!fn_dup->counters) {
+               kfree(fn_dup);
+               return NULL;
+       }
+
+       memcpy(fn_dup->counters, fn->counters, cv_size);
+
+       return fn_dup;
+}
+#endif
 
 /**
  * gcov_info_dup - duplicate profiling data set
@@ -362,6 +416,7 @@ err:
  * gcov_info_free - release memory for profiling data set duplicate
  * @info: profiling data set duplicate to free
  */
+#if CONFIG_CLANG_VERSION < 110000
 void gcov_info_free(struct gcov_info *info)
 {
        struct gcov_fn_info *fn, *tmp;
@@ -375,6 +430,20 @@ void gcov_info_free(struct gcov_info *info)
        kfree(info->filename);
        kfree(info);
 }
+#else
+void gcov_info_free(struct gcov_info *info)
+{
+       struct gcov_fn_info *fn, *tmp;
+
+       list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+               vfree(fn->counters);
+               list_del(&fn->head);
+               kfree(fn);
+       }
+       kfree(info->filename);
+       kfree(info);
+}
+#endif
 
 #define ITER_STRIDE    PAGE_SIZE
 
index 4800660..40880c3 100644 (file)
@@ -159,7 +159,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
  * irq_domain_create_sim - Create a new interrupt simulator irq_domain and
  *                         allocate a range of dummy interrupts.
  *
- * @fnode:      struct fwnode_handle to be associated with this domain.
+ * @fwnode:     struct fwnode_handle to be associated with this domain.
  * @num_irqs:   Number of interrupts to allocate.
  *
  * On success: return a new irq_domain object.
@@ -228,7 +228,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res)
  *                              a managed device.
  *
  * @dev:        Device to initialize the simulator object for.
- * @fnode:      struct fwnode_handle to be associated with this domain.
+ * @fwnode:     struct fwnode_handle to be associated with this domain.
  * @num_irqs:   Number of interrupts to allocate
  *
  * On success: return a new irq_domain object.
index dec3f73..21ea370 100644 (file)
@@ -1142,11 +1142,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
        irqreturn_t ret;
 
        local_bh_disable();
+       if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_disable();
        ret = action->thread_fn(action->irq, action->dev_id);
        if (ret == IRQ_HANDLED)
                atomic_inc(&desc->threads_handled);
 
        irq_finalize_oneshot(desc, action);
+       if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_enable();
        local_bh_enable();
        return ret;
 }
index c6a39d6..ba39fbb 100644 (file)
@@ -407,6 +407,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
                return false;
 
        if (!kernel_text_address(jump_entry_code(entry))) {
+               /*
+                * This skips patching built-in __exit, which
+                * is part of init_section_contains() but is
+                * not part of kernel_text_address().
+                *
+                * Skipping built-in __exit is fine since it
+                * will never be executed.
+                */
                WARN_ONCE(!jump_entry_is_init(entry),
                          "can't patch jump_label at %pS",
                          (void *)jump_entry_code(entry));
index adb9350..622ebdf 100644 (file)
@@ -626,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
  */
 static __always_inline bool
 mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
-                     const bool use_ww_ctx, struct mutex_waiter *waiter)
+                     struct mutex_waiter *waiter)
 {
        if (!waiter) {
                /*
@@ -702,7 +702,7 @@ fail:
 #else
 static __always_inline bool
 mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
-                     const bool use_ww_ctx, struct mutex_waiter *waiter)
+                     struct mutex_waiter *waiter)
 {
        return false;
 }
@@ -922,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        struct ww_mutex *ww;
        int ret;
 
+       if (!use_ww_ctx)
+               ww_ctx = NULL;
+
        might_sleep();
 
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -929,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 #endif
 
        ww = container_of(lock, struct ww_mutex, base);
-       if (use_ww_ctx && ww_ctx) {
+       if (ww_ctx) {
                if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
                        return -EALREADY;
 
@@ -946,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
        if (__mutex_trylock(lock) ||
-           mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+           mutex_optimistic_spin(lock, ww_ctx, NULL)) {
                /* got the lock, yay! */
                lock_acquired(&lock->dep_map, ip);
-               if (use_ww_ctx && ww_ctx)
+               if (ww_ctx)
                        ww_mutex_set_context_fastpath(ww, ww_ctx);
                preempt_enable();
                return 0;
@@ -960,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
         * After waiting to acquire the wait_lock, try again.
         */
        if (__mutex_trylock(lock)) {
-               if (use_ww_ctx && ww_ctx)
+               if (ww_ctx)
                        __ww_mutex_check_waiters(lock, ww_ctx);
 
                goto skip_wait;
@@ -1013,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                        goto err;
                }
 
-               if (use_ww_ctx && ww_ctx) {
+               if (ww_ctx) {
                        ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
                        if (ret)
                                goto err;
@@ -1026,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * ww_mutex needs to always recheck its position since its waiter
                 * list is not FIFO ordered.
                 */
-               if ((use_ww_ctx && ww_ctx) || !first) {
+               if (ww_ctx || !first) {
                        first = __mutex_waiter_is_first(lock, &waiter);
                        if (first)
                                __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
@@ -1039,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * or we must see its unlock and acquire.
                 */
                if (__mutex_trylock(lock) ||
-                   (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+                   (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
                        break;
 
                spin_lock(&lock->wait_lock);
@@ -1048,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 acquired:
        __set_current_state(TASK_RUNNING);
 
-       if (use_ww_ctx && ww_ctx) {
+       if (ww_ctx) {
                /*
                 * Wound-Wait; we stole the lock (!first_waiter), check the
                 * waiters as anyone might want to wound us.
@@ -1068,7 +1071,7 @@ skip_wait:
        /* got the lock - cleanup and rejoice! */
        lock_acquired(&lock->dep_map, ip);
 
-       if (use_ww_ctx && ww_ctx)
+       if (ww_ctx)
                ww_mutex_lock_acquired(ww, ww_ctx);
 
        spin_unlock(&lock->wait_lock);
index 1358fa4..0f4530b 100644 (file)
@@ -98,7 +98,7 @@ static int __init em_debug_init(void)
 
        return 0;
 }
-core_initcall(em_debug_init);
+fs_initcall(em_debug_init);
 #else /* CONFIG_DEBUG_FS */
 static void em_debug_create_pd(struct device *dev) {}
 static void em_debug_remove_pd(struct device *dev) {}
index 821cf17..61db50f 100644 (file)
@@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request,
        audit_ptrace(task);
 
        retval = -EPERM;
-       if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER)))
+       if (unlikely(task->flags & PF_KTHREAD))
                goto out;
        if (same_thread_group(task, current))
                goto out;
index eb1b158..a6ad5eb 100644 (file)
@@ -244,8 +244,6 @@ void migrate_to_reboot_cpu(void)
 void kernel_restart(char *cmd)
 {
        kernel_restart_prepare(cmd);
-       if (pm_power_off_prepare)
-               pm_power_off_prepare();
        migrate_to_reboot_cpu();
        syscore_shutdown();
        if (!cmd)
index ba4d1ef..f271835 100644 (file)
@@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
                return true;
 
        /* Only allow kernel generated signals to this kthread */
-       if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
+       if (unlikely((t->flags & PF_KTHREAD) &&
                     (handler == SIG_KTHREAD_KERNEL) && !force))
                return true;
 
@@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
        /*
         * Skip useless siginfo allocation for SIGKILL and kernel threads.
         */
-       if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER)))
+       if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
                goto out_set;
 
        /*
@@ -2768,13 +2768,21 @@ relock:
                }
 
                /*
+                * PF_IO_WORKER threads will catch and exit on fatal signals
+                * themselves. They have cleanup that must be performed, so
+                * we cannot call do_exit() on their behalf.
+                */
+               if (current->flags & PF_IO_WORKER)
+                       goto out;
+
+               /*
                 * Death signals, no core dump.
                 */
                do_group_exit(ksig->info.si_signo);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);
-
+out:
        ksig->sig = signr;
 
        if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
index ae82529..2c5950b 100644 (file)
@@ -35,27 +35,30 @@ static inline void *static_call_addr(struct static_call_site *site)
        return (void *)((long)site->addr + (long)&site->addr);
 }
 
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+       return (long)site->key + (long)&site->key;
+}
 
 static inline struct static_call_key *static_call_key(const struct static_call_site *site)
 {
-       return (struct static_call_key *)
-               (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS);
+       return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
 }
 
 /* These assume the key is word-aligned. */
 static inline bool static_call_is_init(struct static_call_site *site)
 {
-       return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT;
+       return __static_call_key(site) & STATIC_CALL_SITE_INIT;
 }
 
 static inline bool static_call_is_tail(struct static_call_site *site)
 {
-       return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL;
+       return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
 }
 
 static inline void static_call_set_init(struct static_call_site *site)
 {
-       site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) -
+       site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
                    (long)&site->key;
 }
 
@@ -146,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
        };
 
        for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+               bool init = system_state < SYSTEM_RUNNING;
                struct module *mod = site_mod->mod;
 
                if (!site_mod->sites) {
@@ -165,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
                if (mod) {
                        stop = mod->static_call_sites +
                               mod->num_static_call_sites;
+                       init = mod->state == MODULE_STATE_COMING;
                }
 #endif
 
@@ -172,25 +177,26 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
                     site < stop && static_call_key(site) == key; site++) {
                        void *site_addr = static_call_addr(site);
 
-                       if (static_call_is_init(site)) {
-                               /*
-                                * Don't write to call sites which were in
-                                * initmem and have since been freed.
-                                */
-                               if (!mod && system_state >= SYSTEM_RUNNING)
-                                       continue;
-                               if (mod && !within_module_init((unsigned long)site_addr, mod))
-                                       continue;
-                       }
+                       if (!init && static_call_is_init(site))
+                               continue;
 
                        if (!kernel_text_address((unsigned long)site_addr)) {
-                               WARN_ONCE(1, "can't patch static call site at %pS",
+                               /*
+                                * This skips patching built-in __exit, which
+                                * is part of init_section_contains() but is
+                                * not part of kernel_text_address().
+                                *
+                                * Skipping built-in __exit is fine since it
+                                * will never be executed.
+                                */
+                               WARN_ONCE(!static_call_is_init(site),
+                                         "can't patch static call site at %pS",
                                          site_addr);
                                continue;
                        }
 
                        arch_static_call_transform(site_addr, NULL, func,
-                               static_call_is_tail(site));
+                                                  static_call_is_tail(site));
                }
        }
 
@@ -349,7 +355,7 @@ static int static_call_add_module(struct module *mod)
        struct static_call_site *site;
 
        for (site = start; site != stop; site++) {
-               unsigned long s_key = (long)site->key + (long)&site->key;
+               unsigned long s_key = __static_call_key(site);
                unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
                unsigned long key;
 
index 98d7a15..4d94e2b 100644 (file)
@@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
        if (flags == TIMER_ABSTIME)
                return -ERESTARTNOHAND;
 
-       restart->fn = alarm_timer_nsleep_restart;
        restart->nanosleep.clockid = type;
        restart->nanosleep.expires = exp;
+       set_restart_fn(restart, alarm_timer_nsleep_restart);
        return ret;
 }
 
index 788b9d1..5c9d968 100644 (file)
@@ -1957,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
        }
 
        restart = &current->restart_block;
-       restart->fn = hrtimer_nanosleep_restart;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+       set_restart_fn(restart, hrtimer_nanosleep_restart);
 out:
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
index a71758e..9abe152 100644 (file)
@@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
                if (flags & TIMER_ABSTIME)
                        return -ERESTARTNOHAND;
 
-               restart_block->fn = posix_cpu_nsleep_restart;
                restart_block->nanosleep.clockid = which_clock;
+               set_restart_fn(restart_block, posix_cpu_nsleep_restart);
        }
        return error;
 }
index 4d8e355..3ba52d4 100644 (file)
@@ -3231,7 +3231,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
        pg = start_pg;
        while (pg) {
                order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-               free_pages((unsigned long)pg->records, order);
+               if (order >= 0)
+                       free_pages((unsigned long)pg->records, order);
                start_pg = pg->next;
                kfree(pg);
                pg = start_pg;
@@ -5045,6 +5046,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
        return NULL;
 }
 
+static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
+{
+       struct ftrace_direct_func *direct;
+
+       direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+       if (!direct)
+               return NULL;
+       direct->addr = addr;
+       direct->count = 0;
+       list_add_rcu(&direct->next, &ftrace_direct_funcs);
+       ftrace_direct_func_count++;
+       return direct;
+}
+
 /**
  * register_ftrace_direct - Call a custom trampoline directly
  * @ip: The address of the nop at the beginning of a function
@@ -5120,15 +5135,11 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 
        direct = ftrace_find_direct_func(addr);
        if (!direct) {
-               direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+               direct = ftrace_alloc_direct_func(addr);
                if (!direct) {
                        kfree(entry);
                        goto out_unlock;
                }
-               direct->addr = addr;
-               direct->count = 0;
-               list_add_rcu(&direct->next, &ftrace_direct_funcs);
-               ftrace_direct_func_count++;
        }
 
        entry->ip = ip;
@@ -5329,6 +5340,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
 int modify_ftrace_direct(unsigned long ip,
                         unsigned long old_addr, unsigned long new_addr)
 {
+       struct ftrace_direct_func *direct, *new_direct = NULL;
        struct ftrace_func_entry *entry;
        struct dyn_ftrace *rec;
        int ret = -ENODEV;
@@ -5344,6 +5356,20 @@ int modify_ftrace_direct(unsigned long ip,
        if (entry->direct != old_addr)
                goto out_unlock;
 
+       direct = ftrace_find_direct_func(old_addr);
+       if (WARN_ON(!direct))
+               goto out_unlock;
+       if (direct->count > 1) {
+               ret = -ENOMEM;
+               new_direct = ftrace_alloc_direct_func(new_addr);
+               if (!new_direct)
+                       goto out_unlock;
+               direct->count--;
+               new_direct->count++;
+       } else {
+               direct->addr = new_addr;
+       }
+
        /*
         * If there's no other ftrace callback on the rec->ip location,
         * then it can be changed directly by the architecture.
@@ -5357,6 +5383,14 @@ int modify_ftrace_direct(unsigned long ip,
                ret = 0;
        }
 
+       if (unlikely(ret && new_direct)) {
+               direct->count++;
+               list_del_rcu(&new_direct->next);
+               synchronize_rcu_tasks();
+               kfree(new_direct);
+               ftrace_direct_func_count--;
+       }
+
  out_unlock:
        mutex_unlock(&ftrace_lock);
        mutex_unlock(&direct_mutex);
@@ -6418,7 +6452,8 @@ void ftrace_release_mod(struct module *mod)
                clear_mod_from_hashes(pg);
 
                order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-               free_pages((unsigned long)pg->records, order);
+               if (order >= 0)
+                       free_pages((unsigned long)pg->records, order);
                tmp_page = pg->next;
                kfree(pg);
                ftrace_number_of_pages -= 1 << order;
@@ -6778,7 +6813,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
                if (!pg->index) {
                        *last_pg = pg->next;
                        order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-                       free_pages((unsigned long)pg->records, order);
+                       if (order >= 0)
+                               free_pages((unsigned long)pg->records, order);
                        ftrace_number_of_pages -= 1 << order;
                        ftrace_number_of_groups--;
                        kfree(pg);
index eccb4e1..5c77762 100644 (file)
@@ -2984,7 +2984,8 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 
        size = nr_entries * sizeof(unsigned long);
        event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
-                                           sizeof(*entry) + size, trace_ctx);
+                                   (sizeof(*entry) - sizeof(entry->caller)) + size,
+                                   trace_ctx);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);
index 0b35212..bb7bb3b 100644 (file)
@@ -139,13 +139,22 @@ static void umd_cleanup(struct subprocess_info *info)
        struct umd_info *umd_info = info->data;
 
        /* cleanup if umh_setup() was successful but exec failed */
-       if (info->retval) {
-               fput(umd_info->pipe_to_umh);
-               fput(umd_info->pipe_from_umh);
-               put_pid(umd_info->tgid);
-               umd_info->tgid = NULL;
-       }
+       if (info->retval)
+               umd_cleanup_helper(umd_info);
+}
+
+/**
+ * umd_cleanup_helper - release the resources which were allocated in umd_setup
+ * @info: information about usermode driver
+ */
+void umd_cleanup_helper(struct umd_info *info)
+{
+       fput(info->pipe_to_umh);
+       fput(info->pipe_from_umh);
+       put_pid(info->tgid);
+       info->tgid = NULL;
 }
+EXPORT_SYMBOL_GPL(umd_cleanup_helper);
 
 /**
  * fork_usermode_driver - fork a usermode driver
index 064d68a..4686639 100644 (file)
@@ -232,4 +232,5 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
 
        return res + div64_u64(a * b, c);
 }
+EXPORT_SYMBOL(mul_u64_u64_div_u64);
 #endif
index 8294f43..8b1c318 100644 (file)
@@ -1530,24 +1530,24 @@ static noinline void check_store_range(struct xarray *xa)
 
 #ifdef CONFIG_XARRAY_MULTI
 static void check_split_1(struct xarray *xa, unsigned long index,
-                                                       unsigned int order)
+                               unsigned int order, unsigned int new_order)
 {
-       XA_STATE(xas, xa, index);
-       void *entry;
-       unsigned int i = 0;
+       XA_STATE_ORDER(xas, xa, index, new_order);
+       unsigned int i;
 
        xa_store_order(xa, index, order, xa, GFP_KERNEL);
 
        xas_split_alloc(&xas, xa, order, GFP_KERNEL);
        xas_lock(&xas);
        xas_split(&xas, xa, order);
+       for (i = 0; i < (1 << order); i += (1 << new_order))
+               __xa_store(xa, index + i, xa_mk_index(index + i), 0);
        xas_unlock(&xas);
 
-       xa_for_each(xa, index, entry) {
-               XA_BUG_ON(xa, entry != xa);
-               i++;
+       for (i = 0; i < (1 << order); i++) {
+               unsigned int val = index + (i & ~((1 << new_order) - 1));
+               XA_BUG_ON(xa, xa_load(xa, index + i) != xa_mk_index(val));
        }
-       XA_BUG_ON(xa, i != 1 << order);
 
        xa_set_mark(xa, index, XA_MARK_0);
        XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0));
@@ -1557,14 +1557,16 @@ static void check_split_1(struct xarray *xa, unsigned long index,
 
 static noinline void check_split(struct xarray *xa)
 {
-       unsigned int order;
+       unsigned int order, new_order;
 
        XA_BUG_ON(xa, !xa_empty(xa));
 
        for (order = 1; order < 2 * XA_CHUNK_SHIFT; order++) {
-               check_split_1(xa, 0, order);
-               check_split_1(xa, 1UL << order, order);
-               check_split_1(xa, 3UL << order, order);
+               for (new_order = 0; new_order < order; new_order++) {
+                       check_split_1(xa, 0, order, new_order);
+                       check_split_1(xa, 1UL << order, order, new_order);
+                       check_split_1(xa, 3UL << order, order, new_order);
+               }
        }
 }
 #else
index 5fa5161..f5d8f54 100644 (file)
@@ -987,7 +987,7 @@ static void node_set_marks(struct xa_node *node, unsigned int offset,
  * xas_split_alloc() - Allocate memory for splitting an entry.
  * @xas: XArray operation state.
  * @entry: New entry which will be stored in the array.
- * @order: New entry order.
+ * @order: Current entry order.
  * @gfp: Memory allocation flags.
  *
  * This function should be called before calling xas_split().
@@ -1011,7 +1011,7 @@ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
 
        do {
                unsigned int i;
-               void *sibling;
+               void *sibling = NULL;
                struct xa_node *node;
 
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
@@ -1021,7 +1021,7 @@ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
                for (i = 0; i < XA_CHUNK_SIZE; i++) {
                        if ((i & mask) == 0) {
                                RCU_INIT_POINTER(node->slots[i], entry);
-                               sibling = xa_mk_sibling(0);
+                               sibling = xa_mk_sibling(i);
                        } else {
                                RCU_INIT_POINTER(node->slots[i], sibling);
                        }
@@ -1041,9 +1041,10 @@ EXPORT_SYMBOL_GPL(xas_split_alloc);
  * xas_split() - Split a multi-index entry into smaller entries.
  * @xas: XArray operation state.
  * @entry: New entry to store in the array.
- * @order: New entry order.
+ * @order: Current entry order.
  *
- * The value in the entry is copied to all the replacement entries.
+ * The size of the new entries is set in @xas.  The value in @entry is
+ * copied to all the replacement entries.
  *
  * Context: Any context.  The caller should hold the xa_lock.
  */
index 86f2b94..6ef8f5e 100644 (file)
@@ -618,7 +618,7 @@ void __kmap_local_sched_out(void)
                int idx;
 
                /* With debug all even slots are unmapped and act as guard */
-               if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+               if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
                        WARN_ON_ONCE(!pte_none(pteval));
                        continue;
                }
@@ -654,7 +654,7 @@ void __kmap_local_sched_in(void)
                int idx;
 
                /* With debug all even slots are unmapped and act as guard */
-               if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+               if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
                        WARN_ON_ONCE(!pte_none(pteval));
                        continue;
                }
index 5b1ab1f..a86a58e 100644 (file)
@@ -280,6 +280,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
                nrg->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                nrg->css = &h_cg->css;
+               /*
+                * The caller will hold exactly one h_cg->css reference for the
+                * whole contiguous reservation region. But this area might be
+                * scattered when there are already some file_regions reside in
+                * it. As a result, many file_regions may share only one css
+                * reference. In order to ensure that one file_region must hold
+                * exactly one h_cg->css reference, we should do css_get for
+                * each file_region and leave the reference held by caller
+                * untouched.
+                */
+               css_get(&h_cg->css);
                if (!resv->pages_per_hpage)
                        resv->pages_per_hpage = pages_per_huge_page(h);
                /* pages_per_hpage should be the same for all entries in
@@ -293,6 +304,14 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
 #endif
 }
 
+static void put_uncharge_info(struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       if (rg->css)
+               css_put(rg->css);
+#endif
+}
+
 static bool has_same_uncharge_info(struct file_region *rg,
                                   struct file_region *org)
 {
@@ -316,6 +335,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
                prg->to = rg->to;
 
                list_del(&rg->link);
+               put_uncharge_info(rg);
                kfree(rg);
 
                rg = prg;
@@ -327,6 +347,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
                nrg->from = rg->from;
 
                list_del(&rg->link);
+               put_uncharge_info(rg);
                kfree(rg);
        }
 }
@@ -662,7 +683,7 @@ retry:
 
                        del += t - f;
                        hugetlb_cgroup_uncharge_file_region(
-                               resv, rg, t - f);
+                               resv, rg, t - f, false);
 
                        /* New entry for end of split region */
                        nrg->from = t;
@@ -683,7 +704,7 @@ retry:
                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
                        del += rg->to - rg->from;
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
-                                                           rg->to - rg->from);
+                                                           rg->to - rg->from, true);
                        list_del(&rg->link);
                        kfree(rg);
                        continue;
@@ -691,13 +712,13 @@ retry:
 
                if (f <= rg->from) {    /* Trim beginning of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
-                                                           t - rg->from);
+                                                           t - rg->from, false);
 
                        del += t - rg->from;
                        rg->from = t;
                } else {                /* Trim end of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
-                                                           rg->to - f);
+                                                           rg->to - f, false);
 
                        del += rg->to - f;
                        rg->to = f;
@@ -5187,6 +5208,10 @@ bool hugetlb_reserve_pages(struct inode *inode,
                         */
                        long rsv_adjust;
 
+                       /*
+                        * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
+                        * reference to h_cg->css. See comment below for detail.
+                        */
                        hugetlb_cgroup_uncharge_cgroup_rsvd(
                                hstate_index(h),
                                (chg - add) * pages_per_huge_page(h), h_cg);
@@ -5194,6 +5219,14 @@ bool hugetlb_reserve_pages(struct inode *inode,
                        rsv_adjust = hugepage_subpool_put_pages(spool,
                                                                chg - add);
                        hugetlb_acct_memory(h, -rsv_adjust);
+               } else if (h_cg) {
+                       /*
+                        * The file_regions will hold their own reference to
+                        * h_cg->css. So we should release the reference held
+                        * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
+                        * done.
+                        */
+                       hugetlb_cgroup_put_rsvd_cgroup(h_cg);
                }
        }
        return true;
index f68b51f..603a131 100644 (file)
@@ -391,7 +391,8 @@ void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
 
 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                         struct file_region *rg,
-                                        unsigned long nr_pages)
+                                        unsigned long nr_pages,
+                                        bool region_del)
 {
        if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
                return;
@@ -400,7 +401,12 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
            !resv->reservation_counter) {
                page_counter_uncharge(rg->reservation_counter,
                                      nr_pages * resv->pages_per_hpage);
-               css_put(rg->css);
+               /*
+                * Only do css_put(rg->css) when we delete the entire region
+                * because one file_region must hold exactly one css reference.
+                */
+               if (region_del)
+                       css_put(rg->css);
        }
 }
 
index 3b8ec93..d53c91f 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/debugfs.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
+#include <linux/kmemleak.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/memblock.h>
@@ -480,6 +481,14 @@ static bool __init kfence_init_pool(void)
                addr += 2 * PAGE_SIZE;
        }
 
+       /*
+        * The pool is live and will never be deallocated from this point on.
+        * Remove the pool object from the kmemleak object tree, as it would
+        * otherwise overlap with allocations returned by kfence_alloc(), which
+        * are registered with kmemleak through the slab post-alloc hook.
+        */
+       kmemleak_free(__kfence_pool);
+
        return true;
 
 err:
index c0014d3..fe6e3ae 100644 (file)
@@ -97,6 +97,7 @@
 #include <linux/atomic.h>
 
 #include <linux/kasan.h>
+#include <linux/kfence.h>
 #include <linux/kmemleak.h>
 #include <linux/memory_hotplug.h>
 
@@ -589,7 +590,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        atomic_set(&object->use_count, 1);
        object->flags = OBJECT_ALLOCATED;
        object->pointer = ptr;
-       object->size = size;
+       object->size = kfence_ksize((void *)ptr) ?: size;
        object->excess_ref = 0;
        object->min_count = min_count;
        object->count = 0;                      /* white color initially */
index 5efa07f..550405f 100644 (file)
@@ -166,7 +166,7 @@ static int __init init_zero_pfn(void)
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
 }
-core_initcall(init_zero_pfn);
+early_initcall(init_zero_pfn);
 
 void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
 {
index 61ee40e..459d195 100644 (file)
@@ -501,10 +501,33 @@ static int mn_hlist_invalidate_range_start(
                                                "");
                                WARN_ON(mmu_notifier_range_blockable(range) ||
                                        _ret != -EAGAIN);
+                               /*
+                                * We call all the notifiers on any EAGAIN,
+                                * there is no way for a notifier to know if
+                                * its start method failed, thus a start that
+                                * does EAGAIN can't also do end.
+                                */
+                               WARN_ON(ops->invalidate_range_end);
                                ret = _ret;
                        }
                }
        }
+
+       if (ret) {
+               /*
+                * Must be non-blocking to get here.  If there are multiple
+                * notifiers and one or more failed start, any that succeeded
+                * start are expecting their end to be called.  Do so now.
+                */
+               hlist_for_each_entry_rcu(subscription, &subscriptions->list,
+                                        hlist, srcu_read_lock_held(&srcu)) {
+                       if (!subscription->ops->invalidate_range_end)
+                               continue;
+
+                       subscription->ops->invalidate_range_end(subscription,
+                                                               range);
+               }
+       }
        srcu_read_unlock(&srcu, id);
 
        return ret;
index eb34d20..9e35b63 100644 (file)
@@ -2833,6 +2833,22 @@ void wait_on_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL_GPL(wait_on_page_writeback);
 
+/*
+ * Wait for a page to complete writeback.  Returns -EINTR if we get a
+ * fatal signal while waiting.
+ */
+int wait_on_page_writeback_killable(struct page *page)
+{
+       while (PageWriteback(page)) {
+               trace_wait_on_page_writeback(page, page_mapping(page));
+               if (wait_on_page_bit_killable(page, PG_writeback))
+                       return -EINTR;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+
 /**
  * wait_for_stable_page() - wait for writeback to finish, if necessary.
  * @page:      The page to wait on.
index b5dafa7..9d889ad 100644 (file)
@@ -1346,8 +1346,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
                        page = list_entry(pos, struct page, lru);
 
                        zhdr = page_address(page);
-                       if (test_bit(PAGE_HEADLESS, &page->private))
+                       if (test_bit(PAGE_HEADLESS, &page->private)) {
+                               /*
+                                * For non-headless pages, we wait to do this
+                                * until we have the page lock to avoid racing
+                                * with __z3fold_alloc(). Headless pages don't
+                                * have a lock (and __z3fold_alloc() will never
+                                * see them), but we still need to test and set
+                                * PAGE_CLAIMED to avoid racing with
+                                * z3fold_free(), so just do it now before
+                                * leaving the loop.
+                                */
+                               if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+                                       continue;
+
                                break;
+                       }
 
                        if (kref_get_unless_zero(&zhdr->refcount) == 0) {
                                zhdr = NULL;
index e48f7ac..3ddd66e 100644 (file)
@@ -702,7 +702,6 @@ MODULE_LICENSE("GPL");
 
 MODULE_AUTHOR(BATADV_DRIVER_AUTHOR);
 MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
-MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
 MODULE_VERSION(BATADV_SOURCE_VERSION);
 MODULE_ALIAS_RTNL_LINK("batadv");
 MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
index b895038..1e24d9a 100644 (file)
@@ -128,6 +128,8 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 {
        if (!fdb->dst)
                return;
+       if (test_bit(BR_FDB_LOCAL, &fdb->flags))
+               return;
 
        switch (type) {
        case RTM_DELNEIGH:
index 3ef7f78..15ea123 100644 (file)
@@ -196,7 +196,7 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
        nskb->dev = dev;
        can_skb_set_owner(nskb, sk);
        ncf = (struct canfd_frame *)nskb->data;
-       skb_put(nskb, so->ll.mtu);
+       skb_put_zero(nskb, so->ll.mtu);
 
        /* create & send flow control reply */
        ncf->can_id = so->txid;
@@ -215,8 +215,7 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
        if (ae)
                ncf->data[0] = so->opt.ext_address;
 
-       if (so->ll.mtu == CANFD_MTU)
-               ncf->flags = so->ll.tx_flags;
+       ncf->flags = so->ll.tx_flags;
 
        can_send_ret = can_send(nskb, 1);
        if (can_send_ret)
@@ -780,7 +779,7 @@ isotp_tx_burst:
                can_skb_prv(skb)->skbcnt = 0;
 
                cf = (struct canfd_frame *)skb->data;
-               skb_put(skb, so->ll.mtu);
+               skb_put_zero(skb, so->ll.mtu);
 
                /* create consecutive frame */
                isotp_fill_dataframe(cf, so, ae, 0);
@@ -790,8 +789,7 @@ isotp_tx_burst:
                so->tx.sn %= 16;
                so->tx.bs++;
 
-               if (so->ll.mtu == CANFD_MTU)
-                       cf->flags = so->ll.tx_flags;
+               cf->flags = so->ll.tx_flags;
 
                skb->dev = dev;
                can_skb_set_owner(skb, sk);
@@ -897,7 +895,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
        so->tx.idx = 0;
 
        cf = (struct canfd_frame *)skb->data;
-       skb_put(skb, so->ll.mtu);
+       skb_put_zero(skb, so->ll.mtu);
 
        /* check for single frame transmission depending on TX_DL */
        if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) {
@@ -939,8 +937,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
        }
 
        /* send the first or only CAN frame */
-       if (so->ll.mtu == CANFD_MTU)
-               cf->flags = so->ll.tx_flags;
+       cf->flags = so->ll.tx_flags;
 
        skb->dev = dev;
        skb->sk = sk;
@@ -1228,7 +1225,8 @@ static int isotp_setsockopt(struct socket *sock, int level, int optname,
                        if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU)
                                return -EINVAL;
 
-                       if (ll.mtu == CAN_MTU && ll.tx_dl > CAN_MAX_DLEN)
+                       if (ll.mtu == CAN_MTU &&
+                           (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0))
                                return -EINVAL;
 
                        memcpy(&so->ll, &ll, sizeof(ll));
index 6c5967e..0f72ff5 100644 (file)
@@ -1184,6 +1184,18 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
                        return -ENOMEM;
 
                for_each_netdev(net, d) {
+                       struct netdev_name_node *name_node;
+                       list_for_each_entry(name_node, &d->name_node->list, list) {
+                               if (!sscanf(name_node->name, name, &i))
+                                       continue;
+                               if (i < 0 || i >= max_netdevices)
+                                       continue;
+
+                               /*  avoid cases where sscanf is not exact inverse of printf */
+                               snprintf(buf, IFNAMSIZ, name, i);
+                               if (!strncmp(buf, name_node->name, IFNAMSIZ))
+                                       set_bit(i, inuse);
+                       }
                        if (!sscanf(d->name, name, &i))
                                continue;
                        if (i < 0 || i >= max_netdevices)
@@ -4294,6 +4306,13 @@ static inline void ____napi_schedule(struct softnet_data *sd,
                 */
                thread = READ_ONCE(napi->thread);
                if (thread) {
+                       /* Avoid doing set_bit() if the thread is in
+                        * INTERRUPTIBLE state, cause napi_thread_wait()
+                        * makes sure to proceed with napi polling
+                        * if the thread is explicitly woken from here.
+                        */
+                       if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE)
+                               set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
                        wake_up_process(thread);
                        return;
                }
@@ -6486,6 +6505,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 
                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+                             NAPIF_STATE_SCHED_THREADED |
                              NAPIF_STATE_PREFER_BUSY_POLL);
 
                /* If STATE_MISSED was set, leave STATE_SCHED set,
@@ -6968,16 +6988,25 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 
 static int napi_thread_wait(struct napi_struct *napi)
 {
+       bool woken = false;
+
        set_current_state(TASK_INTERRUPTIBLE);
 
        while (!kthread_should_stop() && !napi_disable_pending(napi)) {
-               if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+               /* Testing SCHED_THREADED bit here to make sure the current
+                * kthread owns this napi and could poll on this napi.
+                * Testing SCHED bit is not enough because SCHED bit might be
+                * set by some other busy poll thread or by napi_disable().
+                */
+               if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
                        WARN_ON(!list_empty(&napi->poll_list));
                        __set_current_state(TASK_RUNNING);
                        return 0;
                }
 
                schedule();
+               /* woken being true indicates this thread owns this napi. */
+               woken = true;
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);
@@ -11346,7 +11375,7 @@ static void __net_exit default_device_exit(struct net *net)
                        continue;
 
                /* Leave virtual devices for the generic cleanup */
-               if (dev->rtnl_link_ops)
+               if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
                        continue;
 
                /* Push remaining network devices to init_net */
index 571f191..db65ce6 100644 (file)
@@ -1053,6 +1053,20 @@ static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
        return 0;
 
 err_module_put:
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+               struct sk_buff *skb;
+
+               del_timer_sync(&hw_data->send_timer);
+               cancel_work_sync(&hw_data->dm_alert_work);
+               while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+                       struct devlink_trap_metadata *hw_metadata;
+
+                       hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+                       net_dm_hw_metadata_free(hw_metadata);
+                       consume_skb(skb);
+               }
+       }
        module_put(THIS_MODULE);
        return rc;
 }
@@ -1134,6 +1148,15 @@ static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
 err_unregister_trace:
        unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
 err_module_put:
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+               struct sk_buff *skb;
+
+               del_timer_sync(&data->send_timer);
+               cancel_work_sync(&data->dm_alert_work);
+               while ((skb = __skb_dequeue(&data->drop_queue)))
+                       consume_skb(skb);
+       }
        module_put(THIS_MODULE);
        return rc;
 }
index 0c01bd8..fb3bcba 100644 (file)
@@ -237,37 +237,62 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 }
 EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 
-static struct dst_ops md_dst_ops = {
-       .family =               AF_UNSPEC,
-};
+struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie)
+{
+       return NULL;
+}
 
-static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
-       WARN_ONCE(1, "Attempting to call output on metadata dst\n");
-       kfree_skb(skb);
-       return 0;
+       return NULL;
 }
 
-static int dst_md_discard(struct sk_buff *skb)
+struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
+                                            struct sk_buff *skb,
+                                            const void *daddr)
 {
-       WARN_ONCE(1, "Attempting to call input on metadata dst\n");
-       kfree_skb(skb);
-       return 0;
+       return NULL;
+}
+
+void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+                              struct sk_buff *skb, u32 mtu,
+                              bool confirm_neigh)
+{
+}
+EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu);
+
+void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+                           struct sk_buff *skb)
+{
+}
+EXPORT_SYMBOL_GPL(dst_blackhole_redirect);
+
+unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
+{
+       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+       return mtu ? : dst->dev->mtu;
 }
+EXPORT_SYMBOL_GPL(dst_blackhole_mtu);
+
+static struct dst_ops dst_blackhole_ops = {
+       .family         = AF_UNSPEC,
+       .neigh_lookup   = dst_blackhole_neigh_lookup,
+       .check          = dst_blackhole_check,
+       .cow_metrics    = dst_blackhole_cow_metrics,
+       .update_pmtu    = dst_blackhole_update_pmtu,
+       .redirect       = dst_blackhole_redirect,
+       .mtu            = dst_blackhole_mtu,
+};
 
 static void __metadata_dst_init(struct metadata_dst *md_dst,
                                enum metadata_type type, u8 optslen)
-
 {
        struct dst_entry *dst;
 
        dst = &md_dst->dst;
-       dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+       dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE,
                 DST_METADATA | DST_NOCOUNT);
-
-       dst->input = dst_md_discard;
-       dst->output = dst_md_discard_out;
-
        memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
        md_dst->type = type;
 }
index adfdad2..9323d34 100644 (file)
@@ -5658,7 +5658,7 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
        if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
                return -EINVAL;
 
-       if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+       if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
                return -EINVAL;
 
        dev = __dev_via_ifindex(dev, ifindex);
@@ -5668,7 +5668,11 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
        mtu = READ_ONCE(dev->mtu);
 
        dev_len = mtu + dev->hard_header_len;
-       skb_len = skb->len + len_diff; /* minus result pass check */
+
+       /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+       skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+
+       skb_len += len_diff; /* minus result pass check */
        if (skb_len <= dev_len) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                goto out;
@@ -5713,6 +5717,10 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
        /* Add L2-header as dev MTU is L3 size */
        dev_len = mtu + dev->hard_header_len;
 
+       /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+       if (*mtu_len)
+               xdp_len = *mtu_len + dev->hard_header_len;
+
        xdp_len += len_diff; /* minus result pass check */
        if (xdp_len > dev_len)
                ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
index 2ef2224..a96a4f5 100644 (file)
@@ -176,7 +176,7 @@ void skb_flow_get_icmp_tci(const struct sk_buff *skb,
         * avoid confusion with packets without such field
         */
        if (icmp_has_id(ih->type))
-               key_icmp->id = ih->un.echo.id ? : 1;
+               key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
        else
                key_icmp->id = 0;
 }
index 0ed98f2..cc31b60 100644 (file)
@@ -3440,6 +3440,32 @@ static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
        twsk_prot->twsk_slab = NULL;
 }
 
+static int tw_prot_init(const struct proto *prot)
+{
+       struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
+
+       if (!twsk_prot)
+               return 0;
+
+       twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
+                                             prot->name);
+       if (!twsk_prot->twsk_slab_name)
+               return -ENOMEM;
+
+       twsk_prot->twsk_slab =
+               kmem_cache_create(twsk_prot->twsk_slab_name,
+                                 twsk_prot->twsk_obj_size, 0,
+                                 SLAB_ACCOUNT | prot->slab_flags,
+                                 NULL);
+       if (!twsk_prot->twsk_slab) {
+               pr_crit("%s: Can't create timewait sock SLAB cache!\n",
+                       prot->name);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
 {
        if (!rsk_prot)
@@ -3496,22 +3522,8 @@ int proto_register(struct proto *prot, int alloc_slab)
                if (req_prot_init(prot))
                        goto out_free_request_sock_slab;
 
-               if (prot->twsk_prot != NULL) {
-                       prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
-
-                       if (prot->twsk_prot->twsk_slab_name == NULL)
-                               goto out_free_request_sock_slab;
-
-                       prot->twsk_prot->twsk_slab =
-                               kmem_cache_create(prot->twsk_prot->twsk_slab_name,
-                                                 prot->twsk_prot->twsk_obj_size,
-                                                 0,
-                                                 SLAB_ACCOUNT |
-                                                 prot->slab_flags,
-                                                 NULL);
-                       if (prot->twsk_prot->twsk_slab == NULL)
-                               goto out_free_timewait_sock_slab;
-               }
+               if (tw_prot_init(prot))
+                       goto out_free_timewait_sock_slab;
        }
 
        mutex_lock(&proto_list_mutex);
index 1f73603..2be5c69 100644 (file)
@@ -319,6 +319,11 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
        if (!ipv6_unicast_destination(skb))
                return 0;       /* discard, don't send a reset here */
 
+       if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
+               __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
+               return 0;
+       }
+
        if (dccp_bad_service_code(sk, service)) {
                dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
                goto drop;
index 4d4956e..d142eb2 100644 (file)
@@ -1066,6 +1066,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master)
 {
        struct dsa_switch *ds = dp->ds;
        struct dsa_switch_tree *dst = ds->dst;
+       const struct dsa_device_ops *tag_ops;
        enum dsa_tag_protocol tag_protocol;
 
        tag_protocol = dsa_get_tag_protocol(dp, master);
@@ -1080,14 +1081,16 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master)
                 * nothing to do here.
                 */
        } else {
-               dst->tag_ops = dsa_tag_driver_get(tag_protocol);
-               if (IS_ERR(dst->tag_ops)) {
-                       if (PTR_ERR(dst->tag_ops) == -ENOPROTOOPT)
+               tag_ops = dsa_tag_driver_get(tag_protocol);
+               if (IS_ERR(tag_ops)) {
+                       if (PTR_ERR(tag_ops) == -ENOPROTOOPT)
                                return -EPROBE_DEFER;
                        dev_warn(ds->dev, "No tagger for this switch\n");
                        dp->master = NULL;
-                       return PTR_ERR(dst->tag_ops);
+                       return PTR_ERR(tag_ops);
                }
+
+               dst->tag_ops = tag_ops;
        }
 
        dp->master = master;
index 6bd7ca0..fd472ea 100644 (file)
@@ -705,12 +705,15 @@ static bool reqsk_queue_unlink(struct request_sock *req)
        return found;
 }
 
-void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
 {
-       if (reqsk_queue_unlink(req)) {
+       bool unlinked = reqsk_queue_unlink(req);
+
+       if (unlinked) {
                reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
                reqsk_put(req);
        }
+       return unlinked;
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
 
index 47db1bf..bc2f6ca 100644 (file)
@@ -309,7 +309,7 @@ have_carrier:
  */
 static void __init ic_close_devs(void)
 {
-       struct net_device *selected_dev = ic_dev->dev;
+       struct net_device *selected_dev = ic_dev ? ic_dev->dev : NULL;
        struct ic_device *d, *next;
        struct net_device *dev;
 
@@ -317,16 +317,18 @@ static void __init ic_close_devs(void)
        next = ic_first_dev;
        while ((d = next)) {
                bool bring_down = (d != ic_dev);
-               struct net_device *lower_dev;
+               struct net_device *lower;
                struct list_head *iter;
 
                next = d->next;
                dev = d->dev;
 
-               netdev_for_each_lower_dev(selected_dev, lower_dev, iter) {
-                       if (dev == lower_dev) {
-                               bring_down = false;
-                               break;
+               if (selected_dev) {
+                       netdev_for_each_lower_dev(selected_dev, lower, iter) {
+                               if (dev == lower) {
+                                       bring_down = false;
+                                       break;
+                               }
                        }
                }
                if (bring_down) {
index c576a63..d1e04d2 100644 (file)
@@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
        local_bh_disable();
        addend = xt_write_recseq_begin();
-       private = rcu_access_pointer(table->private);
+       private = READ_ONCE(table->private); /* Address dependency. */
        cpu     = smp_processor_id();
        table_base = private->entries;
        jumpstack  = (struct arpt_entry **)private->jumpstack[cpu];
@@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
        unsigned int countersize;
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
 
        /* We need atomic snapshot of counters: rest doesn't change
         * (other than comefrom, which userspace doesn't care
@@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size,
        unsigned int off, num;
        const struct arpt_entry *e;
        struct xt_counters *counters;
-       struct xt_table_info *private = xt_table_get_private_protected(table);
+       struct xt_table_info *private = table->private;
        int ret = 0;
        void *loc_cpu_entry;
 
@@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
        t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
        if (!IS_ERR(t)) {
                struct arpt_getinfo info;
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
                struct xt_table_info tmp;
 
@@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
 
        t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
        if (!IS_ERR(t)) {
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
 
                if (get.size == private->size)
                        ret = copy_entries_to_user(private->size,
@@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
        }
 
        local_bh_disable();
-       private = xt_table_get_private_protected(t);
+       private = t->private;
        if (private->number != tmp.num_counters) {
                ret = -EINVAL;
                goto unlock_up_free;
@@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
                                       void __user *userptr)
 {
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
        void __user *pos;
        unsigned int size;
        int ret = 0;
@@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net,
        xt_compat_lock(NFPROTO_ARP);
        t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
        if (!IS_ERR(t)) {
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
                struct xt_table_info info;
 
                ret = compat_table_info(private, &info);
index e8f6f9d..f15bc21 100644 (file)
@@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb,
        WARN_ON(!(table->valid_hooks & (1 << hook)));
        local_bh_disable();
        addend = xt_write_recseq_begin();
-       private = rcu_access_pointer(table->private);
+       private = READ_ONCE(table->private); /* Address dependency. */
        cpu        = smp_processor_id();
        table_base = private->entries;
        jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
@@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
        unsigned int countersize;
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
 
        /* We need atomic snapshot of counters: rest doesn't change
           (other than comefrom, which userspace doesn't care
@@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size,
        unsigned int off, num;
        const struct ipt_entry *e;
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
        int ret = 0;
        const void *loc_cpu_entry;
 
@@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
        t = xt_request_find_table_lock(net, AF_INET, name);
        if (!IS_ERR(t)) {
                struct ipt_getinfo info;
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
                struct xt_table_info tmp;
 
@@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
 
        t = xt_find_table_lock(net, AF_INET, get.name);
        if (!IS_ERR(t)) {
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
                if (get.size == private->size)
                        ret = copy_entries_to_user(private->size,
                                                   t, uptr->entrytable);
@@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
        }
 
        local_bh_disable();
-       private = xt_table_get_private_protected(t);
+       private = t->private;
        if (private->number != tmp.num_counters) {
                ret = -EINVAL;
                goto unlock_up_free;
@@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
                            void __user *userptr)
 {
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
        void __user *pos;
        unsigned int size;
        int ret = 0;
@@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
        xt_compat_lock(AF_INET);
        t = xt_find_table_lock(net, AF_INET, get.name);
        if (!IS_ERR(t)) {
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
                struct xt_table_info info;
                ret = compat_table_info(private, &info);
                if (!ret && get.size == info.size)
index 02d81d7..bba150f 100644 (file)
@@ -2687,44 +2687,15 @@ out:
        return rth;
 }
 
-static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
-{
-       return NULL;
-}
-
-static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
-{
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
-       return mtu ? : dst->dev->mtu;
-}
-
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                                         struct sk_buff *skb, u32 mtu,
-                                         bool confirm_neigh)
-{
-}
-
-static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
-                                      struct sk_buff *skb)
-{
-}
-
-static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
-                                         unsigned long old)
-{
-       return NULL;
-}
-
 static struct dst_ops ipv4_dst_blackhole_ops = {
-       .family                 =       AF_INET,
-       .check                  =       ipv4_blackhole_dst_check,
-       .mtu                    =       ipv4_blackhole_mtu,
-       .default_advmss         =       ipv4_default_advmss,
-       .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
-       .redirect               =       ipv4_rt_blackhole_redirect,
-       .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
-       .neigh_lookup           =       ipv4_neigh_lookup,
+       .family                 = AF_INET,
+       .default_advmss         = ipv4_default_advmss,
+       .neigh_lookup           = ipv4_neigh_lookup,
+       .check                  = dst_blackhole_check,
+       .cow_metrics            = dst_blackhole_cow_metrics,
+       .update_pmtu            = dst_blackhole_update_pmtu,
+       .redirect               = dst_blackhole_redirect,
+       .mtu                    = dst_blackhole_mtu,
 };
 
 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
index 0055ae0..7513ba4 100644 (file)
@@ -804,8 +804,11 @@ embryonic_reset:
                tcp_reset(sk, skb);
        }
        if (!fastopen) {
-               inet_csk_reqsk_queue_drop(sk, req);
-               __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+               bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
+
+               if (unlinked)
+                       __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+               *req_stolen = !unlinked;
        }
        return NULL;
 }
index ef9d022..679699e 100644 (file)
@@ -2486,7 +2486,7 @@ static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
        const struct net_device *dev;
 
        if (rt->nh)
-               fib6_nh = nexthop_fib6_nh(rt->nh);
+               fib6_nh = nexthop_fib6_nh_bh(rt->nh);
 
        seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
 
index e9d2a4a..8025671 100644 (file)
@@ -245,16 +245,6 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
        if (ipv6_addr_is_multicast(&hdr->saddr))
                goto err;
 
-       /* While RFC4291 is not explicit about v4mapped addresses
-        * in IPv6 headers, it seems clear linux dual-stack
-        * model can not deal properly with these.
-        * Security models could be fooled by ::ffff:127.0.0.1 for example.
-        *
-        * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02
-        */
-       if (ipv6_addr_v4mapped(&hdr->saddr))
-               goto err;
-
        skb->transport_header = skb->network_header + sizeof(*hdr);
        IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
 
index 0d453fa..2e2119b 100644 (file)
@@ -280,7 +280,7 @@ ip6t_do_table(struct sk_buff *skb,
 
        local_bh_disable();
        addend = xt_write_recseq_begin();
-       private = rcu_access_pointer(table->private);
+       private = READ_ONCE(table->private); /* Address dependency. */
        cpu        = smp_processor_id();
        table_base = private->entries;
        jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
@@ -807,7 +807,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
        unsigned int countersize;
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
 
        /* We need atomic snapshot of counters: rest doesn't change
           (other than comefrom, which userspace doesn't care
@@ -831,7 +831,7 @@ copy_entries_to_user(unsigned int total_size,
        unsigned int off, num;
        const struct ip6t_entry *e;
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
        int ret = 0;
        const void *loc_cpu_entry;
 
@@ -980,7 +980,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
        t = xt_request_find_table_lock(net, AF_INET6, name);
        if (!IS_ERR(t)) {
                struct ip6t_getinfo info;
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
                struct xt_table_info tmp;
 
@@ -1035,7 +1035,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
 
        t = xt_find_table_lock(net, AF_INET6, get.name);
        if (!IS_ERR(t)) {
-               struct xt_table_info *private = xt_table_get_private_protected(t);
+               struct xt_table_info *private = t->private;
                if (get.size == private->size)
                        ret = copy_entries_to_user(private->size,
                                                   t, uptr->entrytable);
@@ -1189,7 +1189,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
        }
 
        local_bh_disable();
-       private = xt_table_get_private_protected(t);
+       private = t->private;
        if (private->number != tmp.num_counters) {
                ret = -EINVAL;
                goto unlock_up_free;
@@ -1552,7 +1552,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
                            void __user *userptr)
 {
        struct xt_counters *counters;
-       const struct xt_table_info *private = xt_table_get_private_protected(table);
+       const struct xt_table_info *private = table->private;
        void __user *pos;
        unsigned int size;
        int ret = 0;
@@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
        xt_compat_lock(AF_INET6);
        t = xt_find_table_lock(net, AF_INET6, get.name);
        if (!IS_ERR(t)) {
-               const struct xt_table_info *private = xt_table_get_private_protected(t);
+               const struct xt_table_info *private = t->private;
                struct xt_table_info info;
                ret = compat_table_info(private, &info);
                if (!ret && get.size == info.size)
index 1536f49..1056b02 100644 (file)
@@ -260,34 +260,16 @@ static struct dst_ops ip6_dst_ops_template = {
        .confirm_neigh          =       ip6_confirm_neigh,
 };
 
-static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
-{
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
-       return mtu ? : dst->dev->mtu;
-}
-
-static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                                        struct sk_buff *skb, u32 mtu,
-                                        bool confirm_neigh)
-{
-}
-
-static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
-                                     struct sk_buff *skb)
-{
-}
-
 static struct dst_ops ip6_dst_blackhole_ops = {
-       .family                 =       AF_INET6,
-       .destroy                =       ip6_dst_destroy,
-       .check                  =       ip6_dst_check,
-       .mtu                    =       ip6_blackhole_mtu,
-       .default_advmss         =       ip6_default_advmss,
-       .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
-       .redirect               =       ip6_rt_blackhole_redirect,
-       .cow_metrics            =       dst_cow_metrics_generic,
-       .neigh_lookup           =       ip6_dst_neigh_lookup,
+       .family                 = AF_INET6,
+       .default_advmss         = ip6_default_advmss,
+       .neigh_lookup           = ip6_dst_neigh_lookup,
+       .check                  = ip6_dst_check,
+       .destroy                = ip6_dst_destroy,
+       .cow_metrics            = dst_cow_metrics_generic,
+       .update_pmtu            = dst_blackhole_update_pmtu,
+       .redirect               = dst_blackhole_redirect,
+       .mtu                    = dst_blackhole_mtu,
 };
 
 static const u32 ip6_template_metrics[RTAX_MAX] = {
index bd44ded..d0f0077 100644 (file)
@@ -1175,6 +1175,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
        if (!ipv6_unicast_destination(skb))
                goto drop;
 
+       if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
+               __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
+               return 0;
+       }
+
        return tcp_conn_request(&tcp6_request_sock_ops,
                                &tcp_request_sock_ipv6_ops, sk, skb);
 
index d7b3d90..b00d6f5 100644 (file)
@@ -23,6 +23,7 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
        struct aead_request *aead_req;
        int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
        u8 *__aad;
+       int ret;
 
        aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC);
        if (!aead_req)
@@ -40,10 +41,10 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
        aead_request_set_crypt(aead_req, sg, sg, data_len, b_0);
        aead_request_set_ad(aead_req, sg[0].length);
 
-       crypto_aead_encrypt(aead_req);
+       ret = crypto_aead_encrypt(aead_req);
        kfree_sensitive(aead_req);
 
-       return 0;
+       return ret;
 }
 
 int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
index 6f3b3a0..512cab0 100644 (file)
@@ -22,6 +22,7 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
        struct aead_request *aead_req;
        int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
        const __le16 *fc;
+       int ret;
 
        if (data_len < GMAC_MIC_LEN)
                return -EINVAL;
@@ -59,10 +60,10 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
        aead_request_set_crypt(aead_req, sg, sg, 0, iv);
        aead_request_set_ad(aead_req, GMAC_AAD_LEN + data_len);
 
-       crypto_aead_encrypt(aead_req);
+       ret = crypto_aead_encrypt(aead_req);
        kfree_sensitive(aead_req);
 
-       return 0;
+       return ret;
 }
 
 struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[],
index c4c70e3..68a0de0 100644 (file)
@@ -2950,14 +2950,14 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
                        continue;
 
                for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) {
-                       if (~sdata->rc_rateidx_mcs_mask[i][j]) {
+                       if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) {
                                sdata->rc_has_mcs_mask[i] = true;
                                break;
                        }
                }
 
                for (j = 0; j < NL80211_VHT_NSS_MAX; j++) {
-                       if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) {
+                       if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) {
                                sdata->rc_has_vht_mcs_mask[i] = true;
                                break;
                        }
index 1f552f3..a7ac53a 100644 (file)
@@ -1874,6 +1874,8 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
 
        /* remove beacon */
        kfree(sdata->u.ibss.ie);
+       sdata->u.ibss.ie = NULL;
+       sdata->u.ibss.ie_len = 0;
 
        /* on the next join, re-program HT parameters */
        memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa));
index 4f3f8bb..1b9c826 100644 (file)
@@ -973,8 +973,19 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
                        continue;
 
                if (!dflt_chandef.chan) {
+                       /*
+                        * Assign the first enabled channel to dflt_chandef
+                        * from the list of channels
+                        */
+                       for (i = 0; i < sband->n_channels; i++)
+                               if (!(sband->channels[i].flags &
+                                               IEEE80211_CHAN_DISABLED))
+                                       break;
+                       /* if none found then use the first anyway */
+                       if (i == sband->n_channels)
+                               i = 0;
                        cfg80211_chandef_create(&dflt_chandef,
-                                               &sband->channels[0],
+                                               &sband->channels[i],
                                                NL80211_CHAN_NO_HT);
                        /* init channel we're on */
                        if (!local->use_chanctx && !local->_oper_chandef.chan) {
index 2e33a12..ce4e385 100644 (file)
@@ -5071,7 +5071,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
                he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION,
                                                  ies->data, ies->len);
                if (he_oper_ie &&
-                   he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3]))
+                   he_oper_ie[1] >= ieee80211_he_oper_size(&he_oper_ie[3]))
                        he_oper = (void *)(he_oper_ie + 3);
                else
                        he_oper = NULL;
index 2f44f49..ecad9b1 100644 (file)
@@ -805,7 +805,6 @@ minstrel_ht_group_min_rate_offset(struct minstrel_ht_sta *mi, int group,
 static u16
 minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur)
 {
-       struct minstrel_mcs_group_data *mg;
        u8 type = MINSTREL_SAMPLE_TYPE_INC;
        int i, index = 0;
        u8 group;
@@ -813,7 +812,6 @@ minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur)
        group = mi->sample[type].sample_group;
        for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) {
                group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups);
-               mg = &mi->groups[group];
 
                index = minstrel_ht_group_min_rate_offset(mi, group,
                                                          fast_rate_dur);
index f080fcf..c0fa526 100644 (file)
@@ -968,7 +968,7 @@ static void ieee80211_parse_extension_element(u32 *crc,
                break;
        case WLAN_EID_EXT_HE_OPERATION:
                if (len >= sizeof(*elems->he_operation) &&
-                   len == ieee80211_he_oper_size(data) - 1) {
+                   len >= ieee80211_he_oper_size(data) - 1) {
                        if (crc)
                                *crc = crc32_be(*crc, (void *)elem,
                                                elem->datalen + 2);
index 444a386..89a4225 100644 (file)
@@ -567,15 +567,15 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 }
 
 static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
-                                 struct in_addr *addr)
+                                 struct in_addr *addr, u16 port)
 {
        u8 hmac[SHA256_DIGEST_SIZE];
        u8 msg[7];
 
        msg[0] = addr_id;
        memcpy(&msg[1], &addr->s_addr, 4);
-       msg[5] = 0;
-       msg[6] = 0;
+       msg[5] = port >> 8;
+       msg[6] = port & 0xFF;
 
        mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
 
@@ -584,15 +584,15 @@ static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
 static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
-                                  struct in6_addr *addr)
+                                  struct in6_addr *addr, u16 port)
 {
        u8 hmac[SHA256_DIGEST_SIZE];
        u8 msg[19];
 
        msg[0] = addr_id;
        memcpy(&msg[1], &addr->s6_addr, 16);
-       msg[17] = 0;
-       msg[18] = 0;
+       msg[17] = port >> 8;
+       msg[18] = port & 0xFF;
 
        mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
 
@@ -646,7 +646,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
                        opts->ahmac = add_addr_generate_hmac(msk->local_key,
                                                             msk->remote_key,
                                                             opts->addr_id,
-                                                            &opts->addr);
+                                                            &opts->addr,
+                                                            opts->port);
                }
        }
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -657,7 +658,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
                        opts->ahmac = add_addr6_generate_hmac(msk->local_key,
                                                              msk->remote_key,
                                                              opts->addr_id,
-                                                             &opts->addr6);
+                                                             &opts->addr6,
+                                                             opts->port);
                }
        }
 #endif
@@ -962,12 +964,14 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
        if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
                hmac = add_addr_generate_hmac(msk->remote_key,
                                              msk->local_key,
-                                             mp_opt->addr_id, &mp_opt->addr);
+                                             mp_opt->addr_id, &mp_opt->addr,
+                                             mp_opt->port);
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
        else
                hmac = add_addr6_generate_hmac(msk->remote_key,
                                               msk->local_key,
-                                              mp_opt->addr_id, &mp_opt->addr6);
+                                              mp_opt->addr_id, &mp_opt->addr6,
+                                              mp_opt->port);
 #endif
 
        pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
index 7695857..1590b9d 100644 (file)
@@ -2968,7 +2968,7 @@ static void mptcp_release_cb(struct sock *sk)
        for (;;) {
                flags = 0;
                if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags))
-                       flags |= MPTCP_PUSH_PENDING;
+                       flags |= BIT(MPTCP_PUSH_PENDING);
                if (!flags)
                        break;
 
@@ -2981,7 +2981,7 @@ static void mptcp_release_cb(struct sock *sk)
                 */
 
                spin_unlock_bh(&sk->sk_lock.slock);
-               if (flags & MPTCP_PUSH_PENDING)
+               if (flags & BIT(MPTCP_PUSH_PENDING))
                        __mptcp_push_pending(sk, 0);
 
                cond_resched();
index 3d47d67..d17d39c 100644 (file)
@@ -477,6 +477,11 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
        if (!ipv6_unicast_destination(skb))
                goto drop;
 
+       if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
+               __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
+               return 0;
+       }
+
        return tcp_conn_request(&mptcp_subflow_request_sock_ops,
                                &subflow_request_sock_ipv6_ops, sk, skb);
 
index 1469365..1d519b0 100644 (file)
@@ -2962,6 +2962,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
        memset(&m, 0xFF, sizeof(m));
        memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
        m.src.u.all = mask->src.u.all;
+       m.src.l3num = tuple->src.l3num;
        m.dst.protonum = tuple->dst.protonum;
 
        nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK);
index 5b05487..db11e40 100644 (file)
@@ -218,9 +218,6 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
                            enum ip_conntrack_info ctinfo,
                            const struct nf_hook_state *state)
 {
-       if (state->pf != NFPROTO_IPV4)
-               return -NF_ACCEPT;
-
        if (!nf_ct_is_confirmed(ct)) {
                unsigned int *timeouts = nf_ct_timeout_lookup(ct);
 
index 5fa657b..c77ba86 100644 (file)
@@ -506,7 +506,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
 {
        int err;
 
-       INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+       INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
        flow_block_init(&flowtable->flow_block);
        init_rwsem(&flowtable->flow_block_lock);
 
index 224c8e5..f57f1a6 100644 (file)
@@ -6783,6 +6783,9 @@ static int nft_register_flowtable_net_hooks(struct net *net,
 
        list_for_each_entry(hook, hook_list, list) {
                list_for_each_entry(ft, &table->flowtables, list) {
+                       if (!nft_is_active_next(net, ft))
+                               continue;
+
                        list_for_each_entry(hook2, &ft->hook_list, list) {
                                if (hook->ops.dev == hook2->ops.dev &&
                                    hook->ops.pf == hook2->ops.pf) {
@@ -6842,6 +6845,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
        struct nft_hook *hook, *next;
        struct nft_trans *trans;
        bool unregister = false;
+       u32 flags;
        int err;
 
        err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
@@ -6856,6 +6860,17 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
                }
        }
 
+       if (nla[NFTA_FLOWTABLE_FLAGS]) {
+               flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+               if (flags & ~NFT_FLOWTABLE_MASK)
+                       return -EOPNOTSUPP;
+               if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
+                   (flags & NFT_FLOWTABLE_HW_OFFLOAD))
+                       return -EOPNOTSUPP;
+       } else {
+               flags = flowtable->data.flags;
+       }
+
        err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
                                               &flowtable_hook.list, flowtable);
        if (err < 0)
@@ -6869,6 +6884,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
                goto err_flowtable_update_hook;
        }
 
+       nft_trans_flowtable_flags(trans) = flags;
        nft_trans_flowtable(trans) = flowtable;
        nft_trans_flowtable_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
@@ -6963,8 +6979,10 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
        if (nla[NFTA_FLOWTABLE_FLAGS]) {
                flowtable->data.flags =
                        ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
-               if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK)
+               if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
+                       err = -EOPNOTSUPP;
                        goto err3;
+               }
        }
 
        write_pnet(&flowtable->data.net, net);
@@ -8176,6 +8194,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
                        break;
                case NFT_MSG_NEWFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
+                               nft_trans_flowtable(trans)->data.flags =
+                                       nft_trans_flowtable_flags(trans);
                                nf_tables_flowtable_notify(&trans->ctx,
                                                           nft_trans_flowtable(trans),
                                                           &nft_trans_flowtable_hooks(trans),
index bce6ca2..6bd31a7 100644 (file)
@@ -1351,14 +1351,6 @@ struct xt_counters *xt_counters_alloc(unsigned int counters)
 }
 EXPORT_SYMBOL(xt_counters_alloc);
 
-struct xt_table_info
-*xt_table_get_private_protected(const struct xt_table *table)
-{
-       return rcu_dereference_protected(table->private,
-                                        mutex_is_locked(&xt[table->af].mutex));
-}
-EXPORT_SYMBOL(xt_table_get_private_protected);
-
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
              unsigned int num_counters,
@@ -1366,6 +1358,7 @@ xt_replace_table(struct xt_table *table,
              int *error)
 {
        struct xt_table_info *private;
+       unsigned int cpu;
        int ret;
 
        ret = xt_jumpstack_alloc(newinfo);
@@ -1375,20 +1368,47 @@ xt_replace_table(struct xt_table *table,
        }
 
        /* Do the substitution. */
-       private = xt_table_get_private_protected(table);
+       local_bh_disable();
+       private = table->private;
 
        /* Check inside lock: is the old number correct? */
        if (num_counters != private->number) {
                pr_debug("num_counters != table->private->number (%u/%u)\n",
                         num_counters, private->number);
+               local_bh_enable();
                *error = -EAGAIN;
                return NULL;
        }
 
        newinfo->initial_entries = private->initial_entries;
+       /*
+        * Ensure contents of newinfo are visible before assigning to
+        * private.
+        */
+       smp_wmb();
+       table->private = newinfo;
+
+       /* make sure all cpus see new ->private value */
+       smp_mb();
 
-       rcu_assign_pointer(table->private, newinfo);
-       synchronize_rcu();
+       /*
+        * Even though table entries have now been swapped, other CPU's
+        * may still be using the old entries...
+        */
+       local_bh_enable();
+
+       /* ... so wait for even xt_recseq on all cpus */
+       for_each_possible_cpu(cpu) {
+               seqcount_t *s = &per_cpu(xt_recseq, cpu);
+               u32 seq = raw_read_seqcount(s);
+
+               if (seq & 1) {
+                       do {
+                               cond_resched();
+                               cpu_relax();
+                       } while (seq == raw_read_seqcount(s));
+               }
+       }
 
        audit_log_nfcfg(table->name, table->af, private->number,
                        !private->number ? AUDIT_XT_OP_REGISTER :
@@ -1424,12 +1444,12 @@ struct xt_table *xt_register_table(struct net *net,
        }
 
        /* Simplifies replace_table code. */
-       rcu_assign_pointer(table->private, bootstrap);
+       table->private = bootstrap;
 
        if (!xt_replace_table(table, 0, newinfo, &ret))
                goto unlock;
 
-       private = xt_table_get_private_protected(table);
+       private = table->private;
        pr_debug("table->private->number = %u\n", private->number);
 
        /* save number of initial entries */
@@ -1452,8 +1472,7 @@ void *xt_unregister_table(struct xt_table *table)
        struct xt_table_info *private;
 
        mutex_lock(&xt[table->af].mutex);
-       private = xt_table_get_private_protected(table);
-       RCU_INIT_POINTER(table->private, NULL);
+       private = table->private;
        list_del(&table->list);
        mutex_unlock(&xt[table->af].mutex);
        audit_log_nfcfg(table->name, table->af, private->number,
index 5eddfe7..71cec03 100644 (file)
@@ -271,9 +271,11 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 /* This is called to initialize CT key fields possibly coming in from the local
  * stack.
  */
-void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
+void ovs_ct_fill_key(const struct sk_buff *skb,
+                    struct sw_flow_key *key,
+                    bool post_ct)
 {
-       ovs_ct_update_key(skb, NULL, key, false, false);
+       ovs_ct_update_key(skb, NULL, key, post_ct, false);
 }
 
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
@@ -1332,7 +1334,7 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
        if (skb_nfct(skb)) {
                nf_conntrack_put(skb_nfct(skb));
                nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
-               ovs_ct_fill_key(skb, key);
+               ovs_ct_fill_key(skb, key, false);
        }
 
        return 0;
index 59dc327..317e525 100644 (file)
@@ -25,7 +25,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
                   const struct ovs_conntrack_info *);
 int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
 
-void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
+void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key,
+                    bool post_ct);
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
                   const struct sw_flow_key *output, struct sk_buff *skb);
 void ovs_ct_free_action(const struct nlattr *a);
@@ -74,7 +75,8 @@ static inline int ovs_ct_clear(struct sk_buff *skb,
 }
 
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
-                                  struct sw_flow_key *key)
+                                  struct sw_flow_key *key,
+                                  bool post_ct)
 {
        key->ct_state = 0;
        key->ct_zone = 0;
index c7f34d6..e586424 100644 (file)
@@ -857,6 +857,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        struct tc_skb_ext *tc_ext;
 #endif
+       bool post_ct = false;
        int res, err;
 
        /* Extract metadata from packet. */
@@ -895,6 +896,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
                tc_ext = skb_ext_find(skb, TC_SKB_EXT);
                key->recirc_id = tc_ext ? tc_ext->chain : 0;
                OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0;
+               post_ct = tc_ext ? tc_ext->post_ct : false;
        } else {
                key->recirc_id = 0;
        }
@@ -904,7 +906,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 
        err = key_extract(skb, key);
        if (!err)
-               ovs_ct_fill_key(skb, key);   /* Must be after key_extract(). */
+               ovs_ct_fill_key(skb, key, post_ct);   /* Must be after key_extract(). */
        return err;
 }
 
index edb6ac1..dfc820e 100644 (file)
@@ -1058,6 +1058,11 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
        rc = copied;
 
        if (addr) {
+               /* There is an anonymous 2-byte hole after sq_family,
+                * make sure to clear it.
+                */
+               memset(addr, 0, sizeof(*addr));
+
                addr->sq_family = AF_QIPCRTR;
                addr->sq_node = cb->src_node;
                addr->sq_port = cb->src_port;
index f0a0aa1..16e888a 100644 (file)
@@ -945,13 +945,14 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
        tcf_lastuse_update(&c->tcf_tm);
 
        if (clear) {
+               qdisc_skb_cb(skb)->post_ct = false;
                ct = nf_ct_get(skb, &ctinfo);
                if (ct) {
                        nf_conntrack_put(&ct->ct_general);
                        nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
                }
 
-               goto out;
+               goto out_clear;
        }
 
        family = tcf_ct_skb_nf_family(skb);
@@ -1030,8 +1031,9 @@ out_push:
        skb_push_rcsum(skb, nh_ofs);
 
 out:
-       tcf_action_update_bstats(&c->common, skb);
        qdisc_skb_cb(skb)->post_ct = true;
+out_clear:
+       tcf_action_update_bstats(&c->common, skb);
        if (defrag)
                qdisc_skb_cb(skb)->pkt_len = skb->len;
        return retval;
index e37556c..13341e7 100644 (file)
@@ -1629,6 +1629,7 @@ int tcf_classify_ingress(struct sk_buff *skb,
                        return TC_ACT_SHOT;
                ext->chain = last_executed_chain;
                ext->mru = qdisc_skb_cb(skb)->mru;
+               ext->post_ct = qdisc_skb_cb(skb)->post_ct;
        }
 
        return ret;
index d097b5c..c69a4ba 100644 (file)
@@ -1451,7 +1451,7 @@ static int fl_set_key_ct(struct nlattr **tb,
                               &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
                               sizeof(key->ct_state));
 
-               err = fl_validate_ct_state(mask->ct_state,
+               err = fl_validate_ct_state(key->ct_state & mask->ct_state,
                                           tb[TCA_FLOWER_KEY_CT_STATE_MASK],
                                           extack);
                if (err)
index 50f680f..2adbd94 100644 (file)
@@ -345,6 +345,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
        struct sk_buff **old = NULL;
        unsigned int mask;
        u32 max_P;
+       u8 *stab;
 
        if (opt == NULL)
                return -EINVAL;
@@ -361,8 +362,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
        max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
 
        ctl = nla_data(tb[TCA_CHOKE_PARMS]);
-
-       if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log))
+       stab = nla_data(tb[TCA_CHOKE_STAB]);
+       if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab))
                return -EINVAL;
 
        if (ctl->limit > CHOKE_MAX_QUEUE)
@@ -412,7 +413,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
 
        red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
                      ctl->Plog, ctl->Scell_log,
-                     nla_data(tb[TCA_CHOKE_STAB]),
+                     stab,
                      max_P);
        red_set_vars(&q->vars);
 
index e0bc775..f4132dc 100644 (file)
@@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
        struct gred_sched *table = qdisc_priv(sch);
        struct gred_sched_data *q = table->tab[dp];
 
-       if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) {
+       if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab)) {
                NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters");
                return -EINVAL;
        }
index dff3adf..62e12cb 100644 (file)
@@ -1020,6 +1020,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
        struct nlattr *tb[TCA_HTB_MAX + 1];
        struct tc_htb_glob *gopt;
        unsigned int ntx;
+       bool offload;
        int err;
 
        qdisc_watchdog_init(&q->watchdog, sch);
@@ -1044,9 +1045,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
        if (gopt->version != HTB_VER >> 16)
                return -EINVAL;
 
-       q->offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
+       offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
 
-       if (q->offload) {
+       if (offload) {
                if (sch->parent != TC_H_ROOT)
                        return -EOPNOTSUPP;
 
@@ -1076,7 +1077,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
                q->rate2quantum = 1;
        q->defcls = gopt->defcls;
 
-       if (!q->offload)
+       if (!offload)
                return 0;
 
        for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
@@ -1107,12 +1108,14 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
        if (err)
                goto err_free_qdiscs;
 
+       /* Defer this assignment, so that htb_destroy skips offload-related
+        * parts (especially calling ndo_setup_tc) on errors.
+        */
+       q->offload = true;
+
        return 0;
 
 err_free_qdiscs:
-       /* TC_HTB_CREATE call failed, avoid any further calls to the driver. */
-       q->offload = false;
-
        for (ntx = 0; ntx < q->num_direct_qdiscs && q->direct_qdiscs[ntx];
             ntx++)
                qdisc_put(q->direct_qdiscs[ntx]);
@@ -1340,8 +1343,12 @@ htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
 {
        struct net_device *dev = qdisc_dev(sch);
        struct tc_htb_qopt_offload offload_opt;
+       struct htb_sched *q = qdisc_priv(sch);
        int err;
 
+       if (!q->offload)
+               return sch->dev_queue;
+
        offload_opt = (struct tc_htb_qopt_offload) {
                .command = TC_HTB_LEAF_QUERY_QUEUE,
                .classid = TC_H_MIN(tcm->tcm_parent),
index b4ae34d..40adf1f 100644 (file)
@@ -242,6 +242,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb,
        unsigned char flags;
        int err;
        u32 max_P;
+       u8 *stab;
 
        if (tb[TCA_RED_PARMS] == NULL ||
            tb[TCA_RED_STAB] == NULL)
@@ -250,7 +251,9 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb,
        max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
 
        ctl = nla_data(tb[TCA_RED_PARMS]);
-       if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log))
+       stab = nla_data(tb[TCA_RED_STAB]);
+       if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog,
+                             ctl->Scell_log, stab))
                return -EINVAL;
 
        err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS,
@@ -288,7 +291,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb,
        red_set_parms(&q->parms,
                      ctl->qth_min, ctl->qth_max, ctl->Wlog,
                      ctl->Plog, ctl->Scell_log,
-                     nla_data(tb[TCA_RED_STAB]),
+                     stab,
                      max_P);
        red_set_vars(&q->vars);
 
index b25e514..066754a 100644 (file)
@@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
        }
 
        if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
-                                       ctl_v1->Wlog, ctl_v1->Scell_log))
+                                       ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
                return -EINVAL;
        if (ctl_v1 && ctl_v1->qth_min) {
                p = kmalloc(sizeof(*p), GFP_KERNEL);
index 6614c9f..a6aa17d 100644 (file)
@@ -584,13 +584,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
                goto out;
        }
 
-       rcu_read_lock();
-       if (__sk_dst_get(sk) != tp->dst) {
-               dst_hold(tp->dst);
-               sk_setup_caps(sk, tp->dst);
-       }
-       rcu_read_unlock();
-
        /* pack up chunks */
        pkt_count = sctp_packet_pack(packet, head, gso, gfp);
        if (!pkt_count) {
index 3fd06a2..5cb1aa5 100644 (file)
@@ -1135,6 +1135,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 
 static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx)
 {
+       struct sock *sk = ctx->asoc->base.sk;
        struct list_head *ltransport;
        struct sctp_packet *packet;
        struct sctp_transport *t;
@@ -1144,6 +1145,12 @@ static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx)
                t = list_entry(ltransport, struct sctp_transport, send_ready);
                packet = &t->packet;
                if (!sctp_packet_empty(packet)) {
+                       rcu_read_lock();
+                       if (t->dst && __sk_dst_get(sk) != t->dst) {
+                               dst_hold(t->dst);
+                               sk_setup_caps(sk, t->dst);
+                       }
+                       rcu_read_unlock();
                        error = sctp_packet_transmit(packet, ctx->gfp);
                        if (error < 0)
                                ctx->q->asoc->base.sk->sk_err = -error;
index bd4678d..6dff643 100644 (file)
@@ -1825,11 +1825,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
        struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-       struct rpc_gss_wire_cred *gc = &gsd->clcred;
+       struct rpc_gss_wire_cred *gc;
        struct xdr_buf *resbuf = &rqstp->rq_res;
        int stat = -EINVAL;
        struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+       if (!gsd)
+               goto out;
+       gc = &gsd->clcred;
        if (gc->gc_proc != RPC_GSS_PROC_DATA)
                goto out;
        /* Release can be called twice, but we only wrap once. */
@@ -1870,10 +1873,10 @@ out_err:
        if (rqstp->rq_cred.cr_group_info)
                put_group_info(rqstp->rq_cred.cr_group_info);
        rqstp->rq_cred.cr_group_info = NULL;
-       if (gsd->rsci)
+       if (gsd && gsd->rsci) {
                cache_put(&gsd->rsci->h, sn->rsc_cache);
-       gsd->rsci = NULL;
-
+               gsd->rsci = NULL;
+       }
        return stat;
 }
 
index 61fb8a1..d76dc9d 100644 (file)
@@ -1413,7 +1413,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 
  sendit:
        if (svc_authorise(rqstp))
-               goto close;
+               goto close_xprt;
        return 1;               /* Caller can now send it */
 
 release_dropit:
@@ -1425,6 +1425,8 @@ release_dropit:
        return 0;
 
  close:
+       svc_authorise(rqstp);
+close_xprt:
        if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
                svc_close_xprt(rqstp->rq_xprt);
        dprintk("svc: svc_process close\n");
@@ -1433,7 +1435,7 @@ release_dropit:
 err_short_len:
        svc_printk(rqstp, "short len %zd, dropping request\n",
                        argv->iov_len);
-       goto close;
+       goto close_xprt;
 
 err_bad_rpc:
        serv->sv_stats->rpcbadfmt++;
index dcc50ae..3cdd71a 100644 (file)
@@ -1060,7 +1060,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st
        struct svc_xprt *xprt;
        int ret = 0;
 
-       spin_lock(&serv->sv_lock);
+       spin_lock_bh(&serv->sv_lock);
        list_for_each_entry(xprt, xprt_list, xpt_list) {
                if (xprt->xpt_net != net)
                        continue;
@@ -1068,7 +1068,7 @@ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, st
                set_bit(XPT_CLOSE, &xprt->xpt_flags);
                svc_xprt_enqueue(xprt);
        }
-       spin_unlock(&serv->sv_lock);
+       spin_unlock_bh(&serv->sv_lock);
        return ret;
 }
 
index 4a1edbb..9150df3 100644 (file)
@@ -252,9 +252,9 @@ xprt_setup_rdma_bc(struct xprt_create *args)
        xprt->timeout = &xprt_rdma_bc_timeout;
        xprt_set_bound(xprt);
        xprt_set_connected(xprt);
-       xprt->bind_timeout = RPCRDMA_BIND_TO;
-       xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
-       xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+       xprt->bind_timeout = 0;
+       xprt->reestablish_timeout = 0;
+       xprt->idle_timeout = 0;
 
        xprt->prot = XPRT_TRANSPORT_BC_RDMA;
        xprt->ops = &xprt_rdma_bc_procs;
index 6d28f23..7d34290 100644 (file)
@@ -266,46 +266,33 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp)
                svc_rdma_recv_ctxt_put(rdma, ctxt);
 }
 
-static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
-                                  unsigned int wanted, bool temp)
+static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
+                               struct svc_rdma_recv_ctxt *ctxt)
 {
-       const struct ib_recv_wr *bad_wr = NULL;
-       struct svc_rdma_recv_ctxt *ctxt;
-       struct ib_recv_wr *recv_chain;
        int ret;
 
-       recv_chain = NULL;
-       while (wanted--) {
-               ctxt = svc_rdma_recv_ctxt_get(rdma);
-               if (!ctxt)
-                       break;
-
-               trace_svcrdma_post_recv(ctxt);
-               ctxt->rc_temp = temp;
-               ctxt->rc_recv_wr.next = recv_chain;
-               recv_chain = &ctxt->rc_recv_wr;
-               rdma->sc_pending_recvs++;
-       }
-       if (!recv_chain)
-               return false;
-
-       ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr);
+       trace_svcrdma_post_recv(ctxt);
+       ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
        if (ret)
                goto err_post;
-       return true;
+       return 0;
 
 err_post:
-       while (bad_wr) {
-               ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt,
-                                   rc_recv_wr);
-               bad_wr = bad_wr->next;
-               svc_rdma_recv_ctxt_put(rdma, ctxt);
-       }
-
        trace_svcrdma_rq_post_err(rdma, ret);
-       /* Since we're destroying the xprt, no need to reset
-        * sc_pending_recvs. */
-       return false;
+       svc_rdma_recv_ctxt_put(rdma, ctxt);
+       return ret;
+}
+
+static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+{
+       struct svc_rdma_recv_ctxt *ctxt;
+
+       if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+               return 0;
+       ctxt = svc_rdma_recv_ctxt_get(rdma);
+       if (!ctxt)
+               return -ENOMEM;
+       return __svc_rdma_post_recv(rdma, ctxt);
 }
 
 /**
@@ -316,7 +303,20 @@ err_post:
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
-       return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
+       struct svc_rdma_recv_ctxt *ctxt;
+       unsigned int i;
+       int ret;
+
+       for (i = 0; i < rdma->sc_max_requests; i++) {
+               ctxt = svc_rdma_recv_ctxt_get(rdma);
+               if (!ctxt)
+                       return false;
+               ctxt->rc_temp = true;
+               ret = __svc_rdma_post_recv(rdma, ctxt);
+               if (ret)
+                       return false;
+       }
+       return true;
 }
 
 /**
@@ -324,6 +324,8 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
  * @cq: Completion Queue context
  * @wc: Work Completion object
  *
+ * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
+ * the Receive completion handler could be running.
  */
 static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
@@ -331,8 +333,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        struct ib_cqe *cqe = wc->wr_cqe;
        struct svc_rdma_recv_ctxt *ctxt;
 
-       rdma->sc_pending_recvs--;
-
        /* WARNING: Only wc->wr_cqe and wc->status are reliable */
        ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
 
@@ -340,6 +340,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        if (wc->status != IB_WC_SUCCESS)
                goto flushed;
 
+       if (svc_rdma_post_recv(rdma))
+               goto post_err;
+
        /* All wc fields are now known to be valid */
        ctxt->rc_byte_len = wc->byte_len;
 
@@ -350,18 +353,11 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        spin_unlock(&rdma->sc_rq_dto_lock);
        if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
                svc_xprt_enqueue(&rdma->sc_xprt);
-
-       if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) &&
-           rdma->sc_pending_recvs < rdma->sc_max_requests)
-               if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH,
-                                           false))
-                       goto post_err;
-
        return;
 
 flushed:
-       svc_rdma_recv_ctxt_put(rdma, ctxt);
 post_err:
+       svc_rdma_recv_ctxt_put(rdma, ctxt);
        set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
        svc_xprt_enqueue(&rdma->sc_xprt);
 }
index 008670d..136338b 100644 (file)
@@ -2895,17 +2895,22 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
 
 #ifdef CONFIG_TIPC_CRYPTO
 static int tipc_nl_retrieve_key(struct nlattr **attrs,
-                               struct tipc_aead_key **key)
+                               struct tipc_aead_key **pkey)
 {
        struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY];
+       struct tipc_aead_key *key;
 
        if (!attr)
                return -ENODATA;
 
-       *key = (struct tipc_aead_key *)nla_data(attr);
-       if (nla_len(attr) < tipc_aead_key_size(*key))
+       if (nla_len(attr) < sizeof(*key))
+               return -EINVAL;
+       key = (struct tipc_aead_key *)nla_data(attr);
+       if (key->keylen > TIPC_AEAD_KEYLEN_MAX ||
+           nla_len(attr) < tipc_aead_key_size(key))
                return -EINVAL;
 
+       *pkey = key;
        return 0;
 }
 
index 5546710..bc7fb9b 100644 (file)
@@ -755,6 +755,7 @@ static struct sock *__vsock_create(struct net *net,
                vsk->buffer_size = psk->buffer_size;
                vsk->buffer_min_size = psk->buffer_min_size;
                vsk->buffer_max_size = psk->buffer_max_size;
+               security_sk_clone(parent, sk);
        } else {
                vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN);
                vsk->owner = get_current_cred();
index 521d36b..034af85 100644 (file)
@@ -70,7 +70,7 @@ __cfg80211_wdev_from_attrs(struct cfg80211_registered_device *rdev,
        struct wireless_dev *result = NULL;
        bool have_ifidx = attrs[NL80211_ATTR_IFINDEX];
        bool have_wdev_id = attrs[NL80211_ATTR_WDEV];
-       u64 wdev_id;
+       u64 wdev_id = 0;
        int wiphy_idx = -1;
        int ifidx = -1;
 
@@ -14789,6 +14789,7 @@ bad_tid_conf:
 #define NL80211_FLAG_NEED_WDEV_UP      (NL80211_FLAG_NEED_WDEV |\
                                         NL80211_FLAG_CHECK_NETDEV_UP)
 #define NL80211_FLAG_CLEAR_SKB         0x20
+#define NL80211_FLAG_NO_WIPHY_MTX      0x40
 
 static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
                            struct genl_info *info)
@@ -14840,7 +14841,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
                info->user_ptr[0] = rdev;
        }
 
-       if (rdev) {
+       if (rdev && !(ops->internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) {
                wiphy_lock(&rdev->wiphy);
                /* we keep the mutex locked until post_doit */
                __release(&rdev->wiphy.mtx);
@@ -14865,7 +14866,8 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
                }
        }
 
-       if (info->user_ptr[0]) {
+       if (info->user_ptr[0] &&
+           !(ops->internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) {
                struct cfg80211_registered_device *rdev = info->user_ptr[0];
 
                /* we kept the mutex locked since pre_doit */
@@ -15329,7 +15331,9 @@ static const struct genl_small_ops nl80211_small_ops[] = {
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_wiphy_netns,
                .flags = GENL_UNS_ADMIN_PERM,
-               .internal_flags = NL80211_FLAG_NEED_WIPHY,
+               .internal_flags = NL80211_FLAG_NEED_WIPHY |
+                                 NL80211_FLAG_NEED_RTNL |
+                                 NL80211_FLAG_NO_WIPHY_MTX,
        },
        {
                .cmd = NL80211_CMD_GET_SURVEY,
index 168cd27..2c52535 100644 (file)
@@ -20,6 +20,7 @@ SECTIONS {
 
        __patchable_function_entries : { *(__patchable_function_entries) }
 
+#ifdef CONFIG_LTO_CLANG
        /*
         * With CONFIG_LTO_CLANG, LLD always enables -fdata-sections and
         * -ffunction-sections, which increases the size of the final module.
@@ -41,6 +42,7 @@ SECTIONS {
        }
 
        .text : { *(.text .text.[0-9a-zA-Z_]*) }
+#endif
 }
 
 /* bring in arch-specific sections */
index 1d20003..0ba0184 100644 (file)
@@ -98,6 +98,14 @@ struct integrity_iint_cache *integrity_inode_get(struct inode *inode)
        struct rb_node *node, *parent = NULL;
        struct integrity_iint_cache *iint, *test_iint;
 
+       /*
+        * The integrity's "iint_cache" is initialized at security_init(),
+        * unless it is not included in the ordered list of LSMs enabled
+        * on the boot command line.
+        */
+       if (!iint_cache)
+               panic("%s: lsm=integrity required.\n", __func__);
+
        iint = integrity_iint_find(inode);
        if (iint)
                return iint;
index 6fe2530..7650de0 100644 (file)
@@ -219,14 +219,21 @@ static inline bool selinux_policycap_genfs_seclabel_symlinks(void)
        return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS]);
 }
 
+struct selinux_policy_convert_data;
+
+struct selinux_load_state {
+       struct selinux_policy *policy;
+       struct selinux_policy_convert_data *convert_data;
+};
+
 int security_mls_enabled(struct selinux_state *state);
 int security_load_policy(struct selinux_state *state,
-                       void *data, size_t len,
-                       struct selinux_policy **newpolicyp);
+                        void *data, size_t len,
+                        struct selinux_load_state *load_state);
 void selinux_policy_commit(struct selinux_state *state,
-                       struct selinux_policy *newpolicy);
+                          struct selinux_load_state *load_state);
 void selinux_policy_cancel(struct selinux_state *state,
-                       struct selinux_policy *policy);
+                          struct selinux_load_state *load_state);
 int security_read_policy(struct selinux_state *state,
                         void **data, size_t *len);
 int security_read_state_kernel(struct selinux_state *state,
index 01a7d50..fff6bab 100644 (file)
@@ -563,17 +563,13 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
 
        ret = sel_make_bools(newpolicy, tmp_bool_dir, &tmp_bool_num,
                             &tmp_bool_names, &tmp_bool_values);
-       if (ret) {
-               pr_err("SELinux: failed to load policy booleans\n");
+       if (ret)
                goto out;
-       }
 
        ret = sel_make_classes(newpolicy, tmp_class_dir,
                               &fsi->last_class_ino);
-       if (ret) {
-               pr_err("SELinux: failed to load policy classes\n");
+       if (ret)
                goto out;
-       }
 
        /* booleans */
        old_dentry = fsi->bool_dir;
@@ -616,7 +612,7 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf,
 
 {
        struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info;
-       struct selinux_policy *newpolicy;
+       struct selinux_load_state load_state;
        ssize_t length;
        void *data = NULL;
 
@@ -642,23 +638,23 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf,
        if (copy_from_user(data, buf, count) != 0)
                goto out;
 
-       length = security_load_policy(fsi->state, data, count, &newpolicy);
+       length = security_load_policy(fsi->state, data, count, &load_state);
        if (length) {
                pr_warn_ratelimited("SELinux: failed to load policy\n");
                goto out;
        }
 
-       length = sel_make_policy_nodes(fsi, newpolicy);
+       length = sel_make_policy_nodes(fsi, load_state.policy);
        if (length) {
-               selinux_policy_cancel(fsi->state, newpolicy);
-               goto out1;
+               pr_warn_ratelimited("SELinux: failed to initialize selinuxfs\n");
+               selinux_policy_cancel(fsi->state, &load_state);
+               goto out;
        }
 
-       selinux_policy_commit(fsi->state, newpolicy);
+       selinux_policy_commit(fsi->state, &load_state);
 
        length = count;
 
-out1:
        audit_log(audit_context(), GFP_KERNEL, AUDIT_MAC_POLICY_LOAD,
                "auid=%u ses=%u lsm=selinux res=1",
                from_kuid(&init_user_ns, audit_get_loginuid(current)),
index 3438d01..d91e41d 100644 (file)
 #include "policycap_names.h"
 #include "ima.h"
 
+struct convert_context_args {
+       struct selinux_state *state;
+       struct policydb *oldp;
+       struct policydb *newp;
+};
+
+struct selinux_policy_convert_data {
+       struct convert_context_args args;
+       struct sidtab_convert_params sidtab_params;
+};
+
 /* Forward declaration. */
 static int context_struct_to_string(struct policydb *policydb,
                                    struct context *context,
@@ -1974,12 +1985,6 @@ static inline int convert_context_handle_invalid_context(
        return 0;
 }
 
-struct convert_context_args {
-       struct selinux_state *state;
-       struct policydb *oldp;
-       struct policydb *newp;
-};
-
 /*
  * Convert the values in the security context
  * structure `oldc' from the values specified
@@ -2159,7 +2164,7 @@ static void selinux_policy_cond_free(struct selinux_policy *policy)
 }
 
 void selinux_policy_cancel(struct selinux_state *state,
-                       struct selinux_policy *policy)
+                          struct selinux_load_state *load_state)
 {
        struct selinux_policy *oldpolicy;
 
@@ -2167,7 +2172,8 @@ void selinux_policy_cancel(struct selinux_state *state,
                                        lockdep_is_held(&state->policy_mutex));
 
        sidtab_cancel_convert(oldpolicy->sidtab);
-       selinux_policy_free(policy);
+       selinux_policy_free(load_state->policy);
+       kfree(load_state->convert_data);
 }
 
 static void selinux_notify_policy_change(struct selinux_state *state,
@@ -2183,9 +2189,9 @@ static void selinux_notify_policy_change(struct selinux_state *state,
 }
 
 void selinux_policy_commit(struct selinux_state *state,
-                       struct selinux_policy *newpolicy)
+                          struct selinux_load_state *load_state)
 {
-       struct selinux_policy *oldpolicy;
+       struct selinux_policy *oldpolicy, *newpolicy = load_state->policy;
        u32 seqno;
 
        oldpolicy = rcu_dereference_protected(state->policy,
@@ -2225,6 +2231,7 @@ void selinux_policy_commit(struct selinux_state *state,
        /* Free the old policy */
        synchronize_rcu();
        selinux_policy_free(oldpolicy);
+       kfree(load_state->convert_data);
 
        /* Notify others of the policy change */
        selinux_notify_policy_change(state, seqno);
@@ -2241,11 +2248,10 @@ void selinux_policy_commit(struct selinux_state *state,
  * loading the new policy.
  */
 int security_load_policy(struct selinux_state *state, void *data, size_t len,
-                       struct selinux_policy **newpolicyp)
+                        struct selinux_load_state *load_state)
 {
        struct selinux_policy *newpolicy, *oldpolicy;
-       struct sidtab_convert_params convert_params;
-       struct convert_context_args args;
+       struct selinux_policy_convert_data *convert_data;
        int rc = 0;
        struct policy_file file = { data, len }, *fp = &file;
 
@@ -2275,10 +2281,10 @@ int security_load_policy(struct selinux_state *state, void *data, size_t len,
                goto err_mapping;
        }
 
-
        if (!selinux_initialized(state)) {
                /* First policy load, so no need to preserve state from old policy */
-               *newpolicyp = newpolicy;
+               load_state->policy = newpolicy;
+               load_state->convert_data = NULL;
                return 0;
        }
 
@@ -2292,29 +2298,38 @@ int security_load_policy(struct selinux_state *state, void *data, size_t len,
                goto err_free_isids;
        }
 
+       convert_data = kmalloc(sizeof(*convert_data), GFP_KERNEL);
+       if (!convert_data) {
+               rc = -ENOMEM;
+               goto err_free_isids;
+       }
+
        /*
         * Convert the internal representations of contexts
         * in the new SID table.
         */
-       args.state = state;
-       args.oldp = &oldpolicy->policydb;
-       args.newp = &newpolicy->policydb;
+       convert_data->args.state = state;
+       convert_data->args.oldp = &oldpolicy->policydb;
+       convert_data->args.newp = &newpolicy->policydb;
 
-       convert_params.func = convert_context;
-       convert_params.args = &args;
-       convert_params.target = newpolicy->sidtab;
+       convert_data->sidtab_params.func = convert_context;
+       convert_data->sidtab_params.args = &convert_data->args;
+       convert_data->sidtab_params.target = newpolicy->sidtab;
 
-       rc = sidtab_convert(oldpolicy->sidtab, &convert_params);
+       rc = sidtab_convert(oldpolicy->sidtab, &convert_data->sidtab_params);
        if (rc) {
                pr_err("SELinux:  unable to convert the internal"
                        " representation of contexts in the new SID"
                        " table\n");
-               goto err_free_isids;
+               goto err_free_convert_data;
        }
 
-       *newpolicyp = newpolicy;
+       load_state->policy = newpolicy;
+       load_state->convert_data = convert_data;
        return 0;
 
+err_free_convert_data:
+       kfree(convert_data);
 err_free_isids:
        sidtab_destroy(newpolicy->sidtab);
 err_mapping:
index 478f757..8dc6133 100644 (file)
@@ -613,7 +613,7 @@ static int tomoyo_check_unix_address(struct sockaddr *addr,
 static bool tomoyo_kernel_service(void)
 {
        /* Nothing to do if I am a kernel service. */
-       return (current->flags & (PF_KTHREAD | PF_IO_WORKER)) == PF_KTHREAD;
+       return current->flags & PF_KTHREAD;
 }
 
 /**
index 8a24e5a..5263718 100644 (file)
@@ -33,7 +33,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("A loopback soundcard");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA,Loopback soundcard}}");
 
 #define MAX_PCM_SUBSTREAMS     8
 
index 316c9af..01a3eab 100644 (file)
@@ -25,7 +25,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Dummy soundcard (/dev/null)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA,Dummy soundcard}}");
 
 #define MAX_PCM_DEVICES                4
 #define MAX_PCM_SUBSTREAMS     128
index ce5fd17..df4b7f9 100644 (file)
@@ -53,7 +53,6 @@
 MODULE_AUTHOR("Michael T. Mayers");
 MODULE_DESCRIPTION("MOTU MidiTimePiece AV multiport MIDI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{MOTU,MidiTimePiece AV multiport MIDI}}");
 
 // io resources
 #define MTPAV_IOBASE           0x378
index 9c708b6..322d530 100644 (file)
@@ -37,7 +37,6 @@ MODULE_PARM_DESC(enable, "Enable " CARD_NAME " soundcard.");
 MODULE_AUTHOR("Matthias Koenig <mk@phasorlab.de>");
 MODULE_DESCRIPTION("ESI Miditerminal 4140");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESI,Miditerminal 4140}}");
 
 /*********************************************************************
  * Chip specific
index fd79e57..7689fa2 100644 (file)
@@ -22,7 +22,6 @@
 MODULE_AUTHOR("Stas Sergeev <stsp@users.sourceforge.net>");
 MODULE_DESCRIPTION("PC-Speaker driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{PC-Speaker, pcsp}}");
 MODULE_ALIAS("platform:pcspkr");
 
 static int index = SNDRV_DEFAULT_IDX1; /* Index 0-MAX */
index c876cf9..2f4514e 100644 (file)
@@ -57,7 +57,6 @@ MODULE_PARM_DESC(enable, "Enable " CARD_NAME " soundcard.");
 MODULE_AUTHOR("Levent Guendogdu, Tobias Gehrig, Matthias Koenig");
 MODULE_DESCRIPTION("Midiman Portman2x4");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Midiman,Portman2x4}}");
 
 /*********************************************************************
  * Chip specific
index 3947f08..6d5d1ca 100644 (file)
@@ -34,7 +34,6 @@
 
 MODULE_DESCRIPTION("MIDI serial u16550");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA, MIDI serial u16550}}");
 
 #define SNDRV_SERIAL_SOUNDCANVAS 0 /* Roland Soundcanvas; F5 NN selects part */
 #define SNDRV_SERIAL_MS124T 1      /* Midiator MS-124T */
index f1fb68b..4206d93 100644 (file)
@@ -43,7 +43,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("Dummy soundcard for virtual rawmidi devices");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA,Virtual rawmidi device}}");
 
 #define MAX_MIDI_DEVICES       4
 
index 8e0c038..1a14c08 100644 (file)
@@ -493,11 +493,10 @@ void snd_dice_stream_stop_duplex(struct snd_dice *dice)
        struct reg_params tx_params, rx_params;
 
        if (dice->substreams_counter == 0) {
-               if (get_register_params(dice, &tx_params, &rx_params) >= 0) {
-                       amdtp_domain_stop(&dice->domain);
+               if (get_register_params(dice, &tx_params, &rx_params) >= 0)
                        finish_session(dice, &tx_params, &rx_params);
-               }
 
+               amdtp_domain_stop(&dice->domain);
                release_resources(dice);
        }
 }
index ca18fe3..f11af98 100644 (file)
 MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_DESCRIPTION("AD1816A, AD1815");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Highscreen,Sound-Boostar 16 3D},"
-               "{Analog Devices,AD1815},"
-               "{Analog Devices,AD1816A},"
-               "{TerraTec,Base 64},"
-               "{TerraTec,AudioSystem EWS64S},"
-               "{Aztech/Newcom SC-16 3D},"
-               "{Shark Predator ISA}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 1-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 6f221ee..edafb49 100644 (file)
@@ -22,9 +22,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Tugrul Galatali <galatalt@stuy.edu>, Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Analog Devices,AD1848},"
-               "{Analog Devices,AD1847},"
-               "{Crystal Semiconductors,CS4248}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 1085f5b..bacb7a1 100644 (file)
 #define PFX "als100: "
 
 MODULE_DESCRIPTION("Avance Logic ALS007/ALS1X0");
-MODULE_SUPPORTED_DEVICE("{{Diamond Technologies DT-019X},"
-               "{Avance Logic ALS-007}}"
-               "{{Avance Logic,ALS100 - PRO16PNP},"
-               "{Avance Logic,ALS110},"
-               "{Avance Logic,ALS120},"
-               "{Avance Logic,ALS200},"
-               "{3D Melody,MF1000},"
-               "{Digimate,3D Sound},"
-               "{Avance Logic,ALS120},"
-               "{RTL,RTL3000}}");
-
 MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_LICENSE("GPL");
 
index 4ed5209..867e9ae 100644 (file)
 MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_DESCRIPTION("Aztech Systems AZT2320");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Aztech Systems,PRO16V},"
-               "{Aztech Systems,AZT2320},"
-               "{Aztech Systems,AZT3300},"
-               "{Aztech Systems,AZT2320},"
-               "{Aztech Systems,AZT3000}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 19e2585..bc112df 100644 (file)
@@ -51,7 +51,6 @@
 MODULE_AUTHOR("George Talusan <gstalusan@uwaterloo.ca>");
 MODULE_DESCRIPTION("C-Media CMI8330/CMI8329");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{C-Media,CMI8330,isapnp:{CMI0001,@@@0001,@X@0001}}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
index c56cbc0..ec054b9 100644 (file)
@@ -23,7 +23,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Crystal Semiconductors,CS4231}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 63fb0cb..186d7d4 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Cirrus Logic CS4232-9");
-MODULE_SUPPORTED_DEVICE("{{Turtle Beach,TBS-2000},"
-               "{Turtle Beach,Tropez Plus},"
-               "{SIC CrystalWave 32},"
-               "{Hewlett Packard,Omnibook 5500},"
-               "{TerraTec,Maestro 32/96},"
-               "{Philips,PCA70PS}},"
-               "{{Crystal Semiconductors,CS4235},"
-               "{Crystal Semiconductors,CS4236},"
-               "{Crystal Semiconductors,CS4237},"
-               "{Crystal Semiconductors,CS4238},"
-               "{Crystal Semiconductors,CS4239},"
-               "{Acer,AW37},"
-               "{Acer,AW35/Pro},"
-               "{Crystal,3D},"
-               "{Crystal Computer,TidalWave128},"
-               "{Dell,Optiplex GX1},"
-               "{Dell,Workstation 400 sound},"
-               "{EliteGroup,P5TX-LA sound},"
-               "{Gallant,SC-70P},"
-               "{Gateway,E1000 Onboard CS4236B},"
-               "{Genius,Sound Maker 3DJ},"
-               "{Hewlett Packard,HP6330 sound},"
-               "{IBM,PC 300PL sound},"
-               "{IBM,Aptiva 2137 E24},"
-               "{IBM,IntelliStation M Pro},"
-               "{Intel,Marlin Spike Mobo CS4235},"
-               "{Intel PR440FX Onboard},"
-               "{Guillemot,MaxiSound 16 PnP},"
-               "{NewClear,3D},"
-               "{TerraTec,AudioSystem EWS64L/XL},"
-               "{Typhoon Soundsystem,CS4236B},"
-               "{Turtle Beach,Malibu},"
-               "{Unknown,Digital PC 5000 Onboard}}");
-
 MODULE_ALIAS("snd_cs4232");
 
 #define IDENT "CS4232+"
index 4a1f61f..750d499 100644 (file)
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,ES688 PnP AudioDrive,pnp:ESS0100},"
-               "{ESS,ES1688 PnP AudioDrive,pnp:ESS0102},"
-               "{ESS,ES688 AudioDrive,pnp:ESS6881},"
-               "{ESS,ES1688 AudioDrive,pnp:ESS1681}}");
-
 MODULE_ALIAS("snd_es968");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
index 9beef80..375a4a6 100644 (file)
@@ -1929,17 +1929,9 @@ static int snd_es18xx_mixer(struct snd_card *card)
 
 /* Card level */
 
-MODULE_AUTHOR("Christian Fischbach <fishbach@pool.informatik.rwth-aachen.de>, Abramo Bagnara <abramo@alsa-project.org>");  
+MODULE_AUTHOR("Christian Fischbach <fishbach@pool.informatik.rwth-aachen.de>, Abramo Bagnara <abramo@alsa-project.org>");
 MODULE_DESCRIPTION("ESS ES18xx AudioDrive");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,ES1868 PnP AudioDrive},"
-               "{ESS,ES1869 PnP AudioDrive},"
-               "{ESS,ES1878 PnP AudioDrive},"
-               "{ESS,ES1879 PnP AudioDrive},"
-               "{ESS,ES1887 PnP AudioDrive},"
-               "{ESS,ES1888 PnP AudioDrive},"
-               "{ESS,ES1887 AudioDrive},"
-               "{ESS,ES1888 AudioDrive}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 015f88a..0fba5d8 100644 (file)
@@ -23,7 +23,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound Classic}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index c9f31b4..da2b2ca 100644 (file)
@@ -27,7 +27,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound Extreme}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index dc09fbd..24b945f 100644 (file)
@@ -21,7 +21,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Gravis UltraSound MAX");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound MAX}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index e4d412e..99581fb 100644 (file)
@@ -28,14 +28,8 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
 #ifndef SNDRV_STB
 MODULE_DESCRIPTION("AMD InterWave");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound Plug & Play},"
-               "{STB,SoundRage32},"
-               "{MED,MED3210},"
-               "{Dynasonix,Dynasonix Pro},"
-               "{Panasonic,PCA761AW}}");
 #else
 MODULE_DESCRIPTION("AMD InterWave STB with TEA6330T");
-MODULE_SUPPORTED_DEVICE("{{AMD,InterWave STB with TEA6330T}}");
 #endif
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
index 7649a8a..9bde11d 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Yamaha OPL3SA2+");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Yamaha,YMF719E-S},"
-               "{Genius,Sound Maker 3DX},"
-               "{Yamaha,OPL3SA3},"
-               "{Intel,AL440LX sound},"
-               "{NeoMagic,MagicWave 3DX}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 2093334..a510b20 100644 (file)
@@ -33,9 +33,6 @@
 MODULE_AUTHOR("Martin Langer <martin-langer@gmx.de>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Miro miroSOUND PCM1 pro, PCM12, PCM20 Radio");
-MODULE_SUPPORTED_DEVICE("{{Miro,miroSOUND PCM1 pro}, "
-                       "{Miro,miroSOUND PCM12}, "
-                       "{Miro,miroSOUND PCM20 Radio}}");
 
 static int index = SNDRV_DEFAULT_IDX1;         /* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;          /* ID for this card */
index 758f5b5..08e61d9 100644 (file)
@@ -36,17 +36,11 @@ MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_LICENSE("GPL");
 #ifdef OPTi93X
 MODULE_DESCRIPTION("OPTi93X");
-MODULE_SUPPORTED_DEVICE("{{OPTi,82C931/3}}");
 #else  /* OPTi93X */
 #ifdef CS4231
 MODULE_DESCRIPTION("OPTi92X - CS4231");
-MODULE_SUPPORTED_DEVICE("{{OPTi,82C924 (CS4231)},"
-               "{OPTi,82C925 (CS4231)}}");
 #else  /* CS4231 */
 MODULE_DESCRIPTION("OPTi92X - AD1848");
-MODULE_SUPPORTED_DEVICE("{{OPTi,82C924 (AD1848)},"
-               "{OPTi,82C925 (AD1848)},"
-               "{OAK,Mozart}}");
 #endif /* CS4231 */
 #endif /* OPTi93X */
 
index 0e2e0ab..7ba5dd1 100644 (file)
@@ -28,9 +28,6 @@
 #define PFX "jazz16: "
 
 MODULE_DESCRIPTION("Media Vision Jazz16");
-MODULE_SUPPORTED_DEVICE("{{Media Vision ??? },"
-               "{RTL,RTL3000}}");
-
 MODULE_AUTHOR("Krzysztof Helt <krzysztof.h1@wp.pl>");
 MODULE_LICENSE("GPL");
 
index db284b7..63ef960 100644 (file)
@@ -31,16 +31,8 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
 #ifndef SNDRV_SBAWE
 MODULE_DESCRIPTION("Sound Blaster 16");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB 16},"
-               "{Creative Labs,SB Vibra16S},"
-               "{Creative Labs,SB Vibra16C},"
-               "{Creative Labs,SB Vibra16CL},"
-               "{Creative Labs,SB Vibra16X}}");
 #else
 MODULE_DESCRIPTION("Sound Blaster AWE");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB AWE 32},"
-               "{Creative Labs,SB AWE 64},"
-               "{Creative Labs,SB AWE 64 Gold}}");
 #endif
 
 #if 0
index 8e3e67b..6c9d534 100644 (file)
@@ -17,7 +17,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Sound Blaster 1.0/2.0/Pro");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB 1.0/SB 2.0/SB Pro}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index def1375..3462663 100644 (file)
@@ -29,9 +29,6 @@
 MODULE_AUTHOR("Krzysztof Helt");
 MODULE_DESCRIPTION("Gallant SC-6000");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gallant, SC-6000},"
-                       "{AudioExcel, Audio Excel DSP 16},"
-                       "{Zoltrix, AV302}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index b750a4f..a443797 100644 (file)
@@ -21,7 +21,6 @@
 MODULE_AUTHOR("Paul Barton-Davis <pbd@op.net>");
 MODULE_DESCRIPTION("Turtle Beach Wavefront");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Turtle Beach,Maui/Tropez/Tropez+}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;         /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;          /* ID for this card */
index 5bf1ea1..989f656 100644 (file)
@@ -32,7 +32,6 @@
 MODULE_AUTHOR("Vivien Chappelier <vivien.chappelier@linux-mips.org>");
 MODULE_DESCRIPTION("SGI O2 Audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Silicon Graphics, O2 Audio}}");
 
 static int index = SNDRV_DEFAULT_IDX1;  /* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;   /* ID for this card */
index 5d835d2..4520022 100644 (file)
@@ -43,7 +43,6 @@
 MODULE_AUTHOR("Kyle McMartin <kyle@parisc-linux.org>, Thibaut Varene <t-bone@parisc-linux.org>");
 MODULE_DESCRIPTION("Analog Devices AD1889 ALSA sound driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Analog Devices,AD1889}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 module_param_array(index, int, NULL, 0444);
index 51f2479..0d66b92 100644 (file)
@@ -29,7 +29,6 @@
 MODULE_AUTHOR("Matt Wu <Matt_Wu@acersoftech.com.cn>");
 MODULE_DESCRIPTION("ALI M5451");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALI,M5451,pci},{ALI,M5451}}");
 
 static int index = SNDRV_DEFAULT_IDX1; /* Index */
 static char *id = SNDRV_DEFAULT_STR1;  /* ID for this card */
index 1dc8c4e..bd4fd09 100644 (file)
@@ -86,7 +86,6 @@ enum {DEVICE_ALS300, DEVICE_ALS300_PLUS};
 MODULE_AUTHOR("Ash Willis <ashwillis@programmer.net>");
 MODULE_DESCRIPTION("Avance Logic ALS300");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Avance Logic,ALS300},{Avance Logic,ALS300+}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
index 2edc745..139ac2a 100644 (file)
@@ -68,7 +68,6 @@
 MODULE_AUTHOR("Bart Hartgers <bart@etpmod.phys.tue.nl>, Andreas Mohr");
 MODULE_DESCRIPTION("Avance Logic ALS4000");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Avance Logic,ALS4000}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
index a25d754..579425c 100644 (file)
@@ -23,7 +23,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("ATI IXP AC97 controller");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ATI,IXP150/200/250/300/400/600}}");
 
 static int index = SNDRV_DEFAULT_IDX1; /* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;  /* ID for this card */
index ae88217..45e75af 100644 (file)
@@ -23,7 +23,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("ATI IXP MC97 controller");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ATI,IXP150/200/250}}");
 
 static int index = -2; /* Exclude the first card */
 static char *id = SNDRV_DEFAULT_STR1;  /* ID for this card */
index 5dd98e6..1b37b72 100644 (file)
@@ -41,8 +41,6 @@ MODULE_PARM_DESC(pcifix, "Enable VIA-workaround for " CARD_NAME " soundcard.");
 
 MODULE_DESCRIPTION("Aureal vortex");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Aureal Semiconductor Inc., Aureal Vortex Sound Processor}}");
-
 MODULE_DEVICE_TABLE(pci, snd_vortex_ids);
 
 static void vortex_fix_latency(struct pci_dev *vortex)
index 2ac594d..51dcf1b 100644 (file)
 MODULE_AUTHOR("Andreas Mohr <andi AT lisas.de>");
 MODULE_DESCRIPTION("Aztech AZF3328 (PCI168)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Aztech,AZF3328}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_GAMEPORT 1
index cf9f8d8..91512b3 100644 (file)
@@ -23,8 +23,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("Brooktree Bt87x audio driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Brooktree,Bt878},"
-               "{Brooktree,Bt879}}");
 
 static int index[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = -2}; /* Exclude the first card */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index ee20f9a..bee4710 100644 (file)
 MODULE_AUTHOR("James Courtier-Dutton <James@superbug.demon.co.uk>");
 MODULE_DESCRIPTION("CA0106");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Creative,SB CA0106 chip}}");
 
 // module parameters (see "Module Parameters")
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
index 7363d61..5984463 100644 (file)
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("C-Media CMI8x38 PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{C-Media,CMI8738},"
-               "{C-Media,CMI8738B},"
-               "{C-Media,CMI8338A},"
-               "{C-Media,CMI8338B}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
index 94d2a6a..bf3bb70 100644 (file)
@@ -25,7 +25,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Cirrus Logic CS4281");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Cirrus Logic,CS4281}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index a6e0a44..1db7b41 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Cirrus Logic Sound Fusion CS46XX");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Cirrus Logic,Sound Fusion (CS4280)},"
-               "{Cirrus Logic,Sound Fusion (CS4610)},"
-               "{Cirrus Logic,Sound Fusion (CS4612)},"
-               "{Cirrus Logic,Sound Fusion (CS4615)},"
-               "{Cirrus Logic,Sound Fusion (CS4622)},"
-               "{Cirrus Logic,Sound Fusion (CS4624)},"
-               "{Cirrus Logic,Sound Fusion (CS4630)}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 359bc6a..9b716b5 100644 (file)
@@ -393,4 +393,3 @@ module_pci_driver(cs5535audio_driver);
 MODULE_AUTHOR("Jaya Kumar");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("CS5535 Audio");
-MODULE_SUPPORTED_DEVICE("CS5535 Audio");
index 8c07c64..713d36e 100644 (file)
@@ -18,7 +18,6 @@
 MODULE_AUTHOR("Creative Technology Ltd");
 MODULE_DESCRIPTION("X-Fi driver version 1.03");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs, Sound Blaster X-Fi}");
 
 static unsigned int reference_rate = 48000;
 static unsigned int multiple = 2;
index a20b2bb..9bd67ac 100644 (file)
@@ -10,7 +10,6 @@
 MODULE_AUTHOR("Giuliano Pochini <pochini@shiny.it>");
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Echoaudio " ECHOCARD_NAME " soundcards driver");
-MODULE_SUPPORTED_DEVICE("{{Echoaudio," ECHOCARD_NAME "}}");
 MODULE_DEVICE_TABLE(pci, snd_echo_ids);
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
index 353934c..45833bc 100644 (file)
@@ -18,8 +18,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("EMU10K1");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB Live!/PCI512/E-mu APS},"
-              "{Creative Labs,SB Audigy}}");
 
 #if IS_ENABLED(CONFIG_SND_SEQUENCER)
 #define ENABLE_SYNTH
index 785ec0c..d9a12cd 100644 (file)
@@ -31,7 +31,6 @@
 MODULE_AUTHOR("Francisco Moraes <fmoraes@nc.rr.com>");
 MODULE_DESCRIPTION("EMU10K1X");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Dell Creative Labs,SB Live!}");
 
 // module parameters (see "Module Parameters")
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
index 93c4fd3..3ccccdb 100644 (file)
@@ -52,17 +52,9 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>, Thomas Sailer <sailer@ife.ee.et
 MODULE_LICENSE("GPL");
 #ifdef CHIP1370
 MODULE_DESCRIPTION("Ensoniq AudioPCI ES1370");
-MODULE_SUPPORTED_DEVICE("{{Ensoniq,AudioPCI-97 ES1370},"
-               "{Creative Labs,SB PCI64/128 (ES1370)}}");
 #endif
 #ifdef CHIP1371
 MODULE_DESCRIPTION("Ensoniq/Creative AudioPCI ES1371+");
-MODULE_SUPPORTED_DEVICE("{{Ensoniq,AudioPCI ES1371/73},"
-               "{Ensoniq,AudioPCI ES1373},"
-               "{Creative Labs,Ectiva EV1938},"
-               "{Creative Labs,SB PCI64/128 (ES1371/73)},"
-               "{Creative Labs,Vibra PCI128},"
-               "{Ectiva,EV1938}}");
 #endif
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
index 3b5d68c..afc6634 100644 (file)
 MODULE_AUTHOR("Jaromir Koutek <miri@punknet.cz>");
 MODULE_DESCRIPTION("ESS Solo-1");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,ES1938},"
-                "{ESS,ES1946},"
-                "{ESS,ES1969},"
-               "{TerraTec,128i PCI}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
index 747fa69..5fa1861 100644 (file)
 
 MODULE_DESCRIPTION("ESS Maestro");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,Maestro 2e},"
-               "{ESS,Maestro 2},"
-               "{ESS,Maestro 1},"
-               "{TerraTec,DMX}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
index c6ad623..6279eb1 100644 (file)
@@ -26,8 +26,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("ForteMedia FM801");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ForteMedia,FM801},"
-               "{Genius,SoundMaker Live 5.1}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 8b7c550..f5cba7a 100644 (file)
@@ -4065,7 +4065,7 @@ static int add_micmute_led_hook(struct hda_codec *codec)
 
        spec->micmute_led.led_mode = MICMUTE_LED_FOLLOW_MUTE;
        spec->micmute_led.capture = 0;
-       spec->micmute_led.led_value = 0;
+       spec->micmute_led.led_value = -1;
        spec->micmute_led.old_hook = spec->cap_sync_hook;
        spec->cap_sync_hook = update_micmute_led;
        if (!snd_hda_gen_add_kctl(spec, NULL, &micmute_led_mode_ctl))
index 5eea130..79ade33 100644 (file)
@@ -208,40 +208,6 @@ MODULE_PARM_DESC(snoop, "Enable/disable snooping");
 
 
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Intel, ICH6},"
-                        "{Intel, ICH6M},"
-                        "{Intel, ICH7},"
-                        "{Intel, ESB2},"
-                        "{Intel, ICH8},"
-                        "{Intel, ICH9},"
-                        "{Intel, ICH10},"
-                        "{Intel, PCH},"
-                        "{Intel, CPT},"
-                        "{Intel, PPT},"
-                        "{Intel, LPT},"
-                        "{Intel, LPT_LP},"
-                        "{Intel, WPT_LP},"
-                        "{Intel, SPT},"
-                        "{Intel, SPT_LP},"
-                        "{Intel, HPT},"
-                        "{Intel, PBG},"
-                        "{Intel, SCH},"
-                        "{ATI, SB450},"
-                        "{ATI, SB600},"
-                        "{ATI, RS600},"
-                        "{ATI, RS690},"
-                        "{ATI, RS780},"
-                        "{ATI, R600},"
-                        "{ATI, RV630},"
-                        "{ATI, RV610},"
-                        "{ATI, RV670},"
-                        "{ATI, RV635},"
-                        "{ATI, RV620},"
-                        "{ATI, RV770},"
-                        "{VIA, VT8251},"
-                        "{VIA, VT8237A},"
-                        "{SiS, SIS966},"
-                        "{ULI, M5461}}");
 MODULE_DESCRIPTION("Intel HDA driver");
 
 #if defined(CONFIG_PM) && defined(CONFIG_VGA_SWITCHEROO)
@@ -1023,8 +989,12 @@ static int azx_prepare(struct device *dev)
        struct snd_card *card = dev_get_drvdata(dev);
        struct azx *chip;
 
+       if (!azx_is_pm_ready(card))
+               return 0;
+
        chip = card->private_data;
        chip->pm_prepared = 1;
+       snd_power_change_state(card, SNDRV_CTL_POWER_D3hot);
 
        flush_work(&azx_bus(chip)->unsol_work);
 
@@ -1039,7 +1009,11 @@ static void azx_complete(struct device *dev)
        struct snd_card *card = dev_get_drvdata(dev);
        struct azx *chip;
 
+       if (!azx_is_pm_ready(card))
+               return;
+
        chip = card->private_data;
+       snd_power_change_state(card, SNDRV_CTL_POWER_D0);
        chip->pm_prepared = 0;
 }
 
index b47504f..58946d0 100644 (file)
@@ -4225,6 +4225,12 @@ static void alc_fixup_hp_gpio_led(struct hda_codec *codec,
        }
 }
 
+static void alc236_fixup_hp_gpio_led(struct hda_codec *codec,
+                               const struct hda_fixup *fix, int action)
+{
+       alc_fixup_hp_gpio_led(codec, action, 0x02, 0x01);
+}
+
 static void alc269_fixup_hp_gpio_led(struct hda_codec *codec,
                                const struct hda_fixup *fix, int action)
 {
@@ -5250,7 +5256,7 @@ static void alc_determine_headset_type(struct hda_codec *codec)
        case 0x10ec0274:
        case 0x10ec0294:
                alc_process_coef_fw(codec, coef0274);
-               msleep(80);
+               msleep(850);
                val = alc_read_coef_idx(codec, 0x46);
                is_ctia = (val & 0x00f0) == 0x00f0;
                break;
@@ -5434,6 +5440,7 @@ static void alc_update_headset_jack_cb(struct hda_codec *codec,
                                       struct hda_jack_callback *jack)
 {
        snd_hda_gen_hp_automute(codec, jack);
+       alc_update_headset_mode(codec);
 }
 
 static void alc_probe_headset_mode(struct hda_codec *codec)
@@ -6381,6 +6388,7 @@ enum {
        ALC294_FIXUP_ASUS_GX502_VERBS,
        ALC285_FIXUP_HP_GPIO_LED,
        ALC285_FIXUP_HP_MUTE_LED,
+       ALC236_FIXUP_HP_GPIO_LED,
        ALC236_FIXUP_HP_MUTE_LED,
        ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET,
        ALC295_FIXUP_ASUS_MIC_NO_PRESENCE,
@@ -7616,6 +7624,10 @@ static const struct hda_fixup alc269_fixups[] = {
                .type = HDA_FIXUP_FUNC,
                .v.func = alc285_fixup_hp_mute_led,
        },
+       [ALC236_FIXUP_HP_GPIO_LED] = {
+               .type = HDA_FIXUP_FUNC,
+               .v.func = alc236_fixup_hp_gpio_led,
+       },
        [ALC236_FIXUP_HP_MUTE_LED] = {
                .type = HDA_FIXUP_FUNC,
                .v.func = alc236_fixup_hp_mute_led,
@@ -8045,9 +8057,13 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x103c, 0x8783, "HP ZBook Fury 15 G7 Mobile Workstation",
                      ALC285_FIXUP_HP_GPIO_AMP_INIT),
        SND_PCI_QUIRK(0x103c, 0x87c8, "HP", ALC287_FIXUP_HP_GPIO_LED),
+       SND_PCI_QUIRK(0x103c, 0x87e5, "HP ProBook 440 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED),
+       SND_PCI_QUIRK(0x103c, 0x87f2, "HP ProBook 640 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x87f4, "HP", ALC287_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x87f5, "HP", ALC287_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x87f7, "HP Spectre x360 14", ALC245_FIXUP_HP_X360_AMP),
+       SND_PCI_QUIRK(0x103c, 0x8846, "HP EliteBook 850 G8 Notebook PC", ALC285_FIXUP_HP_GPIO_LED),
+       SND_PCI_QUIRK(0x103c, 0x884c, "HP EliteBook 840 G8 Notebook PC", ALC285_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
        SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300),
        SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
@@ -8242,7 +8258,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1b35, 0x1237, "CZC L101", ALC269_FIXUP_CZC_L101),
        SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */
        SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC),
+       SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC),
+       SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC),
        SND_PCI_QUIRK(0x10ec, 0x118c, "Medion EE4254 MD62100", ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE),
        SND_PCI_QUIRK(0x1c06, 0x2013, "Lemote A1802", ALC269_FIXUP_LEMOTE_A1802),
        SND_PCI_QUIRK(0x1c06, 0x2015, "Lemote A190X", ALC269_FIXUP_LEMOTE_A190X),
index f814dbb..d54cd51 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("ICEnsemble ICE1712 (Envy24)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{"
-              HOONTECH_DEVICE_DESC
-              DELTA_DEVICE_DESC
-              EWS_DEVICE_DESC
-              "{ICEnsemble,Generic ICE1712},"
-              "{ICEnsemble,Generic Envy24}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index c0fca94..ef2367d 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("VIA ICEnsemble ICE1724/1720 (Envy24HT/PT)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{"
-              REVO_DEVICE_DESC
-              AMP_AUDIO2000_DEVICE_DESC
-              AUREON_DEVICE_DESC
-              VT1720_MOBO_DEVICE_DESC
-              PONTIS_DEVICE_DESC
-              PRODIGY192_DEVICE_DESC
-              PRODIGY_HIFI_DEVICE_DESC
-              JULI_DEVICE_DESC
-              MAYA44_DEVICE_DESC
-              PHASE_DEVICE_DESC
-              WTM_DEVICE_DESC
-              SE_DEVICE_DESC
-              QTET_DEVICE_DESC
-               "{VIA,VT1720},"
-               "{VIA,VT1724},"
-               "{ICEnsemble,Generic ICE1724},"
-               "{ICEnsemble,Generic Envy24HT}"
-               "{ICEnsemble,Generic Envy24PT}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 3349e45..35903d1 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Intel 82801AA,82901AB,i810,i820,i830,i840,i845,MX440; SiS 7012; Ali 5455");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Intel,82801AA-ICH},"
-               "{Intel,82901AB-ICH0},"
-               "{Intel,82801BA-ICH2},"
-               "{Intel,82801CA-ICH3},"
-               "{Intel,82801DB-ICH4},"
-               "{Intel,ICH5},"
-               "{Intel,ICH6},"
-               "{Intel,ICH7},"
-               "{Intel,6300ESB},"
-               "{Intel,ESB2},"
-               "{Intel,MX440},"
-               "{SiS,SI7012},"
-               "{NVidia,nForce Audio},"
-               "{NVidia,nForce2 Audio},"
-               "{NVidia,nForce3 Audio},"
-               "{NVidia,MCP04},"
-               "{NVidia,MCP501},"
-               "{NVidia,CK804},"
-               "{NVidia,CK8},"
-               "{NVidia,CK8S},"
-               "{AMD,AMD768},"
-               "{AMD,AMD8111},"
-               "{ALI,M5455}}");
 
 static int index = SNDRV_DEFAULT_IDX1; /* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;  /* ID for this card */
index 19872ce..13ef838 100644 (file)
@@ -25,21 +25,6 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Intel 82801AA,82901AB,i810,i820,i830,i840,i845,MX440; "
                   "SiS 7013; NVidia MCP/2/2S/3 modems");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Intel,82801AA-ICH},"
-               "{Intel,82901AB-ICH0},"
-               "{Intel,82801BA-ICH2},"
-               "{Intel,82801CA-ICH3},"
-               "{Intel,82801DB-ICH4},"
-               "{Intel,ICH5},"
-               "{Intel,ICH6},"
-               "{Intel,ICH7},"
-               "{Intel,MX440},"
-               "{SiS,7013},"
-               "{NVidia,NForce Modem},"
-               "{NVidia,NForce2 Modem},"
-               "{NVidia,NForce2s Modem},"
-               "{NVidia,NForce3 Modem},"
-               "{AMD,AMD768}}");
 
 static int index = -2; /* Exclude the first card */
 static char *id = SNDRV_DEFAULT_STR1;  /* ID for this card */
index 2eddd9d..80ac3c6 100644 (file)
@@ -388,7 +388,6 @@ struct snd_korg1212 {
 
 MODULE_DESCRIPTION("korg1212");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{KORG,korg1212}}");
 MODULE_FIRMWARE("korg/k1212.dsp");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
index 491c90f..03b4be4 100644 (file)
@@ -54,7 +54,6 @@ MODULE_PARM_DESC(sample_rate_min, "Minimal sample rate");
  */
 
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram, Lola}}");
 MODULE_DESCRIPTION("Digigram Lola driver");
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 
index b92ea07..1be97c3 100644 (file)
@@ -21,8 +21,6 @@
 MODULE_AUTHOR("Tim Blechmann");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("digigram lx6464es");
-MODULE_SUPPORTED_DEVICE("{digigram lx6464es{}}");
-
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
index d2c2cd6..cdc4b61 100644 (file)
 MODULE_AUTHOR("Zach Brown <zab@zabbo.net>, Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("ESS Maestro3 PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,Maestro3 PCI},"
-               "{ESS,ES1988},"
-               "{ESS,Allegro PCI},"
-               "{ESS,Allegro-1 PCI},"
-               "{ESS,Canyon3D-2/LE PCI}}");
 MODULE_FIRMWARE("ess/maestro3_assp_kernel.fw");
 MODULE_FIRMWARE("ess/maestro3_assp_minisrc.fw");
 
index efff220..a0bbb38 100644 (file)
@@ -32,7 +32,6 @@
 MODULE_AUTHOR("Digigram <alsa@digigram.com>");
 MODULE_DESCRIPTION("Digigram " CARD_NAME);
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram," CARD_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;             /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;              /* ID for this card */
index 9759946..6cb689a 100644 (file)
@@ -32,8 +32,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("NeoMagic NM256AV/ZX");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{NeoMagic,NM256AV},"
-               "{NeoMagic,NM256ZX}}");
 
 /*
  * some compile conditions.
index a751fcc..e335c4b 100644 (file)
@@ -56,9 +56,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("C-Media CMI8788 driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{C-Media,CMI8786}"
-                       ",{C-Media,CMI8787}"
-                       ",{C-Media,CMI8788}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
index 78c35a0..434f885 100644 (file)
@@ -29,7 +29,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("Studio Evolution SE6X driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Studio Evolution,SE6X}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
index 98ab163..baa3244 100644 (file)
@@ -16,7 +16,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("Asus Virtuoso driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Asus,AV66},{Asus,AV100},{Asus,AV200}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
index c2e4831..751f974 100644 (file)
@@ -35,7 +35,6 @@ MODULE_AUTHOR("Markus Bollinger <bollinger@digigram.com>, "
              "Marc Titinger <titinger@digigram.com>");
 MODULE_DESCRIPTION("Digigram " DRIVER_NAME " " PCXHR_DRIVER_VERSION_STRING);
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram," DRIVER_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index fcc2073..56827db 100644 (file)
 MODULE_AUTHOR("Peter Gruber <nokos@gmx.net>");
 MODULE_DESCRIPTION("riptide");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Conexant,Riptide}}");
 MODULE_FIRMWARE("riptide.hex");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
index 4eabece..54f3e39 100644 (file)
@@ -88,7 +88,6 @@ MODULE_PARM_DESC(fullduplex, "Support full-duplex mode.");
 MODULE_AUTHOR("Martin Langer <martin-langer@gmx.de>, Pilo Chambert <pilo.c@wanadoo.fr>");
 MODULE_DESCRIPTION("RME Digi32, Digi32/8, Digi32 PRO");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME,Digi32}," "{RME,Digi32/8}," "{RME,Digi32 PRO}}");
 
 /* Defines for RME Digi32 series */
 #define RME32_SPDIF_NCHANNELS 2
index 84eef6a..66082e9 100644 (file)
@@ -31,11 +31,6 @@ MODULE_AUTHOR("Anders Torger <torger@ludd.luth.se>");
 MODULE_DESCRIPTION("RME Digi96, Digi96/8, Digi96/8 PRO, Digi96/8 PST, "
                   "Digi96/8 PAD");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME,Digi96},"
-               "{RME,Digi96/8},"
-               "{RME,Digi96/8 PRO},"
-               "{RME,Digi96/8 PST},"
-               "{RME,Digi96/8 PAD}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 6d90293..4cf879c 100644 (file)
@@ -44,9 +44,6 @@ MODULE_PARM_DESC(enable, "Enable/disable specific Hammerfall DSP soundcards.");
 MODULE_AUTHOR("Paul Davis <paul@linuxaudiosystems.com>, Marcus Andersson, Thomas Charbonnel <thomas@undata.org>");
 MODULE_DESCRIPTION("RME Hammerfall DSP");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME Hammerfall-DSP},"
-               "{RME HDSP-9652},"
-               "{RME HDSP-9632}}");
 MODULE_FIRMWARE("rpm_firmware.bin");
 MODULE_FIRMWARE("multiface_firmware.bin");
 MODULE_FIRMWARE("multiface_firmware_rev11.bin");
index b667115..8d900c1 100644 (file)
@@ -165,7 +165,6 @@ MODULE_AUTHOR
 );
 MODULE_DESCRIPTION("RME HDSPM");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME HDSPM-MADI}}");
 
 /* --- Write registers. ---
   These are defined as byte-offsets from the iobase value.  */
index 012fbec..4df992e 100644 (file)
@@ -39,8 +39,6 @@ MODULE_PARM_DESC(precise_ptr, "Enable precise pointer (doesn't work reliably).")
 MODULE_AUTHOR("Paul Davis <pbd@op.net>, Winfried Ritsch");
 MODULE_DESCRIPTION("RME Digi9652/Digi9636");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME,Hammerfall},"
-               "{RME,Hammerfall-Light}}");
 
 /* The Hammerfall has two sets of 24 ADAT + 2 S/PDIF channels, one for
    capture, one for playback. Both the ADAT and S/PDIF channels appear
index 8ffa2f5..00ab51c 100644 (file)
@@ -24,7 +24,6 @@
 MODULE_AUTHOR("David Dillow <dave@thedillows.org>");
 MODULE_DESCRIPTION("SiS7019");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{SiS,SiS7019 Audio Accelerator}}");
 
 static int index = SNDRV_DEFAULT_IDX1; /* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;  /* ID for this card */
index 26fd1d0..7de1099 100644 (file)
@@ -29,7 +29,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("S3 SonicVibes PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{S3,SonicVibes PCI}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
index 5bc79da..a510412 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>, <audio@tridentmicro.com>");
 MODULE_DESCRIPTION("Trident 4D-WaveDX/NX & SiS SI7018");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Trident,4DWave DX},"
-               "{Trident,4DWave NX},"
-               "{SiS,SI7018 PCI Audio},"
-               "{Best Union,Miss Melody 4DWave PCI},"
-               "{HIS,4DWave PCI},"
-               "{Warpspeed,ONSpeed 4DWave PCI},"
-               "{Aztech Systems,PCI 64-Q3D},"
-               "{Addonics,SV 750},"
-               "{CHIC,True Sound 4Dwave},"
-               "{Shark,Predator4D-PCI},"
-               "{Jaton,SonicWave 4D},"
-               "{Hoontech,SoundTrack Digital 4DWave NX}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 154d88c..fd1f2f9 100644 (file)
@@ -56,7 +56,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("VIA VT82xx audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{VIA,VT82C686A/B/C,pci},{VIA,VT8233A/C,8235}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
index addfa19..3025330 100644 (file)
@@ -38,7 +38,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("VIA VT82xx modem");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{VIA,VT82C686A/B/C modem,pci}}");
 
 static int index = -2; /* Exclude the first card */
 static char *id = SNDRV_DEFAULT_STR1;  /* ID for this card */
index f7800ed..2a9e1a7 100644 (file)
@@ -20,7 +20,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("Digigram VX222 V2/Mic");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram," CARD_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 9b0d18a..99be149 100644 (file)
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Yamaha DS-1 PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Yamaha,YMF724},"
-               "{Yamaha,YMF724F},"
-               "{Yamaha,YMF740},"
-               "{Yamaha,YMF740C},"
-               "{Yamaha,YMF744},"
-               "{Yamaha,YMF754}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 27d9da6..1445823 100644 (file)
@@ -22,7 +22,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Sound Core " CARD_NAME);
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sound Core," CARD_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index afd30a9..6363204 100644 (file)
 #include <sound/initval.h>
 #include <sound/tlv.h>
 
-/*
- */
-
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("Digigram VXPocket");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram,VXPocket},{Digigram,VXPocket440}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 96ef550..9fb51eb 100644 (file)
@@ -18,7 +18,6 @@
 #define CHIP_NAME "PMac"
 
 MODULE_DESCRIPTION("PowerMac");
-MODULE_SUPPORTED_DEVICE("{{Apple,PowerMac}}");
 MODULE_LICENSE("GPL");
 
 static int index = SNDRV_DEFAULT_IDX1;         /* Index 0-MAX */
index 8fa6843..6e9d6bd 100644 (file)
@@ -32,7 +32,6 @@
 MODULE_AUTHOR("Adrian McMenamin <adrian@mcmen.demon.co.uk>");
 MODULE_DESCRIPTION("Dreamcast AICA sound (pcm) driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Yamaha/SEGA, AICA}}");
 MODULE_FIRMWARE("aica_firmware.bin");
 
 /* module parameters */
index feb2850..8ebd972 100644 (file)
@@ -25,7 +25,6 @@
 MODULE_AUTHOR("Rafael Ignacio Zurita <rizurita@yahoo.com>");
 MODULE_DESCRIPTION("SuperH DAC audio driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{SuperH DAC audio support}}");
 
 /* Module Parameters */
 static int index = SNDRV_DEFAULT_IDX1;
index e4cf14e..1c87b42 100644 (file)
@@ -186,7 +186,6 @@ config SND_SOC_ALL_CODECS
        imply SND_SOC_SI476X
        imply SND_SOC_SIMPLE_AMPLIFIER
        imply SND_SOC_SIMPLE_MUX
-       imply SND_SOC_SIRF_AUDIO_CODEC
        imply SND_SOC_SPDIF
        imply SND_SOC_SSM2305
        imply SND_SOC_SSM2518
@@ -1279,10 +1278,6 @@ config SND_SOC_SIMPLE_MUX
        tristate "Simple Audio Mux"
        select GPIOLIB
 
-config SND_SOC_SIRF_AUDIO_CODEC
-       tristate "SiRF SoC internal audio codec"
-       select REGMAP_MMIO
-
 config SND_SOC_SPDIF
        tristate "S/PDIF CODEC"
 
index 472caad..85a1d00 100644 (file)
@@ -812,6 +812,7 @@ static const struct of_device_id ak4458_of_match[] = {
        { .compatible = "asahi-kasei,ak4497", .data = &ak4497_drvdata},
        { },
 };
+MODULE_DEVICE_TABLE(of, ak4458_of_match);
 
 static struct i2c_driver ak4458_i2c_driver = {
        .driver = {
index 8a32b01..85bdd05 100644 (file)
@@ -419,6 +419,7 @@ static const struct of_device_id ak5558_i2c_dt_ids[] __maybe_unused = {
        { .compatible = "asahi-kasei,ak5558"},
        { }
 };
+MODULE_DEVICE_TABLE(of, ak5558_i2c_dt_ids);
 
 static struct i2c_driver ak5558_i2c_driver = {
        .driver = {
index 210fcbe..811b7b1 100644 (file)
@@ -401,7 +401,7 @@ static const struct regmap_config cs42l42_regmap = {
 };
 
 static DECLARE_TLV_DB_SCALE(adc_tlv, -9600, 100, false);
-static DECLARE_TLV_DB_SCALE(mixer_tlv, -6200, 100, false);
+static DECLARE_TLV_DB_SCALE(mixer_tlv, -6300, 100, true);
 
 static const char * const cs42l42_hpf_freq_text[] = {
        "1.86Hz", "120Hz", "235Hz", "466Hz"
@@ -458,7 +458,7 @@ static const struct snd_kcontrol_new cs42l42_snd_controls[] = {
                                CS42L42_DAC_HPF_EN_SHIFT, true, false),
        SOC_DOUBLE_R_TLV("Mixer Volume", CS42L42_MIXER_CHA_VOL,
                         CS42L42_MIXER_CHB_VOL, CS42L42_MIXER_CH_VOL_SHIFT,
-                               0x3e, 1, mixer_tlv)
+                               0x3f, 1, mixer_tlv)
 };
 
 static int cs42l42_hpdrv_evt(struct snd_soc_dapm_widget *w,
@@ -511,43 +511,6 @@ static const struct snd_soc_dapm_route cs42l42_audio_map[] = {
        {"HP", NULL, "HPDRV"}
 };
 
-static int cs42l42_set_bias_level(struct snd_soc_component *component,
-                                       enum snd_soc_bias_level level)
-{
-       struct cs42l42_private *cs42l42 = snd_soc_component_get_drvdata(component);
-       int ret;
-
-       switch (level) {
-       case SND_SOC_BIAS_ON:
-               break;
-       case SND_SOC_BIAS_PREPARE:
-               break;
-       case SND_SOC_BIAS_STANDBY:
-               if (snd_soc_component_get_bias_level(component) == SND_SOC_BIAS_OFF) {
-                       regcache_cache_only(cs42l42->regmap, false);
-                       regcache_sync(cs42l42->regmap);
-                       ret = regulator_bulk_enable(
-                                               ARRAY_SIZE(cs42l42->supplies),
-                                               cs42l42->supplies);
-                       if (ret != 0) {
-                               dev_err(component->dev,
-                                       "Failed to enable regulators: %d\n",
-                                       ret);
-                               return ret;
-                       }
-               }
-               break;
-       case SND_SOC_BIAS_OFF:
-
-               regcache_cache_only(cs42l42->regmap, true);
-               regulator_bulk_disable(ARRAY_SIZE(cs42l42->supplies),
-                                                   cs42l42->supplies);
-               break;
-       }
-
-       return 0;
-}
-
 static int cs42l42_component_probe(struct snd_soc_component *component)
 {
        struct cs42l42_private *cs42l42 =
@@ -560,7 +523,6 @@ static int cs42l42_component_probe(struct snd_soc_component *component)
 
 static const struct snd_soc_component_driver soc_component_dev_cs42l42 = {
        .probe                  = cs42l42_component_probe,
-       .set_bias_level         = cs42l42_set_bias_level,
        .dapm_widgets           = cs42l42_dapm_widgets,
        .num_dapm_widgets       = ARRAY_SIZE(cs42l42_dapm_widgets),
        .dapm_routes            = cs42l42_audio_map,
@@ -691,24 +653,6 @@ static int cs42l42_pll_config(struct snd_soc_component *component)
                                        CS42L42_CLK_OASRC_SEL_MASK,
                                        CS42L42_CLK_OASRC_SEL_12 <<
                                        CS42L42_CLK_OASRC_SEL_SHIFT);
-                       /* channel 1 on low LRCLK, 32 bit */
-                       snd_soc_component_update_bits(component,
-                                       CS42L42_ASP_RX_DAI0_CH1_AP_RES,
-                                       CS42L42_ASP_RX_CH_AP_MASK |
-                                       CS42L42_ASP_RX_CH_RES_MASK,
-                                       (CS42L42_ASP_RX_CH_AP_LOW <<
-                                       CS42L42_ASP_RX_CH_AP_SHIFT) |
-                                       (CS42L42_ASP_RX_CH_RES_32 <<
-                                       CS42L42_ASP_RX_CH_RES_SHIFT));
-                       /* Channel 2 on high LRCLK, 32 bit */
-                       snd_soc_component_update_bits(component,
-                                       CS42L42_ASP_RX_DAI0_CH2_AP_RES,
-                                       CS42L42_ASP_RX_CH_AP_MASK |
-                                       CS42L42_ASP_RX_CH_RES_MASK,
-                                       (CS42L42_ASP_RX_CH_AP_HI <<
-                                       CS42L42_ASP_RX_CH_AP_SHIFT) |
-                                       (CS42L42_ASP_RX_CH_RES_32 <<
-                                       CS42L42_ASP_RX_CH_RES_SHIFT));
                        if (pll_ratio_table[i].mclk_src_sel == 0) {
                                /* Pass the clock straight through */
                                snd_soc_component_update_bits(component,
@@ -797,27 +741,23 @@ static int cs42l42_set_dai_fmt(struct snd_soc_dai *codec_dai, unsigned int fmt)
        /* Bitclock/frame inversion */
        switch (fmt & SND_SOC_DAIFMT_INV_MASK) {
        case SND_SOC_DAIFMT_NB_NF:
+               asp_cfg_val |= CS42L42_ASP_SCPOL_NOR << CS42L42_ASP_SCPOL_SHIFT;
                break;
        case SND_SOC_DAIFMT_NB_IF:
-               asp_cfg_val |= CS42L42_ASP_POL_INV <<
-                               CS42L42_ASP_LCPOL_IN_SHIFT;
+               asp_cfg_val |= CS42L42_ASP_SCPOL_NOR << CS42L42_ASP_SCPOL_SHIFT;
+               asp_cfg_val |= CS42L42_ASP_LCPOL_INV << CS42L42_ASP_LCPOL_SHIFT;
                break;
        case SND_SOC_DAIFMT_IB_NF:
-               asp_cfg_val |= CS42L42_ASP_POL_INV <<
-                               CS42L42_ASP_SCPOL_IN_DAC_SHIFT;
                break;
        case SND_SOC_DAIFMT_IB_IF:
-               asp_cfg_val |= CS42L42_ASP_POL_INV <<
-                               CS42L42_ASP_LCPOL_IN_SHIFT;
-               asp_cfg_val |= CS42L42_ASP_POL_INV <<
-                               CS42L42_ASP_SCPOL_IN_DAC_SHIFT;
+               asp_cfg_val |= CS42L42_ASP_LCPOL_INV << CS42L42_ASP_LCPOL_SHIFT;
                break;
        }
 
-       snd_soc_component_update_bits(component, CS42L42_ASP_CLK_CFG,
-                               CS42L42_ASP_MODE_MASK |
-                               CS42L42_ASP_SCPOL_IN_DAC_MASK |
-                               CS42L42_ASP_LCPOL_IN_MASK, asp_cfg_val);
+       snd_soc_component_update_bits(component, CS42L42_ASP_CLK_CFG, CS42L42_ASP_MODE_MASK |
+                                                                     CS42L42_ASP_SCPOL_MASK |
+                                                                     CS42L42_ASP_LCPOL_MASK,
+                                                                     asp_cfg_val);
 
        return 0;
 }
@@ -828,14 +768,29 @@ static int cs42l42_pcm_hw_params(struct snd_pcm_substream *substream,
 {
        struct snd_soc_component *component = dai->component;
        struct cs42l42_private *cs42l42 = snd_soc_component_get_drvdata(component);
-       int retval;
+       unsigned int width = (params_width(params) / 8) - 1;
+       unsigned int val = 0;
 
        cs42l42->srate = params_rate(params);
-       cs42l42->swidth = params_width(params);
 
-       retval = cs42l42_pll_config(component);
+       switch(substream->stream) {
+       case SNDRV_PCM_STREAM_PLAYBACK:
+               val |= width << CS42L42_ASP_RX_CH_RES_SHIFT;
+               /* channel 1 on low LRCLK */
+               snd_soc_component_update_bits(component, CS42L42_ASP_RX_DAI0_CH1_AP_RES,
+                                                        CS42L42_ASP_RX_CH_AP_MASK |
+                                                        CS42L42_ASP_RX_CH_RES_MASK, val);
+               /* Channel 2 on high LRCLK */
+               val |= CS42L42_ASP_RX_CH_AP_HI << CS42L42_ASP_RX_CH_AP_SHIFT;
+               snd_soc_component_update_bits(component, CS42L42_ASP_RX_DAI0_CH2_AP_RES,
+                                                        CS42L42_ASP_RX_CH_AP_MASK |
+                                                        CS42L42_ASP_RX_CH_RES_MASK, val);
+               break;
+       default:
+               break;
+       }
 
-       return retval;
+       return cs42l42_pll_config(component);
 }
 
 static int cs42l42_set_sysclk(struct snd_soc_dai *dai,
@@ -900,9 +855,9 @@ static int cs42l42_mute(struct snd_soc_dai *dai, int mute, int direction)
        return 0;
 }
 
-#define CS42L42_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S18_3LE | \
-                       SNDRV_PCM_FMTBIT_S20_3LE | SNDRV_PCM_FMTBIT_S24_LE | \
-                       SNDRV_PCM_FMTBIT_S32_LE)
+#define CS42L42_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
+                        SNDRV_PCM_FMTBIT_S24_LE |\
+                        SNDRV_PCM_FMTBIT_S32_LE )
 
 
 static const struct snd_soc_dai_ops cs42l42_ops = {
@@ -1801,7 +1756,7 @@ static int cs42l42_i2c_probe(struct i2c_client *i2c_client,
                dev_dbg(&i2c_client->dev, "Found reset GPIO\n");
                gpiod_set_value_cansleep(cs42l42->reset_gpio, 1);
        }
-       mdelay(3);
+       usleep_range(CS42L42_BOOT_TIME_US, CS42L42_BOOT_TIME_US * 2);
 
        /* Request IRQ */
        ret = devm_request_threaded_irq(&i2c_client->dev,
@@ -1926,6 +1881,7 @@ static int cs42l42_runtime_resume(struct device *dev)
        }
 
        gpiod_set_value_cansleep(cs42l42->reset_gpio, 1);
+       usleep_range(CS42L42_BOOT_TIME_US, CS42L42_BOOT_TIME_US * 2);
 
        regcache_cache_only(cs42l42->regmap, false);
        regcache_sync(cs42l42->regmap);
index 9e3cc52..866d7c8 100644 (file)
 #define CS42L42_ASP_SLAVE_MODE         0x00
 #define CS42L42_ASP_MODE_SHIFT         4
 #define CS42L42_ASP_MODE_MASK          (1 << CS42L42_ASP_MODE_SHIFT)
-#define CS42L42_ASP_SCPOL_IN_DAC_SHIFT 2
-#define CS42L42_ASP_SCPOL_IN_DAC_MASK  (1 << CS42L42_ASP_SCPOL_IN_DAC_SHIFT)
-#define CS42L42_ASP_LCPOL_IN_SHIFT     0
-#define CS42L42_ASP_LCPOL_IN_MASK      (1 << CS42L42_ASP_LCPOL_IN_SHIFT)
-#define CS42L42_ASP_POL_INV            1
+#define CS42L42_ASP_SCPOL_SHIFT                2
+#define CS42L42_ASP_SCPOL_MASK         (3 << CS42L42_ASP_SCPOL_SHIFT)
+#define CS42L42_ASP_SCPOL_NOR          3
+#define CS42L42_ASP_LCPOL_SHIFT                0
+#define CS42L42_ASP_LCPOL_MASK         (3 << CS42L42_ASP_LCPOL_SHIFT)
+#define CS42L42_ASP_LCPOL_INV          3
 
 #define CS42L42_ASP_FRM_CFG            (CS42L42_PAGE_12 + 0x08)
 #define CS42L42_ASP_STP_SHIFT          4
 #define CS42L42_FRAC2_VAL(val) (((val) & 0xff0000) >> 16)
 
 #define CS42L42_NUM_SUPPLIES   5
+#define CS42L42_BOOT_TIME_US   3000
 
 static const char *const cs42l42_supply_names[CS42L42_NUM_SUPPLIES] = {
        "VA",
@@ -756,7 +758,6 @@ struct  cs42l42_private {
        struct completion pdn_done;
        u32 sclk;
        u32 srate;
-       u32 swidth;
        u8 plug_state;
        u8 hs_type;
        u8 ts_inv;
index d632055..067757d 100644 (file)
@@ -63,13 +63,8 @@ static const SNDRV_CTL_TLVD_DECLARE_DB_RANGE(adc_pga_gain_tlv,
        1, 1, TLV_DB_SCALE_ITEM(0, 0, 0),
        2, 2, TLV_DB_SCALE_ITEM(250, 0, 0),
        3, 3, TLV_DB_SCALE_ITEM(450, 0, 0),
-       4, 4, TLV_DB_SCALE_ITEM(700, 0, 0),
-       5, 5, TLV_DB_SCALE_ITEM(1000, 0, 0),
-       6, 6, TLV_DB_SCALE_ITEM(1300, 0, 0),
-       7, 7, TLV_DB_SCALE_ITEM(1600, 0, 0),
-       8, 8, TLV_DB_SCALE_ITEM(1800, 0, 0),
-       9, 9, TLV_DB_SCALE_ITEM(2100, 0, 0),
-       10, 10, TLV_DB_SCALE_ITEM(2400, 0, 0),
+       4, 7, TLV_DB_SCALE_ITEM(700, 300, 0),
+       8, 10, TLV_DB_SCALE_ITEM(1800, 300, 0),
 );
 
 static const SNDRV_CTL_TLVD_DECLARE_DB_RANGE(hpout_vol_tlv,
index c9c21d2..8c04b3b 100644 (file)
@@ -2895,7 +2895,7 @@ static int rx_macro_enable_echo(struct snd_soc_dapm_widget *w,
 {
        struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
        u16 val, ec_hq_reg;
-       int ec_tx;
+       int ec_tx = -1;
 
        val = snd_soc_component_read(component,
                        CDC_RX_INP_MUX_RX_MIX_CFG4);
index 91e6890..3d6976a 100644 (file)
@@ -189,7 +189,6 @@ struct va_macro {
        struct device *dev;
        unsigned long active_ch_mask[VA_MACRO_MAX_DAIS];
        unsigned long active_ch_cnt[VA_MACRO_MAX_DAIS];
-       unsigned long active_decimator[VA_MACRO_MAX_DAIS];
        u16 dmic_clk_div;
 
        int dec_mode[VA_MACRO_NUM_DECIMATORS];
@@ -549,11 +548,9 @@ static int va_macro_tx_mixer_put(struct snd_kcontrol *kcontrol,
        if (enable) {
                set_bit(dec_id, &va->active_ch_mask[dai_id]);
                va->active_ch_cnt[dai_id]++;
-               va->active_decimator[dai_id] = dec_id;
        } else {
                clear_bit(dec_id, &va->active_ch_mask[dai_id]);
                va->active_ch_cnt[dai_id]--;
-               va->active_decimator[dai_id] = -1;
        }
 
        snd_soc_dapm_mixer_update_power(widget->dapm, kcontrol, enable, update);
@@ -880,18 +877,19 @@ static int va_macro_digital_mute(struct snd_soc_dai *dai, int mute, int stream)
        struct va_macro *va = snd_soc_component_get_drvdata(component);
        u16 tx_vol_ctl_reg, decimator;
 
-       decimator = va->active_decimator[dai->id];
-
-       tx_vol_ctl_reg = CDC_VA_TX0_TX_PATH_CTL +
-                               VA_MACRO_TX_PATH_OFFSET * decimator;
-       if (mute)
-               snd_soc_component_update_bits(component, tx_vol_ctl_reg,
-                                             CDC_VA_TX_PATH_PGA_MUTE_EN_MASK,
-                                             CDC_VA_TX_PATH_PGA_MUTE_EN);
-       else
-               snd_soc_component_update_bits(component, tx_vol_ctl_reg,
-                                             CDC_VA_TX_PATH_PGA_MUTE_EN_MASK,
-                                             CDC_VA_TX_PATH_PGA_MUTE_DISABLE);
+       for_each_set_bit(decimator, &va->active_ch_mask[dai->id],
+                        VA_MACRO_DEC_MAX) {
+               tx_vol_ctl_reg = CDC_VA_TX0_TX_PATH_CTL +
+                                       VA_MACRO_TX_PATH_OFFSET * decimator;
+               if (mute)
+                       snd_soc_component_update_bits(component, tx_vol_ctl_reg,
+                                       CDC_VA_TX_PATH_PGA_MUTE_EN_MASK,
+                                       CDC_VA_TX_PATH_PGA_MUTE_EN);
+               else
+                       snd_soc_component_update_bits(component, tx_vol_ctl_reg,
+                                       CDC_VA_TX_PATH_PGA_MUTE_EN_MASK,
+                                       CDC_VA_TX_PATH_PGA_MUTE_DISABLE);
+       }
 
        return 0;
 }
index 5ebcd93..9ca49a1 100644 (file)
@@ -1211,14 +1211,16 @@ static int wsa_macro_enable_mix_path(struct snd_soc_dapm_widget *w,
                                     struct snd_kcontrol *kcontrol, int event)
 {
        struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm);
-       u16 gain_reg;
+       u16 path_reg, gain_reg;
        int val;
 
-       switch (w->reg) {
-       case CDC_WSA_RX0_RX_PATH_MIX_CTL:
+       switch (w->shift) {
+       case WSA_MACRO_RX_MIX0:
+               path_reg = CDC_WSA_RX0_RX_PATH_MIX_CTL;
                gain_reg = CDC_WSA_RX0_RX_VOL_MIX_CTL;
                break;
-       case CDC_WSA_RX1_RX_PATH_MIX_CTL:
+       case WSA_MACRO_RX_MIX1:
+               path_reg = CDC_WSA_RX1_RX_PATH_MIX_CTL;
                gain_reg = CDC_WSA_RX1_RX_VOL_MIX_CTL;
                break;
        default:
@@ -1231,7 +1233,7 @@ static int wsa_macro_enable_mix_path(struct snd_soc_dapm_widget *w,
                snd_soc_component_write(component, gain_reg, val);
                break;
        case SND_SOC_DAPM_POST_PMD:
-               snd_soc_component_update_bits(component, w->reg,
+               snd_soc_component_update_bits(component, path_reg,
                                              CDC_WSA_RX_PATH_MIX_CLK_EN_MASK,
                                              CDC_WSA_RX_PATH_MIX_CLK_DISABLE);
                break;
@@ -2068,14 +2070,14 @@ static const struct snd_soc_dapm_widget wsa_macro_dapm_widgets[] = {
        SND_SOC_DAPM_MUX("WSA_RX0 INP0", SND_SOC_NOPM, 0, 0, &rx0_prim_inp0_mux),
        SND_SOC_DAPM_MUX("WSA_RX0 INP1", SND_SOC_NOPM, 0, 0, &rx0_prim_inp1_mux),
        SND_SOC_DAPM_MUX("WSA_RX0 INP2", SND_SOC_NOPM, 0, 0, &rx0_prim_inp2_mux),
-       SND_SOC_DAPM_MUX_E("WSA_RX0 MIX INP", CDC_WSA_RX0_RX_PATH_MIX_CTL,
-                          0, 0, &rx0_mix_mux, wsa_macro_enable_mix_path,
+       SND_SOC_DAPM_MUX_E("WSA_RX0 MIX INP", SND_SOC_NOPM, WSA_MACRO_RX_MIX0,
+                          0, &rx0_mix_mux, wsa_macro_enable_mix_path,
                           SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD),
        SND_SOC_DAPM_MUX("WSA_RX1 INP0", SND_SOC_NOPM, 0, 0, &rx1_prim_inp0_mux),
        SND_SOC_DAPM_MUX("WSA_RX1 INP1", SND_SOC_NOPM, 0, 0, &rx1_prim_inp1_mux),
        SND_SOC_DAPM_MUX("WSA_RX1 INP2", SND_SOC_NOPM, 0, 0, &rx1_prim_inp2_mux),
-       SND_SOC_DAPM_MUX_E("WSA_RX1 MIX INP", CDC_WSA_RX1_RX_PATH_MIX_CTL,
-                          0, 0, &rx1_mix_mux, wsa_macro_enable_mix_path,
+       SND_SOC_DAPM_MUX_E("WSA_RX1 MIX INP", SND_SOC_NOPM, WSA_MACRO_RX_MIX1,
+                          0, &rx1_mix_mux, wsa_macro_enable_mix_path,
                           SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD),
 
        SND_SOC_DAPM_MIXER_E("WSA_RX INT0 MIX", SND_SOC_NOPM, 0, 0, NULL, 0,
index 37b5795..844e407 100644 (file)
@@ -209,6 +209,7 @@ static bool rt1015_volatile_register(struct device *dev, unsigned int reg)
        case RT1015_VENDOR_ID:
        case RT1015_DEVICE_ID:
        case RT1015_PRO_ALT:
+       case RT1015_MAN_I2C:
        case RT1015_DAC3:
        case RT1015_VBAT_TEST_OUT1:
        case RT1015_VBAT_TEST_OUT2:
@@ -513,6 +514,7 @@ static void rt1015_calibrate(struct rt1015_priv *rt1015)
        msleep(300);
        regmap_write(regmap, RT1015_PWR_STATE_CTRL, 0x0008);
        regmap_write(regmap, RT1015_SYS_RST1, 0x05F5);
+       regmap_write(regmap, RT1015_CLK_DET, 0x8000);
 
        regcache_cache_bypass(regmap, false);
        regcache_mark_dirty(regmap);
index 1414ad1..a5674c2 100644 (file)
@@ -339,9 +339,9 @@ static bool rt5640_readable_register(struct device *dev, unsigned int reg)
 }
 
 static const DECLARE_TLV_DB_SCALE(out_vol_tlv, -4650, 150, 0);
-static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -65625, 375, 0);
+static const DECLARE_TLV_DB_MINMAX(dac_vol_tlv, -6562, 0);
 static const DECLARE_TLV_DB_SCALE(in_vol_tlv, -3450, 150, 0);
-static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -17625, 375, 0);
+static const DECLARE_TLV_DB_MINMAX(adc_vol_tlv, -1762, 3000);
 static const DECLARE_TLV_DB_SCALE(adc_bst_tlv, 0, 1200, 0);
 
 /* {0, +20, +24, +30, +35, +40, +44, +50, +52} dB */
index d198e19..e59fdc8 100644 (file)
@@ -285,9 +285,9 @@ static bool rt5651_readable_register(struct device *dev, unsigned int reg)
 }
 
 static const DECLARE_TLV_DB_SCALE(out_vol_tlv, -4650, 150, 0);
-static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -65625, 375, 0);
+static const DECLARE_TLV_DB_MINMAX(dac_vol_tlv, -6562, 0);
 static const DECLARE_TLV_DB_SCALE(in_vol_tlv, -3450, 150, 0);
-static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -17625, 375, 0);
+static const DECLARE_TLV_DB_MINMAX(adc_vol_tlv, -1762, 3000);
 static const DECLARE_TLV_DB_SCALE(adc_bst_tlv, 0, 1200, 0);
 
 /* {0, +20, +24, +30, +35, +40, +44, +50, +52} dB */
index 41e5917..91a4ef7 100644 (file)
@@ -3426,12 +3426,17 @@ static int rt5659_set_component_sysclk(struct snd_soc_component *component, int
 {
        struct rt5659_priv *rt5659 = snd_soc_component_get_drvdata(component);
        unsigned int reg_val = 0;
+       int ret;
 
        if (freq == rt5659->sysclk && clk_id == rt5659->sysclk_src)
                return 0;
 
        switch (clk_id) {
        case RT5659_SCLK_S_MCLK:
+               ret = clk_set_rate(rt5659->mclk, freq);
+               if (ret)
+                       return ret;
+
                reg_val |= RT5659_SCLK_SRC_MCLK;
                break;
        case RT5659_SCLK_S_PLL1:
index c29317e..4063aac 100644 (file)
@@ -629,21 +629,69 @@ static SOC_ENUM_SINGLE_DECL(rt5670_if2_dac_enum, RT5670_DIG_INF1_DATA,
 static SOC_ENUM_SINGLE_DECL(rt5670_if2_adc_enum, RT5670_DIG_INF1_DATA,
                                RT5670_IF2_ADC_SEL_SFT, rt5670_data_select);
 
+/*
+ * For reliable output-mute LED control we need a "DAC1 Playback Switch" control.
+ * We emulate this by only clearing the RT5670_M_DAC1_L/_R AD_DA_MIXER register
+ * bits when both our emulated DAC1 Playback Switch control and the DAC1 MIXL/R
+ * DAPM-mixer DAC1 input are enabled.
+ */
+static void rt5670_update_ad_da_mixer_dac1_m_bits(struct rt5670_priv *rt5670)
+{
+       int val = RT5670_M_DAC1_L | RT5670_M_DAC1_R;
+
+       if (rt5670->dac1_mixl_dac1_switch && rt5670->dac1_playback_switch_l)
+               val &= ~RT5670_M_DAC1_L;
+
+       if (rt5670->dac1_mixr_dac1_switch && rt5670->dac1_playback_switch_r)
+               val &= ~RT5670_M_DAC1_R;
+
+       regmap_update_bits(rt5670->regmap, RT5670_AD_DA_MIXER,
+                          RT5670_M_DAC1_L | RT5670_M_DAC1_R, val);
+}
+
+static int rt5670_dac1_playback_switch_get(struct snd_kcontrol *kcontrol,
+                                          struct snd_ctl_elem_value *ucontrol)
+{
+       struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol);
+       struct rt5670_priv *rt5670 = snd_soc_component_get_drvdata(component);
+
+       ucontrol->value.integer.value[0] = rt5670->dac1_playback_switch_l;
+       ucontrol->value.integer.value[1] = rt5670->dac1_playback_switch_r;
+
+       return 0;
+}
+
+static int rt5670_dac1_playback_switch_put(struct snd_kcontrol *kcontrol,
+                                          struct snd_ctl_elem_value *ucontrol)
+{
+       struct snd_soc_component *component = snd_soc_kcontrol_component(kcontrol);
+       struct rt5670_priv *rt5670 = snd_soc_component_get_drvdata(component);
+
+       if (rt5670->dac1_playback_switch_l == ucontrol->value.integer.value[0] &&
+           rt5670->dac1_playback_switch_r == ucontrol->value.integer.value[1])
+               return 0;
+
+       rt5670->dac1_playback_switch_l = ucontrol->value.integer.value[0];
+       rt5670->dac1_playback_switch_r = ucontrol->value.integer.value[1];
+
+       rt5670_update_ad_da_mixer_dac1_m_bits(rt5670);
+
+       return 1;
+}
+
 static const struct snd_kcontrol_new rt5670_snd_controls[] = {
        /* Headphone Output Volume */
-       SOC_DOUBLE("HP Playback Switch", RT5670_HP_VOL,
-               RT5670_L_MUTE_SFT, RT5670_R_MUTE_SFT, 1, 1),
        SOC_DOUBLE_TLV("HP Playback Volume", RT5670_HP_VOL,
                RT5670_L_VOL_SFT, RT5670_R_VOL_SFT,
                39, 1, out_vol_tlv),
        /* OUTPUT Control */
-       SOC_DOUBLE("OUT Channel Switch", RT5670_LOUT1,
-               RT5670_VOL_L_SFT, RT5670_VOL_R_SFT, 1, 1),
        SOC_DOUBLE_TLV("OUT Playback Volume", RT5670_LOUT1,
                RT5670_L_VOL_SFT, RT5670_R_VOL_SFT, 39, 1, out_vol_tlv),
        /* DAC Digital Volume */
        SOC_DOUBLE("DAC2 Playback Switch", RT5670_DAC_CTRL,
                RT5670_M_DAC_L2_VOL_SFT, RT5670_M_DAC_R2_VOL_SFT, 1, 1),
+       SOC_DOUBLE_EXT("DAC1 Playback Switch", SND_SOC_NOPM, 0, 1, 1, 0,
+                       rt5670_dac1_playback_switch_get, rt5670_dac1_playback_switch_put),
        SOC_DOUBLE_TLV("DAC1 Playback Volume", RT5670_DAC1_DIG_VOL,
                        RT5670_L_VOL_SFT, RT5670_R_VOL_SFT,
                        175, 0, dac_vol_tlv),
@@ -913,18 +961,44 @@ static const struct snd_kcontrol_new rt5670_mono_adc_r_mix[] = {
                        RT5670_M_MONO_ADC_R2_SFT, 1, 1),
 };
 
+/* See comment above rt5670_update_ad_da_mixer_dac1_m_bits() */
+static int rt5670_put_dac1_mix_dac1_switch(struct snd_kcontrol *kcontrol,
+                                          struct snd_ctl_elem_value *ucontrol)
+{
+       struct soc_mixer_control *mc = (struct soc_mixer_control *)kcontrol->private_value;
+       struct snd_soc_component *component = snd_soc_dapm_kcontrol_component(kcontrol);
+       struct rt5670_priv *rt5670 = snd_soc_component_get_drvdata(component);
+       int ret;
+
+       if (mc->shift == 0)
+               rt5670->dac1_mixl_dac1_switch = ucontrol->value.integer.value[0];
+       else
+               rt5670->dac1_mixr_dac1_switch = ucontrol->value.integer.value[0];
+
+       /* Apply the update (if any) */
+       ret = snd_soc_dapm_put_volsw(kcontrol, ucontrol);
+       if (ret == 0)
+               return 0;
+
+       rt5670_update_ad_da_mixer_dac1_m_bits(rt5670);
+
+       return 1;
+}
+
+#define SOC_DAPM_SINGLE_RT5670_DAC1_SW(name, shift) \
+       SOC_SINGLE_EXT(name, SND_SOC_NOPM, shift, 1, 0, \
+                      snd_soc_dapm_get_volsw, rt5670_put_dac1_mix_dac1_switch)
+
 static const struct snd_kcontrol_new rt5670_dac_l_mix[] = {
        SOC_DAPM_SINGLE("Stereo ADC Switch", RT5670_AD_DA_MIXER,
                        RT5670_M_ADCMIX_L_SFT, 1, 1),
-       SOC_DAPM_SINGLE("DAC1 Switch", RT5670_AD_DA_MIXER,
-                       RT5670_M_DAC1_L_SFT, 1, 1),
+       SOC_DAPM_SINGLE_RT5670_DAC1_SW("DAC1 Switch", 0),
 };
 
 static const struct snd_kcontrol_new rt5670_dac_r_mix[] = {
        SOC_DAPM_SINGLE("Stereo ADC Switch", RT5670_AD_DA_MIXER,
                        RT5670_M_ADCMIX_R_SFT, 1, 1),
-       SOC_DAPM_SINGLE("DAC1 Switch", RT5670_AD_DA_MIXER,
-                       RT5670_M_DAC1_R_SFT, 1, 1),
+       SOC_DAPM_SINGLE_RT5670_DAC1_SW("DAC1 Switch", 1),
 };
 
 static const struct snd_kcontrol_new rt5670_sto_dac_l_mix[] = {
@@ -1656,12 +1730,10 @@ static const struct snd_soc_dapm_widget rt5670_dapm_widgets[] = {
                            RT5670_PWR_ADC_S1F_BIT, 0, NULL, 0),
        SND_SOC_DAPM_SUPPLY("ADC Stereo2 Filter", RT5670_PWR_DIG2,
                            RT5670_PWR_ADC_S2F_BIT, 0, NULL, 0),
-       SND_SOC_DAPM_MIXER("Sto1 ADC MIXL", RT5670_STO1_ADC_DIG_VOL,
-                          RT5670_L_MUTE_SFT, 1, rt5670_sto1_adc_l_mix,
-                          ARRAY_SIZE(rt5670_sto1_adc_l_mix)),
-       SND_SOC_DAPM_MIXER("Sto1 ADC MIXR", RT5670_STO1_ADC_DIG_VOL,
-                          RT5670_R_MUTE_SFT, 1, rt5670_sto1_adc_r_mix,
-                          ARRAY_SIZE(rt5670_sto1_adc_r_mix)),
+       SND_SOC_DAPM_MIXER("Sto1 ADC MIXL", SND_SOC_NOPM, 0, 0,
+                          rt5670_sto1_adc_l_mix, ARRAY_SIZE(rt5670_sto1_adc_l_mix)),
+       SND_SOC_DAPM_MIXER("Sto1 ADC MIXR", SND_SOC_NOPM, 0, 0,
+                          rt5670_sto1_adc_r_mix, ARRAY_SIZE(rt5670_sto1_adc_r_mix)),
        SND_SOC_DAPM_MIXER("Sto2 ADC MIXL", SND_SOC_NOPM, 0, 0,
                           rt5670_sto2_adc_l_mix,
                           ARRAY_SIZE(rt5670_sto2_adc_l_mix)),
@@ -2999,6 +3071,16 @@ static int rt5670_i2c_probe(struct i2c_client *i2c,
                dev_info(&i2c->dev, "quirk JD mode 3\n");
        }
 
+       /*
+        * Enable the emulated "DAC1 Playback Switch" by default to avoid
+        * muting the output with older UCM profiles.
+        */
+       rt5670->dac1_playback_switch_l = true;
+       rt5670->dac1_playback_switch_r = true;
+       /* The Power-On-Reset values for the DAC1 mixer have the DAC1 input enabled. */
+       rt5670->dac1_mixl_dac1_switch = true;
+       rt5670->dac1_mixr_dac1_switch = true;
+
        rt5670->regmap = devm_regmap_init_i2c(i2c, &rt5670_regmap);
        if (IS_ERR(rt5670->regmap)) {
                ret = PTR_ERR(rt5670->regmap);
index 56b13fe..6fb3c36 100644 (file)
 /* global definition */
 #define RT5670_L_MUTE                          (0x1 << 15)
 #define RT5670_L_MUTE_SFT                      15
-#define RT5670_VOL_L_MUTE                      (0x1 << 14)
-#define RT5670_VOL_L_SFT                       14
 #define RT5670_R_MUTE                          (0x1 << 7)
 #define RT5670_R_MUTE_SFT                      7
-#define RT5670_VOL_R_MUTE                      (0x1 << 6)
-#define RT5670_VOL_R_SFT                       6
 #define RT5670_L_VOL_MASK                      (0x3f << 8)
 #define RT5670_L_VOL_SFT                       8
 #define RT5670_R_VOL_MASK                      (0x3f)
@@ -2017,6 +2013,11 @@ struct rt5670_priv {
        int dsp_rate;
        int jack_type;
        int jack_type_saved;
+
+       bool dac1_mixl_dac1_switch;
+       bool dac1_mixr_dac1_switch;
+       bool dac1_playback_switch_l;
+       bool dac1_playback_switch_r;
 };
 
 void rt5670_jack_suspend(struct snd_soc_component *component);
index 85f7441..047f4e6 100644 (file)
@@ -895,6 +895,13 @@ static int rt711_probe(struct snd_soc_component *component)
        return 0;
 }
 
+static void rt711_remove(struct snd_soc_component *component)
+{
+       struct rt711_priv *rt711 = snd_soc_component_get_drvdata(component);
+
+       regcache_cache_only(rt711->regmap, true);
+}
+
 static const struct snd_soc_component_driver soc_codec_dev_rt711 = {
        .probe = rt711_probe,
        .set_bias_level = rt711_set_bias_level,
@@ -905,6 +912,7 @@ static const struct snd_soc_component_driver soc_codec_dev_rt711 = {
        .dapm_routes = rt711_audio_map,
        .num_dapm_routes = ARRAY_SIZE(rt711_audio_map),
        .set_jack = rt711_set_jack_detect,
+       .remove = rt711_remove,
 };
 
 static int rt711_set_sdw_stream(struct snd_soc_dai *dai, void *sdw_stream,
index 73551e3..6d9bb25 100644 (file)
@@ -71,7 +71,7 @@ static const struct reg_default sgtl5000_reg_defaults[] = {
        { SGTL5000_DAP_EQ_BASS_BAND4,           0x002f },
        { SGTL5000_DAP_MAIN_CHAN,               0x8000 },
        { SGTL5000_DAP_MIX_CHAN,                0x0000 },
-       { SGTL5000_DAP_AVC_CTRL,                0x0510 },
+       { SGTL5000_DAP_AVC_CTRL,                0x5100 },
        { SGTL5000_DAP_AVC_THRESHOLD,           0x1473 },
        { SGTL5000_DAP_AVC_ATTACK,              0x0028 },
        { SGTL5000_DAP_AVC_DECAY,               0x0050 },
diff --git a/sound/soc/codecs/sirf-audio-codec.h b/sound/soc/codecs/sirf-audio-codec.h
deleted file mode 100644 (file)
index a7fe268..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * SiRF inner codec controllers define
- *
- * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#ifndef _SIRF_AUDIO_CODEC_H
-#define _SIRF_AUDIO_CODEC_H
-
-
-#define AUDIO_IC_CODEC_PWR                     (0x00E0)
-#define AUDIO_IC_CODEC_CTRL0                   (0x00E4)
-#define AUDIO_IC_CODEC_CTRL1                   (0x00E8)
-#define AUDIO_IC_CODEC_CTRL2                   (0x00EC)
-#define AUDIO_IC_CODEC_CTRL3                   (0x00F0)
-
-#define MICBIASEN              (1 << 3)
-
-#define IC_RDACEN              (1 << 0)
-#define IC_LDACEN              (1 << 1)
-#define IC_HSREN               (1 << 2)
-#define IC_HSLEN               (1 << 3)
-#define IC_SPEN                        (1 << 4)
-#define IC_CPEN                        (1 << 5)
-
-#define IC_HPRSELR             (1 << 6)
-#define IC_HPLSELR             (1 << 7)
-#define IC_HPRSELL             (1 << 8)
-#define IC_HPLSELL             (1 << 9)
-#define IC_SPSELR              (1 << 10)
-#define IC_SPSELL              (1 << 11)
-
-#define IC_MONOR               (1 << 12)
-#define IC_MONOL               (1 << 13)
-
-#define IC_RXOSRSEL            (1 << 28)
-#define IC_CPFREQ              (1 << 29)
-#define IC_HSINVEN             (1 << 30)
-
-#define IC_MICINREN            (1 << 0)
-#define IC_MICINLEN            (1 << 1)
-#define IC_MICIN1SEL           (1 << 2)
-#define IC_MICIN2SEL           (1 << 3)
-#define IC_MICDIFSEL           (1 << 4)
-#define        IC_LINEIN1SEL           (1 << 5)
-#define        IC_LINEIN2SEL           (1 << 6)
-#define        IC_RADCEN               (1 << 7)
-#define        IC_LADCEN               (1 << 8)
-#define        IC_ALM                  (1 << 9)
-
-#define IC_DIGMICEN             (1 << 22)
-#define IC_DIGMICFREQ           (1 << 23)
-#define IC_ADC14B_12            (1 << 24)
-#define IC_FIRDAC_HSL_EN        (1 << 25)
-#define IC_FIRDAC_HSR_EN        (1 << 26)
-#define IC_FIRDAC_LOUT_EN       (1 << 27)
-#define IC_POR                  (1 << 28)
-#define IC_CODEC_CLK_EN         (1 << 29)
-#define IC_HP_3DB_BOOST         (1 << 30)
-
-#define IC_ADC_LEFT_GAIN_SHIFT 16
-#define IC_ADC_RIGHT_GAIN_SHIFT 10
-#define IC_ADC_GAIN_MASK       0x3F
-#define IC_MIC_MAX_GAIN                0x39
-
-#define IC_RXPGAR_MASK         0x3F
-#define IC_RXPGAR_SHIFT                14
-#define IC_RXPGAL_MASK         0x3F
-#define IC_RXPGAL_SHIFT                21
-#define IC_RXPGAR              0x7B
-#define IC_RXPGAL              0x7B
-
-#define AUDIO_PORT_TX_FIFO_LEVEL_CHECK_MASK     0x3F
-#define AUDIO_PORT_TX_FIFO_SC_OFFSET    0
-#define AUDIO_PORT_TX_FIFO_LC_OFFSET    10
-#define AUDIO_PORT_TX_FIFO_HC_OFFSET    20
-
-#define TX_FIFO_SC(x)           (((x) & AUDIO_PORT_TX_FIFO_LEVEL_CHECK_MASK) \
-                               << AUDIO_PORT_TX_FIFO_SC_OFFSET)
-#define TX_FIFO_LC(x)           (((x) & AUDIO_PORT_TX_FIFO_LEVEL_CHECK_MASK) \
-                               << AUDIO_PORT_TX_FIFO_LC_OFFSET)
-#define TX_FIFO_HC(x)           (((x) & AUDIO_PORT_TX_FIFO_LEVEL_CHECK_MASK) \
-                               << AUDIO_PORT_TX_FIFO_HC_OFFSET)
-
-#define AUDIO_PORT_RX_FIFO_LEVEL_CHECK_MASK     0x0F
-#define AUDIO_PORT_RX_FIFO_SC_OFFSET    0
-#define AUDIO_PORT_RX_FIFO_LC_OFFSET    10
-#define AUDIO_PORT_RX_FIFO_HC_OFFSET    20
-
-#define RX_FIFO_SC(x)           (((x) & AUDIO_PORT_RX_FIFO_LEVEL_CHECK_MASK) \
-                               << AUDIO_PORT_RX_FIFO_SC_OFFSET)
-#define RX_FIFO_LC(x)           (((x) & AUDIO_PORT_RX_FIFO_LEVEL_CHECK_MASK) \
-                               << AUDIO_PORT_RX_FIFO_LC_OFFSET)
-#define RX_FIFO_HC(x)           (((x) & AUDIO_PORT_RX_FIFO_LEVEL_CHECK_MASK) \
-                               << AUDIO_PORT_RX_FIFO_HC_OFFSET)
-#define AUDIO_PORT_IC_CODEC_TX_CTRL            (0x00F4)
-#define AUDIO_PORT_IC_CODEC_RX_CTRL            (0x00F8)
-
-#define AUDIO_PORT_IC_TXFIFO_OP                        (0x00FC)
-#define AUDIO_PORT_IC_TXFIFO_LEV_CHK           (0x0100)
-#define AUDIO_PORT_IC_TXFIFO_STS               (0x0104)
-#define AUDIO_PORT_IC_TXFIFO_INT               (0x0108)
-#define AUDIO_PORT_IC_TXFIFO_INT_MSK           (0x010C)
-
-#define AUDIO_PORT_IC_RXFIFO_OP                        (0x0110)
-#define AUDIO_PORT_IC_RXFIFO_LEV_CHK           (0x0114)
-#define AUDIO_PORT_IC_RXFIFO_STS               (0x0118)
-#define AUDIO_PORT_IC_RXFIFO_INT               (0x011C)
-#define AUDIO_PORT_IC_RXFIFO_INT_MSK           (0x0120)
-
-#define AUDIO_FIFO_START               (1 << 0)
-#define AUDIO_FIFO_RESET               (1 << 1)
-
-#define AUDIO_FIFO_FULL                        (1 << 0)
-#define AUDIO_FIFO_EMPTY               (1 << 1)
-#define AUDIO_FIFO_OFLOW               (1 << 2)
-#define AUDIO_FIFO_UFLOW               (1 << 3)
-
-#define IC_TX_ENABLE           (0x03)
-#define IC_RX_ENABLE_MONO      (0x01)
-#define IC_RX_ENABLE_STEREO    (0x03)
-
-#endif /*__SIRF_AUDIO_CODEC_H*/
index 40f682f..d18ae5e 100644 (file)
@@ -1873,6 +1873,12 @@ static int wcd934x_set_channel_map(struct snd_soc_dai *dai,
 
        wcd = snd_soc_component_get_drvdata(dai->component);
 
+       if (tx_num > WCD934X_TX_MAX || rx_num > WCD934X_RX_MAX) {
+               dev_err(wcd->dev, "Invalid tx %d or rx %d channel count\n",
+                       tx_num, rx_num);
+               return -EINVAL;
+       }
+
        if (!tx_slot || !rx_slot) {
                dev_err(wcd->dev, "Invalid tx_slot=%p, rx_slot=%p\n",
                        tx_slot, rx_slot);
index 5781174..ad8af3f 100644 (file)
@@ -878,6 +878,7 @@ static int fsl_ssi_hw_free(struct snd_pcm_substream *substream,
 static int _fsl_ssi_set_dai_fmt(struct fsl_ssi *ssi, unsigned int fmt)
 {
        u32 strcr = 0, scr = 0, stcr, srcr, mask;
+       unsigned int slots;
 
        ssi->dai_fmt = fmt;
 
@@ -909,10 +910,11 @@ static int _fsl_ssi_set_dai_fmt(struct fsl_ssi *ssi, unsigned int fmt)
                        return -EINVAL;
                }
 
+               slots = ssi->slots ? : 2;
                regmap_update_bits(ssi->regs, REG_SSI_STCCR,
-                                  SSI_SxCCR_DC_MASK, SSI_SxCCR_DC(2));
+                                  SSI_SxCCR_DC_MASK, SSI_SxCCR_DC(slots));
                regmap_update_bits(ssi->regs, REG_SSI_SRCCR,
-                                  SSI_SxCCR_DC_MASK, SSI_SxCCR_DC(2));
+                                  SSI_SxCCR_DC_MASK, SSI_SxCCR_DC(slots));
 
                /* Data on rising edge of bclk, frame low, 1clk before data */
                strcr |= SSI_STCR_TFSI | SSI_STCR_TSCKP | SSI_STCR_TEFS;
index ab31045..6cada4c 100644 (file)
@@ -172,15 +172,16 @@ int asoc_simple_parse_clk(struct device *dev,
         *  or device's module clock.
         */
        clk = devm_get_clk_from_child(dev, node, NULL);
-       if (IS_ERR(clk))
-               clk = devm_get_clk_from_child(dev, dlc->of_node, NULL);
-
        if (!IS_ERR(clk)) {
-               simple_dai->clk = clk;
                simple_dai->sysclk = clk_get_rate(clk);
-       } else if (!of_property_read_u32(node, "system-clock-frequency",
-                                        &val)) {
+
+               simple_dai->clk = clk;
+       } else if (!of_property_read_u32(node, "system-clock-frequency", &val)) {
                simple_dai->sysclk = val;
+       } else {
+               clk = devm_get_clk_from_child(dev, dlc->of_node, NULL);
+               if (!IS_ERR(clk))
+                       simple_dai->sysclk = clk_get_rate(clk);
        }
 
        if (of_property_read_bool(node, "system-clock-direction-out"))
index 782f2b4..5d48cc3 100644 (file)
@@ -581,7 +581,7 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = {
                },
                .driver_data = (void *)(BYT_RT5640_DMIC1_MAP |
                                        BYT_RT5640_JD_SRC_JD1_IN4P |
-                                       BYT_RT5640_OVCD_TH_1500UA |
+                                       BYT_RT5640_OVCD_TH_2000UA |
                                        BYT_RT5640_OVCD_SF_0P75 |
                                        BYT_RT5640_MCLK_EN),
        },
index f5de1d7..f3bebed 100644 (file)
@@ -555,7 +555,9 @@ static int mtk_dai_tdm_hw_params(struct snd_pcm_substream *substream,
 
        /* set tdm */
        if (tdm_priv->bck_invert)
-               tdm_con |= 1 << BCK_INVERSE_SFT;
+               regmap_update_bits(afe->regmap, AUDIO_TOP_CON3,
+                                  BCK_INVERSE_MASK_SFT,
+                                  0x1 << BCK_INVERSE_SFT);
 
        if (tdm_priv->lck_invert)
                tdm_con |= 1 << LRCK_INVERSE_SFT;
index 562f25c..b9fb80d 100644 (file)
@@ -21,6 +21,11 @@ enum {
 /*****************************************************************************
  *                  R E G I S T E R       D E F I N I T I O N
  *****************************************************************************/
+/* AUDIO_TOP_CON3 */
+#define BCK_INVERSE_SFT                              3
+#define BCK_INVERSE_MASK                             0x1
+#define BCK_INVERSE_MASK_SFT                         (0x1 << 3)
+
 /* AFE_DAC_CON0 */
 #define VUL12_ON_SFT                                   31
 #define VUL12_ON_MASK                                  0x1
@@ -2079,9 +2084,6 @@ enum {
 #define TDM_EN_SFT                                     0
 #define TDM_EN_MASK                                    0x1
 #define TDM_EN_MASK_SFT                                (0x1 << 0)
-#define BCK_INVERSE_SFT                                1
-#define BCK_INVERSE_MASK                               0x1
-#define BCK_INVERSE_MASK_SFT                           (0x1 << 1)
 #define LRCK_INVERSE_SFT                               2
 #define LRCK_INVERSE_MASK                              0x1
 #define LRCK_INVERSE_MASK_SFT                          (0x1 << 2)
index c642e5f..be360a4 100644 (file)
@@ -739,7 +739,7 @@ static void of_lpass_cpu_parse_dai_data(struct device *dev,
 
        for_each_child_of_node(dev->of_node, node) {
                ret = of_property_read_u32(node, "reg", &id);
-               if (ret || id < 0 || id >= data->variant->num_dai) {
+               if (ret || id < 0) {
                        dev_err(dev, "valid dai id not found: %d\n", ret);
                        continue;
                }
index 6c2760e..153e9b2 100644 (file)
 #define SPK_TDM_RX_MASK         0x03
 #define NUM_TDM_SLOTS           8
 #define SLIM_MAX_TX_PORTS 16
-#define SLIM_MAX_RX_PORTS 16
+#define SLIM_MAX_RX_PORTS 13
 #define WCD934X_DEFAULT_MCLK_RATE      9600000
 
 struct sdm845_snd_data {
        struct snd_soc_jack jack;
        bool jack_setup;
-       bool stream_prepared[SLIM_MAX_RX_PORTS];
+       bool stream_prepared[AFE_PORT_MAX];
        struct snd_soc_card *card;
        uint32_t pri_mi2s_clk_count;
        uint32_t sec_mi2s_clk_count;
        uint32_t quat_tdm_clk_count;
-       struct sdw_stream_runtime *sruntime[SLIM_MAX_RX_PORTS];
+       struct sdw_stream_runtime *sruntime[AFE_PORT_MAX];
 };
 
 static unsigned int tdm_slot_offset[8] = {0, 4, 8, 12, 16, 20, 24, 28};
index f6d4e99..0cffc95 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/of.h>
 #include <linux/of_graph.h>
 #include <linux/dmi.h>
+#include <linux/acpi.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 #include <sound/pcm_params.h>
@@ -1573,6 +1574,9 @@ int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour)
        if (card->long_name)
                return 0; /* long name already set by driver or from DMI */
 
+       if (!is_acpi_device_node(card->dev->fwnode))
+               return 0;
+
        /* make up dmi long name as: vendor-product-version-board */
        vendor = dmi_get_system_info(DMI_BOARD_VENDOR);
        if (!vendor || !is_dmi_valid(vendor)) {
index 5788fe3..c3b757c 100644 (file)
@@ -207,7 +207,7 @@ int hda_dsp_core_power_down(struct snd_sof_dev *sdev, unsigned int core_mask)
 
        ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR,
                                HDA_DSP_REG_ADSPCS, adspcs,
-                               !(adspcs & HDA_DSP_ADSPCS_SPA_MASK(core_mask)),
+                               !(adspcs & HDA_DSP_ADSPCS_CPA_MASK(core_mask)),
                                HDA_DSP_REG_POLL_INTERVAL_US,
                                HDA_DSP_PD_TIMEOUT * USEC_PER_MSEC);
        if (ret < 0)
index 1d29b1f..0c096db 100644 (file)
@@ -897,6 +897,7 @@ free_streams:
 /* dsp_unmap: not currently used */
        iounmap(sdev->bar[HDA_DSP_BAR]);
 hdac_bus_unmap:
+       platform_device_unregister(hdev->dmic_dev);
        iounmap(bus->remap_addr);
        hda_codec_i915_exit(sdev);
 err:
index 9d0da5f..d24ae00 100644 (file)
@@ -62,7 +62,6 @@ MODULE_PARM_DESC(enable, "Enable Sun AMD7930 soundcard.");
 MODULE_AUTHOR("Thomas K. Dyas and David S. Miller");
 MODULE_DESCRIPTION("Sun AMD7930");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sun,AMD7930}}");
 
 /* Device register layout.  */
 
index 0eed5f7..35c1780 100644 (file)
@@ -52,7 +52,6 @@ MODULE_PARM_DESC(enable, "Enable Sun CS4231 soundcard.");
 MODULE_AUTHOR("Jaroslav Kysela, Derrick J. Brashear and David S. Miller");
 MODULE_DESCRIPTION("Sun CS4231");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sun,CS4231}}");
 
 #ifdef SBUS_SUPPORT
 struct sbus_dma_info {
index 5a6fb66..b055f58 100644 (file)
@@ -76,7 +76,6 @@
 MODULE_AUTHOR("Rudolf Koenig, Brent Baccala and Martin Habets");
 MODULE_DESCRIPTION("Sun DBRI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sun,DBRI}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index 08c6e6a..33e9621 100644 (file)
@@ -26,7 +26,6 @@
 MODULE_AUTHOR("Torsten Schenk <torsten.schenk@zoho.com>");
 MODULE_DESCRIPTION("TerraTec DMX 6Fire USB audio driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{TerraTec,DMX 6Fire USB}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for card */
index e03481c..49f63f8 100644 (file)
 MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
 MODULE_DESCRIPTION("caiaq USB audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Native Instruments,RigKontrol2},"
-                        "{Native Instruments,RigKontrol3},"
-                        "{Native Instruments,Kore Controller},"
-                        "{Native Instruments,Kore Controller 2},"
-                        "{Native Instruments,Audio Kontrol 1},"
-                        "{Native Instruments,Audio 2 DJ},"
-                        "{Native Instruments,Audio 4 DJ},"
-                        "{Native Instruments,Audio 8 DJ},"
-                        "{Native Instruments,Traktor Audio 2},"
-                        "{Native Instruments,Session I/O},"
-                        "{Native Instruments,GuitarRig mobile},"
-                        "{Native Instruments,Traktor Kontrol X1},"
-                        "{Native Instruments,Traktor Kontrol S4},"
-                        "{Native Instruments,Maschine Controller}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char* id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for this card */
index b6f4c08..0826a43 100644 (file)
@@ -58,8 +58,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("USB Audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Generic,USB Audio}}");
-
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;      /* ID for this card */
index c282418..95385e9 100644 (file)
@@ -21,23 +21,6 @@ MODULE_AUTHOR("Michael Trimarchi <michael@amarulasolutions.com>");
 MODULE_AUTHOR("Antonio Ospite <ao2@amarulasolutions.com>");
 MODULE_DESCRIPTION("M2Tech hiFace USB-SPDIF audio driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{M2Tech,Young},"
-                        "{M2Tech,hiFace},"
-                        "{M2Tech,North Star},"
-                        "{M2Tech,W4S Young},"
-                        "{M2Tech,Corrson},"
-                        "{M2Tech,AUDIA},"
-                        "{M2Tech,SL Audio},"
-                        "{M2Tech,Empirical},"
-                        "{M2Tech,Rockna},"
-                        "{M2Tech,Pathos},"
-                        "{M2Tech,Metronome},"
-                        "{M2Tech,CAD},"
-                        "{M2Tech,Audio Esclusive},"
-                        "{M2Tech,Rotel},"
-                        "{M2Tech,Eeaudio},"
-                        "{The Chord Company,CHORD},"
-                        "{AVA Group A/S,Vitus}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for card */
index 6b30155..5834d1d 100644 (file)
@@ -19,7 +19,6 @@
 MODULE_DESCRIPTION("Edirol UA-101/1000 driver");
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Edirol,UA-101},{Edirol,UA-1000}}");
 
 /*
  * Should not be lower than the minimum scheduling delay of the host
index 08873d2..ffd9223 100644 (file)
@@ -2883,7 +2883,7 @@ static int snd_djm_controls_put(struct snd_kcontrol *kctl, struct snd_ctl_elem_v
        u8 group = (private_value & SND_DJM_GROUP_MASK) >> SND_DJM_GROUP_SHIFT;
        u16 value = elem->value.enumerated.item[0];
 
-       kctl->private_value = ((device << SND_DJM_DEVICE_SHIFT) |
+       kctl->private_value = (((unsigned long)device << SND_DJM_DEVICE_SHIFT) |
                              (group << SND_DJM_GROUP_SHIFT) |
                              value);
 
@@ -2921,7 +2921,7 @@ static int snd_djm_controls_create(struct usb_mixer_interface *mixer,
                value = device->controls[i].default_value;
                knew.name = device->controls[i].name;
                knew.private_value = (
-                       (device_idx << SND_DJM_DEVICE_SHIFT) |
+                       ((unsigned long)device_idx << SND_DJM_DEVICE_SHIFT) |
                        (i << SND_DJM_GROUP_SHIFT) |
                        value);
                err = snd_djm_controls_update(mixer, device_idx, i, value);
index d3001fb..176437a 100644 (file)
@@ -1521,6 +1521,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
        case USB_ID(0x21b4, 0x0081): /* AudioQuest DragonFly */
        case USB_ID(0x2912, 0x30c8): /* Audioengine D1 */
        case USB_ID(0x413c, 0xa506): /* Dell AE515 sound bar */
+       case USB_ID(0x046d, 0x084c): /* Logitech ConferenceCam Connect */
                return true;
        }
 
index c541581..3cd28d2 100644 (file)
 MODULE_AUTHOR("Karsten Wiese <annabellesgarden@yahoo.de>");
 MODULE_DESCRIPTION("TASCAM "NAME_ALLCAPS" Version 0.8.7.2");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{TASCAM(0x1604),"NAME_ALLCAPS"(0x8001)(0x8005)(0x8007)}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char* id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for this card */
index 1d66c3a..33b12aa 100644 (file)
@@ -1887,4 +1887,3 @@ MODULE_AUTHOR("Vaibhav Agarwal <vaibhav.agarwal@intel.com>");
 MODULE_AUTHOR("Jerome Anand <jerome.anand@intel.com>");
 MODULE_DESCRIPTION("Intel HDMI Audio driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{Intel,Intel_HAD}");
index 228d820..2cb0a19 100644 (file)
@@ -391,4 +391,3 @@ module_exit(xen_drv_fini);
 MODULE_DESCRIPTION("Xen virtual sound device frontend");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("xen:" XENSND_DRIVER_NAME);
-MODULE_SUPPORTED_DEVICE("{{ALSA,Virtual soundcard}}");
index e4732d3..4f3d5aa 100644 (file)
 #define HUGETLB_FLAG_ENCODE_SHIFT      26
 #define HUGETLB_FLAG_ENCODE_MASK       0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB       (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB       (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB      (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB                (20 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB                (21 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB                (23 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB       (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB       (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB      (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB      (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB                (30 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB                (31 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB       (34 << HUGETLB_FLAG_ENCODE_SHIFT)
index 8b281f7..f6afee2 100644 (file)
@@ -1154,6 +1154,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR       (1 << 0)
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL     (1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO         (1 << 2)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE            (1 << 3)
 
 struct kvm_xen_hvm_config {
        __u32 flags;
@@ -1621,12 +1622,24 @@ struct kvm_xen_vcpu_attr {
        union {
                __u64 gpa;
                __u64 pad[8];
+               struct {
+                       __u64 state;
+                       __u64 state_entry_time;
+                       __u64 time_running;
+                       __u64 time_runnable;
+                       __u64 time_blocked;
+                       __u64 time_offline;
+               } runstate;
        } u;
 };
 
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO       0x0
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO  0x1
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR   0x2
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT        0x3
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA   0x4
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5
 
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
index 71aabaf..8f13b84 100644 (file)
@@ -9,6 +9,7 @@ Type=simple
 ExecStart=/usr/bin/kvm_stat -dtcz -s 10 -L /var/log/kvm_stat.csv
 ExecReload=/bin/kill -HUP $MAINPID
 Restart=always
+RestartSec=60s
 SyslogIdentifier=kvm_stat
 SyslogLevel=debug
 
index 887a494..e9eb6a6 100644 (file)
@@ -215,7 +215,7 @@ define do_install
        if [ ! -d '$(DESTDIR_SQ)$2' ]; then             \
                $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \
        fi;                                             \
-       $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2'
+       $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2'
 endef
 
 install_lib: all_cmd
index 2f9d685..0911aea 100644 (file)
@@ -462,7 +462,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
                return err;
 
        case BTF_KIND_ARRAY:
-               return btf_dump_order_type(d, btf_array(t)->type, through_ptr);
+               return btf_dump_order_type(d, btf_array(t)->type, false);
 
        case BTF_KIND_STRUCT:
        case BTF_KIND_UNION: {
index d43cc3f..4181d17 100644 (file)
@@ -1181,7 +1181,8 @@ static int bpf_object__elf_init(struct bpf_object *obj)
        if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) {
                pr_warn("elf: failed to get section names strings from %s: %s\n",
                        obj->path, elf_errmsg(-1));
-               return -LIBBPF_ERRNO__FORMAT;
+               err = -LIBBPF_ERRNO__FORMAT;
+               goto errout;
        }
 
        /* Old LLVM set e_machine to EM_NONE */
index 4dd73de..d2cb28e 100644 (file)
@@ -40,7 +40,7 @@ static int libbpf_netlink_open(__u32 *nl_pid)
        memset(&sa, 0, sizeof(sa));
        sa.nl_family = AF_NETLINK;
 
-       sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+       sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
        if (sock < 0)
                return -errno;
 
index ace8772..7c4a9d4 100644 (file)
@@ -402,35 +402,42 @@ static pid_t handle_signalfd(struct daemon *daemon)
        int status;
        pid_t pid;
 
+       /*
+        * Take signal fd data as pure signal notification and check all
+        * the sessions state. The reason is that multiple signals can get
+        * coalesced in kernel and we can receive only single signal even
+        * if multiple SIGCHLD were generated.
+        */
        err = read(daemon->signal_fd, &si, sizeof(struct signalfd_siginfo));
-       if (err != sizeof(struct signalfd_siginfo))
+       if (err != sizeof(struct signalfd_siginfo)) {
+               pr_err("failed to read signal fd\n");
                return -1;
+       }
 
        list_for_each_entry(session, &daemon->sessions, list) {
+               if (session->pid == -1)
+                       continue;
 
-               if (session->pid != (int) si.ssi_pid)
+               pid = waitpid(session->pid, &status, WNOHANG);
+               if (pid <= 0)
                        continue;
 
-               pid = waitpid(session->pid, &status, 0);
-               if (pid == session->pid) {
-                       if (WIFEXITED(status)) {
-                               pr_info("session '%s' exited, status=%d\n",
-                                       session->name, WEXITSTATUS(status));
-                       } else if (WIFSIGNALED(status)) {
-                               pr_info("session '%s' killed (signal %d)\n",
-                                       session->name, WTERMSIG(status));
-                       } else if (WIFSTOPPED(status)) {
-                               pr_info("session '%s' stopped (signal %d)\n",
-                                       session->name, WSTOPSIG(status));
-                       } else {
-                               pr_info("session '%s' Unexpected status (0x%x)\n",
-                                       session->name, status);
-                       }
+               if (WIFEXITED(status)) {
+                       pr_info("session '%s' exited, status=%d\n",
+                               session->name, WEXITSTATUS(status));
+               } else if (WIFSIGNALED(status)) {
+                       pr_info("session '%s' killed (signal %d)\n",
+                               session->name, WTERMSIG(status));
+               } else if (WIFSTOPPED(status)) {
+                       pr_info("session '%s' stopped (signal %d)\n",
+                               session->name, WSTOPSIG(status));
+               } else {
+                       pr_info("session '%s' Unexpected status (0x%x)\n",
+                               session->name, status);
                }
 
                session->state = KILL;
                session->pid = -1;
-               return pid;
        }
 
        return 0;
@@ -443,7 +450,6 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d
                .fd     = daemon->signal_fd,
                .events = POLLIN,
        };
-       pid_t wpid = 0, pid = session->pid;
        time_t start;
 
        start = time(NULL);
@@ -452,7 +458,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d
                int err = poll(&pollfd, 1, 1000);
 
                if (err > 0) {
-                       wpid = handle_signalfd(daemon);
+                       handle_signalfd(daemon);
                } else if (err < 0) {
                        perror("failed: poll\n");
                        return -1;
@@ -460,7 +466,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d
 
                if (start + secs < time(NULL))
                        return -1;
-       } while (wpid != pid);
+       } while (session->pid != -1);
 
        return 0;
 }
@@ -902,7 +908,9 @@ static void daemon_session__kill(struct daemon_session *session,
                        daemon_session__signal(session, SIGKILL);
                        break;
                default:
-                       break;
+                       pr_err("failed to wait for session %s\n",
+                              session->name);
+                       return;
                }
                how++;
 
@@ -955,7 +963,8 @@ static void daemon__kill(struct daemon *daemon)
                        daemon__signal(daemon, SIGKILL);
                        break;
                default:
-                       break;
+                       pr_err("failed to wait for sessions\n");
+                       return;
                }
                how++;
 
@@ -1344,7 +1353,7 @@ out:
                close(sock_fd);
        if (conf_fd != -1)
                close(conf_fd);
-       if (conf_fd != -1)
+       if (signal_fd != -1)
                close(signal_fd);
 
        pr_info("daemon exited\n");
index f57e075..c72adbd 100644 (file)
@@ -86,7 +86,7 @@ static struct {
                .msg_load_fail    = "check your vmlinux setting?",
                .target_func      = &epoll_pwait_loop,
                .expect_result    = (NR_ITERS + 1) / 2,
-               .pin              = true,
+               .pin              = true,
        },
 #ifdef HAVE_BPF_PROLOGUE
        {
@@ -99,13 +99,6 @@ static struct {
                .expect_result    = (NR_ITERS + 1) / 4,
        },
 #endif
-       {
-               .prog_id          = LLVM_TESTCASE_BPF_RELOCATION,
-               .desc             = "BPF relocation checker",
-               .name             = "[bpf_relocation_test]",
-               .msg_compile_fail = "fix 'perf test LLVM' first",
-               .msg_load_fail    = "libbpf error when dealing with relocation",
-       },
 };
 
 static int do_test(struct bpf_object *obj, int (*func)(void),
index 5ad3ca8..5898438 100755 (executable)
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # daemon operations
 # SPDX-License-Identifier: GPL-2.0
 
index 953f4af..5b6ccb9 100644 (file)
@@ -298,10 +298,6 @@ static int auxtrace_queues__queue_buffer(struct auxtrace_queues *queues,
                queue->set = true;
                queue->tid = buffer->tid;
                queue->cpu = buffer->cpu;
-       } else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) {
-               pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n",
-                      queue->cpu, queue->tid, buffer->cpu, buffer->tid);
-               return -EINVAL;
        }
 
        buffer->buffer_nr = queues->next_buffer_nr++;
index 57d58c8..cdecda1 100644 (file)
@@ -196,25 +196,32 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
        }
 
        if (info_linear->info_len < offsetof(struct bpf_prog_info, prog_tags)) {
+               free(info_linear);
                pr_debug("%s: the kernel is too old, aborting\n", __func__);
                return -2;
        }
 
        info = &info_linear->info;
+       if (!info->jited_ksyms) {
+               free(info_linear);
+               return -1;
+       }
 
        /* number of ksyms, func_lengths, and tags should match */
        sub_prog_cnt = info->nr_jited_ksyms;
        if (sub_prog_cnt != info->nr_prog_tags ||
-           sub_prog_cnt != info->nr_jited_func_lens)
+           sub_prog_cnt != info->nr_jited_func_lens) {
+               free(info_linear);
                return -1;
+       }
 
        /* check BTF func info support */
        if (info->btf_id && info->nr_func_info && info->func_info_rec_size) {
                /* btf func info number should be same as sub_prog_cnt */
                if (sub_prog_cnt != info->nr_func_info) {
                        pr_debug("%s: mismatch in BPF sub program count and BTF function info count, aborting\n", __func__);
-                       err = -1;
-                       goto out;
+                       free(info_linear);
+                       return -1;
                }
                if (btf__get_from_id(info->btf_id, &btf)) {
                        pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id);
index 42c84ad..c0c0fab 100644 (file)
@@ -356,6 +356,9 @@ __add_event(struct list_head *list, int *idx,
        struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) :
                               cpu_list ? perf_cpu_map__new(cpu_list) : NULL;
 
+       if (pmu && attr->type == PERF_TYPE_RAW)
+               perf_pmu__warn_invalid_config(pmu, attr->config, name);
+
        if (init_attr)
                event_attr_init(attr);
 
index 44ef283..46fd0f9 100644 (file)
@@ -1812,3 +1812,36 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu)
 
        return nr_caps;
 }
+
+void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
+                                  char *name)
+{
+       struct perf_pmu_format *format;
+       __u64 masks = 0, bits;
+       char buf[100];
+       unsigned int i;
+
+       list_for_each_entry(format, &pmu->format, list) {
+               if (format->value != PERF_PMU_FORMAT_VALUE_CONFIG)
+                       continue;
+
+               for_each_set_bit(i, format->bits, PERF_PMU_FORMAT_BITS)
+                       masks |= 1ULL << i;
+       }
+
+       /*
+        * Kernel doesn't export any valid format bits.
+        */
+       if (masks == 0)
+               return;
+
+       bits = config & ~masks;
+       if (bits == 0)
+               return;
+
+       bitmap_scnprintf((unsigned long *)&bits, sizeof(bits) * 8, buf, sizeof(buf));
+
+       pr_warning("WARNING: event '%s' not valid (bits %s of config "
+                  "'%llx' not supported by kernel)!\n",
+                  name ?: "N/A", buf, config);
+}
index 8164388..160b0f5 100644 (file)
@@ -123,4 +123,7 @@ int perf_pmu__convert_scale(const char *scale, char **end, double *sval);
 
 int perf_pmu__caps_parse(struct perf_pmu *pmu);
 
+void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
+                                  char *name);
+
 #endif /* __PMU_H */
index b698046..dff1781 100644 (file)
@@ -424,7 +424,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 
        while (!io.eof) {
                static const char anonstr[] = "//anon";
-               size_t size;
+               size_t size, aligned_size;
 
                /* ensure null termination since stack will be reused. */
                event->mmap2.filename[0] = '\0';
@@ -484,11 +484,12 @@ out:
                }
 
                size = strlen(event->mmap2.filename) + 1;
-               size = PERF_ALIGN(size, sizeof(u64));
+               aligned_size = PERF_ALIGN(size, sizeof(u64));
                event->mmap2.len -= event->mmap.start;
                event->mmap2.header.size = (sizeof(event->mmap2) -
-                                       (sizeof(event->mmap2.filename) - size));
-               memset(event->mmap2.filename + size, 0, machine->id_hdr_size);
+                                       (sizeof(event->mmap2.filename) - aligned_size));
+               memset(event->mmap2.filename + size, 0, machine->id_hdr_size +
+                       (aligned_size - size));
                event->mmap2.header.size += machine->id_hdr_size;
                event->mmap2.pid = tgid;
                event->mmap2.tid = pid;
@@ -758,7 +759,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
        for (i = 0; i < n; i++) {
                char *end;
                pid_t _pid;
-               bool kernel_thread;
+               bool kernel_thread = false;
 
                _pid = strtol(dirent[i]->d_name, &end, 10);
                if (*end)
index 3cc91ad..43beb16 100644 (file)
@@ -133,6 +133,8 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s
        if (dso != NULL) {
                __dsos__add(&machine->dsos, dso);
                dso__set_long_name(dso, long_name, false);
+               /* Put dso here because __dsos_add already got it */
+               dso__put(dso);
        }
 
        return dso;
index a7f0603..6908700 100644 (file)
@@ -40,3 +40,5 @@
 # CONFIG_RESET_BRCMSTB_RESCAL is not set
 # CONFIG_RESET_INTEL_GW is not set
 # CONFIG_ADI_AXI_ADC is not set
+# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_PAGE_POISONING is not set
index 0b550cb..1e2683d 100644 (file)
@@ -13,7 +13,7 @@ from typing import List, Set
 CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$'
 CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+|".*")$'
 
-KconfigEntryBase = collections.namedtuple('KconfigEntry', ['name', 'value'])
+KconfigEntryBase = collections.namedtuple('KconfigEntryBase', ['name', 'value'])
 
 class KconfigEntry(KconfigEntryBase):
 
index 3b796dd..ca24f68 100644 (file)
@@ -296,21 +296,34 @@ static void *idr_throbber(void *arg)
        return NULL;
 }
 
+/*
+ * There are always either 1 or 2 objects in the IDR.  If we find nothing,
+ * or we find something at an ID we didn't expect, that's a bug.
+ */
 void idr_find_test_1(int anchor_id, int throbber_id)
 {
        pthread_t throbber;
        time_t start = time(NULL);
 
-       pthread_create(&throbber, NULL, idr_throbber, &throbber_id);
-
        BUG_ON(idr_alloc(&find_idr, xa_mk_value(anchor_id), anchor_id,
                                anchor_id + 1, GFP_KERNEL) != anchor_id);
 
+       pthread_create(&throbber, NULL, idr_throbber, &throbber_id);
+
+       rcu_read_lock();
        do {
                int id = 0;
                void *entry = idr_get_next(&find_idr, &id);
-               BUG_ON(entry != xa_mk_value(id));
+               rcu_read_unlock();
+               if ((id != anchor_id && id != throbber_id) ||
+                   entry != xa_mk_value(id)) {
+                       printf("%s(%d, %d): %p at %d\n", __func__, anchor_id,
+                               throbber_id, entry, id);
+                       abort();
+               }
+               rcu_read_lock();
        } while (time(NULL) < start + 11);
+       rcu_read_unlock();
 
        pthread_join(throbber, NULL);
 
@@ -577,6 +590,7 @@ void ida_tests(void)
 
 int __weak main(void)
 {
+       rcu_register_thread();
        radix_tree_init();
        idr_checks();
        ida_tests();
@@ -584,5 +598,6 @@ int __weak main(void)
        rcu_barrier();
        if (nr_allocated)
                printf("nr_allocated = %d\n", nr_allocated);
+       rcu_unregister_thread();
        return 0;
 }
diff --git a/tools/testing/radix-tree/linux/compiler_types.h b/tools/testing/radix-tree/linux/compiler_types.h
deleted file mode 100644 (file)
index e69de29..0000000
index 9eae0fb..e00520c 100644 (file)
@@ -224,7 +224,9 @@ void multiorder_checks(void)
 
 int __weak main(void)
 {
+       rcu_register_thread();
        radix_tree_init();
        multiorder_checks();
+       rcu_unregister_thread();
        return 0;
 }
index e61e43e..f20e12c 100644 (file)
@@ -25,11 +25,13 @@ void xarray_tests(void)
 
 int __weak main(void)
 {
+       rcu_register_thread();
        radix_tree_init();
        xarray_tests();
        radix_tree_cpu_dead(1);
        rcu_barrier();
        if (nr_allocated)
                printf("nr_allocated = %d\n", nr_allocated);
+       rcu_unregister_thread();
        return 0;
 }
index 9210691..e3e08d9 100644 (file)
@@ -284,16 +284,28 @@ endfunction
 // Set up test pattern in the FFR
 // x0: pid
 // x2: generation
+//
+// We need to generate a canonical FFR value, which consists of a number of
+// low "1" bits, followed by a number of zeros. This gives us 17 unique values
+// per 16 bits of FFR, so we create a 4 bit signature out of the PID and
+// generation, and use that as the initial number of ones in the pattern.
+// We fill the upper lanes of FFR with zeros.
 // Beware: corrupts P0.
 function setup_ffr
        mov     x4, x30
 
-       bl      pattern
+       and     w0, w0, #0x3
+       bfi     w0, w2, #2, #2
+       mov     w1, #1
+       lsl     w1, w1, w0
+       sub     w1, w1, #1
+
        ldr     x0, =ffrref
-       ldr     x1, =scratch
-       rdvl    x2, #1
-       lsr     x2, x2, #3
-       bl      memcpy
+       strh    w1, [x0], 2
+       rdvl    x1, #1
+       lsr     x1, x1, #3
+       sub     x1, x1, #2
+       bl      memclr
 
        mov     x0, #0
        ldr     x1, =ffrref
index 36af1c1..b62a393 100644 (file)
@@ -128,6 +128,8 @@ static void test_check_mtu_xdp(__u32 mtu, __u32 ifindex)
        test_check_mtu_run_xdp(skel, skel->progs.xdp_use_helper, mtu);
        test_check_mtu_run_xdp(skel, skel->progs.xdp_exceed_mtu, mtu);
        test_check_mtu_run_xdp(skel, skel->progs.xdp_minus_delta, mtu);
+       test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len, mtu);
+       test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len_exceed, mtu);
 
 cleanup:
        test_check_mtu__destroy(skel);
@@ -187,6 +189,8 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex)
        test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu, mtu);
        test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu_da, mtu);
        test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu);
+       test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu);
+       test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu);
 cleanup:
        test_check_mtu__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c
new file mode 100644 (file)
index 0000000..6c4d42a
--- /dev/null
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <test_progs.h>
+#include <time.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include "fexit_sleep.skel.h"
+
+static int do_sleep(void *skel)
+{
+       struct fexit_sleep *fexit_skel = skel;
+       struct timespec ts1 = { .tv_nsec = 1 };
+       struct timespec ts2 = { .tv_sec = 10 };
+
+       fexit_skel->bss->pid = getpid();
+       (void)syscall(__NR_nanosleep, &ts1, NULL);
+       (void)syscall(__NR_nanosleep, &ts2, NULL);
+       return 0;
+}
+
+#define STACK_SIZE (1024 * 1024)
+static char child_stack[STACK_SIZE];
+
+void test_fexit_sleep(void)
+{
+       struct fexit_sleep *fexit_skel = NULL;
+       int wstatus, duration = 0;
+       pid_t cpid;
+       int err, fexit_cnt;
+
+       fexit_skel = fexit_sleep__open_and_load();
+       if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n"))
+               goto cleanup;
+
+       err = fexit_sleep__attach(fexit_skel);
+       if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
+               goto cleanup;
+
+       cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel);
+       if (CHECK(cpid == -1, "clone", strerror(errno)))
+               goto cleanup;
+
+       /* wait until first sys_nanosleep ends and second sys_nanosleep starts */
+       while (READ_ONCE(fexit_skel->bss->fentry_cnt) != 2);
+       fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt);
+       if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt))
+               goto cleanup;
+
+       /* close progs and detach them. That will trigger two nop5->jmp5 rewrites
+        * in the trampolines to skip nanosleep_fexit prog.
+        * The nanosleep_fentry prog will get detached first.
+        * The nanosleep_fexit prog will get detached second.
+        * Detaching will trigger freeing of both progs JITed images.
+        * There will be two dying bpf_tramp_image-s, but only the initial
+        * bpf_tramp_image (with both _fentry and _fexit progs will be stuck
+        * waiting for percpu_ref_kill to confirm). The other one
+        * will be freed quickly.
+        */
+       close(bpf_program__fd(fexit_skel->progs.nanosleep_fentry));
+       close(bpf_program__fd(fexit_skel->progs.nanosleep_fexit));
+       fexit_sleep__detach(fexit_skel);
+
+       /* kill the thread to unwind sys_nanosleep stack through the trampoline */
+       kill(cpid, 9);
+
+       if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno)))
+               goto cleanup;
+       if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed"))
+               goto cleanup;
+
+       /* The bypassed nanosleep_fexit prog shouldn't have executed.
+        * Unlike progs the maps were not freed and directly accessible.
+        */
+       fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt);
+       if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt))
+               goto cleanup;
+
+cleanup:
+       fexit_sleep__destroy(fexit_skel);
+}
index 31975c9..3ac0c9a 100644 (file)
@@ -174,6 +174,12 @@ struct struct_in_struct {
        };
 };
 
+struct struct_in_array {};
+
+struct struct_in_array_typed {};
+
+typedef struct struct_in_array_typed struct_in_array_t[2];
+
 struct struct_with_embedded_stuff {
        int a;
        struct {
@@ -203,6 +209,8 @@ struct struct_with_embedded_stuff {
        } r[5];
        struct struct_in_struct s[10];
        int t[11];
+       struct struct_in_array (*u)[2];
+       struct_in_array_t *v;
 };
 
 struct root_struct {
diff --git a/tools/testing/selftests/bpf/progs/fexit_sleep.c b/tools/testing/selftests/bpf/progs/fexit_sleep.c
new file mode 100644 (file)
index 0000000..03a672d
--- /dev/null
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char LICENSE[] SEC("license") = "GPL";
+
+int pid = 0;
+int fentry_cnt = 0;
+int fexit_cnt = 0;
+
+SEC("fentry/__x64_sys_nanosleep")
+int BPF_PROG(nanosleep_fentry, const struct pt_regs *regs)
+{
+       if ((int)bpf_get_current_pid_tgid() != pid)
+               return 0;
+
+       fentry_cnt++;
+       return 0;
+}
+
+SEC("fexit/__x64_sys_nanosleep")
+int BPF_PROG(nanosleep_fexit, const struct pt_regs *regs, int ret)
+{
+       if ((int)bpf_get_current_pid_tgid() != pid)
+               return 0;
+
+       fexit_cnt++;
+       return 0;
+}
index b7787b4..c4a9bae 100644 (file)
@@ -105,6 +105,54 @@ int xdp_minus_delta(struct xdp_md *ctx)
        return retval;
 }
 
+SEC("xdp")
+int xdp_input_len(struct xdp_md *ctx)
+{
+       int retval = XDP_PASS; /* Expected retval on successful test */
+       void *data_end = (void *)(long)ctx->data_end;
+       void *data = (void *)(long)ctx->data;
+       __u32 ifindex = GLOBAL_USER_IFINDEX;
+       __u32 data_len = data_end - data;
+
+       /* API allow user give length to check as input via mtu_len param,
+        * resulting MTU value is still output in mtu_len param after call.
+        *
+        * Input len is L3, like MTU and iph->tot_len.
+        * Remember XDP data_len is L2.
+        */
+       __u32 mtu_len = data_len - ETH_HLEN;
+
+       if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0))
+               retval = XDP_ABORTED;
+
+       global_bpf_mtu_xdp = mtu_len;
+       return retval;
+}
+
+SEC("xdp")
+int xdp_input_len_exceed(struct xdp_md *ctx)
+{
+       int retval = XDP_ABORTED; /* Fail */
+       __u32 ifindex = GLOBAL_USER_IFINDEX;
+       int err;
+
+       /* API allow user give length to check as input via mtu_len param,
+        * resulting MTU value is still output in mtu_len param after call.
+        *
+        * Input length value is L3 size like MTU.
+        */
+       __u32 mtu_len = GLOBAL_USER_MTU;
+
+       mtu_len += 1; /* Exceed with 1 */
+
+       err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0);
+       if (err == BPF_MTU_CHK_RET_FRAG_NEEDED)
+               retval = XDP_PASS ; /* Success in exceeding MTU check */
+
+       global_bpf_mtu_xdp = mtu_len;
+       return retval;
+}
+
 SEC("classifier")
 int tc_use_helper(struct __sk_buff *ctx)
 {
@@ -196,3 +244,47 @@ int tc_minus_delta(struct __sk_buff *ctx)
        global_bpf_mtu_xdp = mtu_len;
        return retval;
 }
+
+SEC("classifier")
+int tc_input_len(struct __sk_buff *ctx)
+{
+       int retval = BPF_OK; /* Expected retval on successful test */
+       __u32 ifindex = GLOBAL_USER_IFINDEX;
+
+       /* API allow user give length to check as input via mtu_len param,
+        * resulting MTU value is still output in mtu_len param after call.
+        *
+        * Input length value is L3 size.
+        */
+       __u32 mtu_len = GLOBAL_USER_MTU;
+
+       if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0))
+               retval = BPF_DROP;
+
+       global_bpf_mtu_xdp = mtu_len;
+       return retval;
+}
+
+SEC("classifier")
+int tc_input_len_exceed(struct __sk_buff *ctx)
+{
+       int retval = BPF_DROP; /* Fail */
+       __u32 ifindex = GLOBAL_USER_IFINDEX;
+       int err;
+
+       /* API allow user give length to check as input via mtu_len param,
+        * resulting MTU value is still output in mtu_len param after call.
+        *
+        * Input length value is L3 size like MTU.
+        */
+       __u32 mtu_len = GLOBAL_USER_MTU;
+
+       mtu_len += 1; /* Exceed with 1 */
+
+       err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0);
+       if (err == BPF_MTU_CHK_RET_FRAG_NEEDED)
+               retval = BPF_OK; /* Success in exceeding MTU check */
+
+       global_bpf_mtu_xdp = mtu_len;
+       return retval;
+}
index 9afe947..ba6eadf 100644 (file)
@@ -508,10 +508,8 @@ int _ip6geneve_get_tunnel(struct __sk_buff *skb)
        }
 
        ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt));
-       if (ret < 0) {
-               ERROR(ret);
-               return TC_ACT_SHOT;
-       }
+       if (ret < 0)
+               gopt.opt_class = 0;
 
        bpf_trace_printk(fmt, sizeof(fmt),
                        key.tunnel_id, key.remote_ipv4, gopt.opt_class);
index 1fd07a4..c162498 100644 (file)
@@ -6,8 +6,9 @@
                BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
+       .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types",
        .errstr = "R0 tried to subtract pointer from scalar",
+       .result = REJECT,
 },
 {
        "check deducing bounds from const, 2",
@@ -20,6 +21,8 @@
                BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0),
                BPF_EXIT_INSN(),
        },
+       .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types",
+       .result_unpriv = REJECT,
        .result = ACCEPT,
        .retval = 1,
 },
@@ -31,8 +34,9 @@
                BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
+       .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types",
        .errstr = "R0 tried to subtract pointer from scalar",
+       .result = REJECT,
 },
 {
        "check deducing bounds from const, 4",
@@ -45,6 +49,8 @@
                BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0),
                BPF_EXIT_INSN(),
        },
+       .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types",
+       .result_unpriv = REJECT,
        .result = ACCEPT,
 },
 {
@@ -55,8 +61,9 @@
                BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
+       .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types",
        .errstr = "R0 tried to subtract pointer from scalar",
+       .result = REJECT,
 },
 {
        "check deducing bounds from const, 6",
@@ -67,8 +74,9 @@
                BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
+       .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types",
        .errstr = "R0 tried to subtract pointer from scalar",
+       .result = REJECT,
 },
 {
        "check deducing bounds from const, 7",
@@ -80,8 +88,9 @@
                            offsetof(struct __sk_buff, mark)),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
+       .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types",
        .errstr = "dereference of modified ctx ptr",
+       .result = REJECT,
        .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
                            offsetof(struct __sk_buff, mark)),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
+       .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types",
        .errstr = "dereference of modified ctx ptr",
+       .result = REJECT,
        .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
                BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
+       .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types",
        .errstr = "R0 tried to subtract pointer from scalar",
+       .result = REJECT,
 },
 {
        "check deducing bounds from const, 10",
                BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
                BPF_EXIT_INSN(),
        },
-       .result = REJECT,
        .errstr = "math between ctx pointer and register with unbounded min value is not allowed",
+       .result = REJECT,
 },
index b117bdd..6f610cf 100644 (file)
@@ -75,6 +75,8 @@
        BPF_EXIT_INSN(),
        },
        .fixup_map_hash_16b = { 4 },
+       .result_unpriv = REJECT,
+       .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types",
        .result = ACCEPT,
 },
 {
@@ -91,5 +93,7 @@
        BPF_EXIT_INSN(),
        },
        .fixup_map_hash_16b = { 4 },
+       .result_unpriv = REJECT,
+       .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types",
        .result = ACCEPT,
 },
index b018ad7..3e32400 100644 (file)
        .result = ACCEPT,
 },
 {
-       "unpriv: adding of fp",
+       "unpriv: adding of fp, reg",
        .insns = {
        BPF_MOV64_IMM(BPF_REG_0, 0),
        BPF_MOV64_IMM(BPF_REG_1, 0),
        BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8),
        BPF_EXIT_INSN(),
        },
+       .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types",
+       .result_unpriv = REJECT,
+       .result = ACCEPT,
+},
+{
+       "unpriv: adding of fp, imm",
+       .insns = {
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0),
+       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8),
+       BPF_EXIT_INSN(),
+       },
        .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
        .result_unpriv = REJECT,
        .result = ACCEPT,
index ed4e76b..feb9126 100644 (file)
        .fixup_map_array_48b = { 1 },
        .result = ACCEPT,
        .result_unpriv = REJECT,
-       .errstr_unpriv = "R2 tried to add from different maps or paths",
+       .errstr_unpriv = "R2 tried to add from different maps, paths, or prohibited types",
        .retval = 0,
 },
 {
        .retval = 0xabcdef12,
 },
 {
+       "map access: value_ptr += N, value_ptr -= N known scalar",
+       .insns = {
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+       BPF_MOV32_IMM(BPF_REG_1, 0x12345678),
+       BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2),
+       BPF_MOV64_IMM(BPF_REG_1, 2),
+       BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+       BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_array_48b = { 3 },
+       .result = ACCEPT,
+       .retval = 0x12345678,
+},
+{
        "map access: unknown scalar += value_ptr, 1",
        .insns = {
        BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
index e65d557..bd83158 100644 (file)
@@ -9,10 +9,13 @@
 /x86_64/debug_regs
 /x86_64/evmcs_test
 /x86_64/get_cpuid_test
+/x86_64/get_msr_index_features
 /x86_64/kvm_pv_test
+/x86_64/hyperv_clock
 /x86_64/hyperv_cpuid
 /x86_64/mmio_warning_test
 /x86_64/platform_info_test
+/x86_64/set_boot_cpu_id
 /x86_64/set_sregs_test
 /x86_64/smm_test
 /x86_64/state_test
@@ -36,6 +39,7 @@
 /dirty_log_perf_test
 /hardware_disable_test
 /kvm_create_max_vcpus
+/kvm_page_table_test
 /memslot_modification_stress_test
 /set_memory_region_test
 /steal_time
index 4e548d7..ea5c428 100644 (file)
@@ -39,12 +39,15 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c
 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
 
 TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smm_test
 TEST_GEN_PROGS_x86_64 += x86_64/state_test
@@ -69,6 +72,7 @@ TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
@@ -80,6 +84,7 @@ TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 
@@ -89,6 +94,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
index bb2752d..81edbd2 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <asm/barrier.h>
+#include <linux/atomic.h>
 
 #include "kvm_util.h"
 #include "test_util.h"
@@ -137,12 +138,20 @@ static uint64_t host_clear_count;
 static uint64_t host_track_next_count;
 
 /* Whether dirty ring reset is requested, or finished */
-static sem_t dirty_ring_vcpu_stop;
-static sem_t dirty_ring_vcpu_cont;
+static sem_t sem_vcpu_stop;
+static sem_t sem_vcpu_cont;
+/*
+ * This is only set by main thread, and only cleared by vcpu thread.  It is
+ * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC
+ * is the only place that we'll guarantee both "dirty bit" and "dirty data"
+ * will match.  E.g., SIG_IPI won't guarantee that if the vcpu is interrupted
+ * after setting dirty bit but before the data is written.
+ */
+static atomic_t vcpu_sync_stop_requested;
 /*
  * This is updated by the vcpu thread to tell the host whether it's a
  * ring-full event.  It should only be read until a sem_wait() of
- * dirty_ring_vcpu_stop and before vcpu continues to run.
+ * sem_vcpu_stop and before vcpu continues to run.
  */
 static bool dirty_ring_vcpu_ring_full;
 /*
@@ -234,6 +243,17 @@ static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
        kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
 }
 
+/* Should only be called after a GUEST_SYNC */
+static void vcpu_handle_sync_stop(void)
+{
+       if (atomic_read(&vcpu_sync_stop_requested)) {
+               /* It means main thread is sleeping waiting */
+               atomic_set(&vcpu_sync_stop_requested, false);
+               sem_post(&sem_vcpu_stop);
+               sem_wait_until(&sem_vcpu_cont);
+       }
+}
+
 static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 {
        struct kvm_run *run = vcpu_state(vm, VCPU_ID);
@@ -244,6 +264,8 @@ static void default_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
        TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
                    "Invalid guest sync status: exit_reason=%s\n",
                    exit_reason_str(run->exit_reason));
+
+       vcpu_handle_sync_stop();
 }
 
 static bool dirty_ring_supported(void)
@@ -301,13 +323,13 @@ static void dirty_ring_wait_vcpu(void)
 {
        /* This makes sure that hardware PML cache flushed */
        vcpu_kick();
-       sem_wait_until(&dirty_ring_vcpu_stop);
+       sem_wait_until(&sem_vcpu_stop);
 }
 
 static void dirty_ring_continue_vcpu(void)
 {
        pr_info("Notifying vcpu to continue\n");
-       sem_post(&dirty_ring_vcpu_cont);
+       sem_post(&sem_vcpu_cont);
 }
 
 static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
@@ -361,11 +383,11 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
                /* Update the flag first before pause */
                WRITE_ONCE(dirty_ring_vcpu_ring_full,
                           run->exit_reason == KVM_EXIT_DIRTY_RING_FULL);
-               sem_post(&dirty_ring_vcpu_stop);
+               sem_post(&sem_vcpu_stop);
                pr_info("vcpu stops because %s...\n",
                        dirty_ring_vcpu_ring_full ?
                        "dirty ring is full" : "vcpu is kicked out");
-               sem_wait_until(&dirty_ring_vcpu_cont);
+               sem_wait_until(&sem_vcpu_cont);
                pr_info("vcpu continues now.\n");
        } else {
                TEST_ASSERT(false, "Invalid guest sync status: "
@@ -377,7 +399,7 @@ static void dirty_ring_after_vcpu_run(struct kvm_vm *vm, int ret, int err)
 static void dirty_ring_before_vcpu_join(void)
 {
        /* Kick another round of vcpu just to make sure it will quit */
-       sem_post(&dirty_ring_vcpu_cont);
+       sem_post(&sem_vcpu_cont);
 }
 
 struct log_mode {
@@ -505,9 +527,8 @@ static void *vcpu_worker(void *data)
         */
        sigmask->len = 8;
        pthread_sigmask(0, NULL, sigset);
+       sigdelset(sigset, SIG_IPI);
        vcpu_ioctl(vm, VCPU_ID, KVM_SET_SIGNAL_MASK, sigmask);
-       sigaddset(sigset, SIG_IPI);
-       pthread_sigmask(SIG_BLOCK, sigset, NULL);
 
        sigemptyset(sigset);
        sigaddset(sigset, SIG_IPI);
@@ -768,7 +789,25 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                usleep(p->interval * 1000);
                log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
                                             bmap, host_num_pages);
+
+               /*
+                * See vcpu_sync_stop_requested definition for details on why
+                * we need to stop vcpu when verify data.
+                */
+               atomic_set(&vcpu_sync_stop_requested, true);
+               sem_wait_until(&sem_vcpu_stop);
+               /*
+                * NOTE: for dirty ring, it's possible that we didn't stop at
+                * GUEST_SYNC but instead we stopped because ring is full;
+                * that's okay too because ring full means we're only missing
+                * the flush of the last page, and since we handle the last
+                * page specially verification will succeed anyway.
+                */
+               assert(host_log_mode == LOG_MODE_DIRTY_RING ||
+                      atomic_read(&vcpu_sync_stop_requested) == false);
                vm_dirty_log_verify(mode, bmap);
+               sem_post(&sem_vcpu_cont);
+
                iteration++;
                sync_global_to_guest(vm, iteration);
        }
@@ -818,9 +857,10 @@ int main(int argc, char *argv[])
                .interval = TEST_HOST_LOOP_INTERVAL,
        };
        int opt, i;
+       sigset_t sigset;
 
-       sem_init(&dirty_ring_vcpu_stop, 0, 0);
-       sem_init(&dirty_ring_vcpu_cont, 0, 0);
+       sem_init(&sem_vcpu_stop, 0, 0);
+       sem_init(&sem_vcpu_cont, 0, 0);
 
        guest_modes_append_default();
 
@@ -876,6 +916,11 @@ int main(int argc, char *argv[])
 
        srandom(time(0));
 
+       /* Ensure that vCPU threads start with SIG_IPI blocked.  */
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIG_IPI);
+       pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
        if (host_log_mode_option == LOG_MODE_ALL) {
                /* Run each log mode */
                for (i = 0; i < LOG_MODE_NUM; i++) {
index 2f2eeb8..5aadf84 100644 (file)
@@ -108,7 +108,7 @@ static void run_test(uint32_t run)
        kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
        vm_create_irqchip(vm);
 
-       fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run);
+       pr_debug("%s: [%d] start vcpus\n", __func__, run);
        for (i = 0; i < VCPU_NUM; ++i) {
                vm_vcpu_add_default(vm, i, guest_code);
                payloads[i].vm = vm;
@@ -124,7 +124,7 @@ static void run_test(uint32_t run)
                        check_set_affinity(throw_away, &cpu_set);
                }
        }
-       fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run);
+       pr_debug("%s: [%d] all threads launched\n", __func__, run);
        sem_post(sem);
        for (i = 0; i < VCPU_NUM; ++i)
                check_join(threads[i], &b);
@@ -147,16 +147,16 @@ int main(int argc, char **argv)
                if (pid == 0)
                        run_test(i); /* This function always exits */
 
-               fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i);
+               pr_debug("%s: [%d] waiting semaphore\n", __func__, i);
                sem_wait(sem);
                r = (rand() % DELAY_US_MAX) + 1;
-               fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r);
+               pr_debug("%s: [%d] waiting %dus\n", __func__, i, r);
                usleep(r);
                r = waitpid(pid, &s, WNOHANG);
                TEST_ASSERT(r != pid,
                            "%s: [%d] child exited unexpectedly status: [%d]",
                            __func__, i, s);
-               fprintf(stderr, "%s: [%d] killing child\n", __func__, i);
+               pr_debug("%s: [%d] killing child\n", __func__, i);
                kill(pid, SIGKILL);
        }
 
index bea4644..a8f0227 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "sparsebit.h"
 
+#define KVM_DEV_PATH "/dev/kvm"
 #define KVM_MAX_VCPUS 512
 
 /*
@@ -68,9 +69,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE          (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE      ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
        unsigned int pa_bits;
        unsigned int va_bits;
@@ -84,6 +82,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
                    struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
 void kvm_vm_free(struct kvm_vm *vmp);
@@ -133,6 +132,7 @@ void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
 int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
                void *arg);
 void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
+int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg);
 void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
 int _kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
index b7f4139..fade313 100644 (file)
@@ -71,13 +71,32 @@ enum vm_mem_backing_src_type {
        VM_MEM_SRC_ANONYMOUS,
        VM_MEM_SRC_ANONYMOUS_THP,
        VM_MEM_SRC_ANONYMOUS_HUGETLB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+       NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
        const char *name;
-       enum vm_mem_backing_src_type type;
+       uint32_t flag;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
+size_t get_def_hugetlb_pagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644 (file)
index 0000000..1c4753f
--- /dev/null
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX             1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE          (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM         0xc0000000
+
+/* Different guest memory accessing stages */
+enum test_stage {
+       KVM_BEFORE_MAPPINGS,
+       KVM_CREATE_MAPPINGS,
+       KVM_UPDATE_MAPPINGS,
+       KVM_ADJUST_MAPPINGS,
+       NUM_TEST_STAGES,
+};
+
+static const char * const test_stage_string[] = {
+       "KVM_BEFORE_MAPPINGS",
+       "KVM_CREATE_MAPPINGS",
+       "KVM_UPDATE_MAPPINGS",
+       "KVM_ADJUST_MAPPINGS",
+};
+
+struct vcpu_args {
+       int vcpu_id;
+       bool vcpu_write;
+};
+
+struct test_args {
+       struct kvm_vm *vm;
+       uint64_t guest_test_virt_mem;
+       uint64_t host_page_size;
+       uint64_t host_num_pages;
+       uint64_t large_page_size;
+       uint64_t large_num_pages;
+       uint64_t host_pages_per_lpage;
+       enum vm_mem_backing_src_type src_type;
+       struct vcpu_args vcpu_args[KVM_MAX_VCPUS];
+};
+
+/*
+ * Guest variables. Use addr_gva2hva() if these variables need
+ * to be changed in host.
+ */
+static enum test_stage guest_test_stage;
+
+/* Host variables */
+static uint32_t nr_vcpus = 1;
+static struct test_args test_args;
+static enum test_stage *current_stage;
+static bool host_quit;
+
+/* Whether the test stage is updated, or completed */
+static sem_t test_stage_updated;
+static sem_t test_stage_completed;
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+static void guest_code(int vcpu_id)
+{
+       struct test_args *p = &test_args;
+       struct vcpu_args *vcpu_args = &p->vcpu_args[vcpu_id];
+       enum test_stage *current_stage = &guest_test_stage;
+       uint64_t addr;
+       int i, j;
+
+       /* Make sure vCPU args data structure is not corrupt */
+       GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id);
+
+       while (true) {
+               addr = p->guest_test_virt_mem;
+
+               switch (READ_ONCE(*current_stage)) {
+               /*
+                * All vCPU threads will be started in this stage,
+                * where guest code of each vCPU will do nothing.
+                */
+               case KVM_BEFORE_MAPPINGS:
+                       break;
+
+               /*
+                * Before dirty logging, vCPUs concurrently access the first
+                * 8 bytes of each page (host page/large page) within the same
+                * memory region with different accessing types (read/write).
+                * Then KVM will create normal page mappings or huge block
+                * mappings for them.
+                */
+               case KVM_CREATE_MAPPINGS:
+                       for (i = 0; i < p->large_num_pages; i++) {
+                               if (vcpu_args->vcpu_write)
+                                       *(uint64_t *)addr = 0x0123456789ABCDEF;
+                               else
+                                       READ_ONCE(*(uint64_t *)addr);
+
+                               addr += p->large_page_size;
+                       }
+                       break;
+
+               /*
+                * During dirty logging, KVM will only update attributes of the
+                * normal page mappings from RO to RW if memory backing src type
+                * is anonymous. In other cases, KVM will split the huge block
+                * mappings into normal page mappings if memory backing src type
+                * is THP or HUGETLB.
+                */
+               case KVM_UPDATE_MAPPINGS:
+                       if (p->src_type == VM_MEM_SRC_ANONYMOUS) {
+                               for (i = 0; i < p->host_num_pages; i++) {
+                                       *(uint64_t *)addr = 0x0123456789ABCDEF;
+                                       addr += p->host_page_size;
+                               }
+                               break;
+                       }
+
+                       for (i = 0; i < p->large_num_pages; i++) {
+                               /*
+                                * Write to the first host page in each large
+                                * page region, and triger break of large pages.
+                                */
+                               *(uint64_t *)addr = 0x0123456789ABCDEF;
+
+                               /*
+                                * Access the middle host pages in each large
+                                * page region. Since dirty logging is enabled,
+                                * this will create new mappings at the smallest
+                                * granularity.
+                                */
+                               addr += p->large_page_size / 2;
+                               for (j = 0; j < p->host_pages_per_lpage / 2; j++) {
+                                       READ_ONCE(*(uint64_t *)addr);
+                                       addr += p->host_page_size;
+                               }
+                       }
+                       break;
+
+               /*
+                * After dirty logging is stopped, vCPUs concurrently read
+                * from every single host page. Then KVM will coalesce the
+                * split page mappings back to block mappings. And a TLB
+                * conflict abort could occur here if TLB entries of the
+                * page mappings are not fully invalidated.
+                */
+               case KVM_ADJUST_MAPPINGS:
+                       for (i = 0; i < p->host_num_pages; i++) {
+                               READ_ONCE(*(uint64_t *)addr);
+                               addr += p->host_page_size;
+                       }
+                       break;
+
+               default:
+                       GUEST_ASSERT(0);
+               }
+
+               GUEST_SYNC(1);
+       }
+}
+
+static void *vcpu_worker(void *data)
+{
+       int ret;
+       struct vcpu_args *vcpu_args = data;
+       struct kvm_vm *vm = test_args.vm;
+       int vcpu_id = vcpu_args->vcpu_id;
+       struct kvm_run *run;
+       struct timespec start;
+       struct timespec ts_diff;
+       enum test_stage stage;
+
+       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+       run = vcpu_state(vm, vcpu_id);
+
+       while (!READ_ONCE(host_quit)) {
+               ret = sem_wait(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+               if (READ_ONCE(host_quit))
+                       return NULL;
+
+               clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+               ret = _vcpu_run(vm, vcpu_id);
+               ts_diff = timespec_elapsed(start);
+
+               TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+               TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
+                           "Invalid guest sync status: exit_reason=%s\n",
+                           exit_reason_str(run->exit_reason));
+
+               pr_debug("Got sync event from vCPU %d\n", vcpu_id);
+               stage = READ_ONCE(*current_stage);
+
+               /*
+                * Here we can know the execution time of every
+                * single vcpu running in different test stages.
+                */
+               pr_debug("vCPU %d has completed stage %s\n"
+                        "execution time is: %ld.%.9lds\n\n",
+                        vcpu_id, test_stage_string[stage],
+                        ts_diff.tv_sec, ts_diff.tv_nsec);
+
+               ret = sem_post(&test_stage_completed);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+
+       return NULL;
+}
+
+struct test_params {
+       uint64_t phys_offset;
+       uint64_t test_mem_size;
+       enum vm_mem_backing_src_type src_type;
+};
+
+static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
+{
+       int ret;
+       struct test_params *p = arg;
+       struct vcpu_args *vcpu_args;
+       enum vm_mem_backing_src_type src_type = p->src_type;
+       uint64_t large_page_size = get_backing_src_pagesz(src_type);
+       uint64_t guest_page_size = vm_guest_mode_params[mode].page_size;
+       uint64_t host_page_size = getpagesize();
+       uint64_t test_mem_size = p->test_mem_size;
+       uint64_t guest_num_pages;
+       uint64_t alignment;
+       void *host_test_mem;
+       struct kvm_vm *vm;
+       int vcpu_id;
+
+       /* Align up the test memory size */
+       alignment = max(large_page_size, guest_page_size);
+       test_mem_size = (test_mem_size + alignment - 1) & ~(alignment - 1);
+
+       /* Create a VM with enough guest pages */
+       guest_num_pages = test_mem_size / guest_page_size;
+       vm = vm_create_with_vcpus(mode, nr_vcpus,
+                                 guest_num_pages, 0, guest_code, NULL);
+
+       /* Align down GPA of the testing memslot */
+       if (!p->phys_offset)
+               guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
+                                      guest_page_size;
+       else
+               guest_test_phys_mem = p->phys_offset;
+#ifdef __s390x__
+       alignment = max(0x100000, alignment);
+#endif
+       guest_test_phys_mem &= ~(alignment - 1);
+
+       /* Set up the shared data structure test_args */
+       test_args.vm = vm;
+       test_args.guest_test_virt_mem = guest_test_virt_mem;
+       test_args.host_page_size = host_page_size;
+       test_args.host_num_pages = test_mem_size / host_page_size;
+       test_args.large_page_size = large_page_size;
+       test_args.large_num_pages = test_mem_size / large_page_size;
+       test_args.host_pages_per_lpage = large_page_size / host_page_size;
+       test_args.src_type = src_type;
+
+       for (vcpu_id = 0; vcpu_id < KVM_MAX_VCPUS; vcpu_id++) {
+               vcpu_args = &test_args.vcpu_args[vcpu_id];
+               vcpu_args->vcpu_id = vcpu_id;
+               vcpu_args->vcpu_write = !(vcpu_id % 2);
+       }
+
+       /* Add an extra memory slot with specified backing src type */
+       vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem,
+                                   TEST_MEM_SLOT_INDEX, guest_num_pages, 0);
+
+       /* Do mapping(GVA->GPA) for the testing memory slot */
+       virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+       /* Cache the HVA pointer of the region */
+       host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
+
+       /* Export shared structure test_args to guest */
+       ucall_init(vm, NULL);
+       sync_global_to_guest(vm, test_args);
+
+       ret = sem_init(&test_stage_updated, 0, 0);
+       TEST_ASSERT(ret == 0, "Error in sem_init");
+
+       ret = sem_init(&test_stage_completed, 0, 0);
+       TEST_ASSERT(ret == 0, "Error in sem_init");
+
+       current_stage = addr_gva2hva(vm, (vm_vaddr_t)(&guest_test_stage));
+       *current_stage = NUM_TEST_STAGES;
+
+       pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+       pr_info("Testing memory backing src type: %s\n",
+               vm_mem_backing_src_alias(src_type)->name);
+       pr_info("Testing memory backing src granularity: 0x%lx\n",
+               large_page_size);
+       pr_info("Testing memory size(aligned): 0x%lx\n", test_mem_size);
+       pr_info("Guest physical test memory offset: 0x%lx\n",
+               guest_test_phys_mem);
+       pr_info("Host  virtual  test memory offset: 0x%lx\n",
+               (uint64_t)host_test_mem);
+       pr_info("Number of testing vCPUs: %d\n", nr_vcpus);
+
+       return vm;
+}
+
+static void vcpus_complete_new_stage(enum test_stage stage)
+{
+       int ret;
+       int vcpus;
+
+       /* Wake up all the vcpus to run new test stage */
+       for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+               ret = sem_post(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+       pr_debug("All vcpus have been notified to continue\n");
+
+       /* Wait for all the vcpus to complete new test stage */
+       for (vcpus = 0; vcpus < nr_vcpus; vcpus++) {
+               ret = sem_wait(&test_stage_completed);
+               TEST_ASSERT(ret == 0, "Error in sem_wait");
+
+               pr_debug("%d vcpus have completed stage %s\n",
+                        vcpus + 1, test_stage_string[stage]);
+       }
+
+       pr_debug("All vcpus have completed stage %s\n",
+                test_stage_string[stage]);
+}
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+       int ret;
+       pthread_t *vcpu_threads;
+       struct kvm_vm *vm;
+       int vcpu_id;
+       struct timespec start;
+       struct timespec ts_diff;
+
+       /* Create VM with vCPUs and make some pre-initialization */
+       vm = pre_init_before_test(mode, arg);
+
+       vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+       TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+       host_quit = false;
+       *current_stage = KVM_BEFORE_MAPPINGS;
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+                              &test_args.vcpu_args[vcpu_id]);
+       }
+
+       vcpus_complete_new_stage(*current_stage);
+       pr_info("Started all vCPUs successfully\n");
+
+       /* Test the stage of KVM creating mappings */
+       *current_stage = KVM_CREATE_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_CREATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Test the stage of KVM updating mappings */
+       vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX,
+                               KVM_MEM_LOG_DIRTY_PAGES);
+
+       *current_stage = KVM_UPDATE_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_UPDATE_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Test the stage of KVM adjusting mappings */
+       vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0);
+
+       *current_stage = KVM_ADJUST_MAPPINGS;
+
+       clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+       vcpus_complete_new_stage(*current_stage);
+       ts_diff = timespec_elapsed(start);
+
+       pr_info("KVM_ADJUST_MAPPINGS: total execution time: %ld.%.9lds\n\n",
+               ts_diff.tv_sec, ts_diff.tv_nsec);
+
+       /* Tell the vcpu thread to quit */
+       host_quit = true;
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               ret = sem_post(&test_stage_updated);
+               TEST_ASSERT(ret == 0, "Error in sem_post");
+       }
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+               pthread_join(vcpu_threads[vcpu_id], NULL);
+
+       ret = sem_destroy(&test_stage_updated);
+       TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+       ret = sem_destroy(&test_stage_completed);
+       TEST_ASSERT(ret == 0, "Error in sem_destroy");
+
+       free(vcpu_threads);
+       ucall_uninit(vm);
+       kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+       puts("");
+       printf("usage: %s [-h] [-p offset] [-m mode] "
+              "[-b mem-size] [-v vcpus] [-s mem-type]\n", name);
+       puts("");
+       printf(" -p: specify guest physical test memory offset\n"
+              "     Warning: a low offset can conflict with the loaded test code.\n");
+       guest_modes_help();
+       printf(" -b: specify size of the memory region for testing. e.g. 10M or 3G.\n"
+              "     (default: 1G)\n");
+       printf(" -v: specify the number of vCPUs to run\n"
+              "     (default: 1)\n");
+       printf(" -s: specify the type of memory that should be used to\n"
+              "     back the guest data region.\n"
+              "     (default: anonymous)\n\n");
+       backing_src_help();
+       puts("");
+}
+
+int main(int argc, char *argv[])
+{
+       int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+       struct test_params p = {
+               .test_mem_size = DEFAULT_TEST_MEM_SIZE,
+               .src_type = VM_MEM_SRC_ANONYMOUS,
+       };
+       int opt;
+
+       guest_modes_append_default();
+
+       while ((opt = getopt(argc, argv, "hp:m:b:v:s:")) != -1) {
+               switch (opt) {
+               case 'p':
+                       p.phys_offset = strtoull(optarg, NULL, 0);
+                       break;
+               case 'm':
+                       guest_modes_cmdline(optarg);
+                       break;
+               case 'b':
+                       p.test_mem_size = parse_size(optarg);
+                       break;
+               case 'v':
+                       nr_vcpus = atoi(optarg);
+                       TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+                                   "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+                       break;
+               case 's':
+                       p.src_type = parse_backing_src_type(optarg);
+                       break;
+               case 'h':
+               default:
+                       help(argv[0]);
+                       exit(0);
+               }
+       }
+
+       for_each_guest_mode(run_test, &p);
+
+       return 0;
+}
index 5ebbd0d..71ade61 100644 (file)
@@ -71,9 +71,9 @@ test_assert(bool exp, const char *exp_str,
 
                fprintf(stderr, "==== Test Assertion Failure ====\n"
                        "  %s:%u: %s\n"
-                       "  pid=%d tid=%d - %s\n",
+                       "  pid=%d tid=%d errno=%d - %s\n",
                        file, line, exp_str, getpid(), _gettid(),
-                       strerror(errno));
+                       errno, strerror(errno));
                test_dump_stack();
                if (fmt) {
                        fputs("  ", stderr);
index 5067d04..8926f91 100644 (file)
@@ -18,7 +18,6 @@
 #include <unistd.h>
 #include <linux/kernel.h>
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN       2
 
 static int vcpu_mmap_sz(void);
@@ -143,17 +142,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
                "rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-       "PA-bits:52,  VA-bits:48,  4K pages",
-       "PA-bits:52,  VA-bits:48, 64K pages",
-       "PA-bits:48,  VA-bits:48,  4K pages",
-       "PA-bits:48,  VA-bits:48, 64K pages",
-       "PA-bits:40,  VA-bits:48,  4K pages",
-       "PA-bits:40,  VA-bits:48, 64K pages",
-       "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-              "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+       static const char * const strings[] = {
+               [VM_MODE_P52V48_4K]     = "PA-bits:52,  VA-bits:48,  4K pages",
+               [VM_MODE_P52V48_64K]    = "PA-bits:52,  VA-bits:48, 64K pages",
+               [VM_MODE_P48V48_4K]     = "PA-bits:48,  VA-bits:48,  4K pages",
+               [VM_MODE_P48V48_64K]    = "PA-bits:48,  VA-bits:48, 64K pages",
+               [VM_MODE_P40V48_4K]     = "PA-bits:40,  VA-bits:48,  4K pages",
+               [VM_MODE_P40V48_64K]    = "PA-bits:40,  VA-bits:48, 64K pages",
+               [VM_MODE_PXXV48_4K]     = "PA-bits:ANY, VA-bits:48,  4K pages",
+       };
+       _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+                      "Missing new mode strings?");
+
+       TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+       return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
        { 52, 48,  0x1000, 12 },
@@ -681,7 +687,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
        int ret;
        struct userspace_mem_region *region;
-       size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+       size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
        size_t alignment;
 
        TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -743,7 +749,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
        if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-               alignment = max(huge_page_size, alignment);
+               alignment = max(backing_src_pagesz, alignment);
 
        /* Add enough memory to align up if necessary */
        if (alignment > 1)
@@ -752,7 +758,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        region->mmap_start = mmap(NULL, region->mmap_size,
                                  PROT_READ | PROT_WRITE,
                                  MAP_PRIVATE | MAP_ANONYMOUS
-                                 | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
+                                 | vm_mem_backing_src_alias(src_type)->flag,
                                  -1, 0);
        TEST_ASSERT(region->mmap_start != MAP_FAILED,
                    "test_malloc failed, mmap_start: %p errno: %i",
@@ -762,22 +768,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
        region->host_mem = align(region->mmap_start, alignment);
 
        /* As needed perform madvise */
-       if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
-               struct stat statbuf;
-
-               ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-               TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-                           "stat /sys/kernel/mm/transparent_hugepage");
-
-               TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-                           "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
-
-               if (ret == 0) {
-                       ret = madvise(region->host_mem, npages * vm->page_size,
-                                     src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-                       TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
-                                   region->host_mem, npages * vm->page_size, src_type);
-               }
+       if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+            src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+               ret = madvise(region->host_mem, npages * vm->page_size,
+                             src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+               TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
+                           region->host_mem, npages * vm->page_size,
+                           vm_mem_backing_src_alias(src_type)->name);
        }
 
        region->unused_phy_pages = sparsebit_alloc();
@@ -1697,11 +1694,16 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
 {
        int ret;
 
-       ret = ioctl(vm->fd, cmd, arg);
+       ret = _vm_ioctl(vm, cmd, arg);
        TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)",
                cmd, ret, errno, strerror(errno));
 }
 
+int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
+{
+       return ioctl(vm->fd, cmd, arg);
+}
+
 /*
  * KVM system ioctl
  *
index 34465dc..91ce1b5 100644 (file)
@@ -10,8 +10,6 @@
 
 #include "sparsebit.h"
 
-#define KVM_DEV_PATH           "/dev/kvm"
-
 struct userspace_mem_region {
        struct kvm_userspace_memory_region region;
        struct sparsebit *unused_phy_pages;
index 906c955..63d2bc7 100644 (file)
@@ -10,6 +10,8 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <time.h>
+#include <sys/stat.h>
+#include <linux/mman.h>
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -111,28 +113,169 @@ void print_skip(const char *fmt, ...)
        puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-       {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-       {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-       {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
+bool thp_configured(void)
+{
+       int ret;
+       struct stat statbuf;
+
+       ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+       TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+                   "Error in stating /sys/kernel/mm/transparent_hugepage");
+
+       return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+       size_t size;
+       FILE *f;
+
+       TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+       f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+       TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size");
+
+       fscanf(f, "%ld", &size);
+       fclose(f);
+
+       return size;
+}
+
+size_t get_def_hugetlb_pagesz(void)
+{
+       char buf[64];
+       const char *tag = "Hugepagesize:";
+       FILE *f;
+
+       f = fopen("/proc/meminfo", "r");
+       TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo");
+
+       while (fgets(buf, sizeof(buf), f) != NULL) {
+               if (strstr(buf, tag) == buf) {
+                       fclose(f);
+                       return strtoull(buf + strlen(tag), NULL, 10) << 10;
+               }
+       }
+
+       if (feof(f))
+               TEST_FAIL("HUGETLB is not configured in host kernel");
+       else
+               TEST_FAIL("Error in reading /proc/meminfo");
+
+       fclose(f);
+       return 0;
+}
+
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+       static const struct vm_mem_backing_src_alias aliases[] = {
+               [VM_MEM_SRC_ANONYMOUS] = {
+                       .name = "anonymous",
+                       .flag = 0,
+               },
+               [VM_MEM_SRC_ANONYMOUS_THP] = {
+                       .name = "anonymous_thp",
+                       .flag = 0,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
+                       .name = "anonymous_hugetlb",
+                       .flag = MAP_HUGETLB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
+                       .name = "anonymous_hugetlb_16kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
+                       .name = "anonymous_hugetlb_64kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_64KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = {
+                       .name = "anonymous_hugetlb_512kb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_512KB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = {
+                       .name = "anonymous_hugetlb_1mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_1MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = {
+                       .name = "anonymous_hugetlb_2mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_2MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = {
+                       .name = "anonymous_hugetlb_8mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_8MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = {
+                       .name = "anonymous_hugetlb_16mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = {
+                       .name = "anonymous_hugetlb_32mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_32MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = {
+                       .name = "anonymous_hugetlb_256mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_256MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = {
+                       .name = "anonymous_hugetlb_512mb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_512MB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = {
+                       .name = "anonymous_hugetlb_1gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_1GB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = {
+                       .name = "anonymous_hugetlb_2gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_2GB,
+               },
+               [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = {
+                       .name = "anonymous_hugetlb_16gb",
+                       .flag = MAP_HUGETLB | MAP_HUGE_16GB,
+               },
+       };
+       _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
+                      "Missing new backing src types?");
+
+       TEST_ASSERT(i < NUM_SRC_TYPES, "Backing src type ID %d too big", i);
+
+       return &aliases[i];
+}
+
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
+size_t get_backing_src_pagesz(uint32_t i)
+{
+       uint32_t flag = vm_mem_backing_src_alias(i)->flag;
+
+       switch (i) {
+       case VM_MEM_SRC_ANONYMOUS:
+               return getpagesize();
+       case VM_MEM_SRC_ANONYMOUS_THP:
+               return get_trans_hugepagesz();
+       case VM_MEM_SRC_ANONYMOUS_HUGETLB:
+               return get_def_hugetlb_pagesz();
+       default:
+               return MAP_HUGE_PAGE_SIZE(flag);
+       }
+}
 
 void backing_src_help(void)
 {
        int i;
 
        printf("Available backing src types:\n");
-       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
-               printf("\t%s\n", backing_src_aliases[i].name);
+       for (i = 0; i < NUM_SRC_TYPES; i++)
+               printf("\t%s\n", vm_mem_backing_src_alias(i)->name);
 }
 
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name)
 {
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
-               if (!strcmp(type_name, backing_src_aliases[i].name))
-                       return backing_src_aliases[i].type;
+       for (i = 0; i < NUM_SRC_TYPES; i++)
+               if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name))
+                       return i;
 
        backing_src_help();
        TEST_FAIL("Unknown backing src type: %s", type_name);
diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
new file mode 100644 (file)
index 0000000..cb953df
--- /dev/null
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that KVM_GET_MSR_INDEX_LIST and
+ * KVM_GET_MSR_FEATURE_INDEX_LIST work as intended
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+static int kvm_num_index_msrs(int kvm_fd, int nmsrs)
+{
+       struct kvm_msr_list *list;
+       int r;
+
+       list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+       list->nmsrs = nmsrs;
+       r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+       TEST_ASSERT(r == -1 && errno == E2BIG,
+                               "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
+                               r);
+
+       r = list->nmsrs;
+       free(list);
+       return r;
+}
+
+static void test_get_msr_index(void)
+{
+       int old_res, res, kvm_fd, r;
+       struct kvm_msr_list *list;
+
+       kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+       if (kvm_fd < 0)
+               exit(KSFT_SKIP);
+
+       old_res = kvm_num_index_msrs(kvm_fd, 0);
+       TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0");
+
+       if (old_res != 1) {
+               res = kvm_num_index_msrs(kvm_fd, 1);
+               TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1");
+               TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical");
+       }
+
+       list = malloc(sizeof(*list) + old_res * sizeof(list->indices[0]));
+       list->nmsrs = old_res;
+       r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+
+       TEST_ASSERT(r == 0,
+                   "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i",
+                   r);
+       TEST_ASSERT(list->nmsrs == old_res, "Expecting nmsrs to be identical");
+       free(list);
+
+       close(kvm_fd);
+}
+
+static int kvm_num_feature_msrs(int kvm_fd, int nmsrs)
+{
+       struct kvm_msr_list *list;
+       int r;
+
+       list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+       list->nmsrs = nmsrs;
+       r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);
+       TEST_ASSERT(r == -1 && errno == E2BIG,
+               "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST probe, r: %i",
+                               r);
+
+       r = list->nmsrs;
+       free(list);
+       return r;
+}
+
+struct kvm_msr_list *kvm_get_msr_feature_list(int kvm_fd, int nmsrs)
+{
+       struct kvm_msr_list *list;
+       int r;
+
+       list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+       list->nmsrs = nmsrs;
+       r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);
+
+       TEST_ASSERT(r == 0,
+               "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i",
+               r);
+
+       return list;
+}
+
+static void test_get_msr_feature(void)
+{
+       int res, old_res, i, kvm_fd;
+       struct kvm_msr_list *feature_list;
+
+       kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+       if (kvm_fd < 0)
+               exit(KSFT_SKIP);
+
+       old_res = kvm_num_feature_msrs(kvm_fd, 0);
+       TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0");
+
+       if (old_res != 1) {
+               res = kvm_num_feature_msrs(kvm_fd, 1);
+               TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1");
+               TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical");
+       }
+
+       feature_list = kvm_get_msr_feature_list(kvm_fd, old_res);
+       TEST_ASSERT(old_res == feature_list->nmsrs,
+                               "Unmatching number of msr indexes");
+
+       for (i = 0; i < feature_list->nmsrs; i++)
+               kvm_get_feature_msr(feature_list->indices[i]);
+
+       free(feature_list);
+       close(kvm_fd);
+}
+
+int main(int argc, char *argv[])
+{
+       if (kvm_check_cap(KVM_CAP_GET_MSR_FEATURES))
+               test_get_msr_feature();
+
+       test_get_msr_index();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
new file mode 100644 (file)
index 0000000..7f1d276
--- /dev/null
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat, Inc.
+ *
+ * Tests for Hyper-V clocksources
+ */
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+struct ms_hyperv_tsc_page {
+       volatile u32 tsc_sequence;
+       u32 reserved1;
+       volatile u64 tsc_scale;
+       volatile s64 tsc_offset;
+} __packed;
+
+#define HV_X64_MSR_GUEST_OS_ID                 0x40000000
+#define HV_X64_MSR_TIME_REF_COUNT              0x40000020
+#define HV_X64_MSR_REFERENCE_TSC               0x40000021
+#define HV_X64_MSR_TSC_FREQUENCY               0x40000022
+#define HV_X64_MSR_REENLIGHTENMENT_CONTROL     0x40000106
+#define HV_X64_MSR_TSC_EMULATION_CONTROL       0x40000107
+
+/* Simplified mul_u64_u64_shr() */
+static inline u64 mul_u64_u64_shr64(u64 a, u64 b)
+{
+       union {
+               u64 ll;
+               struct {
+                       u32 low, high;
+               } l;
+       } rm, rn, rh, a0, b0;
+       u64 c;
+
+       a0.ll = a;
+       b0.ll = b;
+
+       rm.ll = (u64)a0.l.low * b0.l.high;
+       rn.ll = (u64)a0.l.high * b0.l.low;
+       rh.ll = (u64)a0.l.high * b0.l.high;
+
+       rh.l.low = c = rm.l.high + rn.l.high + rh.l.low;
+       rh.l.high = (c >> 32) + rh.l.high;
+
+       return rh.ll;
+}
+
+static inline void nop_loop(void)
+{
+       int i;
+
+       for (i = 0; i < 1000000; i++)
+               asm volatile("nop");
+}
+
+static inline void check_tsc_msr_rdtsc(void)
+{
+       u64 tsc_freq, r1, r2, t1, t2;
+       s64 delta_ns;
+
+       tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
+       GUEST_ASSERT(tsc_freq > 0);
+
+       /* First, check MSR-based clocksource */
+       r1 = rdtsc();
+       t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+       nop_loop();
+       r2 = rdtsc();
+       t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+
+       GUEST_ASSERT(r2 > r1 && t2 > t1);
+
+       /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
+       delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
+       if (delta_ns < 0)
+               delta_ns = -delta_ns;
+
+       /* 1% tolerance */
+       GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100);
+}
+
+static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page)
+{
+       return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset;
+}
+
+static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page)
+{
+       u64 r1, r2, t1, t2;
+
+       /* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */
+       t1 = get_tscpage_ts(tsc_page);
+       r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+
+       /* 10 ms tolerance */
+       GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000);
+       nop_loop();
+
+       t2 = get_tscpage_ts(tsc_page);
+       r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+       GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000);
+}
+
+static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa)
+{
+       u64 tsc_scale, tsc_offset;
+
+       /* Set Guest OS id to enable Hyper-V emulation */
+       GUEST_SYNC(1);
+       wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48);
+       GUEST_SYNC(2);
+
+       check_tsc_msr_rdtsc();
+
+       GUEST_SYNC(3);
+
+       /* Set up TSC page is disabled state, check that it's clean */
+       wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa);
+       GUEST_ASSERT(tsc_page->tsc_sequence == 0);
+       GUEST_ASSERT(tsc_page->tsc_scale == 0);
+       GUEST_ASSERT(tsc_page->tsc_offset == 0);
+
+       GUEST_SYNC(4);
+
+       /* Set up TSC page is enabled state */
+       wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1);
+       GUEST_ASSERT(tsc_page->tsc_sequence != 0);
+
+       GUEST_SYNC(5);
+
+       check_tsc_msr_tsc_page(tsc_page);
+
+       GUEST_SYNC(6);
+
+       tsc_offset = tsc_page->tsc_offset;
+       /* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */
+
+       GUEST_SYNC(7);
+       /* Sanity check TSC page timestamp, it should be close to 0 */
+       GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000);
+
+       GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset);
+
+       nop_loop();
+
+       /*
+        * Enable Re-enlightenment and check that TSC page stays constant across
+        * KVM_SET_CLOCK.
+        */
+       wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff);
+       wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1);
+       tsc_offset = tsc_page->tsc_offset;
+       tsc_scale = tsc_page->tsc_scale;
+       GUEST_SYNC(8);
+       GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset);
+       GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale);
+
+       GUEST_SYNC(9);
+
+       check_tsc_msr_tsc_page(tsc_page);
+
+       /*
+        * Disable re-enlightenment and TSC page, check that KVM doesn't update
+        * it anymore.
+        */
+       wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
+       wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
+       wrmsr(HV_X64_MSR_REFERENCE_TSC, 0);
+       memset(tsc_page, 0, sizeof(*tsc_page));
+
+       GUEST_SYNC(10);
+       GUEST_ASSERT(tsc_page->tsc_sequence == 0);
+       GUEST_ASSERT(tsc_page->tsc_offset == 0);
+       GUEST_ASSERT(tsc_page->tsc_scale == 0);
+
+       GUEST_DONE();
+}
+
+#define VCPU_ID 0
+
+static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm)
+{
+       u64 tsc_freq, r1, r2, t1, t2;
+       s64 delta_ns;
+
+       tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY);
+       TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
+
+       /* First, check MSR-based clocksource */
+       r1 = rdtsc();
+       t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
+       nop_loop();
+       r2 = rdtsc();
+       t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT);
+
+       TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
+
+       /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
+       delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
+       if (delta_ns < 0)
+               delta_ns = -delta_ns;
+
+       /* 1% tolerance */
+       TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100,
+                   "Elapsed time does not match (MSR=%ld, TSC=%ld)",
+                   (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq);
+}
+
+int main(void)
+{
+       struct kvm_vm *vm;
+       struct kvm_run *run;
+       struct ucall uc;
+       vm_vaddr_t tsc_page_gva;
+       int stage;
+
+       vm = vm_create_default(VCPU_ID, 0, guest_main);
+       run = vcpu_state(vm, VCPU_ID);
+
+       vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+       tsc_page_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+       memset(addr_gpa2hva(vm, tsc_page_gva), 0x0, getpagesize());
+       TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0,
+               "TSC page has to be page aligned\n");
+       vcpu_args_set(vm, VCPU_ID, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva));
+
+       host_check_tsc_msr_rdtsc(vm);
+
+       for (stage = 1;; stage++) {
+               _vcpu_run(vm, VCPU_ID);
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                           "Stage %d: unexpected exit reason: %u (%s),\n",
+                           stage, run->exit_reason,
+                           exit_reason_str(run->exit_reason));
+
+               switch (get_ucall(vm, VCPU_ID, &uc)) {
+               case UCALL_ABORT:
+                       TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+                                 __FILE__, uc.args[1]);
+                       /* NOT REACHED */
+               case UCALL_SYNC:
+                       break;
+               case UCALL_DONE:
+                       /* Keep in sync with guest_main() */
+                       TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d\n",
+                                   stage);
+                       goto out;
+               default:
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
+               }
+
+               TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+                           uc.args[1] == stage,
+                           "Stage %d: Unexpected register values vmexit, got %lx",
+                           stage, (ulong)uc.args[1]);
+
+               /* Reset kvmclock triggering TSC page update */
+               if (stage == 7 || stage == 8 || stage == 10) {
+                       struct kvm_clock_data clock = {0};
+
+                       vm_ioctl(vm, KVM_SET_CLOCK, &clock);
+               }
+       }
+
+out:
+       kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
new file mode 100644 (file)
index 0000000..12c558f
--- /dev/null
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that KVM_SET_BOOT_CPU_ID works as intended
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE /* for program_invocation_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define N_VCPU 2
+#define VCPU_ID0 0
+#define VCPU_ID1 1
+
+static uint32_t get_bsp_flag(void)
+{
+       return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP;
+}
+
+static void guest_bsp_vcpu(void *arg)
+{
+       GUEST_SYNC(1);
+
+       GUEST_ASSERT(get_bsp_flag() != 0);
+
+       GUEST_DONE();
+}
+
+static void guest_not_bsp_vcpu(void *arg)
+{
+       GUEST_SYNC(1);
+
+       GUEST_ASSERT(get_bsp_flag() == 0);
+
+       GUEST_DONE();
+}
+
+static void test_set_boot_busy(struct kvm_vm *vm)
+{
+       int res;
+
+       res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID0);
+       TEST_ASSERT(res == -1 && errno == EBUSY,
+                       "KVM_SET_BOOT_CPU_ID set while running vm");
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct ucall uc;
+       int stage;
+
+       for (stage = 0; stage < 2; stage++) {
+
+               vcpu_run(vm, vcpuid);
+
+               switch (get_ucall(vm, vcpuid, &uc)) {
+               case UCALL_SYNC:
+                       TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+                                       uc.args[1] == stage + 1,
+                                       "Stage %d: Unexpected register values vmexit, got %lx",
+                                       stage + 1, (ulong)uc.args[1]);
+                       test_set_boot_busy(vm);
+                       break;
+               case UCALL_DONE:
+                       TEST_ASSERT(stage == 1,
+                                       "Expected GUEST_DONE in stage 2, got stage %d",
+                                       stage);
+                       break;
+               case UCALL_ABORT:
+                       TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx",
+                                               (const char *)uc.args[0], __FILE__,
+                                               uc.args[1], uc.args[2], uc.args[3]);
+               default:
+                       TEST_ASSERT(false, "Unexpected exit: %s",
+                                       exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+               }
+       }
+}
+
+static struct kvm_vm *create_vm(void)
+{
+       struct kvm_vm *vm;
+       uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * 2;
+       uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU;
+       uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages;
+
+       pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages);
+       vm = vm_create(VM_MODE_DEFAULT, pages, O_RDWR);
+
+       kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+       vm_create_irqchip(vm);
+
+       return vm;
+}
+
+static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code)
+{
+       if (bsp_code)
+               vm_vcpu_add_default(vm, vcpuid, guest_bsp_vcpu);
+       else
+               vm_vcpu_add_default(vm, vcpuid, guest_not_bsp_vcpu);
+
+       vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid());
+}
+
+static void run_vm_bsp(uint32_t bsp_vcpu)
+{
+       struct kvm_vm *vm;
+       bool is_bsp_vcpu1 = bsp_vcpu == VCPU_ID1;
+
+       vm = create_vm();
+
+       if (is_bsp_vcpu1)
+               vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1);
+
+       add_x86_vcpu(vm, VCPU_ID0, !is_bsp_vcpu1);
+       add_x86_vcpu(vm, VCPU_ID1, is_bsp_vcpu1);
+
+       run_vcpu(vm, VCPU_ID0);
+       run_vcpu(vm, VCPU_ID1);
+
+       kvm_vm_free(vm);
+}
+
+static void check_set_bsp_busy(void)
+{
+       struct kvm_vm *vm;
+       int res;
+
+       vm = create_vm();
+
+       add_x86_vcpu(vm, VCPU_ID0, true);
+       add_x86_vcpu(vm, VCPU_ID1, false);
+
+       res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1);
+       TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set after adding vcpu");
+
+       run_vcpu(vm, VCPU_ID0);
+       run_vcpu(vm, VCPU_ID1);
+
+       res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1);
+       TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set to a terminated vcpu");
+
+       kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+       if (!kvm_check_cap(KVM_CAP_SET_BOOT_CPU_ID)) {
+               print_skip("set_boot_cpu_id not available");
+               return 0;
+       }
+
+       run_vm_bsp(VCPU_ID0);
+       run_vm_bsp(VCPU_ID1);
+       run_vm_bsp(VCPU_ID0);
+
+       check_set_bsp_busy();
+}
index 804ff5f..1f4a059 100644 (file)
@@ -186,7 +186,7 @@ int main(int argc, char *argv[])
                vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
        }
 
-       struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);;
+       struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
        rs->state = 0x5a;
 
        for (;;) {
index ce6bea9..0ccb1dd 100755 (executable)
@@ -658,7 +658,7 @@ test_ecn_decap()
        # In accordance with INET_ECN_decapsulate()
        __test_ecn_decap 00 00 0x00
        __test_ecn_decap 01 01 0x01
-       __test_ecn_decap 02 01 0x02
+       __test_ecn_decap 02 01 0x01
        __test_ecn_decap 01 03 0x03
        __test_ecn_decap 02 03 0x03
        test_ecn_decap_error
index 964db9e..ad32240 100755 (executable)
@@ -11,6 +11,7 @@ ksft_skip=4
 timeout=30
 mptcp_connect=""
 capture=0
+do_all_tests=1
 
 TEST_COUNT=0
 
@@ -121,12 +122,6 @@ reset_with_add_addr_timeout()
                -j DROP
 }
 
-for arg in "$@"; do
-       if [ "$arg" = "-c" ]; then
-               capture=1
-       fi
-done
-
 ip -Version > /dev/null 2>&1
 if [ $? -ne 0 ];then
        echo "SKIP: Could not run test without ip tool"
@@ -1221,7 +1216,8 @@ usage()
        echo "  -4 v4mapped_tests"
        echo "  -b backup_tests"
        echo "  -p add_addr_ports_tests"
-       echo "  -c syncookies_tests"
+       echo "  -k syncookies_tests"
+       echo "  -c capture pcap files"
        echo "  -h help"
 }
 
@@ -1235,12 +1231,24 @@ make_file "$cin" "client" 1
 make_file "$sin" "server" 1
 trap cleanup EXIT
 
-if [ -z $1 ]; then
+for arg in "$@"; do
+       # check for "capture" arg before launching tests
+       if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then
+               capture=1
+       fi
+
+       # exception for the capture option, the rest means: a part of the tests
+       if [ "${arg}" != "-c" ]; then
+               do_all_tests=0
+       fi
+done
+
+if [ $do_all_tests -eq 1 ]; then
        all_tests
        exit $ret
 fi
 
-while getopts 'fsltra64bpch' opt; do
+while getopts 'fsltra64bpkch' opt; do
        case $opt in
                f)
                        subflows_tests
@@ -1272,9 +1280,11 @@ while getopts 'fsltra64bpch' opt; do
                p)
                        add_addr_ports_tests
                        ;;
-               c)
+               k)
                        syncookies_tests
                        ;;
+               c)
+                       ;;
                h | *)
                        usage
                        ;;
index 7b01b7c..066efd3 100644 (file)
@@ -30,25 +30,25 @@ struct reuse_opts {
 };
 
 struct reuse_opts unreusable_opts[12] = {
-       {0, 0, 0, 0},
-       {0, 0, 0, 1},
-       {0, 0, 1, 0},
-       {0, 0, 1, 1},
-       {0, 1, 0, 0},
-       {0, 1, 0, 1},
-       {0, 1, 1, 0},
-       {0, 1, 1, 1},
-       {1, 0, 0, 0},
-       {1, 0, 0, 1},
-       {1, 0, 1, 0},
-       {1, 0, 1, 1},
+       {{0, 0}, {0, 0}},
+       {{0, 0}, {0, 1}},
+       {{0, 0}, {1, 0}},
+       {{0, 0}, {1, 1}},
+       {{0, 1}, {0, 0}},
+       {{0, 1}, {0, 1}},
+       {{0, 1}, {1, 0}},
+       {{0, 1}, {1, 1}},
+       {{1, 0}, {0, 0}},
+       {{1, 0}, {0, 1}},
+       {{1, 0}, {1, 0}},
+       {{1, 0}, {1, 1}},
 };
 
 struct reuse_opts reusable_opts[4] = {
-       {1, 1, 0, 0},
-       {1, 1, 0, 1},
-       {1, 1, 1, 0},
-       {1, 1, 1, 1},
+       {{1, 1}, {0, 0}},
+       {{1, 1}, {0, 1}},
+       {{1, 1}, {1, 0}},
+       {{1, 1}, {1, 1}},
 };
 
 int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport)
index 592c1cc..0bd7342 100644 (file)
@@ -14,7 +14,7 @@
 #define __aligned(x) __attribute__((__aligned__(x)))
 #define __packed __attribute__((packed))
 
-#include "../../../../arch/x86/kernel/cpu/sgx/arch.h"
+#include "../../../../arch/x86/include/asm/sgx.h"
 #include "../../../../arch/x86/include/asm/enclu.h"
 #include "../../../../arch/x86/include/uapi/asm/sgx.h"
 
index 9d43b75..f441ac3 100644 (file)
@@ -45,19 +45,19 @@ static bool encl_map_bin(const char *path, struct encl *encl)
 
        fd = open(path, O_RDONLY);
        if (fd == -1)  {
-               perror("open()");
+               perror("enclave executable open()");
                return false;
        }
 
        ret = stat(path, &sb);
        if (ret) {
-               perror("stat()");
+               perror("enclave executable stat()");
                goto err;
        }
 
        bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
        if (bin == MAP_FAILED) {
-               perror("mmap()");
+               perror("enclave executable mmap()");
                goto err;
        }
 
@@ -90,8 +90,7 @@ static bool encl_ioc_create(struct encl *encl)
        ioc.src = (unsigned long)secs;
        rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc);
        if (rc) {
-               fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n",
-                       errno);
+               perror("SGX_IOC_ENCLAVE_CREATE failed");
                munmap((void *)secs->base, encl->encl_size);
                return false;
        }
@@ -116,31 +115,72 @@ static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg)
 
        rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc);
        if (rc < 0) {
-               fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n",
-                       errno);
+               perror("SGX_IOC_ENCLAVE_ADD_PAGES failed");
                return false;
        }
 
        return true;
 }
 
+
+
 bool encl_load(const char *path, struct encl *encl)
 {
+       const char device_path[] = "/dev/sgx_enclave";
        Elf64_Phdr *phdr_tbl;
        off_t src_offset;
        Elf64_Ehdr *ehdr;
+       struct stat sb;
+       void *ptr;
        int i, j;
        int ret;
+       int fd = -1;
 
        memset(encl, 0, sizeof(*encl));
 
-       ret = open("/dev/sgx_enclave", O_RDWR);
-       if (ret < 0) {
-               fprintf(stderr, "Unable to open /dev/sgx_enclave\n");
+       fd = open(device_path, O_RDWR);
+       if (fd < 0) {
+               perror("Unable to open /dev/sgx_enclave");
+               goto err;
+       }
+
+       ret = stat(device_path, &sb);
+       if (ret) {
+               perror("device file stat()");
+               goto err;
+       }
+
+       /*
+        * This just checks if the /dev file has these permission
+        * bits set.  It does not check that the current user is
+        * the owner or in the owning group.
+        */
+       if (!(sb.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) {
+               fprintf(stderr, "no execute permissions on device file %s\n", device_path);
+               goto err;
+       }
+
+       ptr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0);
+       if (ptr == (void *)-1) {
+               perror("mmap for read");
+               goto err;
+       }
+       munmap(ptr, PAGE_SIZE);
+
+#define ERR_MSG \
+"mmap() succeeded for PROT_READ, but failed for PROT_EXEC.\n" \
+" Check that current user has execute permissions on %s and \n" \
+" that /dev does not have noexec set: mount | grep \"/dev .*noexec\"\n" \
+" If so, remount it executable: mount -o remount,exec /dev\n\n"
+
+       ptr = mmap(NULL, PAGE_SIZE, PROT_EXEC, MAP_SHARED, fd, 0);
+       if (ptr == (void *)-1) {
+               fprintf(stderr, ERR_MSG, device_path);
                goto err;
        }
+       munmap(ptr, PAGE_SIZE);
 
-       encl->fd = ret;
+       encl->fd = fd;
 
        if (!encl_map_bin(path, encl))
                goto err;
@@ -217,6 +257,8 @@ bool encl_load(const char *path, struct encl *encl)
        return true;
 
 err:
+       if (fd != -1)
+               close(fd);
        encl_delete(encl);
        return false;
 }
@@ -229,7 +271,7 @@ static bool encl_map_area(struct encl *encl)
        area = mmap(NULL, encl_size * 2, PROT_NONE,
                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (area == MAP_FAILED) {
-               perror("mmap");
+               perror("reservation mmap()");
                return false;
        }
 
@@ -268,8 +310,7 @@ bool encl_build(struct encl *encl)
        ioc.sigstruct = (uint64_t)&encl->sigstruct;
        ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc);
        if (ret) {
-               fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n",
-                       errno);
+               perror("SGX_IOC_ENCLAVE_INIT failed");
                return false;
        }
 
index 724cec7..d304a40 100644 (file)
@@ -15,6 +15,7 @@
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#include <sys/auxv.h>
 #include "defines.h"
 #include "main.h"
 #include "../kselftest.h"
@@ -28,24 +29,6 @@ struct vdso_symtab {
        Elf64_Word *elf_hashtab;
 };
 
-static void *vdso_get_base_addr(char *envp[])
-{
-       Elf64_auxv_t *auxv;
-       int i;
-
-       for (i = 0; envp[i]; i++)
-               ;
-
-       auxv = (Elf64_auxv_t *)&envp[i + 1];
-
-       for (i = 0; auxv[i].a_type != AT_NULL; i++) {
-               if (auxv[i].a_type == AT_SYSINFO_EHDR)
-                       return (void *)auxv[i].a_un.a_val;
-       }
-
-       return NULL;
-}
-
 static Elf64_Dyn *vdso_get_dyntab(void *addr)
 {
        Elf64_Ehdr *ehdr = addr;
@@ -162,7 +145,7 @@ static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r
        return 0;
 }
 
-int main(int argc, char *argv[], char *envp[])
+int main(int argc, char *argv[])
 {
        struct sgx_enclave_run run;
        struct vdso_symtab symtab;
@@ -195,7 +178,7 @@ int main(int argc, char *argv[], char *envp[])
                addr = mmap((void *)encl.encl_base + seg->offset, seg->size,
                            seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0);
                if (addr == MAP_FAILED) {
-                       fprintf(stderr, "mmap() failed, errno=%d.\n", errno);
+                       perror("mmap() segment failed");
                        exit(KSFT_FAIL);
                }
        }
@@ -203,7 +186,8 @@ int main(int argc, char *argv[], char *envp[])
        memset(&run, 0, sizeof(run));
        run.tcs = encl.encl_base;
 
-       addr = vdso_get_base_addr(envp);
+       /* Get vDSO base address */
+       addr = (void *)getauxval(AT_SYSINFO_EHDR);
        if (!addr)
                goto err;
 
index d42115e..8b0cd42 100644 (file)
@@ -101,7 +101,7 @@ endef
 ifeq ($(CAN_BUILD_I386),1)
 $(BINARIES_32): CFLAGS += -m32
 $(BINARIES_32): LDLIBS += -lrt -ldl -lm
-$(BINARIES_32): %_32: %.c
+$(BINARIES_32): $(OUTPUT)/%_32: %.c
        $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
 $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t))))
 endif
@@ -109,7 +109,7 @@ endif
 ifeq ($(CAN_BUILD_X86_64),1)
 $(BINARIES_64): CFLAGS += -m64
 $(BINARIES_64): LDLIBS += -lrt -ldl
-$(BINARIES_64): %_64: %.c
+$(BINARIES_64): $(OUTPUT)/%_64: %.c
        $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
 $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t))))
 endif
index 62bd908..f08f5e8 100644 (file)
@@ -174,21 +174,36 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                                           struct kvm_coalesced_mmio_zone *zone)
 {
        struct kvm_coalesced_mmio_dev *dev, *tmp;
+       int r;
 
        if (zone->pio != 1 && zone->pio != 0)
                return -EINVAL;
 
        mutex_lock(&kvm->slots_lock);
 
-       list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+       list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
                if (zone->pio == dev->zone.pio &&
                    coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
-                       kvm_io_bus_unregister_dev(kvm,
+                       r = kvm_io_bus_unregister_dev(kvm,
                                zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
                        kvm_iodevice_destructor(&dev->dev);
+
+                       /*
+                        * On failure, unregister destroys all devices on the
+                        * bus _except_ the target device, i.e. coalesced_zones
+                        * has been modified.  No need to restart the walk as
+                        * there aren't any zones left.
+                        */
+                       if (r)
+                               break;
                }
+       }
 
        mutex_unlock(&kvm->slots_lock);
 
+       /*
+        * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
+        * perspective, the coalesced MMIO is most definitely unregistered.
+        */
        return 0;
 }
index 383df23..2799c66 100644 (file)
@@ -451,35 +451,170 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
-                                       struct mm_struct *mm,
-                                       unsigned long address,
-                                       pte_t pte)
+typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
+typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
+                            unsigned long end);
+
+struct kvm_hva_range {
+       unsigned long start;
+       unsigned long end;
+       pte_t pte;
+       hva_handler_t handler;
+       on_lock_fn_t on_lock;
+       bool flush_on_ret;
+       bool may_block;
+};
+
+/*
+ * Use a dedicated stub instead of NULL to indicate that there is no callback
+ * function/handler.  The compiler technically can't guarantee that a real
+ * function will have a non-zero address, and so it will generate code to
+ * check for !NULL, whereas comparing against a stub will be elided at compile
+ * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
+ */
+static void kvm_null_fn(void)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int idx;
+
+}
+#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
+
+static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+                                                 const struct kvm_hva_range *range)
+{
+       bool ret = false, locked = false;
+       struct kvm_gfn_range gfn_range;
+       struct kvm_memory_slot *slot;
+       struct kvm_memslots *slots;
+       int i, idx;
+
+       /* A null handler is allowed if and only if on_lock() is provided. */
+       if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
+                        IS_KVM_NULL_FN(range->handler)))
+               return 0;
 
        idx = srcu_read_lock(&kvm->srcu);
 
-       KVM_MMU_LOCK(kvm);
+       /* The on_lock() path does not yet support lock elision. */
+       if (!IS_KVM_NULL_FN(range->on_lock)) {
+               locked = true;
+               KVM_MMU_LOCK(kvm);
 
-       kvm->mmu_notifier_seq++;
+               range->on_lock(kvm, range->start, range->end);
+
+               if (IS_KVM_NULL_FN(range->handler))
+                       goto out_unlock;
+       }
 
-       if (kvm_set_spte_hva(kvm, address, pte))
+       for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot(slot, slots) {
+                       unsigned long hva_start, hva_end;
+
+                       hva_start = max(range->start, slot->userspace_addr);
+                       hva_end = min(range->end, slot->userspace_addr +
+                                                 (slot->npages << PAGE_SHIFT));
+                       if (hva_start >= hva_end)
+                               continue;
+
+                       /*
+                        * To optimize for the likely case where the address
+                        * range is covered by zero or one memslots, don't
+                        * bother making these conditional (to avoid writes on
+                        * the second or later invocation of the handler).
+                        */
+                       gfn_range.pte = range->pte;
+                       gfn_range.may_block = range->may_block;
+
+                       /*
+                        * {gfn(page) | page intersects with [hva_start, hva_end)} =
+                        * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+                        */
+                       gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
+                       gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
+                       gfn_range.slot = slot;
+
+                       if (!locked) {
+                               locked = true;
+                               KVM_MMU_LOCK(kvm);
+                       }
+                       ret |= range->handler(kvm, &gfn_range);
+               }
+       }
+
+       if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
                kvm_flush_remote_tlbs(kvm);
 
-       KVM_MMU_UNLOCK(kvm);
+out_unlock:
+       if (locked)
+               KVM_MMU_UNLOCK(kvm);
+
        srcu_read_unlock(&kvm->srcu, idx);
+
+       /* The notifiers are averse to booleans. :-( */
+       return (int)ret;
 }
 
-static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-                                       const struct mmu_notifier_range *range)
+static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
+                                               unsigned long start,
+                                               unsigned long end,
+                                               pte_t pte,
+                                               hva_handler_t handler)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int need_tlb_flush = 0, idx;
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = pte,
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = true,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+
+static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
+                                                        unsigned long start,
+                                                        unsigned long end,
+                                                        hva_handler_t handler)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range range = {
+               .start          = start,
+               .end            = end,
+               .pte            = __pte(0),
+               .handler        = handler,
+               .on_lock        = (void *)kvm_null_fn,
+               .flush_on_ret   = false,
+               .may_block      = false,
+       };
+
+       return __kvm_handle_hva_range(kvm, &range);
+}
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long address,
+                                       pte_t pte)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+       trace_kvm_set_spte_hva(address);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
+       /*
+        * .change_pte() must be surrounded by .invalidate_range_{start,end}(),
+        * and so always runs with an elevated notifier count.  This obviates
+        * the need to bump the sequence count.
+        */
+       WARN_ON_ONCE(!kvm->mmu_notifier_count);
+
+       kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+}
+
+static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
        /*
         * The count increase must become visible at unlock time as no
         * spte can be established without taking the mmu_lock and
@@ -487,8 +622,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         */
        kvm->mmu_notifier_count++;
        if (likely(kvm->mmu_notifier_count == 1)) {
-               kvm->mmu_notifier_range_start = range->start;
-               kvm->mmu_notifier_range_end = range->end;
+               kvm->mmu_notifier_range_start = start;
+               kvm->mmu_notifier_range_end = end;
        } else {
                /*
                 * Fully tracking multiple concurrent ranges has dimishing
@@ -500,28 +635,36 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                 * complete.
                 */
                kvm->mmu_notifier_range_start =
-                       min(kvm->mmu_notifier_range_start, range->start);
+                       min(kvm->mmu_notifier_range_start, start);
                kvm->mmu_notifier_range_end =
-                       max(kvm->mmu_notifier_range_end, range->end);
+                       max(kvm->mmu_notifier_range_end, end);
        }
-       need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
-                                            range->flags);
-       /* we've to flush the tlb before the pages can be freed */
-       if (need_tlb_flush || kvm->tlbs_dirty)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return 0;
 }
 
-static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                                        const struct mmu_notifier_range *range)
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = kvm_unmap_gfn_range,
+               .on_lock        = kvm_inc_notifier_count,
+               .flush_on_ret   = true,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
 
-       KVM_MMU_LOCK(kvm);
+       trace_kvm_unmap_hva_range(range->start, range->end);
+
+       __kvm_handle_hva_range(kvm, &hva_range);
+
+       return 0;
+}
+
+static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+                                  unsigned long end)
+{
        /*
         * This sequence increase will notify the kvm page fault that
         * the page that is going to be mapped in the spte could have
@@ -535,7 +678,23 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
         * in conjunction with the smp_rmb in mmu_notifier_retry().
         */
        kvm->mmu_notifier_count--;
-       KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+                                       const struct mmu_notifier_range *range)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       const struct kvm_hva_range hva_range = {
+               .start          = range->start,
+               .end            = range->end,
+               .pte            = __pte(0),
+               .handler        = (void *)kvm_null_fn,
+               .on_lock        = kvm_dec_notifier_count,
+               .flush_on_ret   = false,
+               .may_block      = mmu_notifier_range_blockable(range),
+       };
+
+       __kvm_handle_hva_range(kvm, &hva_range);
 
        BUG_ON(kvm->mmu_notifier_count < 0);
 }
@@ -545,20 +704,9 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                              unsigned long start,
                                              unsigned long end)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_age_hva(start, end);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
-
-       young = kvm_age_hva(kvm, start, end);
-       if (young)
-               kvm_flush_remote_tlbs(kvm);
-
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -566,11 +714,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
                                        unsigned long start,
                                        unsigned long end)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_age_hva(start, end);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
        /*
         * Even though we do not flush TLB, this will still adversely
         * affect performance on pre-Haswell Intel EPT, where there is
@@ -584,27 +729,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
         * cadence. If we find this inaccurate, we might come up with a
         * more sophisticated heuristic later.
         */
-       young = kvm_age_hva(kvm, start, end);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
 {
-       struct kvm *kvm = mmu_notifier_to_kvm(mn);
-       int young, idx;
+       trace_kvm_test_age_hva(address);
 
-       idx = srcu_read_lock(&kvm->srcu);
-       KVM_MMU_LOCK(kvm);
-       young = kvm_test_age_hva(kvm, address);
-       KVM_MMU_UNLOCK(kvm);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return young;
+       return kvm_handle_hva_range_no_flush(mn, address, address + 1,
+                                            kvm_test_age_gfn);
 }
 
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
@@ -3002,6 +3137,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
        return false;
 }
 
+bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
        struct kvm *kvm = me->kvm;
@@ -3035,7 +3175,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
                            !vcpu_dy_runnable(vcpu))
                                continue;
                        if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
-                               !kvm_arch_vcpu_in_kernel(vcpu))
+                           !kvm_arch_dy_has_pending_interrupt(vcpu) &&
+                           !kvm_arch_vcpu_in_kernel(vcpu))
                                continue;
                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;
@@ -3182,7 +3323,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        if (r)
                goto vcpu_decrement;
 
-       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
        if (!vcpu) {
                r = -ENOMEM;
                goto vcpu_decrement;
@@ -4062,6 +4203,12 @@ static struct file_operations kvm_vm_fops = {
        KVM_COMPAT(kvm_vm_compat_ioctl),
 };
 
+bool file_is_kvm(struct file *file)
+{
+       return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_GPL(file_is_kvm);
+
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
        int r;
@@ -4485,24 +4632,26 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        return 0;
 }
 
-/* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                              struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                             struct kvm_io_device *dev)
 {
        int i, j;
        struct kvm_io_bus *new_bus, *bus;
 
+       lockdep_assert_held(&kvm->slots_lock);
+
        bus = kvm_get_bus(kvm, bus_idx);
        if (!bus)
-               return;
+               return 0;
 
-       for (i = 0; i < bus->dev_count; i++)
+       for (i = 0; i < bus->dev_count; i++) {
                if (bus->range[i].dev == dev) {
                        break;
                }
+       }
 
        if (i == bus->dev_count)
-               return;
+               return 0;
 
        new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
                          GFP_KERNEL_ACCOUNT);
@@ -4511,7 +4660,13 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                new_bus->dev_count--;
                memcpy(new_bus->range + i, bus->range + i + 1,
                                flex_array_size(new_bus, range, new_bus->dev_count - i));
-       } else {
+       }
+
+       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+       synchronize_srcu_expedited(&kvm->srcu);
+
+       /* Destroy the old bus _after_ installing the (null) bus. */
+       if (!new_bus) {
                pr_err("kvm: failed to shrink bus, removing it completely\n");
                for (j = 0; j < bus->dev_count; j++) {
                        if (j == i)
@@ -4520,10 +4675,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                }
        }
 
-       rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
-       synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);
-       return;
+       return new_bus ? 0 : -ENOMEM;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,