Merge tag 'zonefs-6.5-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 12 Aug 2023 01:35:56 +0000 (18:35 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 12 Aug 2023 01:35:56 +0000 (18:35 -0700)
Pull zonefs fix from Damien Le Moal:

 - The switch to using iomap for executing a direct synchronous write to
   sequential files using a zone append BIO overlooked cases where the
   BIO built by iomap is too large and needs splitting, which is not
   allowed with zone append.

   Fix this by using regular write commands instead. The use of zone
   append commands will be reintroduced later with proper support from
   iomap.

* tag 'zonefs-6.5-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs:
  zonefs: fix synchronous direct writes to sequential files

261 files changed:
Documentation/ABI/testing/sysfs-devices-system-cpu
Documentation/ABI/testing/sysfs-platform-hidma
Documentation/ABI/testing/sysfs-platform-hidma-mgmt
Documentation/admin-guide/hw-vuln/gather_data_sampling.rst [new file with mode: 0644]
Documentation/admin-guide/hw-vuln/index.rst
Documentation/admin-guide/hw-vuln/srso.rst [new file with mode: 0644]
Documentation/admin-guide/kernel-parameters.txt
MAINTAINERS
arch/arm64/include/asm/el2_setup.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_emulate.h
arch/arm64/kvm/arm.c
arch/arm64/kvm/hyp/include/hyp/switch.h
arch/arm64/kvm/hyp/nvhe/ffa.c
arch/arm64/kvm/hyp/nvhe/switch.c
arch/parisc/Kconfig.debug
arch/parisc/boot/compressed/misc.c
arch/parisc/include/asm/dma.h
arch/parisc/include/asm/ftrace.h
arch/parisc/include/asm/spinlock.h
arch/parisc/include/asm/spinlock_types.h
arch/parisc/kernel/firmware.c
arch/parisc/kernel/ftrace.c
arch/parisc/kernel/parisc_ksyms.c
arch/parisc/kernel/pci-dma.c
arch/parisc/kernel/pdt.c
arch/parisc/kernel/perf.c
arch/parisc/kernel/processor.c
arch/parisc/kernel/setup.c
arch/parisc/kernel/signal.c
arch/parisc/kernel/sys_parisc.c
arch/parisc/kernel/syscall.S
arch/parisc/kernel/unaligned.c
arch/parisc/lib/ucmpdi2.c
arch/parisc/mm/fault.c
arch/parisc/mm/init.c
arch/parisc/mm/ioremap.c
arch/riscv/include/asm/cacheflush.h
arch/riscv/include/asm/mmio.h
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/vmalloc.h
arch/riscv/kernel/cpu.c
arch/riscv/kernel/elf_kexec.c
arch/riscv/kernel/smp.c
arch/riscv/mm/init.c
arch/riscv/mm/kasan_init.c
arch/x86/Kconfig
arch/x86/include/asm/acpi.h
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/nospec-branch.h
arch/x86/include/asm/processor.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/bugs.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/cpu.h
arch/x86/kernel/traps.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/cpuid.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/svm/vmenter.S
arch/x86/kvm/x86.c
arch/x86/lib/retpoline.S
block/blk-core.c
block/blk-iocost.c
block/fops.c
drivers/accel/ivpu/ivpu_gem.c
drivers/acpi/resource.c
drivers/base/cpu.c
drivers/block/zram/zram_drv.c
drivers/char/tpm/tpm-chip.c
drivers/char/tpm/tpm_crb.c
drivers/char/tpm/tpm_tis.c
drivers/cpufreq/amd-pstate.c
drivers/cpuidle/cpuidle-psci-domain.c
drivers/cpuidle/dt_idle_genpd.c
drivers/cpuidle/dt_idle_genpd.h
drivers/dma/Kconfig
drivers/dma/idxd/device.c
drivers/dma/mcf-edma.c
drivers/dma/owl-dma.c
drivers/dma/pl330.c
drivers/dma/xilinx/xdma.c
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
drivers/gpu/drm/amd/amdkfd/kfd_crat.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
drivers/gpu/drm/bridge/ite-it6505.c
drivers/gpu/drm/bridge/lontium-lt9611.c
drivers/gpu/drm/drm_gem_shmem_helper.c
drivers/gpu/drm/nouveau/nouveau_connector.c
drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h
drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk110b.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk208.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c
drivers/gpu/drm/rockchip/rockchip_drm_vop.c
drivers/hwmon/aquacomputer_d5next.c
drivers/hwmon/pmbus/bel-pfe.c
drivers/isdn/mISDN/dsp.h
drivers/isdn/mISDN/dsp_cmx.c
drivers/isdn/mISDN/dsp_core.c
drivers/media/platform/qcom/venus/hfi_cmds.c
drivers/mmc/host/moxart-mmc.c
drivers/mmc/host/sdhci_f_sdh30.c
drivers/net/bonding/bond_main.c
drivers/net/dsa/ocelot/felix.c
drivers/net/ethernet/freescale/enetc/enetc_pf.c
drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
drivers/net/ethernet/ibm/ibmvnic.c
drivers/net/ethernet/intel/iavf/iavf_ethtool.c
drivers/net/ethernet/intel/iavf/iavf_fdir.c
drivers/net/ethernet/intel/igc/igc.h
drivers/net/ethernet/intel/igc/igc_main.c
drivers/net/ethernet/marvell/prestera/prestera_router.c
drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c
drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
drivers/net/ethernet/mellanox/mlx5/core/sriov.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c
drivers/net/ethernet/microsoft/mana/mana_en.c
drivers/net/ethernet/pensando/ionic/ionic_lif.c
drivers/net/macsec.c
drivers/net/phy/at803x.c
drivers/net/tun.c
drivers/net/vxlan/vxlan_vnifilter.c
drivers/net/wireguard/allowedips.c
drivers/net/wireguard/selftest/allowedips.c
drivers/net/wireless/ath/ath12k/wmi.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
drivers/net/wireless/realtek/rtw89/mac.c
drivers/net/xen-netback/netback.c
drivers/nvme/host/core.c
drivers/nvme/host/ioctl.c
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/parisc/sba_iommu.c
drivers/pci/bus.c
drivers/pci/controller/Kconfig
drivers/pci/controller/dwc/pcie-designware-host.c
drivers/pci/controller/dwc/pcie-designware.c
drivers/pci/controller/dwc/pcie-designware.h
drivers/pci/hotplug/acpiphp_glue.c
drivers/pci/of.c
fs/gfs2/file.c
fs/gfs2/trans.c
fs/nilfs2/inode.c
fs/nilfs2/segment.c
fs/nilfs2/the_nilfs.h
fs/proc/kcore.c
fs/smb/server/smb2misc.c
fs/smb/server/smb2pdu.c
fs/vboxsf/shfl_hostintf.h
include/linux/bio.h
include/linux/blkdev.h
include/linux/cpu.h
include/linux/skmsg.h
include/linux/tpm.h
include/net/cfg80211.h
include/net/netfilter/nf_tables.h
include/trace/events/tcp.h
io_uring/io_uring.c
io_uring/openclose.c
kernel/power/hibernate.c
kernel/workqueue.c
lib/Kconfig.debug
lib/scatterlist.c
mm/compaction.c
mm/damon/core.c
mm/hugetlb.c
mm/ksm.c
mm/memory-failure.c
mm/memory.c
mm/swapfile.c
mm/zsmalloc.c
net/8021q/vlan.c
net/core/filter.c
net/core/skmsg.c
net/core/sock.c
net/core/sock_map.c
net/dccp/output.c
net/dccp/proto.c
net/ipv4/ip_tunnel_core.c
net/ipv4/nexthop.c
net/ipv6/ndisc.c
net/mptcp/protocol.c
net/mptcp/protocol.h
net/mptcp/subflow.c
net/netfilter/nf_tables_api.c
net/netfilter/nft_set_hash.c
net/netfilter/nft_set_pipapo.c
net/netfilter/nft_set_rbtree.c
net/packet/af_packet.c
net/smc/af_smc.c
net/smc/smc.h
net/smc/smc_clc.c
net/smc/smc_core.c
net/smc/smc_sysctl.c
net/tls/tls_device.c
net/tls/tls_main.c
net/wireless/nl80211.c
net/xdp/xsk.c
security/keys/sysctl.c
tools/arch/x86/include/asm/cpufeatures.h
tools/arch/x86/include/asm/msr-index.h
tools/objtool/arch/x86/decode.c
tools/perf/util/machine.c
tools/perf/util/stat-display.c
tools/testing/radix-tree/regression1.c
tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
tools/testing/selftests/bpf/progs/test_sockmap_listen.c
tools/testing/selftests/cgroup/test_kmem.c
tools/testing/selftests/mm/ksm_tests.c
tools/testing/selftests/net/fib_nexthops.sh
tools/testing/selftests/net/forwarding/bridge_mdb.sh
tools/testing/selftests/net/forwarding/bridge_mdb_max.sh
tools/testing/selftests/net/forwarding/ethtool.sh
tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
tools/testing/selftests/net/forwarding/ethtool_mm.sh
tools/testing/selftests/net/forwarding/hw_stats_l3_gre.sh
tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh
tools/testing/selftests/net/forwarding/lib.sh
tools/testing/selftests/net/forwarding/settings [new file with mode: 0644]
tools/testing/selftests/net/forwarding/tc_actions.sh
tools/testing/selftests/net/forwarding/tc_flower.sh
tools/testing/selftests/net/forwarding/tc_flower_l2_miss.sh
tools/testing/selftests/net/forwarding/tc_tunnel_key.sh
tools/testing/selftests/net/mptcp/mptcp_join.sh
tools/testing/selftests/net/pmtu.sh
tools/testing/selftests/rseq/Makefile
tools/testing/selftests/rseq/rseq.c

index ecd585c..77942ee 100644 (file)
@@ -513,17 +513,18 @@ Description:      information about CPUs heterogeneity.
                cpu_capacity: capacity of cpuX.
 
 What:          /sys/devices/system/cpu/vulnerabilities
+               /sys/devices/system/cpu/vulnerabilities/gather_data_sampling
+               /sys/devices/system/cpu/vulnerabilities/itlb_multihit
+               /sys/devices/system/cpu/vulnerabilities/l1tf
+               /sys/devices/system/cpu/vulnerabilities/mds
                /sys/devices/system/cpu/vulnerabilities/meltdown
+               /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+               /sys/devices/system/cpu/vulnerabilities/retbleed
+               /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
                /sys/devices/system/cpu/vulnerabilities/spectre_v1
                /sys/devices/system/cpu/vulnerabilities/spectre_v2
-               /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
-               /sys/devices/system/cpu/vulnerabilities/l1tf
-               /sys/devices/system/cpu/vulnerabilities/mds
                /sys/devices/system/cpu/vulnerabilities/srbds
                /sys/devices/system/cpu/vulnerabilities/tsx_async_abort
-               /sys/devices/system/cpu/vulnerabilities/itlb_multihit
-               /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
-               /sys/devices/system/cpu/vulnerabilities/retbleed
 Date:          January 2018
 Contact:       Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:   Information about CPU vulnerabilities
index fca40a5..a80aeda 100644 (file)
@@ -2,7 +2,7 @@ What:           /sys/devices/platform/hidma-*/chid
                /sys/devices/platform/QCOM8061:*/chid
 Date:          Dec 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Contains the ID of the channel within the HIDMA instance.
                It is used to associate a given HIDMA channel with the
index 3b6c5c9..0373745 100644 (file)
@@ -2,7 +2,7 @@ What:           /sys/devices/platform/hidma-mgmt*/chanops/chan*/priority
                /sys/devices/platform/QCOM8060:*/chanops/chan*/priority
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Contains either 0 or 1 and indicates if the DMA channel is a
                low priority (0) or high priority (1) channel.
@@ -11,7 +11,7 @@ What:         /sys/devices/platform/hidma-mgmt*/chanops/chan*/weight
                /sys/devices/platform/QCOM8060:*/chanops/chan*/weight
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Contains 0..15 and indicates the weight of the channel among
                equal priority channels during round robin scheduling.
@@ -20,7 +20,7 @@ What:         /sys/devices/platform/hidma-mgmt*/chreset_timeout_cycles
                /sys/devices/platform/QCOM8060:*/chreset_timeout_cycles
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Contains the platform specific cycle value to wait after a
                reset command is issued. If the value is chosen too short,
@@ -32,7 +32,7 @@ What:         /sys/devices/platform/hidma-mgmt*/dma_channels
                /sys/devices/platform/QCOM8060:*/dma_channels
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Contains the number of dma channels supported by one instance
                of HIDMA hardware. The value may change from chip to chip.
@@ -41,7 +41,7 @@ What:         /sys/devices/platform/hidma-mgmt*/hw_version_major
                /sys/devices/platform/QCOM8060:*/hw_version_major
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Version number major for the hardware.
 
@@ -49,7 +49,7 @@ What:         /sys/devices/platform/hidma-mgmt*/hw_version_minor
                /sys/devices/platform/QCOM8060:*/hw_version_minor
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Version number minor for the hardware.
 
@@ -57,7 +57,7 @@ What:         /sys/devices/platform/hidma-mgmt*/max_rd_xactions
                /sys/devices/platform/QCOM8060:*/max_rd_xactions
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Contains a value between 0 and 31. Maximum number of
                read transactions that can be issued back to back.
@@ -69,7 +69,7 @@ What:         /sys/devices/platform/hidma-mgmt*/max_read_request
                /sys/devices/platform/QCOM8060:*/max_read_request
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Size of each read request. The value needs to be a power
                of two and can be between 128 and 1024.
@@ -78,7 +78,7 @@ What:         /sys/devices/platform/hidma-mgmt*/max_wr_xactions
                /sys/devices/platform/QCOM8060:*/max_wr_xactions
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Contains a value between 0 and 31. Maximum number of
                write transactions that can be issued back to back.
@@ -91,7 +91,7 @@ What:         /sys/devices/platform/hidma-mgmt*/max_write_request
                /sys/devices/platform/QCOM8060:*/max_write_request
 Date:          Nov 2015
 KernelVersion: 4.4
-Contact:       "Sinan Kaya <okaya@codeaurora.org>"
+Contact:       "Sinan Kaya <okaya@kernel.org>"
 Description:
                Size of each write request. The value needs to be a power
                of two and can be between 128 and 1024.
diff --git a/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst b/Documentation/admin-guide/hw-vuln/gather_data_sampling.rst
new file mode 100644 (file)
index 0000000..264bfa9
--- /dev/null
@@ -0,0 +1,109 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+GDS - Gather Data Sampling
+==========================
+
+Gather Data Sampling is a hardware vulnerability which allows unprivileged
+speculative access to data which was previously stored in vector registers.
+
+Problem
+-------
+When a gather instruction performs loads from memory, different data elements
+are merged into the destination vector register. However, when a gather
+instruction that is transiently executed encounters a fault, stale data from
+architectural or internal vector registers may get transiently forwarded to the
+destination vector register instead. This will allow a malicious attacker to
+infer stale data using typical side channel techniques like cache timing
+attacks. GDS is a purely sampling-based attack.
+
+The attacker uses gather instructions to infer the stale vector register data.
+The victim does not need to do anything special other than use the vector
+registers. The victim does not need to use gather instructions to be
+vulnerable.
+
+Because the buffers are shared between Hyper-Threads cross Hyper-Thread attacks
+are possible.
+
+Attack scenarios
+----------------
+Without mitigation, GDS can infer stale data across virtually all
+permission boundaries:
+
+       Non-enclaves can infer SGX enclave data
+       Userspace can infer kernel data
+       Guests can infer data from hosts
+       Guest can infer guest from other guests
+       Users can infer data from other users
+
+Because of this, it is important to ensure that the mitigation stays enabled in
+lower-privilege contexts like guests and when running outside SGX enclaves.
+
+The hardware enforces the mitigation for SGX. Likewise, VMMs should  ensure
+that guests are not allowed to disable the GDS mitigation. If a host erred and
+allowed this, a guest could theoretically disable GDS mitigation, mount an
+attack, and re-enable it.
+
+Mitigation mechanism
+--------------------
+This issue is mitigated in microcode. The microcode defines the following new
+bits:
+
+ ================================   ===   ============================
+ IA32_ARCH_CAPABILITIES[GDS_CTRL]   R/O   Enumerates GDS vulnerability
+                                          and mitigation support.
+ IA32_ARCH_CAPABILITIES[GDS_NO]     R/O   Processor is not vulnerable.
+ IA32_MCU_OPT_CTRL[GDS_MITG_DIS]    R/W   Disables the mitigation
+                                          0 by default.
+ IA32_MCU_OPT_CTRL[GDS_MITG_LOCK]   R/W   Locks GDS_MITG_DIS=0. Writes
+                                          to GDS_MITG_DIS are ignored
+                                          Can't be cleared once set.
+ ================================   ===   ============================
+
+GDS can also be mitigated on systems that don't have updated microcode by
+disabling AVX. This can be done by setting gather_data_sampling="force" or
+"clearcpuid=avx" on the kernel command-line.
+
+If used, these options will disable AVX use by turning off XSAVE YMM support.
+However, the processor will still enumerate AVX support.  Userspace that
+does not follow proper AVX enumeration to check both AVX *and* XSAVE YMM
+support will break.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+The mitigation can be disabled by setting "gather_data_sampling=off" or
+"mitigations=off" on the kernel command line. Not specifying either will default
+to the mitigation being enabled. Specifying "gather_data_sampling=force" will
+use the microcode mitigation when available or disable AVX on affected systems
+where the microcode hasn't been updated to include the mitigation.
+
+GDS System Information
+------------------------
+The kernel provides vulnerability status information through sysfs. For
+GDS this can be accessed by the following sysfs file:
+
+/sys/devices/system/cpu/vulnerabilities/gather_data_sampling
+
+The possible values contained in this file are:
+
+ ============================== =============================================
+ Not affected                   Processor not vulnerable.
+ Vulnerable                     Processor vulnerable and mitigation disabled.
+ Vulnerable: No microcode       Processor vulnerable and microcode is missing
+                                mitigation.
+ Mitigation: AVX disabled,
+ no microcode                   Processor is vulnerable and microcode is missing
+                                mitigation. AVX disabled as mitigation.
+ Mitigation: Microcode          Processor is vulnerable and mitigation is in
+                                effect.
+ Mitigation: Microcode (locked) Processor is vulnerable and mitigation is in
+                                effect and cannot be disabled.
+ Unknown: Dependent on
+ hypervisor status              Running on a virtual guest processor that is
+                                affected but with no way to know if host
+                                processor is mitigated or vulnerable.
+ ============================== =============================================
+
+GDS Default mitigation
+----------------------
+The updated microcode will enable the mitigation by default. The kernel's
+default action is to leave the mitigation enabled.
index e061476..a7d37e1 100644 (file)
@@ -19,3 +19,5 @@ are configurable at compile, boot or run time.
    l1d_flush.rst
    processor_mmio_stale_data.rst
    cross-thread-rsb.rst
+   srso
+   gather_data_sampling.rst
diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst
new file mode 100644 (file)
index 0000000..32eb5e6
--- /dev/null
@@ -0,0 +1,133 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Speculative Return Stack Overflow (SRSO)
+========================================
+
+This is a mitigation for the speculative return stack overflow (SRSO)
+vulnerability found on AMD processors. The mechanism is by now the well
+known scenario of poisoning CPU functional units - the Branch Target
+Buffer (BTB) and Return Address Predictor (RAP) in this case - and then
+tricking the elevated privilege domain (the kernel) into leaking
+sensitive data.
+
+AMD CPUs predict RET instructions using a Return Address Predictor (aka
+Return Address Stack/Return Stack Buffer). In some cases, a non-architectural
+CALL instruction (i.e., an instruction predicted to be a CALL but is
+not actually a CALL) can create an entry in the RAP which may be used
+to predict the target of a subsequent RET instruction.
+
+The specific circumstances that lead to this varies by microarchitecture
+but the concern is that an attacker can mis-train the CPU BTB to predict
+non-architectural CALL instructions in kernel space and use this to
+control the speculative target of a subsequent kernel RET, potentially
+leading to information disclosure via a speculative side-channel.
+
+The issue is tracked under CVE-2023-20569.
+
+Affected processors
+-------------------
+
+AMD Zen, generations 1-4. That is, all families 0x17 and 0x19. Older
+processors have not been investigated.
+
+System information and options
+------------------------------
+
+First of all, it is required that the latest microcode be loaded for
+mitigations to be effective.
+
+The sysfs file showing SRSO mitigation status is:
+
+  /sys/devices/system/cpu/vulnerabilities/spec_rstack_overflow
+
+The possible values in this file are:
+
+ - 'Not affected'               The processor is not vulnerable
+
+ - 'Vulnerable: no microcode'   The processor is vulnerable, no
+                                microcode extending IBPB functionality
+                                to address the vulnerability has been
+                                applied.
+
+ - 'Mitigation: microcode'      Extended IBPB functionality microcode
+                                patch has been applied. It does not
+                                address User->Kernel and Guest->Host
+                                transitions protection but it does
+                                address User->User and VM->VM attack
+                                vectors.
+
+                                (spec_rstack_overflow=microcode)
+
+ - 'Mitigation: safe RET'       Software-only mitigation. It complements
+                                the extended IBPB microcode patch
+                                functionality by addressing User->Kernel 
+                                and Guest->Host transitions protection.
+
+                                Selected by default or by
+                                spec_rstack_overflow=safe-ret
+
+ - 'Mitigation: IBPB'           Similar protection as "safe RET" above
+                                but employs an IBPB barrier on privilege
+                                domain crossings (User->Kernel,
+                                Guest->Host).
+
+                                (spec_rstack_overflow=ibpb)
+
+ - 'Mitigation: IBPB on VMEXIT' Mitigation addressing the cloud provider
+                                scenario - the Guest->Host transitions
+                                only.
+
+                                (spec_rstack_overflow=ibpb-vmexit)
+
+In order to exploit vulnerability, an attacker needs to:
+
+ - gain local access on the machine
+
+ - break kASLR
+
+ - find gadgets in the running kernel in order to use them in the exploit
+
+ - potentially create and pin an additional workload on the sibling
+   thread, depending on the microarchitecture (not necessary on fam 0x19)
+
+ - run the exploit
+
+Considering the performance implications of each mitigation type, the
+default one is 'Mitigation: safe RET' which should take care of most
+attack vectors, including the local User->Kernel one.
+
+As always, the user is advised to keep her/his system up-to-date by
+applying software updates regularly.
+
+The default setting will be reevaluated when needed and especially when
+new attack vectors appear.
+
+As one can surmise, 'Mitigation: safe RET' does come at the cost of some
+performance depending on the workload. If one trusts her/his userspace
+and does not want to suffer the performance impact, one can always
+disable the mitigation with spec_rstack_overflow=off.
+
+Similarly, 'Mitigation: IBPB' is another full mitigation type employing
+an indrect branch prediction barrier after having applied the required
+microcode patch for one's system. This mitigation comes also at
+a performance cost.
+
+Mitigation: safe RET
+--------------------
+
+The mitigation works by ensuring all RET instructions speculate to
+a controlled location, similar to how speculation is controlled in the
+retpoline sequence.  To accomplish this, the __x86_return_thunk forces
+the CPU to mispredict every function return using a 'safe return'
+sequence.
+
+To ensure the safety of this mitigation, the kernel must ensure that the
+safe return sequence is itself free from attacker interference.  In Zen3
+and Zen4, this is accomplished by creating a BTB alias between the
+untraining function srso_untrain_ret_alias() and the safe return
+function srso_safe_ret_alias() which results in evicting a potentially
+poisoned BTB entry and using that safe one for all function returns.
+
+In older Zen1 and Zen2, this is accomplished using a reinterpretation
+technique similar to Retbleed one: srso_untrain_ret() and
+srso_safe_ret().
index a145799..722b6ec 100644 (file)
                        Format: off | on
                        default: on
 
+       gather_data_sampling=
+                       [X86,INTEL] Control the Gather Data Sampling (GDS)
+                       mitigation.
+
+                       Gather Data Sampling is a hardware vulnerability which
+                       allows unprivileged speculative access to data which was
+                       previously stored in vector registers.
+
+                       This issue is mitigated by default in updated microcode.
+                       The mitigation may have a performance impact but can be
+                       disabled. On systems without the microcode mitigation
+                       disabling AVX serves as a mitigation.
+
+                       force:  Disable AVX to mitigate systems without
+                               microcode mitigation. No effect if the microcode
+                               mitigation is present. Known to cause crashes in
+                               userspace with buggy AVX enumeration.
+
+                       off:    Disable GDS mitigation.
+
        gcov_persist=   [GCOV] When non-zero (default), profiling data for
                        kernel modules is saved and remains accessible via
                        debugfs, even when the module is unloaded/reloaded.
                                Disable all optional CPU mitigations.  This
                                improves system performance, but it may also
                                expose users to several CPU vulnerabilities.
-                               Equivalent to: nopti [X86,PPC]
-                                              if nokaslr then kpti=0 [ARM64]
-                                              nospectre_v1 [X86,PPC]
-                                              nobp=0 [S390]
-                                              nospectre_v2 [X86,PPC,S390,ARM64]
-                                              spectre_v2_user=off [X86]
-                                              spec_store_bypass_disable=off [X86,PPC]
-                                              ssbd=force-off [ARM64]
-                                              nospectre_bhb [ARM64]
+                               Equivalent to: if nokaslr then kpti=0 [ARM64]
+                                              gather_data_sampling=off [X86]
+                                              kvm.nx_huge_pages=off [X86]
                                               l1tf=off [X86]
                                               mds=off [X86]
-                                              tsx_async_abort=off [X86]
-                                              kvm.nx_huge_pages=off [X86]
-                                              srbds=off [X86,INTEL]
+                                              mmio_stale_data=off [X86]
                                               no_entry_flush [PPC]
                                               no_uaccess_flush [PPC]
-                                              mmio_stale_data=off [X86]
+                                              nobp=0 [S390]
+                                              nopti [X86,PPC]
+                                              nospectre_bhb [ARM64]
+                                              nospectre_v1 [X86,PPC]
+                                              nospectre_v2 [X86,PPC,S390,ARM64]
                                               retbleed=off [X86]
+                                              spec_store_bypass_disable=off [X86,PPC]
+                                              spectre_v2_user=off [X86]
+                                              srbds=off [X86,INTEL]
+                                              ssbd=force-off [ARM64]
+                                              tsx_async_abort=off [X86]
 
                                Exceptions:
                                               This does not have any effect on
                        Not specifying this option is equivalent to
                        spectre_v2_user=auto.
 
+       spec_rstack_overflow=
+                       [X86] Control RAS overflow mitigation on AMD Zen CPUs
+
+                       off             - Disable mitigation
+                       microcode       - Enable microcode mitigation only
+                       safe-ret        - Enable sw-only safe RET mitigation (default)
+                       ibpb            - Enable mitigation by issuing IBPB on
+                                         kernel entry
+                       ibpb-vmexit     - Issue IBPB only on VMEXIT
+                                         (cloud-specific mitigation)
+
        spec_store_bypass_disable=
                        [HW] Control Speculative Store Bypass (SSB) Disable mitigation
                        (Speculative Store Bypass vulnerability)
index 0f966f0..8635305 100644 (file)
@@ -2339,7 +2339,7 @@ F:        drivers/phy/mediatek/
 ARM/MICROCHIP (ARM64) SoC support
 M:     Conor Dooley <conor@kernel.org>
 M:     Nicolas Ferre <nicolas.ferre@microchip.com>
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Supported
 T:     git https://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git
@@ -2348,7 +2348,7 @@ F:        arch/arm64/boot/dts/microchip/
 ARM/Microchip (AT91) SoC support
 M:     Nicolas Ferre <nicolas.ferre@microchip.com>
 M:     Alexandre Belloni <alexandre.belloni@bootlin.com>
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Supported
 W:     http://www.linux4sam.org
@@ -3250,7 +3250,7 @@ F:        include/uapi/linux/atm*
 
 ATMEL MACB ETHERNET DRIVER
 M:     Nicolas Ferre <nicolas.ferre@microchip.com>
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 S:     Supported
 F:     drivers/net/ethernet/cadence/
 
@@ -12480,6 +12480,7 @@ F:      net/mctp/
 
 MAPLE TREE
 M:     Liam R. Howlett <Liam.Howlett@oracle.com>
+L:     maple-tree@lists.infradead.org
 L:     linux-mm@kvack.org
 S:     Supported
 F:     Documentation/core-api/maple_tree.rst
@@ -13786,7 +13787,7 @@ F:      Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml
 F:     drivers/spi/spi-at91-usart.c
 
 MICROCHIP AUDIO ASOC DRIVERS
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:     Supported
 F:     Documentation/devicetree/bindings/sound/atmel*
@@ -13809,7 +13810,7 @@ S:      Maintained
 F:     drivers/crypto/atmel-ecc.*
 
 MICROCHIP EIC DRIVER
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Supported
 F:     Documentation/devicetree/bindings/interrupt-controller/microchip,sama7g5-eic.yaml
@@ -13882,7 +13883,7 @@ F:      drivers/video/fbdev/atmel_lcdfb.c
 F:     include/video/atmel_lcdc.h
 
 MICROCHIP MCP16502 PMIC DRIVER
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Supported
 F:     Documentation/devicetree/bindings/regulator/mcp16502-regulator.txt
@@ -13909,7 +13910,7 @@ F:      Documentation/devicetree/bindings/mtd/atmel-nand.txt
 F:     drivers/mtd/nand/raw/atmel/*
 
 MICROCHIP OTPC DRIVER
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Supported
 F:     Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml
@@ -13948,7 +13949,7 @@ F:      Documentation/devicetree/bindings/fpga/microchip,mpf-spi-fpga-mgr.yaml
 F:     drivers/fpga/microchip-spi.c
 
 MICROCHIP PWM DRIVER
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 L:     linux-pwm@vger.kernel.org
 S:     Supported
@@ -13964,7 +13965,7 @@ F:      drivers/iio/adc/at91-sama5d2_adc.c
 F:     include/dt-bindings/iio/adc/at91-sama5d2_adc.h
 
 MICROCHIP SAMA5D2-COMPATIBLE SHUTDOWN CONTROLLER
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 S:     Supported
 F:     Documentation/devicetree/bindings/power/reset/atmel,sama5d2-shdwc.yaml
 F:     drivers/power/reset/at91-sama5d2_shdwc.c
@@ -13981,7 +13982,7 @@ S:      Supported
 F:     drivers/spi/spi-atmel.*
 
 MICROCHIP SSC DRIVER
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Supported
 F:     Documentation/devicetree/bindings/misc/atmel-ssc.txt
@@ -14010,7 +14011,7 @@ F:      drivers/usb/gadget/udc/atmel_usba_udc.*
 
 MICROCHIP WILC1000 WIFI DRIVER
 M:     Ajay Singh <ajay.kathat@microchip.com>
-M:     Claudiu Beznea <claudiu.beznea@microchip.com>
+M:     Claudiu Beznea <claudiu.beznea@tuxon.dev>
 L:     linux-wireless@vger.kernel.org
 S:     Supported
 F:     drivers/net/wireless/microchip/wilc1000/
@@ -16293,6 +16294,7 @@ F:      drivers/pci/controller/dwc/pci-exynos.c
 PCI DRIVER FOR SYNOPSYS DESIGNWARE
 M:     Jingoo Han <jingoohan1@gmail.com>
 M:     Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+M:     Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
 L:     linux-pci@vger.kernel.org
 S:     Maintained
 F:     Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml
@@ -18508,17 +18510,14 @@ RTL8180 WIRELESS DRIVER
 L:     linux-wireless@vger.kernel.org
 S:     Orphan
 W:     https://wireless.wiki.kernel.org/
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
 F:     drivers/net/wireless/realtek/rtl818x/rtl8180/
 
 RTL8187 WIRELESS DRIVER
-M:     Herton Ronaldo Krzesinski <herton@canonical.com>
-M:     Hin-Tak Leung <htl10@users.sourceforge.net>
+M:     Hin-Tak Leung <hintak.leung@gmail.com>
 M:     Larry Finger <Larry.Finger@lwfinger.net>
 L:     linux-wireless@vger.kernel.org
 S:     Maintained
 W:     https://wireless.wiki.kernel.org/
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
 F:     drivers/net/wireless/realtek/rtl818x/rtl8187/
 
 RTL8XXXU WIRELESS DRIVER (rtl8xxxu)
index 8e5ffb5..b7afaa0 100644 (file)
 .Lskip_hcrx_\@:
 .endm
 
+/* Check if running in host at EL2 mode, i.e., (h)VHE. Jump to fail if not. */
+.macro __check_hvhe fail, tmp
+       mrs     \tmp, hcr_el2
+       and     \tmp, \tmp, #HCR_E2H
+       cbz     \tmp, \fail
+.endm
+
 /*
  * Allow Non-secure EL1 and EL0 to access physical timer and counter.
  * This is not necessary for VHE, since the host kernel runs in EL2,
@@ -43,9 +50,7 @@
  */
 .macro __init_el2_timers
        mov     x0, #3                          // Enable EL1 physical timers
-       mrs     x1, hcr_el2
-       and     x1, x1, #HCR_E2H
-       cbz     x1, .LnVHE_\@
+       __check_hvhe .LnVHE_\@, x1
        lsl     x0, x0, #10
 .LnVHE_\@:
        msr     cnthctl_el2, x0
 
 /* Coprocessor traps */
 .macro __init_el2_cptr
-       mrs     x1, hcr_el2
-       and     x1, x1, #HCR_E2H
-       cbz     x1, .LnVHE_\@
+       __check_hvhe .LnVHE_\@, x1
        mov     x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN)
-       b       .Lset_cptr_\@
+       msr     cpacr_el1, x0
+       b       .Lskip_set_cptr_\@
 .LnVHE_\@:
        mov     x0, #0x33ff
-.Lset_cptr_\@:
        msr     cptr_el2, x0                    // Disable copro. traps to EL2
+.Lskip_set_cptr_\@:
 .endm
 
 /* Disable any fine grained traps */
        check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2
 
 .Linit_sve_\@: /* SVE register access */
-       mrs     x0, cptr_el2                    // Disable SVE traps
-       mrs     x1, hcr_el2
-       and     x1, x1, #HCR_E2H
-       cbz     x1, .Lcptr_nvhe_\@
+       __check_hvhe .Lcptr_nvhe_\@, x1
 
-       // VHE case
+       // (h)VHE case
+       mrs     x0, cpacr_el1                   // Disable SVE traps
        orr     x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN)
-       b       .Lset_cptr_\@
+       msr     cpacr_el1, x0
+       b       .Lskip_set_cptr_\@
 
 .Lcptr_nvhe_\@: // nVHE case
+       mrs     x0, cptr_el2                    // Disable SVE traps
        bic     x0, x0, #CPTR_EL2_TZ
-.Lset_cptr_\@:
        msr     cptr_el2, x0
+.Lskip_set_cptr_\@:
        isb
        mov     x1, #ZCR_ELx_LEN_MASK           // SVE: Enable full vector
        msr_s   SYS_ZCR_EL2, x1                 // length for EL1.
        check_override id_aa64pfr1, ID_AA64PFR1_EL1_SME_SHIFT, .Linit_sme_\@, .Lskip_sme_\@, x1, x2
 
 .Linit_sme_\@: /* SME register access and priority mapping */
+       __check_hvhe .Lcptr_nvhe_sme_\@, x1
+
+       // (h)VHE case
+       mrs     x0, cpacr_el1                   // Disable SME traps
+       orr     x0, x0, #(CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN)
+       msr     cpacr_el1, x0
+       b       .Lskip_set_cptr_sme_\@
+
+.Lcptr_nvhe_sme_\@: // nVHE case
        mrs     x0, cptr_el2                    // Disable SME traps
        bic     x0, x0, #CPTR_EL2_TSM
        msr     cptr_el2, x0
+.Lskip_set_cptr_sme_\@:
        isb
 
        mrs     x1, sctlr_el2
index 7d170aa..24e28bb 100644 (file)
@@ -278,7 +278,7 @@ asmlinkage void __noreturn hyp_panic_bad_stack(void);
 asmlinkage void kvm_unexpected_el2_exception(void);
 struct kvm_cpu_context;
 void handle_trap(struct kvm_cpu_context *host_ctxt);
-asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on);
+asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on);
 void __noreturn __pkvm_init_finalise(void);
 void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
 void kvm_patch_vector_branch(struct alt_instr *alt,
index efc0b45..3d6725f 100644 (file)
@@ -571,6 +571,14 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature)
        return test_bit(feature, vcpu->arch.features);
 }
 
+static __always_inline void kvm_write_cptr_el2(u64 val)
+{
+       if (has_vhe() || has_hvhe())
+               write_sysreg(val, cpacr_el1);
+       else
+               write_sysreg(val, cptr_el2);
+}
+
 static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu)
 {
        u64 val;
@@ -578,8 +586,16 @@ static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu)
        if (has_vhe()) {
                val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
                       CPACR_EL1_ZEN_EL1EN);
+               if (cpus_have_final_cap(ARM64_SME))
+                       val |= CPACR_EL1_SMEN_EL1EN;
        } else if (has_hvhe()) {
                val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN);
+
+               if (!vcpu_has_sve(vcpu) ||
+                   (vcpu->arch.fp_state != FP_STATE_GUEST_OWNED))
+                       val |= CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN;
+               if (cpus_have_final_cap(ARM64_SME))
+                       val |= CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN;
        } else {
                val = CPTR_NVHE_EL2_RES1;
 
@@ -597,9 +613,6 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu)
 {
        u64 val = kvm_get_reset_cptr_el2(vcpu);
 
-       if (has_vhe() || has_hvhe())
-               write_sysreg(val, cpacr_el1);
-       else
-               write_sysreg(val, cptr_el2);
+       kvm_write_cptr_el2(val);
 }
 #endif /* __ARM64_KVM_EMULATE_H__ */
index 72dc53a..d1cb298 100644 (file)
@@ -55,7 +55,7 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 
 static bool vgic_present, kvm_arm_initialised;
 
-static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
 DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
 bool is_kvm_arm_initialised(void)
@@ -1864,18 +1864,24 @@ static void cpu_hyp_reinit(void)
        cpu_hyp_init_features();
 }
 
-static void _kvm_arch_hardware_enable(void *discard)
+static void cpu_hyp_init(void *discard)
 {
-       if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+       if (!__this_cpu_read(kvm_hyp_initialized)) {
                cpu_hyp_reinit();
-               __this_cpu_write(kvm_arm_hardware_enabled, 1);
+               __this_cpu_write(kvm_hyp_initialized, 1);
        }
 }
 
-int kvm_arch_hardware_enable(void)
+static void cpu_hyp_uninit(void *discard)
 {
-       int was_enabled;
+       if (__this_cpu_read(kvm_hyp_initialized)) {
+               cpu_hyp_reset();
+               __this_cpu_write(kvm_hyp_initialized, 0);
+       }
+}
 
+int kvm_arch_hardware_enable(void)
+{
        /*
         * Most calls to this function are made with migration
         * disabled, but not with preemption disabled. The former is
@@ -1884,36 +1890,23 @@ int kvm_arch_hardware_enable(void)
         */
        preempt_disable();
 
-       was_enabled = __this_cpu_read(kvm_arm_hardware_enabled);
-       _kvm_arch_hardware_enable(NULL);
+       cpu_hyp_init(NULL);
 
-       if (!was_enabled) {
-               kvm_vgic_cpu_up();
-               kvm_timer_cpu_up();
-       }
+       kvm_vgic_cpu_up();
+       kvm_timer_cpu_up();
 
        preempt_enable();
 
        return 0;
 }
 
-static void _kvm_arch_hardware_disable(void *discard)
-{
-       if (__this_cpu_read(kvm_arm_hardware_enabled)) {
-               cpu_hyp_reset();
-               __this_cpu_write(kvm_arm_hardware_enabled, 0);
-       }
-}
-
 void kvm_arch_hardware_disable(void)
 {
-       if (__this_cpu_read(kvm_arm_hardware_enabled)) {
-               kvm_timer_cpu_down();
-               kvm_vgic_cpu_down();
-       }
+       kvm_timer_cpu_down();
+       kvm_vgic_cpu_down();
 
        if (!is_protected_kvm_enabled())
-               _kvm_arch_hardware_disable(NULL);
+               cpu_hyp_uninit(NULL);
 }
 
 #ifdef CONFIG_CPU_PM
@@ -1922,16 +1915,16 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
                                    void *v)
 {
        /*
-        * kvm_arm_hardware_enabled is left with its old value over
+        * kvm_hyp_initialized is left with its old value over
         * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
         * re-enable hyp.
         */
        switch (cmd) {
        case CPU_PM_ENTER:
-               if (__this_cpu_read(kvm_arm_hardware_enabled))
+               if (__this_cpu_read(kvm_hyp_initialized))
                        /*
-                        * don't update kvm_arm_hardware_enabled here
-                        * so that the hardware will be re-enabled
+                        * don't update kvm_hyp_initialized here
+                        * so that the hyp will be re-enabled
                         * when we resume. See below.
                         */
                        cpu_hyp_reset();
@@ -1939,8 +1932,8 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
                return NOTIFY_OK;
        case CPU_PM_ENTER_FAILED:
        case CPU_PM_EXIT:
-               if (__this_cpu_read(kvm_arm_hardware_enabled))
-                       /* The hardware was enabled before suspend. */
+               if (__this_cpu_read(kvm_hyp_initialized))
+                       /* The hyp was enabled before suspend. */
                        cpu_hyp_reinit();
 
                return NOTIFY_OK;
@@ -2021,7 +2014,7 @@ static int __init init_subsystems(void)
        /*
         * Enable hardware so that subsystem initialisation can access EL2.
         */
-       on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+       on_each_cpu(cpu_hyp_init, NULL, 1);
 
        /*
         * Register CPU lower-power notifier
@@ -2059,7 +2052,7 @@ out:
                hyp_cpu_pm_exit();
 
        if (err || !is_protected_kvm_enabled())
-               on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+               on_each_cpu(cpu_hyp_uninit, NULL, 1);
 
        return err;
 }
@@ -2097,7 +2090,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits)
         * The stub hypercalls are now disabled, so set our local flag to
         * prevent a later re-init attempt in kvm_arch_hardware_enable().
         */
-       __this_cpu_write(kvm_arm_hardware_enabled, 1);
+       __this_cpu_write(kvm_hyp_initialized, 1);
        preempt_enable();
 
        return ret;
index 4bddb85..34f222a 100644 (file)
@@ -457,6 +457,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
         */
        val &= ~(TCR_HD | TCR_HA);
        write_sysreg_el1(val, SYS_TCR);
+       __kvm_skip_instr(vcpu);
        return true;
 }
 
index 58dcd92..ab4f5d1 100644 (file)
@@ -705,7 +705,20 @@ int hyp_ffa_init(void *pages)
        if (res.a0 == FFA_RET_NOT_SUPPORTED)
                return 0;
 
-       if (res.a0 != FFA_VERSION_1_0)
+       /*
+        * Firmware returns the maximum supported version of the FF-A
+        * implementation. Check that the returned version is
+        * backwards-compatible with the hyp according to the rules in DEN0077A
+        * v1.1 REL0 13.2.1.
+        *
+        * Of course, things are never simple when dealing with firmware. v1.1
+        * broke ABI with v1.0 on several structures, which is itself
+        * incompatible with the aforementioned versioning scheme. The
+        * expectation is that v1.x implementations that do not support the v1.0
+        * ABI return NOT_SUPPORTED rather than a version number, according to
+        * DEN0077A v1.1 REL0 18.6.4.
+        */
+       if (FFA_MAJOR_VERSION(res.a0) != 1)
                return -EOPNOTSUPP;
 
        arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
index 0a62710..e89a231 100644 (file)
@@ -63,7 +63,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
                __activate_traps_fpsimd32(vcpu);
        }
 
-       write_sysreg(val, cptr_el2);
+       kvm_write_cptr_el2(val);
        write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
 
        if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
index 1401e4c..bf2b21b 100644 (file)
@@ -2,7 +2,7 @@
 #
 config LIGHTWEIGHT_SPINLOCK_CHECK
        bool "Enable lightweight spinlock checks"
-       depends on SMP && !DEBUG_SPINLOCK
+       depends on DEBUG_KERNEL && SMP && !DEBUG_SPINLOCK
        default y
        help
          Add checks with low performance impact to the spinlock functions
index 7ee49f5..d389359 100644 (file)
@@ -117,7 +117,7 @@ char *strchr(const char *s, int c)
        return NULL;
 }
 
-int puts(const char *s)
+static int puts(const char *s)
 {
        const char *nuline = s;
 
@@ -172,7 +172,7 @@ static int print_num(unsigned long num, int base)
        return 0;
 }
 
-int printf(const char *fmt, ...)
+static int printf(const char *fmt, ...)
 {
        va_list args;
        int i = 0;
@@ -204,13 +204,13 @@ void abort(void)
 }
 
 #undef malloc
-void *malloc(size_t size)
+static void *malloc(size_t size)
 {
        return malloc_gzip(size);
 }
 
 #undef free
-void free(void *ptr)
+static void free(void *ptr)
 {
        return free_gzip(ptr);
 }
@@ -278,7 +278,7 @@ static void parse_elf(void *output)
        free(phdrs);
 }
 
-unsigned long decompress_kernel(unsigned int started_wide,
+asmlinkage unsigned long __visible decompress_kernel(unsigned int started_wide,
                unsigned int command_line,
                const unsigned int rd_start,
                const unsigned int rd_end)
index 9e8c101..582fb5d 100644 (file)
@@ -14,6 +14,8 @@
 #define dma_outb       outb
 #define dma_inb                inb
 
+extern unsigned long pcxl_dma_start;
+
 /*
 ** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up
 ** (or rather not merge) DMAs into manageable chunks.
index a7cf0d0..f1cc1ee 100644 (file)
@@ -12,6 +12,10 @@ extern void mcount(void);
 extern unsigned long sys_call_table[];
 
 extern unsigned long return_address(unsigned int);
+struct ftrace_regs;
+extern void ftrace_function_trampoline(unsigned long parent,
+               unsigned long self_addr, unsigned long org_sp_gr3,
+               struct ftrace_regs *fregs);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern void ftrace_caller(void);
index edfcb98..0b326e5 100644 (file)
@@ -7,8 +7,6 @@
 #include <asm/processor.h>
 #include <asm/spinlock_types.h>
 
-#define SPINLOCK_BREAK_INSN    0x0000c006      /* break 6,6 */
-
 static inline void arch_spin_val_check(int lock_val)
 {
        if (IS_ENABLED(CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK))
index d659340..efd06a8 100644 (file)
@@ -4,6 +4,10 @@
 
 #define __ARCH_SPIN_LOCK_UNLOCKED_VAL  0x1a46
 
+#define SPINLOCK_BREAK_INSN    0x0000c006      /* break 6,6 */
+
+#ifndef __ASSEMBLY__
+
 typedef struct {
 #ifdef CONFIG_PA20
        volatile unsigned int slock;
@@ -27,6 +31,8 @@ typedef struct {
        volatile unsigned int   counter;
 } arch_rwlock_t;
 
+#endif /* __ASSEMBLY__ */
+
 #define __ARCH_RW_LOCK_UNLOCKED__       0x01000000
 #define __ARCH_RW_LOCK_UNLOCKED         { .lock_mutex = __ARCH_SPIN_LOCK_UNLOCKED, \
                                        .counter = __ARCH_RW_LOCK_UNLOCKED__ }
index 6d1c781..8f37e75 100644 (file)
@@ -74,8 +74,8 @@
 static DEFINE_SPINLOCK(pdc_lock);
 #endif
 
-unsigned long pdc_result[NUM_PDC_RESULT]  __aligned(8);
-unsigned long pdc_result2[NUM_PDC_RESULT] __aligned(8);
+static unsigned long pdc_result[NUM_PDC_RESULT]  __aligned(8);
+static unsigned long pdc_result2[NUM_PDC_RESULT] __aligned(8);
 
 #ifdef CONFIG_64BIT
 #define WIDE_FIRMWARE 0x1
@@ -334,7 +334,7 @@ int __pdc_cpu_rendezvous(void)
 /**
  * pdc_cpu_rendezvous_lock - Lock PDC while transitioning to rendezvous state
  */
-void pdc_cpu_rendezvous_lock(void)
+void pdc_cpu_rendezvous_lock(void) __acquires(&pdc_lock)
 {
        spin_lock(&pdc_lock);
 }
@@ -342,7 +342,7 @@ void pdc_cpu_rendezvous_lock(void)
 /**
  * pdc_cpu_rendezvous_unlock - Unlock PDC after reaching rendezvous state
  */
-void pdc_cpu_rendezvous_unlock(void)
+void pdc_cpu_rendezvous_unlock(void) __releases(&pdc_lock)
 {
        spin_unlock(&pdc_lock);
 }
index 4d392e4..d1defb9 100644 (file)
@@ -53,7 +53,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent,
 
 static ftrace_func_t ftrace_func;
 
-void notrace __hot ftrace_function_trampoline(unsigned long parent,
+asmlinkage void notrace __hot ftrace_function_trampoline(unsigned long parent,
                                unsigned long self_addr,
                                unsigned long org_sp_gr3,
                                struct ftrace_regs *fregs)
index 00297e8..6f0c92e 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
+#include <linux/libgcc.h>
 
 #include <linux/string.h>
 EXPORT_SYMBOL(memset);
@@ -92,12 +93,6 @@ EXPORT_SYMBOL($$divI_12);
 EXPORT_SYMBOL($$divI_14);
 EXPORT_SYMBOL($$divI_15);
 
-extern void __ashrdi3(void);
-extern void __ashldi3(void);
-extern void __lshrdi3(void);
-extern void __muldi3(void);
-extern void __ucmpdi2(void);
-
 EXPORT_SYMBOL(__ashrdi3);
 EXPORT_SYMBOL(__ashldi3);
 EXPORT_SYMBOL(__lshrdi3);
index 3f6b507..bf9f192 100644 (file)
@@ -39,7 +39,7 @@ static struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
 static unsigned long pcxl_used_bytes __read_mostly;
 static unsigned long pcxl_used_pages __read_mostly;
 
-extern unsigned long pcxl_dma_start; /* Start of pcxl dma mapping area */
+unsigned long pcxl_dma_start __ro_after_init; /* pcxl dma mapping area start */
 static DEFINE_SPINLOCK(pcxl_res_lock);
 static char    *pcxl_res_map;
 static int     pcxl_res_hint;
@@ -381,7 +381,7 @@ pcxl_dma_init(void)
        pcxl_res_map = (char *)__get_free_pages(GFP_KERNEL,
                                            get_order(pcxl_res_size));
        memset(pcxl_res_map, 0, pcxl_res_size);
-       proc_gsc_root = proc_mkdir("gsc", NULL);
+       proc_gsc_root = proc_mkdir("bus/gsc", NULL);
        if (!proc_gsc_root)
                printk(KERN_WARNING
                        "pcxl_dma_init: Unable to create gsc /proc dir entry\n");
index 0d24735..0f9b3b5 100644 (file)
@@ -354,10 +354,8 @@ static int __init pdt_initcall(void)
                return -ENODEV;
 
        kpdtd_task = kthread_run(pdt_mainloop, NULL, "kpdtd");
-       if (IS_ERR(kpdtd_task))
-               return PTR_ERR(kpdtd_task);
 
-       return 0;
+       return PTR_ERR_OR_ZERO(kpdtd_task);
 }
 
 late_initcall(pdt_initcall);
index 90b04d8..b0f0816 100644 (file)
@@ -57,7 +57,7 @@ struct rdr_tbl_ent {
 static int perf_processor_interface __read_mostly = UNKNOWN_INTF;
 static int perf_enabled __read_mostly;
 static DEFINE_SPINLOCK(perf_lock);
-struct parisc_device *cpu_device __read_mostly;
+static struct parisc_device *cpu_device __read_mostly;
 
 /* RDRs to write for PCX-W */
 static const int perf_rdrs_W[] =
index 00b0df9..762289b 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/pdc.h>
+#include <asm/smp.h>
 #include <asm/pdcpat.h>
 #include <asm/irq.h>           /* for struct irq_region */
 #include <asm/parisc-device.h>
index 573f830..211a4af 100644 (file)
 
 static char __initdata command_line[COMMAND_LINE_SIZE];
 
-/* Intended for ccio/sba/cpu statistics under /proc/bus/{runway|gsc} */
-struct proc_dir_entry * proc_runway_root __read_mostly = NULL;
-struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
-struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL;
-
 static void __init setup_cmdline(char **cmdline_p)
 {
        extern unsigned int boot_args[];
@@ -196,48 +191,6 @@ const struct seq_operations cpuinfo_op = {
        .show   = show_cpuinfo
 };
 
-static void __init parisc_proc_mkdir(void)
-{
-       /*
-       ** Can't call proc_mkdir() until after proc_root_init() has been
-       ** called by start_kernel(). In other words, this code can't
-       ** live in arch/.../setup.c because start_parisc() calls
-       ** start_kernel().
-       */
-       switch (boot_cpu_data.cpu_type) {
-       case pcxl:
-       case pcxl2:
-               if (NULL == proc_gsc_root)
-               {
-                       proc_gsc_root = proc_mkdir("bus/gsc", NULL);
-               }
-               break;
-        case pcxt_:
-        case pcxu:
-        case pcxu_:
-        case pcxw:
-        case pcxw_:
-        case pcxw2:
-                if (NULL == proc_runway_root)
-                {
-                        proc_runway_root = proc_mkdir("bus/runway", NULL);
-                }
-                break;
-       case mako:
-       case mako2:
-                if (NULL == proc_mckinley_root)
-                {
-                        proc_mckinley_root = proc_mkdir("bus/mckinley", NULL);
-                }
-                break;
-       default:
-               /* FIXME: this was added to prevent the compiler 
-                * complaining about missing pcx, pcxs and pcxt
-                * I'm assuming they have neither gsc nor runway */
-               break;
-       }
-}
-
 static struct resource central_bus = {
        .name   = "Central Bus",
        .start  = F_EXTEND(0xfff80000),
@@ -294,7 +247,6 @@ static int __init parisc_init(void)
 {
        u32 osid = (OS_ID_LINUX << 16);
 
-       parisc_proc_mkdir();
        parisc_init_resources();
        do_device_inventory();                  /* probe for hardware */
 
index f886ff0..e8d27de 100644 (file)
@@ -423,7 +423,7 @@ static void check_syscallno_in_delay_branch(struct pt_regs *regs)
        regs->gr[31] -= 8; /* delayed branching */
 
        /* Get assembler opcode of code in delay branch */
-       uaddr = (unsigned int *) ((regs->gr[31] & ~3) + 4);
+       uaddr = (u32 __user *) ((regs->gr[31] & ~3) + 4);
        err = get_user(opcode, uaddr);
        if (err)
                return;
index ca2d537..9915062 100644 (file)
 #include <linux/elf-randomize.h>
 
 /*
- * Construct an artificial page offset for the mapping based on the virtual
+ * Construct an artificial page offset for the mapping based on the physical
  * address of the kernel file mapping variable.
- * If filp is zero the calculated pgoff value aliases the memory of the given
- * address. This is useful for io_uring where the mapping shall alias a kernel
- * address and a userspace adress where both the kernel and the userspace
- * access the same memory region.
  */
-#define GET_FILP_PGOFF(filp, addr)             \
-       ((filp ? (((unsigned long) filp->f_mapping) >> 8)       \
-                & ((SHM_COLOUR-1) >> PAGE_SHIFT) : 0UL)        \
-         + (addr >> PAGE_SHIFT))
+#define GET_FILP_PGOFF(filp)           \
+       (filp ? (((unsigned long) filp->f_mapping) >> 8)        \
+                & ((SHM_COLOUR-1) >> PAGE_SHIFT) : 0UL)
 
 static unsigned long shared_align_offset(unsigned long filp_pgoff,
                                         unsigned long pgoff)
@@ -117,7 +112,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
        do_color_align = 0;
        if (filp || (flags & MAP_SHARED))
                do_color_align = 1;
-       filp_pgoff = GET_FILP_PGOFF(filp, addr);
+       filp_pgoff = GET_FILP_PGOFF(filp);
 
        if (flags & MAP_FIXED) {
                /* Even MAP_FIXED mappings must reside within TASK_SIZE */
index 1373e51..1f51aa9 100644 (file)
@@ -39,6 +39,7 @@ registers).
 #include <asm/assembly.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
+#include <asm/spinlock_types.h>
 
 #include <linux/linkage.h>
 
@@ -66,6 +67,16 @@ registers).
        stw     \reg1, 0(%sr2,\reg2)
        .endm
 
+       /* raise exception if spinlock content is not zero or
+        * __ARCH_SPIN_LOCK_UNLOCKED_VAL */
+       .macro  spinlock_check spin_val,tmpreg
+#ifdef CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK
+       ldi     __ARCH_SPIN_LOCK_UNLOCKED_VAL, \tmpreg
+       andcm,= \spin_val, \tmpreg, %r0
+       .word   SPINLOCK_BREAK_INSN
+#endif
+       .endm
+
        .text
 
        .import syscall_exit,code
@@ -508,7 +519,8 @@ lws_start:
 
 lws_exit_noerror:
        lws_pagefault_enable    %r1,%r21
-       stw,ma  %r20, 0(%sr2,%r20)
+       ldi     __ARCH_SPIN_LOCK_UNLOCKED_VAL, %r21
+       stw,ma  %r21, 0(%sr2,%r20)
        ssm     PSW_SM_I, %r0
        b       lws_exit
        copy    %r0, %r21
@@ -521,7 +533,8 @@ lws_wouldblock:
 
 lws_pagefault:
        lws_pagefault_enable    %r1,%r21
-       stw,ma  %r20, 0(%sr2,%r20)
+       ldi     __ARCH_SPIN_LOCK_UNLOCKED_VAL, %r21
+       stw,ma  %r21, 0(%sr2,%r20)
        ssm     PSW_SM_I, %r0
        ldo     3(%r0),%r28
        b       lws_exit
@@ -619,6 +632,7 @@ lws_compare_and_swap:
 
        /* Try to acquire the lock */
        LDCW    0(%sr2,%r20), %r28
+       spinlock_check  %r28, %r21
        comclr,<>       %r0, %r28, %r0
        b,n     lws_wouldblock
 
@@ -772,6 +786,7 @@ cas2_lock_start:
 
        /* Try to acquire the lock */
        LDCW    0(%sr2,%r20), %r28
+       spinlock_check  %r28, %r21
        comclr,<>       %r0, %r28, %r0
        b,n     lws_wouldblock
 
@@ -1001,6 +1016,7 @@ atomic_xchg_start:
 
        /* Try to acquire the lock */
        LDCW    0(%sr2,%r20), %r28
+       spinlock_check  %r28, %r21
        comclr,<>       %r0, %r28, %r0
        b,n     lws_wouldblock
 
@@ -1199,6 +1215,7 @@ atomic_store_start:
 
        /* Try to acquire the lock */
        LDCW    0(%sr2,%r20), %r28
+       spinlock_check  %r28, %r21
        comclr,<>       %r0, %r28, %r0
        b,n     lws_wouldblock
 
@@ -1330,7 +1347,7 @@ ENTRY(lws_lock_start)
        /* lws locks */
        .rept 256
        /* Keep locks aligned at 16-bytes */
-       .word 1
+       .word __ARCH_SPIN_LOCK_UNLOCKED_VAL
        .word 0 
        .word 0
        .word 0
index 8130627..170d0dd 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/signal.h>
 #include <linux/ratelimit.h>
 #include <linux/uaccess.h>
+#include <linux/sysctl.h>
 #include <asm/unaligned.h>
 #include <asm/hardirq.h>
 #include <asm/traps.h>
index 8e6014a..9d8b4db 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/module.h>
+#include <linux/libgcc.h>
 
 union ull_union {
        unsigned long long ull;
@@ -9,7 +10,7 @@ union ull_union {
        } ui;
 };
 
-int __ucmpdi2(unsigned long long a, unsigned long long b)
+word_type __ucmpdi2(unsigned long long a, unsigned long long b)
 {
        union ull_union au = {.ull = a};
        union ull_union bu = {.ull = b};
index a4c7c76..2fe5b44 100644 (file)
@@ -192,31 +192,31 @@ int fixup_exception(struct pt_regs *regs)
  * For implementation see handle_interruption() in traps.c
  */
 static const char * const trap_description[] = {
-       [1] "High-priority machine check (HPMC)",
-       [2] "Power failure interrupt",
-       [3] "Recovery counter trap",
-       [5] "Low-priority machine check",
-       [6] "Instruction TLB miss fault",
-       [7] "Instruction access rights / protection trap",
-       [8] "Illegal instruction trap",
-       [9] "Break instruction trap",
-       [10] "Privileged operation trap",
-       [11] "Privileged register trap",
-       [12] "Overflow trap",
-       [13] "Conditional trap",
-       [14] "FP Assist Exception trap",
-       [15] "Data TLB miss fault",
-       [16] "Non-access ITLB miss fault",
-       [17] "Non-access DTLB miss fault",
-       [18] "Data memory protection/unaligned access trap",
-       [19] "Data memory break trap",
-       [20] "TLB dirty bit trap",
-       [21] "Page reference trap",
-       [22] "Assist emulation trap",
-       [25] "Taken branch trap",
-       [26] "Data memory access rights trap",
-       [27] "Data memory protection ID trap",
-       [28] "Unaligned data reference trap",
+       [1] =   "High-priority machine check (HPMC)",
+       [2] =   "Power failure interrupt",
+       [3] =   "Recovery counter trap",
+       [5] =   "Low-priority machine check",
+       [6] =   "Instruction TLB miss fault",
+       [7] =   "Instruction access rights / protection trap",
+       [8] =   "Illegal instruction trap",
+       [9] =   "Break instruction trap",
+       [10] =  "Privileged operation trap",
+       [11] =  "Privileged register trap",
+       [12] =  "Overflow trap",
+       [13] =  "Conditional trap",
+       [14] =  "FP Assist Exception trap",
+       [15] =  "Data TLB miss fault",
+       [16] =  "Non-access ITLB miss fault",
+       [17] =  "Non-access DTLB miss fault",
+       [18] =  "Data memory protection/unaligned access trap",
+       [19] =  "Data memory break trap",
+       [20] =  "TLB dirty bit trap",
+       [21] =  "Page reference trap",
+       [22] =  "Assist emulation trap",
+       [25] =  "Taken branch trap",
+       [26] =  "Data memory access rights trap",
+       [27] =  "Data memory protection ID trap",
+       [28] =  "Unaligned data reference trap",
 };
 
 const char *trap_name(unsigned long code)
index 389941c..a088c24 100644 (file)
@@ -523,10 +523,6 @@ void mark_rodata_ro(void)
 void *parisc_vmalloc_start __ro_after_init;
 EXPORT_SYMBOL(parisc_vmalloc_start);
 
-#ifdef CONFIG_PA11
-unsigned long pcxl_dma_start __ro_after_init;
-#endif
-
 void __init mem_init(void)
 {
        /* Do sanity checks on IPC (compat) structures */
index 345ff0b..d7ee1f4 100644 (file)
@@ -27,7 +27,7 @@
  */
 void __iomem *ioremap(unsigned long phys_addr, unsigned long size)
 {
-       void __iomem *addr;
+       uintptr_t addr;
        struct vm_struct *area;
        unsigned long offset, last_addr;
        pgprot_t pgprot;
@@ -79,10 +79,9 @@ void __iomem *ioremap(unsigned long phys_addr, unsigned long size)
        if (!area)
                return NULL;
 
-       addr = (void __iomem *) area->addr;
-       if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-                              phys_addr, pgprot)) {
-               vunmap(addr);
+       addr = (uintptr_t) area->addr;
+       if (ioremap_page_range(addr, addr + size, phys_addr, pgprot)) {
+               vunmap(area->addr);
                return NULL;
        }
 
index 8091b8b..b93ffdd 100644 (file)
@@ -37,6 +37,10 @@ static inline void flush_dcache_page(struct page *page)
 #define flush_icache_user_page(vma, pg, addr, len) \
        flush_icache_mm(vma->vm_mm, 0)
 
+#ifdef CONFIG_64BIT
+#define flush_cache_vmap(start, end)   flush_tlb_kernel_range(start, end)
+#endif
+
 #ifndef CONFIG_SMP
 
 #define flush_icache_all() local_flush_icache_all()
index aff6c33..4c58ee7 100644 (file)
@@ -101,9 +101,9 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
  * Relaxed I/O memory access primitives. These follow the Device memory
  * ordering rules but do not guarantee any ordering relative to Normal memory
  * accesses.  These are defined to order the indicated access (either a read or
- * write) with all other I/O memory accesses. Since the platform specification
- * defines that all I/O regions are strongly ordered on channel 2, no explicit
- * fences are required to enforce this ordering.
+ * write) with all other I/O memory accesses to the same peripheral. Since the
+ * platform specification defines that all I/O regions are strongly ordered on
+ * channel 0, no explicit fences are required to enforce this ordering.
  */
 /* FIXME: These are now the same as asm-generic */
 #define __io_rbr()             do {} while (0)
@@ -125,14 +125,14 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
 #endif
 
 /*
- * I/O memory access primitives. Reads are ordered relative to any
- * following Normal memory access. Writes are ordered relative to any prior
- * Normal memory access.  The memory barriers here are necessary as RISC-V
+ * I/O memory access primitives.  Reads are ordered relative to any following
+ * Normal memory read and delay() loop.  Writes are ordered relative to any
+ * prior Normal memory write.  The memory barriers here are necessary as RISC-V
  * doesn't define any ordering between the memory space and the I/O space.
  */
 #define __io_br()      do {} while (0)
-#define __io_ar(v)     __asm__ __volatile__ ("fence i,r" : : : "memory")
-#define __io_bw()      __asm__ __volatile__ ("fence w,o" : : : "memory")
+#define __io_ar(v)     ({ __asm__ __volatile__ ("fence i,ir" : : : "memory"); })
+#define __io_bw()      ({ __asm__ __volatile__ ("fence w,o" : : : "memory"); })
 #define __io_aw()      mmiowb_set_pending()
 
 #define readb(c)       ({ u8  __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; })
index 75970ee..b5680c9 100644 (file)
@@ -188,6 +188,8 @@ extern struct pt_alloc_ops pt_ops __initdata;
 #define PAGE_KERNEL_IO         __pgprot(_PAGE_IOREMAP)
 
 extern pgd_t swapper_pg_dir[];
+extern pgd_t trampoline_pg_dir[];
+extern pgd_t early_pg_dir[];
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline int pmd_present(pmd_t pmd)
index 58d3e44..924d01b 100644 (file)
@@ -3,12 +3,14 @@
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 
+extern bool pgtable_l4_enabled, pgtable_l5_enabled;
+
 #define IOREMAP_MAX_ORDER (PUD_SHIFT)
 
 #define arch_vmap_pud_supported arch_vmap_pud_supported
 static inline bool arch_vmap_pud_supported(pgprot_t prot)
 {
-       return true;
+       return pgtable_l4_enabled || pgtable_l5_enabled;
 }
 
 #define arch_vmap_pmd_supported arch_vmap_pmd_supported
index a2fc952..35b854c 100644 (file)
 #include <asm/smp.h>
 #include <asm/pgtable.h>
 
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+       return phys_id == cpuid_to_hartid_map(cpu);
+}
+
 /*
  * Returns the hart ID of the given device tree node, or -ENODEV if the node
  * isn't an enabled and valid RISC-V hart node.
index 5372b70..c08bb5c 100644 (file)
@@ -281,7 +281,7 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
                kbuf.buffer = initrd;
                kbuf.bufsz = kbuf.memsz = initrd_len;
                kbuf.buf_align = PAGE_SIZE;
-               kbuf.top_down = false;
+               kbuf.top_down = true;
                kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
                ret = kexec_add_buffer(&kbuf);
                if (ret)
@@ -425,6 +425,7 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
                 * sym, instead of searching the whole relsec.
                 */
                case R_RISCV_PCREL_HI20:
+               case R_RISCV_CALL_PLT:
                case R_RISCV_CALL:
                        *(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) |
                                 ENCODE_UJTYPE_IMM(val - addr);
index 85bbce0..40420af 100644 (file)
@@ -61,11 +61,6 @@ int riscv_hartid_to_cpuid(unsigned long hartid)
        return -ENOENT;
 }
 
-bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
-{
-       return phys_id == cpuid_to_hartid_map(cpu);
-}
-
 static void ipi_stop(void)
 {
        set_cpu_online(smp_processor_id(), false);
index 9ce5047..e4c35ac 100644 (file)
 #include <linux/kfence.h>
 
 #include <asm/fixmap.h>
-#include <asm/tlbflush.h>
-#include <asm/sections.h>
-#include <asm/soc.h>
 #include <asm/io.h>
-#include <asm/ptdump.h>
 #include <asm/numa.h>
+#include <asm/pgtable.h>
+#include <asm/ptdump.h>
+#include <asm/sections.h>
+#include <asm/soc.h>
+#include <asm/tlbflush.h>
 
 #include "../kernel/head.h"
 
@@ -214,8 +215,13 @@ static void __init setup_bootmem(void)
        memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
 
        phys_ram_end = memblock_end_of_DRAM();
+
+       /*
+        * Make sure we align the start of the memory on a PMD boundary so that
+        * at worst, we map the linear mapping with PMD mappings.
+        */
        if (!IS_ENABLED(CONFIG_XIP_KERNEL))
-               phys_ram_base = memblock_start_of_DRAM();
+               phys_ram_base = memblock_start_of_DRAM() & PMD_MASK;
 
        /*
         * In 64-bit, any use of __va/__pa before this point is wrong as we
index 8fc0efc..a01bc15 100644 (file)
@@ -22,7 +22,6 @@
  * region is not and then we have to go down to the PUD level.
  */
 
-extern pgd_t early_pg_dir[PTRS_PER_PGD];
 pgd_t tmp_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
 p4d_t tmp_p4d[PTRS_PER_P4D] __page_aligned_bss;
 pud_t tmp_pud[PTRS_PER_PUD] __page_aligned_bss;
index 7422db4..e36261b 100644 (file)
@@ -2593,6 +2593,13 @@ config CPU_IBRS_ENTRY
          This mitigates both spectre_v2 and retbleed at great cost to
          performance.
 
+config CPU_SRSO
+       bool "Mitigate speculative RAS overflow on AMD"
+       depends on CPU_SUP_AMD && X86_64 && RETHUNK
+       default y
+       help
+         Enable the SRSO mitigation needed on AMD Zen1-4 machines.
+
 config SLS
        bool "Mitigate Straight-Line-Speculation"
        depends on CC_HAS_SLS && X86_64
@@ -2603,6 +2610,25 @@ config SLS
          against straight line speculation. The kernel image might be slightly
          larger.
 
+config GDS_FORCE_MITIGATION
+       bool "Force GDS Mitigation"
+       depends on CPU_SUP_INTEL
+       default n
+       help
+         Gather Data Sampling (GDS) is a hardware vulnerability which allows
+         unprivileged speculative access to data which was previously stored in
+         vector registers.
+
+         This option is equivalent to setting gather_data_sampling=force on the
+         command line. The microcode mitigation is used if present, otherwise
+         AVX is disabled as a mitigation. On affected systems that are missing
+         the microcode any userspace code that unconditionally uses AVX will
+         break with this option set.
+
+         Setting this option on systems not vulnerable to GDS has no effect.
+
+         If in doubt, say N.
+
 endif
 
 config ARCH_HAS_ADD_PAGES
index 8eb74cf..2888c0e 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm/mpspec.h>
 #include <asm/x86_init.h>
 #include <asm/cpufeature.h>
+#include <asm/irq_vectors.h>
 
 #ifdef CONFIG_ACPI_APEI
 # include <asm/pgtable_types.h>
@@ -31,6 +32,7 @@ extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
 extern int acpi_fix_pin2_polarity;
 extern int acpi_disable_cmcff;
+extern bool acpi_int_src_ovr[NR_IRQS_LEGACY];
 
 extern u8 acpi_sci_flags;
 extern u32 acpi_sci_override_gsi;
index cb8ca46..b69b0d7 100644 (file)
@@ -14,7 +14,7 @@
  * Defines x86 CPU feature bits
  */
 #define NCAPINTS                       21         /* N 32-bit words worth of info */
-#define NBUGINTS                       1          /* N 32-bit bug flags */
+#define NBUGINTS                       2          /* N 32-bit bug flags */
 
 /*
  * Note: If the comment begins with a quoted string, that string is used
 #define X86_FEATURE_SMBA               (11*32+21) /* "" Slow Memory Bandwidth Allocation */
 #define X86_FEATURE_BMEC               (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
 
+#define X86_FEATURE_SRSO               (11*32+24) /* "" AMD BTB untrain RETs */
+#define X86_FEATURE_SRSO_ALIAS         (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
+#define X86_FEATURE_IBPB_ON_VMEXIT     (11*32+26) /* "" Issue an IBPB only on VMEXIT */
+
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI           (12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16                (12*32+ 5) /* AVX512 BFLOAT16 instructions */
 #define X86_FEATURE_AUTOIBRS           (20*32+ 8) /* "" Automatic IBRS */
 #define X86_FEATURE_NO_SMM_CTL_MSR     (20*32+ 9) /* "" SMM_CTL MSR is not present */
 
+#define X86_FEATURE_SBPB               (20*32+27) /* "" Selective Branch Prediction Barrier */
+#define X86_FEATURE_IBPB_BRTYPE                (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */
+#define X86_FEATURE_SRSO_NO            (20*32+29) /* "" CPU is not affected by SRSO */
+
 /*
  * BUG word(s)
  */
 #define X86_BUG_RETBLEED               X86_BUG(27) /* CPU is affected by RETBleed */
 #define X86_BUG_EIBRS_PBRSB            X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
 #define X86_BUG_SMT_RSB                        X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
+#define X86_BUG_GDS                    X86_BUG(30) /* CPU is affected by Gather Data Sampling */
 
+/* BUG word 2 */
+#define X86_BUG_SRSO                   X86_BUG(1*32 + 0) /* AMD SRSO bug */
+#define X86_BUG_DIV0                   X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
 #endif /* _ASM_X86_CPUFEATURES_H */
index a00a53e..1d11135 100644 (file)
@@ -57,6 +57,7 @@
 
 #define MSR_IA32_PRED_CMD              0x00000049 /* Prediction Command */
 #define PRED_CMD_IBPB                  BIT(0)     /* Indirect Branch Prediction Barrier */
+#define PRED_CMD_SBPB                  BIT(7)     /* Selective Branch Prediction Barrier */
 
 #define MSR_PPIN_CTL                   0x0000004e
 #define MSR_PPIN                       0x0000004f
                                                 * Not susceptible to Post-Barrier
                                                 * Return Stack Buffer Predictions.
                                                 */
+#define ARCH_CAP_GDS_CTRL              BIT(25) /*
+                                                * CPU is vulnerable to Gather
+                                                * Data Sampling (GDS) and
+                                                * has controls for mitigation.
+                                                */
+#define ARCH_CAP_GDS_NO                        BIT(26) /*
+                                                * CPU is not vulnerable to Gather
+                                                * Data Sampling (GDS).
+                                                */
 
 #define ARCH_CAP_XAPIC_DISABLE         BIT(21) /*
                                                 * IA32_XAPIC_DISABLE_STATUS MSR
 #define RNGDS_MITG_DIS                 BIT(0)  /* SRBDS support */
 #define RTM_ALLOW                      BIT(1)  /* TSX development mode */
 #define FB_CLEAR_DIS                   BIT(3)  /* CPU Fill buffer clear disable */
+#define GDS_MITG_DIS                   BIT(4)  /* Disable GDS mitigation */
+#define GDS_MITG_LOCKED                        BIT(5)  /* GDS mitigation locked */
 
 #define MSR_IA32_SYSENTER_CS           0x00000174
 #define MSR_IA32_SYSENTER_ESP          0x00000175
index 1a65cf4..3faf044 100644 (file)
  * eventually turn into it's own annotation.
  */
 .macro VALIDATE_UNRET_END
-#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+#if defined(CONFIG_NOINSTR_VALIDATION) && \
+       (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
        ANNOTATE_RETPOLINE_SAFE
        nop
 #endif
  */
 .macro UNTRAIN_RET
 #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
-       defined(CONFIG_CALL_DEPTH_TRACKING)
+       defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
        VALIDATE_UNRET_END
        ALTERNATIVE_3 "",                                               \
                      CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET,          \
                      "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,        \
                      __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
 #endif
+
+#ifdef CONFIG_CPU_SRSO
+       ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \
+                         "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS
+#endif
 .endm
 
 .macro UNTRAIN_RET_FROM_CALL
                      "call entry_ibpb", X86_FEATURE_ENTRY_IBPB,        \
                      __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
 #endif
+
+#ifdef CONFIG_CPU_SRSO
+       ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \
+                         "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS
+#endif
 .endm
 
 
@@ -332,6 +343,8 @@ extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
 
 extern void __x86_return_thunk(void);
 extern void zen_untrain_ret(void);
+extern void srso_untrain_ret(void);
+extern void srso_untrain_ret_alias(void);
 extern void entry_ibpb(void);
 
 #ifdef CONFIG_CALL_THUNKS
@@ -479,11 +492,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
                : "memory");
 }
 
+extern u64 x86_pred_cmd;
+
 static inline void indirect_branch_prediction_barrier(void)
 {
-       u64 val = PRED_CMD_IBPB;
-
-       alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
+       alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
 }
 
 /* The Intel SPEC CTRL MSR base value cache */
index d46300e..973db04 100644 (file)
@@ -682,9 +682,13 @@ extern u16 get_llc_id(unsigned int cpu);
 #ifdef CONFIG_CPU_SUP_AMD
 extern u32 amd_get_nodes_per_socket(void);
 extern u32 amd_get_highest_perf(void);
+extern bool cpu_has_ibpb_brtype_microcode(void);
+extern void amd_clear_divider(void);
 #else
 static inline u32 amd_get_nodes_per_socket(void)       { return 0; }
 static inline u32 amd_get_highest_perf(void)           { return 0; }
+static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; }
+static inline void amd_clear_divider(void)             { }
 #endif
 
 extern unsigned long arch_align_stack(unsigned long sp);
index 21b542a..53369c5 100644 (file)
@@ -52,6 +52,7 @@ int acpi_lapic;
 int acpi_ioapic;
 int acpi_strict;
 int acpi_disable_cmcff;
+bool acpi_int_src_ovr[NR_IRQS_LEGACY];
 
 /* ACPI SCI override configuration */
 u8 acpi_sci_flags __initdata;
@@ -588,6 +589,9 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
 
        acpi_table_print_madt_entry(&header->common);
 
+       if (intsrc->source_irq < NR_IRQS_LEGACY)
+               acpi_int_src_ovr[intsrc->source_irq] = true;
+
        if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
                acpi_sci_ioapic_setup(intsrc->source_irq,
                                      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
index 26ad7ca..b55d8f8 100644 (file)
@@ -75,6 +75,10 @@ static const int amd_zenbleed[] =
                           AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
                           AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
 
+static const int amd_div0[] =
+       AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
+                          AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
+
 static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
 {
        int osvw_id = *erratum++;
@@ -1130,6 +1134,11 @@ static void init_amd(struct cpuinfo_x86 *c)
                WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
 
        zenbleed_check(c);
+
+       if (cpu_has_amd_erratum(c, amd_div0)) {
+               pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+               setup_force_cpu_bug(X86_BUG_DIV0);
+       }
 }
 
 #ifdef CONFIG_X86_32
@@ -1290,3 +1299,32 @@ void amd_check_microcode(void)
 {
        on_each_cpu(zenbleed_check_cpu, NULL, 1);
 }
+
+bool cpu_has_ibpb_brtype_microcode(void)
+{
+       switch (boot_cpu_data.x86) {
+       /* Zen1/2 IBPB flushes branch type predictions too. */
+       case 0x17:
+               return boot_cpu_has(X86_FEATURE_AMD_IBPB);
+       case 0x19:
+               /* Poke the MSR bit on Zen3/4 to check its presence. */
+               if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+                       setup_force_cpu_cap(X86_FEATURE_SBPB);
+                       return true;
+               } else {
+                       return false;
+               }
+       default:
+               return false;
+       }
+}
+
+/*
+ * Issue a DIV 0/1 insn to clear any division data from previous DIV
+ * operations.
+ */
+void noinstr amd_clear_divider(void)
+{
+       asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
+                    :: "a" (0), "d" (0), "r" (1));
+}
index 9550744..d02f73c 100644 (file)
@@ -47,6 +47,8 @@ static void __init taa_select_mitigation(void);
 static void __init mmio_select_mitigation(void);
 static void __init srbds_select_mitigation(void);
 static void __init l1d_flush_select_mitigation(void);
+static void __init srso_select_mitigation(void);
+static void __init gds_select_mitigation(void);
 
 /* The base value of the SPEC_CTRL MSR without task-specific bits set */
 u64 x86_spec_ctrl_base;
@@ -56,6 +58,9 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
 DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
 
+u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+EXPORT_SYMBOL_GPL(x86_pred_cmd);
+
 static DEFINE_MUTEX(spec_ctrl_mutex);
 
 /* Update SPEC_CTRL MSR and its cached copy unconditionally */
@@ -160,6 +165,8 @@ void __init cpu_select_mitigations(void)
        md_clear_select_mitigation();
        srbds_select_mitigation();
        l1d_flush_select_mitigation();
+       srso_select_mitigation();
+       gds_select_mitigation();
 }
 
 /*
@@ -646,6 +653,149 @@ static int __init l1d_flush_parse_cmdline(char *str)
 early_param("l1d_flush", l1d_flush_parse_cmdline);
 
 #undef pr_fmt
+#define pr_fmt(fmt)    "GDS: " fmt
+
+enum gds_mitigations {
+       GDS_MITIGATION_OFF,
+       GDS_MITIGATION_UCODE_NEEDED,
+       GDS_MITIGATION_FORCE,
+       GDS_MITIGATION_FULL,
+       GDS_MITIGATION_FULL_LOCKED,
+       GDS_MITIGATION_HYPERVISOR,
+};
+
+#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION)
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
+#else
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
+#endif
+
+static const char * const gds_strings[] = {
+       [GDS_MITIGATION_OFF]            = "Vulnerable",
+       [GDS_MITIGATION_UCODE_NEEDED]   = "Vulnerable: No microcode",
+       [GDS_MITIGATION_FORCE]          = "Mitigation: AVX disabled, no microcode",
+       [GDS_MITIGATION_FULL]           = "Mitigation: Microcode",
+       [GDS_MITIGATION_FULL_LOCKED]    = "Mitigation: Microcode (locked)",
+       [GDS_MITIGATION_HYPERVISOR]     = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+       return (gds_mitigation == GDS_MITIGATION_FULL ||
+               gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+       u64 mcu_ctrl_after;
+       u64 mcu_ctrl;
+
+       switch (gds_mitigation) {
+       case GDS_MITIGATION_OFF:
+               rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+               mcu_ctrl |= GDS_MITG_DIS;
+               break;
+       case GDS_MITIGATION_FULL_LOCKED:
+               /*
+                * The LOCKED state comes from the boot CPU. APs might not have
+                * the same state. Make sure the mitigation is enabled on all
+                * CPUs.
+                */
+       case GDS_MITIGATION_FULL:
+               rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+               mcu_ctrl &= ~GDS_MITG_DIS;
+               break;
+       case GDS_MITIGATION_FORCE:
+       case GDS_MITIGATION_UCODE_NEEDED:
+       case GDS_MITIGATION_HYPERVISOR:
+               return;
+       };
+
+       wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+       /*
+        * Check to make sure that the WRMSR value was not ignored. Writes to
+        * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+        * processor was not.
+        */
+       rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+       WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+       u64 mcu_ctrl;
+
+       if (!boot_cpu_has_bug(X86_BUG_GDS))
+               return;
+
+       if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+               gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+               goto out;
+       }
+
+       if (cpu_mitigations_off())
+               gds_mitigation = GDS_MITIGATION_OFF;
+       /* Will verify below that mitigation _can_ be disabled */
+
+       /* No microcode */
+       if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) {
+               if (gds_mitigation == GDS_MITIGATION_FORCE) {
+                       /*
+                        * This only needs to be done on the boot CPU so do it
+                        * here rather than in update_gds_msr()
+                        */
+                       setup_clear_cpu_cap(X86_FEATURE_AVX);
+                       pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+               } else {
+                       gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+               }
+               goto out;
+       }
+
+       /* Microcode has mitigation, use it */
+       if (gds_mitigation == GDS_MITIGATION_FORCE)
+               gds_mitigation = GDS_MITIGATION_FULL;
+
+       rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+       if (mcu_ctrl & GDS_MITG_LOCKED) {
+               if (gds_mitigation == GDS_MITIGATION_OFF)
+                       pr_warn("Mitigation locked. Disable failed.\n");
+
+               /*
+                * The mitigation is selected from the boot CPU. All other CPUs
+                * _should_ have the same state. If the boot CPU isn't locked
+                * but others are then update_gds_msr() will WARN() of the state
+                * mismatch. If the boot CPU is locked update_gds_msr() will
+                * ensure the other CPUs have the mitigation enabled.
+                */
+               gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+       }
+
+       update_gds_msr();
+out:
+       pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+       if (!str)
+               return -EINVAL;
+
+       if (!boot_cpu_has_bug(X86_BUG_GDS))
+               return 0;
+
+       if (!strcmp(str, "off"))
+               gds_mitigation = GDS_MITIGATION_OFF;
+       else if (!strcmp(str, "force"))
+               gds_mitigation = GDS_MITIGATION_FORCE;
+
+       return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V1 : " fmt
 
 enum spectre_v1_mitigation {
@@ -2188,6 +2338,165 @@ static int __init l1tf_cmdline(char *str)
 early_param("l1tf", l1tf_cmdline);
 
 #undef pr_fmt
+#define pr_fmt(fmt)    "Speculative Return Stack Overflow: " fmt
+
+enum srso_mitigation {
+       SRSO_MITIGATION_NONE,
+       SRSO_MITIGATION_MICROCODE,
+       SRSO_MITIGATION_SAFE_RET,
+       SRSO_MITIGATION_IBPB,
+       SRSO_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+enum srso_mitigation_cmd {
+       SRSO_CMD_OFF,
+       SRSO_CMD_MICROCODE,
+       SRSO_CMD_SAFE_RET,
+       SRSO_CMD_IBPB,
+       SRSO_CMD_IBPB_ON_VMEXIT,
+};
+
+static const char * const srso_strings[] = {
+       [SRSO_MITIGATION_NONE]           = "Vulnerable",
+       [SRSO_MITIGATION_MICROCODE]      = "Mitigation: microcode",
+       [SRSO_MITIGATION_SAFE_RET]       = "Mitigation: safe RET",
+       [SRSO_MITIGATION_IBPB]           = "Mitigation: IBPB",
+       [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
+static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET;
+
+static int __init srso_parse_cmdline(char *str)
+{
+       if (!str)
+               return -EINVAL;
+
+       if (!strcmp(str, "off"))
+               srso_cmd = SRSO_CMD_OFF;
+       else if (!strcmp(str, "microcode"))
+               srso_cmd = SRSO_CMD_MICROCODE;
+       else if (!strcmp(str, "safe-ret"))
+               srso_cmd = SRSO_CMD_SAFE_RET;
+       else if (!strcmp(str, "ibpb"))
+               srso_cmd = SRSO_CMD_IBPB;
+       else if (!strcmp(str, "ibpb-vmexit"))
+               srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT;
+       else
+               pr_err("Ignoring unknown SRSO option (%s).", str);
+
+       return 0;
+}
+early_param("spec_rstack_overflow", srso_parse_cmdline);
+
+#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options."
+
+static void __init srso_select_mitigation(void)
+{
+       bool has_microcode;
+
+       if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+               goto pred_cmd;
+
+       /*
+        * The first check is for the kernel running as a guest in order
+        * for guests to verify whether IBPB is a viable mitigation.
+        */
+       has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode();
+       if (!has_microcode) {
+               pr_warn("IBPB-extending microcode not applied!\n");
+               pr_warn(SRSO_NOTICE);
+       } else {
+               /*
+                * Enable the synthetic (even if in a real CPUID leaf)
+                * flags for guests.
+                */
+               setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+
+               /*
+                * Zen1/2 with SMT off aren't vulnerable after the right
+                * IBPB microcode has been applied.
+                */
+               if ((boot_cpu_data.x86 < 0x19) &&
+                   (!cpu_smt_possible() || (cpu_smt_control == CPU_SMT_DISABLED)))
+                       setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+       }
+
+       if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
+               if (has_microcode) {
+                       pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n");
+                       srso_mitigation = SRSO_MITIGATION_IBPB;
+                       goto pred_cmd;
+               }
+       }
+
+       switch (srso_cmd) {
+       case SRSO_CMD_OFF:
+               return;
+
+       case SRSO_CMD_MICROCODE:
+               if (has_microcode) {
+                       srso_mitigation = SRSO_MITIGATION_MICROCODE;
+                       pr_warn(SRSO_NOTICE);
+               }
+               break;
+
+       case SRSO_CMD_SAFE_RET:
+               if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+                       /*
+                        * Enable the return thunk for generated code
+                        * like ftrace, static_call, etc.
+                        */
+                       setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+
+                       if (boot_cpu_data.x86 == 0x19)
+                               setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+                       else
+                               setup_force_cpu_cap(X86_FEATURE_SRSO);
+                       srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+               } else {
+                       pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+                       goto pred_cmd;
+               }
+               break;
+
+       case SRSO_CMD_IBPB:
+               if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+                       if (has_microcode) {
+                               setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+                               srso_mitigation = SRSO_MITIGATION_IBPB;
+                       }
+               } else {
+                       pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+                       goto pred_cmd;
+               }
+               break;
+
+       case SRSO_CMD_IBPB_ON_VMEXIT:
+               if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+                       if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
+                               setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+                               srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+                       }
+               } else {
+                       pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+                       goto pred_cmd;
+                }
+               break;
+
+       default:
+               break;
+       }
+
+       pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode"));
+
+pred_cmd:
+       if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) &&
+            boot_cpu_has(X86_FEATURE_SBPB))
+               x86_pred_cmd = PRED_CMD_SBPB;
+}
+
+#undef pr_fmt
 #define pr_fmt(fmt) fmt
 
 #ifdef CONFIG_SYSFS
@@ -2385,6 +2694,18 @@ static ssize_t retbleed_show_state(char *buf)
        return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
 }
 
+static ssize_t srso_show_state(char *buf)
+{
+       return sysfs_emit(buf, "%s%s\n",
+                         srso_strings[srso_mitigation],
+                         (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+       return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
+}
+
 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
                               char *buf, unsigned int bug)
 {
@@ -2434,6 +2755,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
        case X86_BUG_RETBLEED:
                return retbleed_show_state(buf);
 
+       case X86_BUG_SRSO:
+               return srso_show_state(buf);
+
+       case X86_BUG_GDS:
+               return gds_show_state(buf);
+
        default:
                break;
        }
@@ -2498,4 +2825,14 @@ ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, cha
 {
        return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
 }
+
+ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       return cpu_show_common(dev, attr, buf, X86_BUG_SRSO);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
 #endif
index 0ba1067..e3a65e9 100644 (file)
@@ -1250,6 +1250,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 #define RETBLEED       BIT(3)
 /* CPU is affected by SMT (cross-thread) return predictions */
 #define SMT_RSB                BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO           BIT(5)
+/* CPU is affected by GDS */
+#define GDS            BIT(6)
 
 static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
        VULNBL_INTEL_STEPPINGS(IVYBRIDGE,       X86_STEPPING_ANY,               SRBDS),
@@ -1262,27 +1266,30 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
        VULNBL_INTEL_STEPPINGS(BROADWELL_X,     X86_STEPPING_ANY,               MMIO),
        VULNBL_INTEL_STEPPINGS(BROADWELL,       X86_STEPPING_ANY,               SRBDS),
        VULNBL_INTEL_STEPPINGS(SKYLAKE_L,       X86_STEPPING_ANY,               SRBDS | MMIO | RETBLEED),
-       VULNBL_INTEL_STEPPINGS(SKYLAKE_X,       X86_STEPPING_ANY,               MMIO | RETBLEED),
+       VULNBL_INTEL_STEPPINGS(SKYLAKE_X,       X86_STEPPING_ANY,               MMIO | RETBLEED | GDS),
        VULNBL_INTEL_STEPPINGS(SKYLAKE,         X86_STEPPING_ANY,               SRBDS | MMIO | RETBLEED),
-       VULNBL_INTEL_STEPPINGS(KABYLAKE_L,      X86_STEPPING_ANY,               SRBDS | MMIO | RETBLEED),
-       VULNBL_INTEL_STEPPINGS(KABYLAKE,        X86_STEPPING_ANY,               SRBDS | MMIO | RETBLEED),
+       VULNBL_INTEL_STEPPINGS(KABYLAKE_L,      X86_STEPPING_ANY,               SRBDS | MMIO | RETBLEED | GDS),
+       VULNBL_INTEL_STEPPINGS(KABYLAKE,        X86_STEPPING_ANY,               SRBDS | MMIO | RETBLEED | GDS),
        VULNBL_INTEL_STEPPINGS(CANNONLAKE_L,    X86_STEPPING_ANY,               RETBLEED),
-       VULNBL_INTEL_STEPPINGS(ICELAKE_L,       X86_STEPPING_ANY,               MMIO | MMIO_SBDS | RETBLEED),
-       VULNBL_INTEL_STEPPINGS(ICELAKE_D,       X86_STEPPING_ANY,               MMIO),
-       VULNBL_INTEL_STEPPINGS(ICELAKE_X,       X86_STEPPING_ANY,               MMIO),
-       VULNBL_INTEL_STEPPINGS(COMETLAKE,       X86_STEPPING_ANY,               MMIO | MMIO_SBDS | RETBLEED),
+       VULNBL_INTEL_STEPPINGS(ICELAKE_L,       X86_STEPPING_ANY,               MMIO | MMIO_SBDS | RETBLEED | GDS),
+       VULNBL_INTEL_STEPPINGS(ICELAKE_D,       X86_STEPPING_ANY,               MMIO | GDS),
+       VULNBL_INTEL_STEPPINGS(ICELAKE_X,       X86_STEPPING_ANY,               MMIO | GDS),
+       VULNBL_INTEL_STEPPINGS(COMETLAKE,       X86_STEPPING_ANY,               MMIO | MMIO_SBDS | RETBLEED | GDS),
        VULNBL_INTEL_STEPPINGS(COMETLAKE_L,     X86_STEPPINGS(0x0, 0x0),        MMIO | RETBLEED),
-       VULNBL_INTEL_STEPPINGS(COMETLAKE_L,     X86_STEPPING_ANY,               MMIO | MMIO_SBDS | RETBLEED),
+       VULNBL_INTEL_STEPPINGS(COMETLAKE_L,     X86_STEPPING_ANY,               MMIO | MMIO_SBDS | RETBLEED | GDS),
+       VULNBL_INTEL_STEPPINGS(TIGERLAKE_L,     X86_STEPPING_ANY,               GDS),
+       VULNBL_INTEL_STEPPINGS(TIGERLAKE,       X86_STEPPING_ANY,               GDS),
        VULNBL_INTEL_STEPPINGS(LAKEFIELD,       X86_STEPPING_ANY,               MMIO | MMIO_SBDS | RETBLEED),
-       VULNBL_INTEL_STEPPINGS(ROCKETLAKE,      X86_STEPPING_ANY,               MMIO | RETBLEED),
+       VULNBL_INTEL_STEPPINGS(ROCKETLAKE,      X86_STEPPING_ANY,               MMIO | RETBLEED | GDS),
        VULNBL_INTEL_STEPPINGS(ATOM_TREMONT,    X86_STEPPING_ANY,               MMIO | MMIO_SBDS),
        VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D,  X86_STEPPING_ANY,               MMIO),
        VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L,  X86_STEPPING_ANY,               MMIO | MMIO_SBDS),
 
        VULNBL_AMD(0x15, RETBLEED),
        VULNBL_AMD(0x16, RETBLEED),
-       VULNBL_AMD(0x17, RETBLEED | SMT_RSB),
+       VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
        VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
+       VULNBL_AMD(0x19, SRSO),
        {}
 };
 
@@ -1406,6 +1413,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
        if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
                setup_force_cpu_bug(X86_BUG_SMT_RSB);
 
+       if (!cpu_has(c, X86_FEATURE_SRSO_NO)) {
+               if (cpu_matches(cpu_vuln_blacklist, SRSO))
+                       setup_force_cpu_bug(X86_BUG_SRSO);
+       }
+
+       /*
+        * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+        * an affected processor, the VMM may have disabled the use of GATHER by
+        * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+        * which means that AVX will be disabled.
+        */
+       if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) &&
+           boot_cpu_has(X86_FEATURE_AVX))
+               setup_force_cpu_bug(X86_BUG_GDS);
+
        if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
                return;
 
@@ -1962,6 +1984,8 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
        validate_apic_and_package_id(c);
        x86_spec_ctrl_setup_ap();
        update_srbds_msr();
+       if (boot_cpu_has_bug(X86_BUG_GDS))
+               update_gds_msr();
 
        tsx_ap_init();
 }
index 1c44630..1dcd7d4 100644 (file)
@@ -83,6 +83,7 @@ void cpu_select_mitigations(void);
 
 extern void x86_spec_ctrl_setup_ap(void);
 extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
 
 extern enum spectre_v2_mitigation spectre_v2_enabled;
 
index 4a817d2..1885326 100644 (file)
@@ -206,6 +206,8 @@ DEFINE_IDTENTRY(exc_divide_error)
 {
        do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE,
                      FPE_INTDIV, error_get_trap_addr(regs));
+
+       amd_clear_divider();
 }
 
 DEFINE_IDTENTRY(exc_overflow)
index 03c885d..e768132 100644 (file)
@@ -134,13 +134,27 @@ SECTIONS
                SOFTIRQENTRY_TEXT
 #ifdef CONFIG_RETPOLINE
                __indirect_thunk_start = .;
-               *(.text.__x86.*)
+               *(.text.__x86.indirect_thunk)
+               *(.text.__x86.return_thunk)
                __indirect_thunk_end = .;
 #endif
                STATIC_CALL_TEXT
 
                ALIGN_ENTRY_TEXT_BEGIN
+#ifdef CONFIG_CPU_SRSO
+               *(.text.__x86.rethunk_untrain)
+#endif
+
                ENTRY_TEXT
+
+#ifdef CONFIG_CPU_SRSO
+               /*
+                * See the comment above srso_untrain_ret_alias()'s
+                * definition.
+                */
+               . = srso_untrain_ret_alias | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20);
+               *(.text.__x86.rethunk_safe)
+#endif
                ALIGN_ENTRY_TEXT_END
                *(.gnu.warning)
 
@@ -509,7 +523,18 @@ INIT_PER_CPU(irq_stack_backing_store);
 #endif
 
 #ifdef CONFIG_RETHUNK
-. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned");
+. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned");
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
+#endif
+
+#ifdef CONFIG_CPU_SRSO
+/*
+ * GNU ld cannot do XOR so do: (A | B) - (A & B) in order to compute the XOR
+ * of the two function addresses:
+ */
+. = ASSERT(((srso_untrain_ret_alias | srso_safe_ret_alias) -
+               (srso_untrain_ret_alias & srso_safe_ret_alias)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
+               "SRSO function pair won't alias");
 #endif
 
 #endif /* CONFIG_X86_64 */
index 7f4d133..d343268 100644 (file)
@@ -729,6 +729,9 @@ void kvm_set_cpu_caps(void)
                F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
        );
 
+       if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
+               kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+
        kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
                F(PERFMON_V2)
        );
index 07756b7..d3aec1f 100644 (file)
@@ -2417,15 +2417,18 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
         */
        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
 
-       vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb);
-       vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb);
-       vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb);
-       vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb);
-       vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb);
+       BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
+       memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
 
-       svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb);
+       vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
+       vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
+       vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
+       vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
+       vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
 
-       if (ghcb_xcr0_is_valid(ghcb)) {
+       svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
+
+       if (kvm_ghcb_xcr0_is_valid(svm)) {
                vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
                kvm_update_cpuid_runtime(vcpu);
        }
@@ -2436,84 +2439,88 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
        control->exit_code_hi = upper_32_bits(exit_code);
        control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
        control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
+       svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
 
        /* Clear the valid entries fields */
        memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
 }
 
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+       return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
 static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 {
-       struct kvm_vcpu *vcpu;
-       struct ghcb *ghcb;
+       struct vmcb_control_area *control = &svm->vmcb->control;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
        u64 exit_code;
        u64 reason;
 
-       ghcb = svm->sev_es.ghcb;
-
        /*
         * Retrieve the exit code now even though it may not be marked valid
         * as it could help with debugging.
         */
-       exit_code = ghcb_get_sw_exit_code(ghcb);
+       exit_code = kvm_ghcb_get_sw_exit_code(control);
 
        /* Only GHCB Usage code 0 is supported */
-       if (ghcb->ghcb_usage) {
+       if (svm->sev_es.ghcb->ghcb_usage) {
                reason = GHCB_ERR_INVALID_USAGE;
                goto vmgexit_err;
        }
 
        reason = GHCB_ERR_MISSING_INPUT;
 
-       if (!ghcb_sw_exit_code_is_valid(ghcb) ||
-           !ghcb_sw_exit_info_1_is_valid(ghcb) ||
-           !ghcb_sw_exit_info_2_is_valid(ghcb))
+       if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
+           !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
+           !kvm_ghcb_sw_exit_info_2_is_valid(svm))
                goto vmgexit_err;
 
-       switch (ghcb_get_sw_exit_code(ghcb)) {
+       switch (exit_code) {
        case SVM_EXIT_READ_DR7:
                break;
        case SVM_EXIT_WRITE_DR7:
-               if (!ghcb_rax_is_valid(ghcb))
+               if (!kvm_ghcb_rax_is_valid(svm))
                        goto vmgexit_err;
                break;
        case SVM_EXIT_RDTSC:
                break;
        case SVM_EXIT_RDPMC:
-               if (!ghcb_rcx_is_valid(ghcb))
+               if (!kvm_ghcb_rcx_is_valid(svm))
                        goto vmgexit_err;
                break;
        case SVM_EXIT_CPUID:
-               if (!ghcb_rax_is_valid(ghcb) ||
-                   !ghcb_rcx_is_valid(ghcb))
+               if (!kvm_ghcb_rax_is_valid(svm) ||
+                   !kvm_ghcb_rcx_is_valid(svm))
                        goto vmgexit_err;
-               if (ghcb_get_rax(ghcb) == 0xd)
-                       if (!ghcb_xcr0_is_valid(ghcb))
+               if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
+                       if (!kvm_ghcb_xcr0_is_valid(svm))
                                goto vmgexit_err;
                break;
        case SVM_EXIT_INVD:
                break;
        case SVM_EXIT_IOIO:
-               if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) {
-                       if (!ghcb_sw_scratch_is_valid(ghcb))
+               if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
+                       if (!kvm_ghcb_sw_scratch_is_valid(svm))
                                goto vmgexit_err;
                } else {
-                       if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK))
-                               if (!ghcb_rax_is_valid(ghcb))
+                       if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
+                               if (!kvm_ghcb_rax_is_valid(svm))
                                        goto vmgexit_err;
                }
                break;
        case SVM_EXIT_MSR:
-               if (!ghcb_rcx_is_valid(ghcb))
+               if (!kvm_ghcb_rcx_is_valid(svm))
                        goto vmgexit_err;
-               if (ghcb_get_sw_exit_info_1(ghcb)) {
-                       if (!ghcb_rax_is_valid(ghcb) ||
-                           !ghcb_rdx_is_valid(ghcb))
+               if (control->exit_info_1) {
+                       if (!kvm_ghcb_rax_is_valid(svm) ||
+                           !kvm_ghcb_rdx_is_valid(svm))
                                goto vmgexit_err;
                }
                break;
        case SVM_EXIT_VMMCALL:
-               if (!ghcb_rax_is_valid(ghcb) ||
-                   !ghcb_cpl_is_valid(ghcb))
+               if (!kvm_ghcb_rax_is_valid(svm) ||
+                   !kvm_ghcb_cpl_is_valid(svm))
                        goto vmgexit_err;
                break;
        case SVM_EXIT_RDTSCP:
@@ -2521,19 +2528,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
        case SVM_EXIT_WBINVD:
                break;
        case SVM_EXIT_MONITOR:
-               if (!ghcb_rax_is_valid(ghcb) ||
-                   !ghcb_rcx_is_valid(ghcb) ||
-                   !ghcb_rdx_is_valid(ghcb))
+               if (!kvm_ghcb_rax_is_valid(svm) ||
+                   !kvm_ghcb_rcx_is_valid(svm) ||
+                   !kvm_ghcb_rdx_is_valid(svm))
                        goto vmgexit_err;
                break;
        case SVM_EXIT_MWAIT:
-               if (!ghcb_rax_is_valid(ghcb) ||
-                   !ghcb_rcx_is_valid(ghcb))
+               if (!kvm_ghcb_rax_is_valid(svm) ||
+                   !kvm_ghcb_rcx_is_valid(svm))
                        goto vmgexit_err;
                break;
        case SVM_VMGEXIT_MMIO_READ:
        case SVM_VMGEXIT_MMIO_WRITE:
-               if (!ghcb_sw_scratch_is_valid(ghcb))
+               if (!kvm_ghcb_sw_scratch_is_valid(svm))
                        goto vmgexit_err;
                break;
        case SVM_VMGEXIT_NMI_COMPLETE:
@@ -2549,11 +2556,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
        return 0;
 
 vmgexit_err:
-       vcpu = &svm->vcpu;
-
        if (reason == GHCB_ERR_INVALID_USAGE) {
                vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
-                           ghcb->ghcb_usage);
+                           svm->sev_es.ghcb->ghcb_usage);
        } else if (reason == GHCB_ERR_INVALID_EVENT) {
                vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
                            exit_code);
@@ -2563,11 +2568,8 @@ vmgexit_err:
                dump_ghcb(svm);
        }
 
-       /* Clear the valid entries fields */
-       memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
-
-       ghcb_set_sw_exit_info_1(ghcb, 2);
-       ghcb_set_sw_exit_info_2(ghcb, reason);
+       ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+       ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason);
 
        /* Resume the guest to "return" the error code. */
        return 1;
@@ -2586,7 +2588,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
                 */
                if (svm->sev_es.ghcb_sa_sync) {
                        kvm_write_guest(svm->vcpu.kvm,
-                                       ghcb_get_sw_scratch(svm->sev_es.ghcb),
+                                       svm->sev_es.sw_scratch,
                                        svm->sev_es.ghcb_sa,
                                        svm->sev_es.ghcb_sa_len);
                        svm->sev_es.ghcb_sa_sync = false;
@@ -2632,12 +2634,11 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
 static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
 {
        struct vmcb_control_area *control = &svm->vmcb->control;
-       struct ghcb *ghcb = svm->sev_es.ghcb;
        u64 ghcb_scratch_beg, ghcb_scratch_end;
        u64 scratch_gpa_beg, scratch_gpa_end;
        void *scratch_va;
 
-       scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
+       scratch_gpa_beg = svm->sev_es.sw_scratch;
        if (!scratch_gpa_beg) {
                pr_err("vmgexit: scratch gpa not provided\n");
                goto e_scratch;
@@ -2708,8 +2709,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
        return 0;
 
 e_scratch:
-       ghcb_set_sw_exit_info_1(ghcb, 2);
-       ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+       ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+       ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
 
        return 1;
 }
@@ -2822,7 +2823,6 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        u64 ghcb_gpa, exit_code;
-       struct ghcb *ghcb;
        int ret;
 
        /* Validate the GHCB */
@@ -2847,20 +2847,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
        }
 
        svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
-       ghcb = svm->sev_es.ghcb_map.hva;
 
-       trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
-
-       exit_code = ghcb_get_sw_exit_code(ghcb);
+       trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
 
+       sev_es_sync_from_ghcb(svm);
        ret = sev_es_validate_vmgexit(svm);
        if (ret)
                return ret;
 
-       sev_es_sync_from_ghcb(svm);
-       ghcb_set_sw_exit_info_1(ghcb, 0);
-       ghcb_set_sw_exit_info_2(ghcb, 0);
+       ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0);
+       ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0);
 
+       exit_code = kvm_ghcb_get_sw_exit_code(control);
        switch (exit_code) {
        case SVM_VMGEXIT_MMIO_READ:
                ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
@@ -2898,13 +2896,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
                        break;
                case 1:
                        /* Get AP jump table address */
-                       ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table);
+                       ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table);
                        break;
                default:
                        pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
                               control->exit_info_1);
-                       ghcb_set_sw_exit_info_1(ghcb, 2);
-                       ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
+                       ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+                       ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
                }
 
                ret = 1;
index 956726d..03e852d 100644 (file)
@@ -1498,7 +1498,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        if (sd->current_vmcb != svm->vmcb) {
                sd->current_vmcb = svm->vmcb;
-               indirect_branch_prediction_barrier();
+
+               if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+                       indirect_branch_prediction_barrier();
        }
        if (kvm_vcpu_apicv_active(vcpu))
                avic_vcpu_load(vcpu, cpu);
index 18af7e7..8239c8d 100644 (file)
@@ -190,10 +190,12 @@ struct vcpu_sev_es_state {
        /* SEV-ES support */
        struct sev_es_save_area *vmsa;
        struct ghcb *ghcb;
+       u8 valid_bitmap[16];
        struct kvm_host_map ghcb_map;
        bool received_first_sipi;
 
        /* SEV-ES scratch area support */
+       u64 sw_scratch;
        void *ghcb_sa;
        u32 ghcb_sa_len;
        bool ghcb_sa_sync;
@@ -744,4 +746,28 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
 void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
 void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
 
+#define DEFINE_KVM_GHCB_ACCESSORS(field)                                               \
+       static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+       {                                                                       \
+               return test_bit(GHCB_BITMAP_IDX(field),                         \
+                               (unsigned long *)&svm->sev_es.valid_bitmap);    \
+       }                                                                       \
+                                                                               \
+       static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
+       {                                                                       \
+               return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
+       }                                                                       \
+
+DEFINE_KVM_GHCB_ACCESSORS(cpl)
+DEFINE_KVM_GHCB_ACCESSORS(rax)
+DEFINE_KVM_GHCB_ACCESSORS(rcx)
+DEFINE_KVM_GHCB_ACCESSORS(rdx)
+DEFINE_KVM_GHCB_ACCESSORS(rbx)
+DEFINE_KVM_GHCB_ACCESSORS(rsi)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+
 #endif
index 8e8295e..265452f 100644 (file)
@@ -224,6 +224,9 @@ SYM_FUNC_START(__svm_vcpu_run)
         */
        UNTRAIN_RET
 
+       /* SRSO */
+       ALTERNATIVE "", "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT
+
        /*
         * Clear all general purpose registers except RSP and RAX to prevent
         * speculative use of the guest's values, even those that are reloaded
index 278dbd3..19d9ff9 100644 (file)
@@ -314,6 +314,8 @@ u64 __read_mostly host_xcr0;
 
 static struct kmem_cache *x86_emulator_cache;
 
+extern bool gds_ucode_mitigated(void);
+
 /*
  * When called, it means the previous get/set msr reached an invalid msr.
  * Return true if we want to ignore/silent this failed msr access.
@@ -1616,7 +1618,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
         ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
         ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
         ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
-        ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+        ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
 
 static u64 kvm_get_arch_capabilities(void)
 {
@@ -1673,6 +1675,9 @@ static u64 kvm_get_arch_capabilities(void)
                 */
        }
 
+       if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+               data |= ARCH_CAP_GDS_NO;
+
        return data;
 }
 
index 3fd066d..2cff585 100644 (file)
@@ -11,6 +11,7 @@
 #include <asm/unwind_hints.h>
 #include <asm/percpu.h>
 #include <asm/frame.h>
+#include <asm/nops.h>
 
        .section .text.__x86.indirect_thunk
 
@@ -131,6 +132,46 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
  */
 #ifdef CONFIG_RETHUNK
 
+/*
+ * srso_untrain_ret_alias() and srso_safe_ret_alias() are placed at
+ * special addresses:
+ *
+ * - srso_untrain_ret_alias() is 2M aligned
+ * - srso_safe_ret_alias() is also in the same 2M page but bits 2, 8, 14
+ * and 20 in its virtual address are set (while those bits in the
+ * srso_untrain_ret_alias() function are cleared).
+ *
+ * This guarantees that those two addresses will alias in the branch
+ * target buffer of Zen3/4 generations, leading to any potential
+ * poisoned entries at that BTB slot to get evicted.
+ *
+ * As a result, srso_safe_ret_alias() becomes a safe return.
+ */
+#ifdef CONFIG_CPU_SRSO
+       .section .text.__x86.rethunk_untrain
+
+SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+       ANNOTATE_NOENDBR
+       ASM_NOP2
+       lfence
+       jmp __x86_return_thunk
+SYM_FUNC_END(srso_untrain_ret_alias)
+__EXPORT_THUNK(srso_untrain_ret_alias)
+
+       .section .text.__x86.rethunk_safe
+#endif
+
+/* Needs a definition for the __x86_return_thunk alternative below. */
+SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+#ifdef CONFIG_CPU_SRSO
+       add $8, %_ASM_SP
+       UNWIND_HINT_FUNC
+#endif
+       ANNOTATE_UNRET_SAFE
+       ret
+       int3
+SYM_FUNC_END(srso_safe_ret_alias)
+
        .section .text.__x86.return_thunk
 
 /*
@@ -143,7 +184,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
  *    from re-poisioning the BTB prediction.
  */
        .align 64
-       .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc
+       .skip 64 - (__ret - zen_untrain_ret), 0xcc
 SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
        ANNOTATE_NOENDBR
        /*
@@ -175,10 +216,10 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
         * evicted, __x86_return_thunk will suffer Straight Line Speculation
         * which will be contained safely by the INT3.
         */
-SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL)
+SYM_INNER_LABEL(__ret, SYM_L_GLOBAL)
        ret
        int3
-SYM_CODE_END(__x86_return_thunk)
+SYM_CODE_END(__ret)
 
        /*
         * Ensure the TEST decoding / BTB invalidation is complete.
@@ -189,11 +230,45 @@ SYM_CODE_END(__x86_return_thunk)
         * Jump back and execute the RET in the middle of the TEST instruction.
         * INT3 is for SLS protection.
         */
-       jmp __x86_return_thunk
+       jmp __ret
        int3
 SYM_FUNC_END(zen_untrain_ret)
 __EXPORT_THUNK(zen_untrain_ret)
 
+/*
+ * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
+ * above. On kernel entry, srso_untrain_ret() is executed which is a
+ *
+ * movabs $0xccccccc308c48348,%rax
+ *
+ * and when the return thunk executes the inner label srso_safe_ret()
+ * later, it is a stack manipulation and a RET which is mispredicted and
+ * thus a "safe" one to use.
+ */
+       .align 64
+       .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc
+SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+       ANNOTATE_NOENDBR
+       .byte 0x48, 0xb8
+
+SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
+       add $8, %_ASM_SP
+       ret
+       int3
+       int3
+       int3
+       lfence
+       call srso_safe_ret
+       int3
+SYM_CODE_END(srso_safe_ret)
+SYM_FUNC_END(srso_untrain_ret)
+__EXPORT_THUNK(srso_untrain_ret)
+
+SYM_FUNC_START(__x86_return_thunk)
+       ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \
+                       "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS
+       int3
+SYM_CODE_END(__x86_return_thunk)
 EXPORT_SYMBOL(__x86_return_thunk)
 
 #endif /* CONFIG_RETHUNK */
index 90de500..9866468 100644 (file)
@@ -722,14 +722,9 @@ void submit_bio_noacct(struct bio *bio)
        struct block_device *bdev = bio->bi_bdev;
        struct request_queue *q = bdev_get_queue(bdev);
        blk_status_t status = BLK_STS_IOERR;
-       struct blk_plug *plug;
 
        might_sleep();
 
-       plug = blk_mq_plug(bio);
-       if (plug && plug->nowait)
-               bio->bi_opf |= REQ_NOWAIT;
-
        /*
         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
         * if queue does not support NOWAIT.
@@ -1059,7 +1054,6 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
        plug->rq_count = 0;
        plug->multiple_queues = false;
        plug->has_elevator = false;
-       plug->nowait = false;
        INIT_LIST_HEAD(&plug->cb_list);
 
        /*
index dd64e20..089fcb9 100644 (file)
@@ -3301,11 +3301,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
        if (qos[QOS_MIN] > qos[QOS_MAX])
                goto einval;
 
-       if (enable) {
+       if (enable && !ioc->enabled) {
                blk_stat_enable_accounting(disk->queue);
                blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
                ioc->enabled = true;
-       } else {
+       } else if (!enable && ioc->enabled) {
+               blk_stat_disable_accounting(disk->queue);
                blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
                ioc->enabled = false;
        }
index a286bf3..838ffad 100644 (file)
@@ -358,13 +358,14 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
                task_io_account_write(bio->bi_iter.bi_size);
        }
 
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               bio->bi_opf |= REQ_NOWAIT;
+
        if (iocb->ki_flags & IOCB_HIPRI) {
-               bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
+               bio->bi_opf |= REQ_POLLED;
                submit_bio(bio);
                WRITE_ONCE(iocb->private, bio);
        } else {
-               if (iocb->ki_flags & IOCB_NOWAIT)
-                       bio->bi_opf |= REQ_NOWAIT;
                submit_bio(bio);
        }
        return -EIOCBQUEUED;
index 52b339a..9967fcf 100644 (file)
@@ -173,6 +173,9 @@ static void internal_free_pages_locked(struct ivpu_bo *bo)
 {
        unsigned int i, npages = bo->base.size >> PAGE_SHIFT;
 
+       if (ivpu_bo_cache_mode(bo) != DRM_IVPU_BO_CACHED)
+               set_pages_array_wb(bo->pages, bo->base.size >> PAGE_SHIFT);
+
        for (i = 0; i < npages; i++)
                put_page(bo->pages[i]);
 
@@ -587,6 +590,11 @@ ivpu_bo_alloc_internal(struct ivpu_device *vdev, u64 vpu_addr, u64 size, u32 fla
        if (ivpu_bo_cache_mode(bo) != DRM_IVPU_BO_CACHED)
                drm_clflush_pages(bo->pages, bo->base.size >> PAGE_SHIFT);
 
+       if (bo->flags & DRM_IVPU_BO_WC)
+               set_pages_array_wc(bo->pages, bo->base.size >> PAGE_SHIFT);
+       else if (bo->flags & DRM_IVPU_BO_UNCACHED)
+               set_pages_array_uc(bo->pages, bo->base.size >> PAGE_SHIFT);
+
        prot = ivpu_bo_pgprot(bo, PAGE_KERNEL);
        bo->kvaddr = vmap(bo->pages, bo->base.size >> PAGE_SHIFT, VM_MAP, prot);
        if (!bo->kvaddr) {
index 1dd8d5a..a4d9f14 100644 (file)
@@ -470,6 +470,45 @@ static const struct dmi_system_id asus_laptop[] = {
        { }
 };
 
+static const struct dmi_system_id tongfang_gm_rg[] = {
+       {
+               .ident = "TongFang GMxRGxx/XMG CORE 15 (M22)/TUXEDO Stellaris 15 Gen4 AMD",
+               .matches = {
+                       DMI_MATCH(DMI_BOARD_NAME, "GMxRGxx"),
+               },
+       },
+       { }
+};
+
+static const struct dmi_system_id maingear_laptop[] = {
+       {
+               .ident = "MAINGEAR Vector Pro 2 15",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Micro Electronics Inc"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MG-VCP2-15A3070T"),
+               }
+       },
+       {
+               .ident = "MAINGEAR Vector Pro 2 17",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Micro Electronics Inc"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MG-VCP2-17A3070T"),
+               },
+       },
+       { }
+};
+
+static const struct dmi_system_id pcspecialist_laptop[] = {
+       {
+               .ident = "PCSpecialist Elimina Pro 16 M",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "PCSpecialist"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Elimina Pro 16 M"),
+               },
+       },
+       { }
+};
+
 static const struct dmi_system_id lg_laptop[] = {
        {
                .ident = "LG Electronics 17U70P",
@@ -493,6 +532,9 @@ struct irq_override_cmp {
 static const struct irq_override_cmp override_table[] = {
        { medion_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false },
        { asus_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false },
+       { tongfang_gm_rg, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true },
+       { maingear_laptop, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true },
+       { pcspecialist_laptop, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true },
        { lg_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false },
 };
 
@@ -512,6 +554,28 @@ static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity,
                        return entry->override;
        }
 
+#ifdef CONFIG_X86
+       /*
+        * Always use the MADT override info, except for the i8042 PS/2 ctrl
+        * IRQs (1 and 12). For these the DSDT IRQ settings should sometimes
+        * be used otherwise PS/2 keyboards / mice will not work.
+        */
+       if (gsi != 1 && gsi != 12)
+               return true;
+
+       /* If the override comes from an INT_SRC_OVR MADT entry, honor it. */
+       if (acpi_int_src_ovr[gsi])
+               return true;
+
+       /*
+        * IRQ override isn't needed on modern AMD Zen systems and
+        * this override breaks active low IRQs on AMD Ryzen 6000 and
+        * newer systems. Skip it.
+        */
+       if (boot_cpu_has(X86_FEATURE_ZEN))
+               return false;
+#endif
+
        return true;
 }
 
index c1815b9..52df435 100644 (file)
@@ -577,6 +577,18 @@ ssize_t __weak cpu_show_retbleed(struct device *dev,
        return sysfs_emit(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_spec_rstack_overflow(struct device *dev,
+                                            struct device_attribute *attr, char *buf)
+{
+       return sysfs_emit(buf, "Not affected\n");
+}
+
+ssize_t __weak cpu_show_gds(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       return sysfs_emit(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
@@ -588,6 +600,8 @@ static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
 static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL);
 static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL);
 static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL);
+static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NULL);
+static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
        &dev_attr_meltdown.attr,
@@ -601,6 +615,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
        &dev_attr_srbds.attr,
        &dev_attr_mmio_stale_data.attr,
        &dev_attr_retbleed.attr,
+       &dev_attr_spec_rstack_overflow.attr,
+       &dev_attr_gather_data_sampling.attr,
        NULL
 };
 
index 5676e6d..06673c6 100644 (file)
@@ -1870,15 +1870,16 @@ static void zram_bio_discard(struct zram *zram, struct bio *bio)
 
 static void zram_bio_read(struct zram *zram, struct bio *bio)
 {
-       struct bvec_iter iter;
-       struct bio_vec bv;
-       unsigned long start_time;
+       unsigned long start_time = bio_start_io_acct(bio);
+       struct bvec_iter iter = bio->bi_iter;
 
-       start_time = bio_start_io_acct(bio);
-       bio_for_each_segment(bv, bio, iter) {
+       do {
                u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
                u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
                                SECTOR_SHIFT;
+               struct bio_vec bv = bio_iter_iovec(bio, iter);
+
+               bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
 
                if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) {
                        atomic64_inc(&zram->stats.failed_reads);
@@ -1890,22 +1891,26 @@ static void zram_bio_read(struct zram *zram, struct bio *bio)
                zram_slot_lock(zram, index);
                zram_accessed(zram, index);
                zram_slot_unlock(zram, index);
-       }
+
+               bio_advance_iter_single(bio, &iter, bv.bv_len);
+       } while (iter.bi_size);
+
        bio_end_io_acct(bio, start_time);
        bio_endio(bio);
 }
 
 static void zram_bio_write(struct zram *zram, struct bio *bio)
 {
-       struct bvec_iter iter;
-       struct bio_vec bv;
-       unsigned long start_time;
+       unsigned long start_time = bio_start_io_acct(bio);
+       struct bvec_iter iter = bio->bi_iter;
 
-       start_time = bio_start_io_acct(bio);
-       bio_for_each_segment(bv, bio, iter) {
+       do {
                u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
                u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
                                SECTOR_SHIFT;
+               struct bio_vec bv = bio_iter_iovec(bio, iter);
+
+               bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
 
                if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) {
                        atomic64_inc(&zram->stats.failed_writes);
@@ -1916,7 +1921,10 @@ static void zram_bio_write(struct zram *zram, struct bio *bio)
                zram_slot_lock(zram, index);
                zram_accessed(zram, index);
                zram_slot_unlock(zram, index);
-       }
+
+               bio_advance_iter_single(bio, &iter, bv.bv_len);
+       } while (iter.bi_size);
+
        bio_end_io_acct(bio, start_time);
        bio_endio(bio);
 }
index cf5499e..ea6b401 100644 (file)
@@ -510,70 +510,6 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip)
        return 0;
 }
 
-/*
- * Some AMD fTPM versions may cause stutter
- * https://www.amd.com/en/support/kb/faq/pa-410
- *
- * Fixes are available in two series of fTPM firmware:
- * 6.x.y.z series: 6.0.18.6 +
- * 3.x.y.z series: 3.57.y.5 +
- */
-#ifdef CONFIG_X86
-static bool tpm_amd_is_rng_defective(struct tpm_chip *chip)
-{
-       u32 val1, val2;
-       u64 version;
-       int ret;
-
-       if (!(chip->flags & TPM_CHIP_FLAG_TPM2))
-               return false;
-
-       ret = tpm_request_locality(chip);
-       if (ret)
-               return false;
-
-       ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL);
-       if (ret)
-               goto release;
-       if (val1 != 0x414D4400U /* AMD */) {
-               ret = -ENODEV;
-               goto release;
-       }
-       ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL);
-       if (ret)
-               goto release;
-       ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL);
-
-release:
-       tpm_relinquish_locality(chip);
-
-       if (ret)
-               return false;
-
-       version = ((u64)val1 << 32) | val2;
-       if ((version >> 48) == 6) {
-               if (version >= 0x0006000000180006ULL)
-                       return false;
-       } else if ((version >> 48) == 3) {
-               if (version >= 0x0003005700000005ULL)
-                       return false;
-       } else {
-               return false;
-       }
-
-       dev_warn(&chip->dev,
-                "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n",
-                version);
-
-       return true;
-}
-#else
-static inline bool tpm_amd_is_rng_defective(struct tpm_chip *chip)
-{
-       return false;
-}
-#endif /* CONFIG_X86 */
-
 static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
 {
        struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng);
@@ -585,10 +521,20 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
        return tpm_get_random(chip, data, max);
 }
 
+static bool tpm_is_hwrng_enabled(struct tpm_chip *chip)
+{
+       if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM))
+               return false;
+       if (tpm_is_firmware_upgrade(chip))
+               return false;
+       if (chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED)
+               return false;
+       return true;
+}
+
 static int tpm_add_hwrng(struct tpm_chip *chip)
 {
-       if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) ||
-           tpm_amd_is_rng_defective(chip))
+       if (!tpm_is_hwrng_enabled(chip))
                return 0;
 
        snprintf(chip->hwrng_name, sizeof(chip->hwrng_name),
@@ -693,7 +639,7 @@ int tpm_chip_register(struct tpm_chip *chip)
        return 0;
 
 out_hwrng:
-       if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip))
+       if (tpm_is_hwrng_enabled(chip))
                hwrng_unregister(&chip->hwrng);
 out_ppi:
        tpm_bios_log_teardown(chip);
@@ -718,8 +664,7 @@ EXPORT_SYMBOL_GPL(tpm_chip_register);
 void tpm_chip_unregister(struct tpm_chip *chip)
 {
        tpm_del_legacy_sysfs(chip);
-       if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip) &&
-           !tpm_amd_is_rng_defective(chip))
+       if (tpm_is_hwrng_enabled(chip))
                hwrng_unregister(&chip->hwrng);
        tpm_bios_log_teardown(chip);
        if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip))
index 1a5d09b..9eb1a18 100644 (file)
@@ -463,6 +463,28 @@ static bool crb_req_canceled(struct tpm_chip *chip, u8 status)
        return (cancel & CRB_CANCEL_INVOKE) == CRB_CANCEL_INVOKE;
 }
 
+static int crb_check_flags(struct tpm_chip *chip)
+{
+       u32 val;
+       int ret;
+
+       ret = crb_request_locality(chip, 0);
+       if (ret)
+               return ret;
+
+       ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val, NULL);
+       if (ret)
+               goto release;
+
+       if (val == 0x414D4400U /* AMD */)
+               chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED;
+
+release:
+       crb_relinquish_locality(chip, 0);
+
+       return ret;
+}
+
 static const struct tpm_class_ops tpm_crb = {
        .flags = TPM_OPS_AUTO_STARTUP,
        .status = crb_status,
@@ -800,6 +822,14 @@ static int crb_acpi_add(struct acpi_device *device)
        chip->acpi_dev_handle = device->handle;
        chip->flags = TPM_CHIP_FLAG_TPM2;
 
+       rc = tpm_chip_bootstrap(chip);
+       if (rc)
+               goto out;
+
+       rc = crb_check_flags(chip);
+       if (rc)
+               goto out;
+
        rc = tpm_chip_register(chip);
 
 out:
index cc42cf3..ac4daaf 100644 (file)
@@ -164,6 +164,22 @@ static const struct dmi_system_id tpm_tis_dmi_table[] = {
        },
        {
                .callback = tpm_tis_disable_irq,
+               .ident = "ThinkStation P620",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkStation P620"),
+               },
+       },
+       {
+               .callback = tpm_tis_disable_irq,
+               .ident = "TUXEDO InfinityBook S 15/17 Gen7",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "TUXEDO"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "TUXEDO InfinityBook S 15/17 Gen7"),
+               },
+       },
+       {
+               .callback = tpm_tis_disable_irq,
                .ident = "UPX-TGL",
                .matches = {
                        DMI_MATCH(DMI_SYS_VENDOR, "AAEON"),
index 81fba0d..9a1e194 100644 (file)
@@ -1012,8 +1012,8 @@ static int amd_pstate_update_status(const char *buf, size_t size)
        return 0;
 }
 
-static ssize_t show_status(struct kobject *kobj,
-                          struct kobj_attribute *attr, char *buf)
+static ssize_t status_show(struct device *dev,
+                          struct device_attribute *attr, char *buf)
 {
        ssize_t ret;
 
@@ -1024,7 +1024,7 @@ static ssize_t show_status(struct kobject *kobj,
        return ret;
 }
 
-static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
+static ssize_t status_store(struct device *a, struct device_attribute *b,
                            const char *buf, size_t count)
 {
        char *p = memchr(buf, '\n', count);
@@ -1043,7 +1043,7 @@ cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq);
 cpufreq_freq_attr_ro(amd_pstate_highest_perf);
 cpufreq_freq_attr_rw(energy_performance_preference);
 cpufreq_freq_attr_ro(energy_performance_available_preferences);
-define_one_global_rw(status);
+static DEVICE_ATTR_RW(status);
 
 static struct freq_attr *amd_pstate_attr[] = {
        &amd_pstate_max_freq,
@@ -1062,7 +1062,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = {
 };
 
 static struct attribute *pstate_global_attributes[] = {
-       &status.attr,
+       &dev_attr_status.attr,
        NULL
 };
 
index c2d6d9c..b88af12 100644 (file)
@@ -120,20 +120,6 @@ static void psci_pd_remove(void)
        }
 }
 
-static bool psci_pd_try_set_osi_mode(void)
-{
-       int ret;
-
-       if (!psci_has_osi_support())
-               return false;
-
-       ret = psci_set_osi_mode(true);
-       if (ret)
-               return false;
-
-       return true;
-}
-
 static void psci_cpuidle_domain_sync_state(struct device *dev)
 {
        /*
@@ -152,15 +138,12 @@ static int psci_cpuidle_domain_probe(struct platform_device *pdev)
 {
        struct device_node *np = pdev->dev.of_node;
        struct device_node *node;
-       bool use_osi;
+       bool use_osi = psci_has_osi_support();
        int ret = 0, pd_count = 0;
 
        if (!np)
                return -ENODEV;
 
-       /* If OSI mode is supported, let's try to enable it. */
-       use_osi = psci_pd_try_set_osi_mode();
-
        /*
         * Parse child nodes for the "#power-domain-cells" property and
         * initialize a genpd/genpd-of-provider pair when it's found.
@@ -170,33 +153,37 @@ static int psci_cpuidle_domain_probe(struct platform_device *pdev)
                        continue;
 
                ret = psci_pd_init(node, use_osi);
-               if (ret)
-                       goto put_node;
+               if (ret) {
+                       of_node_put(node);
+                       goto exit;
+               }
 
                pd_count++;
        }
 
        /* Bail out if not using the hierarchical CPU topology. */
        if (!pd_count)
-               goto no_pd;
+               return 0;
 
        /* Link genpd masters/subdomains to model the CPU topology. */
        ret = dt_idle_pd_init_topology(np);
        if (ret)
                goto remove_pd;
 
+       /* let's try to enable OSI. */
+       ret = psci_set_osi_mode(use_osi);
+       if (ret)
+               goto remove_pd;
+
        pr_info("Initialized CPU PM domain topology using %s mode\n",
                use_osi ? "OSI" : "PC");
        return 0;
 
-put_node:
-       of_node_put(node);
 remove_pd:
+       dt_idle_pd_remove_topology(np);
        psci_pd_remove();
+exit:
        pr_err("failed to create CPU PM domains ret=%d\n", ret);
-no_pd:
-       if (use_osi)
-               psci_set_osi_mode(false);
        return ret;
 }
 
index b371655..1af63c1 100644 (file)
@@ -152,6 +152,30 @@ int dt_idle_pd_init_topology(struct device_node *np)
        return 0;
 }
 
+int dt_idle_pd_remove_topology(struct device_node *np)
+{
+       struct device_node *node;
+       struct of_phandle_args child, parent;
+       int ret;
+
+       for_each_child_of_node(np, node) {
+               if (of_parse_phandle_with_args(node, "power-domains",
+                                       "#power-domain-cells", 0, &parent))
+                       continue;
+
+               child.np = node;
+               child.args_count = 0;
+               ret = of_genpd_remove_subdomain(&parent, &child);
+               of_node_put(parent.np);
+               if (ret) {
+                       of_node_put(node);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
 struct device *dt_idle_attach_cpu(int cpu, const char *name)
 {
        struct device *dev;
index a95483d..3be1f70 100644 (file)
@@ -14,6 +14,8 @@ struct generic_pm_domain *dt_idle_pd_alloc(struct device_node *np,
 
 int dt_idle_pd_init_topology(struct device_node *np);
 
+int dt_idle_pd_remove_topology(struct device_node *np);
+
 struct device *dt_idle_attach_cpu(int cpu, const char *name);
 
 void dt_idle_detach_cpu(struct device *dev);
@@ -36,6 +38,11 @@ static inline int dt_idle_pd_init_topology(struct device_node *np)
        return 0;
 }
 
+static inline int dt_idle_pd_remove_topology(struct device_node *np)
+{
+       return 0;
+}
+
 static inline struct device *dt_idle_attach_cpu(int cpu, const char *name)
 {
        return NULL;
index 644c188..08fdd0e 100644 (file)
@@ -211,6 +211,7 @@ config FSL_DMA
 config FSL_EDMA
        tristate "Freescale eDMA engine support"
        depends on OF
+       depends on HAS_IOMEM
        select DMA_ENGINE
        select DMA_VIRTUAL_CHANNELS
        help
@@ -280,6 +281,7 @@ config IMX_SDMA
 
 config INTEL_IDMA64
        tristate "Intel integrated DMA 64-bit support"
+       depends on HAS_IOMEM
        select DMA_ENGINE
        select DMA_VIRTUAL_CHANNELS
        help
index 5abbcc6..9a15f0d 100644 (file)
@@ -384,9 +384,7 @@ static void idxd_wq_disable_cleanup(struct idxd_wq *wq)
        wq->threshold = 0;
        wq->priority = 0;
        wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
-       clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
-       clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
-       clear_bit(WQ_FLAG_ATS_DISABLE, &wq->flags);
+       wq->flags = 0;
        memset(wq->name, 0, WQ_NAME_SIZE);
        wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
        idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
index ebd8733..9413fad 100644 (file)
@@ -190,7 +190,13 @@ static int mcf_edma_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
-       chans = pdata->dma_channels;
+       if (!pdata->dma_channels) {
+               dev_info(&pdev->dev, "setting default channel number to 64");
+               chans = 64;
+       } else {
+               chans = pdata->dma_channels;
+       }
+
        len = sizeof(*mcf_edma) + sizeof(*mcf_chan) * chans;
        mcf_edma = devm_kzalloc(&pdev->dev, len, GFP_KERNEL);
        if (!mcf_edma)
@@ -202,11 +208,6 @@ static int mcf_edma_probe(struct platform_device *pdev)
        mcf_edma->drvdata = &mcf_data;
        mcf_edma->big_endian = 1;
 
-       if (!mcf_edma->n_chans) {
-               dev_info(&pdev->dev, "setting default channel number to 64");
-               mcf_edma->n_chans = 64;
-       }
-
        mutex_init(&mcf_edma->fsl_edma_mutex);
 
        mcf_edma->membase = devm_platform_ioremap_resource(pdev, 0);
index 95a462a..b6e0ac8 100644 (file)
@@ -192,7 +192,7 @@ struct owl_dma_pchan {
 };
 
 /**
- * struct owl_dma_pchan - Wrapper for DMA ENGINE channel
+ * struct owl_dma_vchan - Wrapper for DMA ENGINE channel
  * @vc: wrapped virtual channel
  * @pchan: the physical channel utilized by this channel
  * @txd: active transaction on this channel
index b4731fe..3cf0b38 100644 (file)
@@ -404,6 +404,12 @@ enum desc_status {
         */
        BUSY,
        /*
+        * Pause was called while descriptor was BUSY. Due to hardware
+        * limitations, only termination is possible for descriptors
+        * that have been paused.
+        */
+       PAUSED,
+       /*
         * Sitting on the channel work_list but xfer done
         * by PL330 core
         */
@@ -2041,7 +2047,7 @@ static inline void fill_queue(struct dma_pl330_chan *pch)
        list_for_each_entry(desc, &pch->work_list, node) {
 
                /* If already submitted */
-               if (desc->status == BUSY)
+               if (desc->status == BUSY || desc->status == PAUSED)
                        continue;
 
                ret = pl330_submit_req(pch->thread, desc);
@@ -2326,6 +2332,7 @@ static int pl330_pause(struct dma_chan *chan)
 {
        struct dma_pl330_chan *pch = to_pchan(chan);
        struct pl330_dmac *pl330 = pch->dmac;
+       struct dma_pl330_desc *desc;
        unsigned long flags;
 
        pm_runtime_get_sync(pl330->ddma.dev);
@@ -2335,6 +2342,10 @@ static int pl330_pause(struct dma_chan *chan)
        _stop(pch->thread);
        spin_unlock(&pl330->lock);
 
+       list_for_each_entry(desc, &pch->work_list, node) {
+               if (desc->status == BUSY)
+                       desc->status = PAUSED;
+       }
        spin_unlock_irqrestore(&pch->lock, flags);
        pm_runtime_mark_last_busy(pl330->ddma.dev);
        pm_runtime_put_autosuspend(pl330->ddma.dev);
@@ -2425,7 +2436,7 @@ pl330_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                else if (running && desc == running)
                        transferred =
                                pl330_get_current_xferred_count(pch, desc);
-               else if (desc->status == BUSY)
+               else if (desc->status == BUSY || desc->status == PAUSED)
                        /*
                         * Busy but not running means either just enqueued,
                         * or finished and not yet marked done
@@ -2442,6 +2453,9 @@ pl330_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
                        case DONE:
                                ret = DMA_COMPLETE;
                                break;
+                       case PAUSED:
+                               ret = DMA_PAUSED;
+                               break;
                        case PREP:
                        case BUSY:
                                ret = DMA_IN_PROGRESS;
index 93ee298..e0bfd12 100644 (file)
@@ -668,6 +668,8 @@ static int xdma_set_vector_reg(struct xdma_device *xdev, u32 vec_tbl_start,
                        val |= irq_start << shift;
                        irq_start++;
                        irq_num--;
+                       if (!irq_num)
+                               break;
                }
 
                /* write IRQ register */
@@ -715,7 +717,7 @@ static int xdma_irq_init(struct xdma_device *xdev)
                ret = request_irq(irq, xdma_channel_isr, 0,
                                  "xdma-c2h-channel", &xdev->c2h_chans[j]);
                if (ret) {
-                       xdma_err(xdev, "H2C channel%d request irq%d failed: %d",
+                       xdma_err(xdev, "C2H channel%d request irq%d failed: %d",
                                 j, irq, ret);
                        goto failed_init_c2h;
                }
@@ -892,7 +894,7 @@ static int xdma_probe(struct platform_device *pdev)
        }
 
        reg_base = devm_ioremap_resource(&pdev->dev, res);
-       if (!reg_base) {
+       if (IS_ERR(reg_base)) {
                xdma_err(xdev, "ioremap failed");
                goto failed;
        }
index a3b86b8..6dc950c 100644 (file)
@@ -1296,6 +1296,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
 int amdgpu_device_pci_reset(struct amdgpu_device *adev);
 bool amdgpu_device_need_post(struct amdgpu_device *adev);
+bool amdgpu_sg_display_supported(struct amdgpu_device *adev);
 bool amdgpu_device_pcie_dynamic_switching_supported(void);
 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev);
 bool amdgpu_device_aspm_support_quirk(void);
index 040f4cb..fb78a8f 100644 (file)
@@ -295,7 +295,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
 
        if (!p->gang_size) {
                ret = -EINVAL;
-               goto free_partial_kdata;
+               goto free_all_kdata;
        }
 
        for (i = 0; i < p->gang_size; ++i) {
index a2cdde0..45e9d73 100644 (file)
@@ -1459,6 +1459,32 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
 }
 
 /*
+ * On APUs with >= 64GB white flickering has been observed w/ SG enabled.
+ * Disable S/G on such systems until we have a proper fix.
+ * https://gitlab.freedesktop.org/drm/amd/-/issues/2354
+ * https://gitlab.freedesktop.org/drm/amd/-/issues/2735
+ */
+bool amdgpu_sg_display_supported(struct amdgpu_device *adev)
+{
+       switch (amdgpu_sg_display) {
+       case -1:
+               break;
+       case 0:
+               return false;
+       case 1:
+               return true;
+       default:
+               return false;
+       }
+       if ((totalram_pages() << (PAGE_SHIFT - 10)) +
+           (adev->gmc.real_vram_size / 1024) >= 64000000) {
+               DRM_WARN("Disabling S/G due to >=64GB RAM\n");
+               return false;
+       }
+       return true;
+}
+
+/*
  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
  * speed switching. Until we have confirmation from Intel that a specific host
  * supports it, it's safer that we keep it disabled for all.
index 3a7af59..0451533 100644 (file)
@@ -471,8 +471,12 @@ static void gfx_v11_0_check_fw_cp_gfx_shadow(struct amdgpu_device *adev)
        case IP_VERSION(11, 0, 3):
                if ((adev->gfx.me_fw_version >= 1505) &&
                    (adev->gfx.pfp_fw_version >= 1600) &&
-                   (adev->gfx.mec_fw_version >= 512))
-                       adev->gfx.cp_gfx_shadow = true;
+                   (adev->gfx.mec_fw_version >= 512)) {
+                       if (amdgpu_sriov_vf(adev))
+                               adev->gfx.cp_gfx_shadow = true;
+                       else
+                               adev->gfx.cp_gfx_shadow = false;
+               }
                break;
        default:
                adev->gfx.cp_gfx_shadow = false;
index e1a392b..af5685f 100644 (file)
@@ -137,14 +137,15 @@ static int psp_v13_0_wait_for_bootloader(struct psp_context *psp)
        int ret;
        int retry_loop;
 
+       /* Wait for bootloader to signify that it is ready having bit 31 of
+        * C2PMSG_35 set to 1. All other bits are expected to be cleared.
+        * If there is an error in processing command, bits[7:0] will be set.
+        * This is applicable for PSP v13.0.6 and newer.
+        */
        for (retry_loop = 0; retry_loop < 10; retry_loop++) {
-               /* Wait for bootloader to signify that is
-                   ready having bit 31 of C2PMSG_35 set to 1 */
-               ret = psp_wait_for(psp,
-                                  SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_35),
-                                  0x80000000,
-                                  0x80000000,
-                                  false);
+               ret = psp_wait_for(
+                       psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_35),
+                       0x80000000, 0xffffffff, false);
 
                if (ret == 0)
                        return 0;
index 49f40d9..f5a6f56 100644 (file)
@@ -1543,11 +1543,7 @@ static bool kfd_ignore_crat(void)
        if (ignore_crat)
                return true;
 
-#ifndef KFD_SUPPORT_IOMMU_V2
        ret = true;
-#else
-       ret = false;
-#endif
 
        return ret;
 }
index 0b3dc75..a53e075 100644 (file)
@@ -194,11 +194,6 @@ static void kfd_device_info_init(struct kfd_dev *kfd,
 
                kfd_device_info_set_event_interrupt_class(kfd);
 
-               /* Raven */
-               if (gc_version == IP_VERSION(9, 1, 0) ||
-                   gc_version == IP_VERSION(9, 2, 2))
-                       kfd->device_info.needs_iommu_device = true;
-
                if (gc_version < IP_VERSION(11, 0, 0)) {
                        /* Navi2x+, Navi1x+ */
                        if (gc_version == IP_VERSION(10, 3, 6))
@@ -233,10 +228,6 @@ static void kfd_device_info_init(struct kfd_dev *kfd,
                    asic_type != CHIP_TONGA)
                        kfd->device_info.supports_cwsr = true;
 
-               if (asic_type == CHIP_KAVERI ||
-                   asic_type == CHIP_CARRIZO)
-                       kfd->device_info.needs_iommu_device = true;
-
                if (asic_type != CHIP_HAWAII && !vf)
                        kfd->device_info.needs_pci_atomics = true;
        }
@@ -249,7 +240,6 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
        uint32_t gfx_target_version = 0;
 
        switch (adev->asic_type) {
-#ifdef KFD_SUPPORT_IOMMU_V2
 #ifdef CONFIG_DRM_AMDGPU_CIK
        case CHIP_KAVERI:
                gfx_target_version = 70000;
@@ -262,7 +252,6 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
                if (!vf)
                        f2g = &gfx_v8_kfd2kgd;
                break;
-#endif
 #ifdef CONFIG_DRM_AMDGPU_CIK
        case CHIP_HAWAII:
                gfx_target_version = 70001;
@@ -298,7 +287,6 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
                        gfx_target_version = 90000;
                        f2g = &gfx_v9_kfd2kgd;
                        break;
-#ifdef KFD_SUPPORT_IOMMU_V2
                /* Raven */
                case IP_VERSION(9, 1, 0):
                case IP_VERSION(9, 2, 2):
@@ -306,7 +294,6 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
                        if (!vf)
                                f2g = &gfx_v9_kfd2kgd;
                        break;
-#endif
                /* Vega12 */
                case IP_VERSION(9, 2, 1):
                        gfx_target_version = 90004;
index 2df1538..01192f5 100644 (file)
@@ -2538,18 +2538,12 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
        }
 
        switch (dev->adev->asic_type) {
-       case CHIP_CARRIZO:
-               device_queue_manager_init_vi(&dqm->asic_ops);
-               break;
-
        case CHIP_KAVERI:
-               device_queue_manager_init_cik(&dqm->asic_ops);
-               break;
-
        case CHIP_HAWAII:
                device_queue_manager_init_cik_hawaii(&dqm->asic_ops);
                break;
 
+       case CHIP_CARRIZO:
        case CHIP_TONGA:
        case CHIP_FIJI:
        case CHIP_POLARIS10:
index 0fa739f..e5554a3 100644 (file)
@@ -1638,9 +1638,8 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
                }
                break;
        }
-       if (init_data.flags.gpu_vm_support &&
-           (amdgpu_sg_display == 0))
-               init_data.flags.gpu_vm_support = false;
+       if (init_data.flags.gpu_vm_support)
+               init_data.flags.gpu_vm_support = amdgpu_sg_display_supported(adev);
 
        if (init_data.flags.gpu_vm_support)
                adev->mode_info.gpu_vm_support = true;
index 9bc86de..b885c39 100644 (file)
@@ -1320,7 +1320,7 @@ int compute_mst_dsc_configs_for_state(struct drm_atomic_state *state,
                if (computed_streams[i])
                        continue;
 
-               if (!res_pool->funcs->remove_stream_from_ctx ||
+               if (res_pool->funcs->remove_stream_from_ctx &&
                    res_pool->funcs->remove_stream_from_ctx(stream->ctx->dc, dc_state, stream) != DC_OK)
                        return -EINVAL;
 
index 20d4d08..6966420 100644 (file)
@@ -777,7 +777,8 @@ void dce110_edp_wait_for_hpd_ready(
        dal_gpio_destroy_irq(&hpd);
 
        /* ensure that the panel is detected */
-       ASSERT(edp_hpd_high);
+       if (!edp_hpd_high)
+               DC_LOG_DC("%s: wait timed out!\n", __func__);
 }
 
 void dce110_edp_power_control(
index e5b7ef7..50dc834 100644 (file)
@@ -357,8 +357,11 @@ void dpp3_set_cursor_attributes(
        int cur_rom_en = 0;
 
        if (color_format == CURSOR_MODE_COLOR_PRE_MULTIPLIED_ALPHA ||
-               color_format == CURSOR_MODE_COLOR_UN_PRE_MULTIPLIED_ALPHA)
-               cur_rom_en = 1;
+               color_format == CURSOR_MODE_COLOR_UN_PRE_MULTIPLIED_ALPHA) {
+               if (cursor_attributes->attribute_flags.bits.ENABLE_CURSOR_DEGAMMA) {
+                       cur_rom_en = 1;
+               }
+       }
 
        REG_UPDATE_3(CURSOR0_CONTROL,
                        CUR0_MODE, color_format,
index ce41a83..222af2f 100644 (file)
@@ -1581,9 +1581,9 @@ static int smu_disable_dpms(struct smu_context *smu)
 
        /*
         * For SMU 13.0.4/11, PMFW will handle the features disablement properly
-        * for gpu reset case. Driver involvement is unnecessary.
+        * for gpu reset and S0i3 cases. Driver involvement is unnecessary.
         */
-       if (amdgpu_in_reset(adev)) {
+       if (amdgpu_in_reset(adev) || adev->in_s0ix) {
                switch (adev->ip_versions[MP1_HWIP][0]) {
                case IP_VERSION(13, 0, 4):
                case IP_VERSION(13, 0, 11):
index 3d18861..fddcd83 100644 (file)
@@ -331,11 +331,13 @@ static int smu_v13_0_0_check_powerplay_table(struct smu_context *smu)
        struct smu_13_0_0_powerplay_table *powerplay_table =
                table_context->power_play_table;
        struct smu_baco_context *smu_baco = &smu->smu_baco;
+#if 0
        PPTable_t *pptable = smu->smu_table.driver_pptable;
        const OverDriveLimits_t * const overdrive_upperlimits =
                                &pptable->SkuTable.OverDriveLimitsBasicMax;
        const OverDriveLimits_t * const overdrive_lowerlimits =
                                &pptable->SkuTable.OverDriveLimitsMin;
+#endif
 
        if (powerplay_table->platform_caps & SMU_13_0_0_PP_PLATFORM_CAP_HARDWAREDC)
                smu->dc_controlled_by_gpio = true;
@@ -347,18 +349,27 @@ static int smu_v13_0_0_check_powerplay_table(struct smu_context *smu)
        if (powerplay_table->platform_caps & SMU_13_0_0_PP_PLATFORM_CAP_MACO)
                smu_baco->maco_support = true;
 
+       /*
+        * We are in the transition to a new OD mechanism.
+        * Disable the OD feature support for SMU13 temporarily.
+        * TODO: get this reverted when new OD mechanism online
+        */
+#if 0
        if (!overdrive_lowerlimits->FeatureCtrlMask ||
            !overdrive_upperlimits->FeatureCtrlMask)
                smu->od_enabled = false;
 
-       table_context->thermal_controller_type =
-               powerplay_table->thermal_controller_type;
-
        /*
         * Instead of having its own buffer space and get overdrive_table copied,
         * smu->od_settings just points to the actual overdrive_table
         */
        smu->od_settings = &powerplay_table->overdrive_table;
+#else
+       smu->od_enabled = false;
+#endif
+
+       table_context->thermal_controller_type =
+               powerplay_table->thermal_controller_type;
 
        return 0;
 }
@@ -1140,7 +1151,6 @@ static int smu_v13_0_0_print_clk_levels(struct smu_context *smu,
                (OverDriveTableExternal_t *)smu->smu_table.overdrive_table;
        struct smu_13_0_dpm_table *single_dpm_table;
        struct smu_13_0_pcie_table *pcie_table;
-       const int link_width[] = {0, 1, 2, 4, 8, 12, 16};
        uint32_t gen_speed, lane_width;
        int i, curr_freq, size = 0;
        int32_t min_value, max_value;
@@ -1256,7 +1266,7 @@ static int smu_v13_0_0_print_clk_levels(struct smu_context *smu,
                                        (pcie_table->pcie_lane[i] == 6) ? "x16" : "",
                                        pcie_table->clk_freq[i],
                                        (gen_speed == DECODE_GEN_SPEED(pcie_table->pcie_gen[i])) &&
-                                       (lane_width == DECODE_LANE_WIDTH(link_width[pcie_table->pcie_lane[i]])) ?
+                                       (lane_width == DECODE_LANE_WIDTH(pcie_table->pcie_lane[i])) ?
                                        "*" : "");
                break;
 
index 1ac5521..fe4ee2d 100644 (file)
@@ -1993,9 +1993,8 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table
 
        gpu_metrics->average_socket_power =
                SMUQ10_TO_UINT(metrics->SocketPower);
-       /* Energy is reported in 15.625mJ units */
-       gpu_metrics->energy_accumulator =
-               SMUQ10_TO_UINT(metrics->SocketEnergyAcc);
+       /* Energy counter reported in 15.259uJ (2^-16) units */
+       gpu_metrics->energy_accumulator = metrics->SocketEnergyAcc;
 
        gpu_metrics->current_gfxclk =
                SMUQ10_TO_UINT(metrics->GfxclkFrequency[xcc0]);
index b1f0937..62f2886 100644 (file)
@@ -323,10 +323,12 @@ static int smu_v13_0_7_check_powerplay_table(struct smu_context *smu)
        struct smu_baco_context *smu_baco = &smu->smu_baco;
        PPTable_t *smc_pptable = table_context->driver_pptable;
        BoardTable_t *BoardTable = &smc_pptable->BoardTable;
+#if 0
        const OverDriveLimits_t * const overdrive_upperlimits =
                                &smc_pptable->SkuTable.OverDriveLimitsBasicMax;
        const OverDriveLimits_t * const overdrive_lowerlimits =
                                &smc_pptable->SkuTable.OverDriveLimitsMin;
+#endif
 
        if (powerplay_table->platform_caps & SMU_13_0_7_PP_PLATFORM_CAP_HARDWAREDC)
                smu->dc_controlled_by_gpio = true;
@@ -338,18 +340,22 @@ static int smu_v13_0_7_check_powerplay_table(struct smu_context *smu)
        if (smu_baco->platform_support && (BoardTable->HsrEnabled || BoardTable->VddqOffEnabled))
                smu_baco->maco_support = true;
 
+#if 0
        if (!overdrive_lowerlimits->FeatureCtrlMask ||
            !overdrive_upperlimits->FeatureCtrlMask)
                smu->od_enabled = false;
 
-       table_context->thermal_controller_type =
-               powerplay_table->thermal_controller_type;
-
        /*
         * Instead of having its own buffer space and get overdrive_table copied,
         * smu->od_settings just points to the actual overdrive_table
         */
        smu->od_settings = &powerplay_table->overdrive_table;
+#else
+       smu->od_enabled = false;
+#endif
+
+       table_context->thermal_controller_type =
+               powerplay_table->thermal_controller_type;
 
        return 0;
 }
index 504d51c..aadb396 100644 (file)
@@ -2517,9 +2517,11 @@ static irqreturn_t it6505_int_threaded_handler(int unused, void *data)
        };
        int int_status[3], i;
 
-       if (it6505->enable_drv_hold || pm_runtime_get_if_in_use(dev) <= 0)
+       if (it6505->enable_drv_hold || !it6505->powered)
                return IRQ_HANDLED;
 
+       pm_runtime_get_sync(dev);
+
        int_status[0] = it6505_read(it6505, INT_STATUS_01);
        int_status[1] = it6505_read(it6505, INT_STATUS_02);
        int_status[2] = it6505_read(it6505, INT_STATUS_03);
index 5163e52..9663601 100644 (file)
@@ -774,9 +774,7 @@ static struct mipi_dsi_device *lt9611_attach_dsi(struct lt9611 *lt9611,
        dsi->lanes = 4;
        dsi->format = MIPI_DSI_FMT_RGB888;
        dsi->mode_flags = MIPI_DSI_MODE_VIDEO | MIPI_DSI_MODE_VIDEO_SYNC_PULSE |
-                         MIPI_DSI_MODE_VIDEO_HSE | MIPI_DSI_MODE_VIDEO_NO_HSA |
-                         MIPI_DSI_MODE_VIDEO_NO_HFP | MIPI_DSI_MODE_VIDEO_NO_HBP |
-                         MIPI_DSI_MODE_NO_EOT_PACKET;
+                         MIPI_DSI_MODE_VIDEO_HSE;
 
        ret = devm_mipi_dsi_attach(dev, dsi);
        if (ret < 0) {
index 4ea6507..baaf0e0 100644 (file)
@@ -623,7 +623,13 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct
        int ret;
 
        if (obj->import_attach) {
+               /* Reset both vm_ops and vm_private_data, so we don't end up with
+                * vm_ops pointing to our implementation if the dma-buf backend
+                * doesn't set those fields.
+                */
                vma->vm_private_data = NULL;
+               vma->vm_ops = NULL;
+
                ret = dma_buf_mmap(obj->dma_buf, vma, 0);
 
                /* Drop the reference drm_gem_mmap_obj() acquired.*/
index f75c6f0..a2e0033 100644 (file)
@@ -967,7 +967,7 @@ nouveau_connector_get_modes(struct drm_connector *connector)
        /* Determine display colour depth for everything except LVDS now,
         * DP requires this before mode_valid() is called.
         */
-       if (connector->connector_type != DRM_MODE_CONNECTOR_LVDS && nv_connector->native_mode)
+       if (connector->connector_type != DRM_MODE_CONNECTOR_LVDS)
                nouveau_connector_detect_depth(connector);
 
        /* Find the native mode if this is a digital panel, if we didn't
index 40c8ea4..b8ac66b 100644 (file)
@@ -26,6 +26,8 @@
 #include "head.h"
 #include "ior.h"
 
+#include <drm/display/drm_dp.h>
+
 #include <subdev/bios.h>
 #include <subdev/bios/init.h>
 #include <subdev/gpio.h>
@@ -634,6 +636,50 @@ nvkm_dp_enable_supported_link_rates(struct nvkm_outp *outp)
        return outp->dp.rates != 0;
 }
 
+/* XXX: This is a big fat hack, and this is just drm_dp_read_dpcd_caps()
+ * converted to work inside nvkm. This is a temporary holdover until we start
+ * passing the drm_dp_aux device through NVKM
+ */
+static int
+nvkm_dp_read_dpcd_caps(struct nvkm_outp *outp)
+{
+       struct nvkm_i2c_aux *aux = outp->dp.aux;
+       u8 dpcd_ext[DP_RECEIVER_CAP_SIZE];
+       int ret;
+
+       ret = nvkm_rdaux(aux, DPCD_RC00_DPCD_REV, outp->dp.dpcd, DP_RECEIVER_CAP_SIZE);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Prior to DP1.3 the bit represented by
+        * DP_EXTENDED_RECEIVER_CAP_FIELD_PRESENT was reserved.
+        * If it is set DP_DPCD_REV at 0000h could be at a value less than
+        * the true capability of the panel. The only way to check is to
+        * then compare 0000h and 2200h.
+        */
+       if (!(outp->dp.dpcd[DP_TRAINING_AUX_RD_INTERVAL] &
+             DP_EXTENDED_RECEIVER_CAP_FIELD_PRESENT))
+               return 0;
+
+       ret = nvkm_rdaux(aux, DP_DP13_DPCD_REV, dpcd_ext, sizeof(dpcd_ext));
+       if (ret < 0)
+               return ret;
+
+       if (outp->dp.dpcd[DP_DPCD_REV] > dpcd_ext[DP_DPCD_REV]) {
+               OUTP_DBG(outp, "Extended DPCD rev less than base DPCD rev (%d > %d)\n",
+                        outp->dp.dpcd[DP_DPCD_REV], dpcd_ext[DP_DPCD_REV]);
+               return 0;
+       }
+
+       if (!memcmp(outp->dp.dpcd, dpcd_ext, sizeof(dpcd_ext)))
+               return 0;
+
+       memcpy(outp->dp.dpcd, dpcd_ext, sizeof(dpcd_ext));
+
+       return 0;
+}
+
 void
 nvkm_dp_enable(struct nvkm_outp *outp, bool auxpwr)
 {
@@ -689,7 +735,7 @@ nvkm_dp_enable(struct nvkm_outp *outp, bool auxpwr)
                        memset(outp->dp.lttpr, 0x00, sizeof(outp->dp.lttpr));
                }
 
-               if (!nvkm_rdaux(aux, DPCD_RC00_DPCD_REV, outp->dp.dpcd, sizeof(outp->dp.dpcd))) {
+               if (!nvkm_dp_read_dpcd_caps(outp)) {
                        const u8 rates[] = { 0x1e, 0x14, 0x0a, 0x06, 0 };
                        const u8 *rate;
                        int rate_max;
index 00dbeda..de161e7 100644 (file)
@@ -117,6 +117,7 @@ void gk104_grctx_generate_r418800(struct gf100_gr *);
 
 extern const struct gf100_grctx_func gk110_grctx;
 void gk110_grctx_generate_r419eb0(struct gf100_gr *);
+void gk110_grctx_generate_r419f78(struct gf100_gr *);
 
 extern const struct gf100_grctx_func gk110b_grctx;
 extern const struct gf100_grctx_func gk208_grctx;
index 94233d0..52a234b 100644 (file)
@@ -906,7 +906,9 @@ static void
 gk104_grctx_generate_r419f78(struct gf100_gr *gr)
 {
        struct nvkm_device *device = gr->base.engine.subdev.device;
-       nvkm_mask(device, 0x419f78, 0x00000001, 0x00000000);
+
+       /* bit 3 set disables loads in fp helper invocations, we need it enabled */
+       nvkm_mask(device, 0x419f78, 0x00000009, 0x00000000);
 }
 
 void
index 4391458..3acdd9e 100644 (file)
@@ -820,6 +820,15 @@ gk110_grctx_generate_r419eb0(struct gf100_gr *gr)
        nvkm_mask(device, 0x419eb0, 0x00001000, 0x00001000);
 }
 
+void
+gk110_grctx_generate_r419f78(struct gf100_gr *gr)
+{
+       struct nvkm_device *device = gr->base.engine.subdev.device;
+
+       /* bit 3 set disables loads in fp helper invocations, we need it enabled */
+       nvkm_mask(device, 0x419f78, 0x00000008, 0x00000000);
+}
+
 const struct gf100_grctx_func
 gk110_grctx = {
        .main  = gf100_grctx_generate_main,
@@ -854,4 +863,5 @@ gk110_grctx = {
        .gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
        .r418800 = gk104_grctx_generate_r418800,
        .r419eb0 = gk110_grctx_generate_r419eb0,
+       .r419f78 = gk110_grctx_generate_r419f78,
 };
index 7b9a34f..5597e87 100644 (file)
@@ -103,4 +103,5 @@ gk110b_grctx = {
        .gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
        .r418800 = gk104_grctx_generate_r418800,
        .r419eb0 = gk110_grctx_generate_r419eb0,
+       .r419f78 = gk110_grctx_generate_r419f78,
 };
index c78d07a..6126564 100644 (file)
@@ -568,4 +568,5 @@ gk208_grctx = {
        .dist_skip_table = gf117_grctx_generate_dist_skip_table,
        .gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
        .r418800 = gk104_grctx_generate_r418800,
+       .r419f78 = gk110_grctx_generate_r419f78,
 };
index beac66e..9906974 100644 (file)
@@ -988,4 +988,5 @@ gm107_grctx = {
        .r406500 = gm107_grctx_generate_r406500,
        .gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
        .r419e00 = gm107_grctx_generate_r419e00,
+       .r419f78 = gk110_grctx_generate_r419f78,
 };
index 3b6c810..a7775aa 100644 (file)
@@ -206,19 +206,6 @@ tu102_gr_av_to_init_veid(struct nvkm_blob *blob, struct gf100_gr_pack **ppack)
        return gk20a_gr_av_to_init_(blob, 64, 0x00100000, ppack);
 }
 
-int
-tu102_gr_load(struct gf100_gr *gr, int ver, const struct gf100_gr_fwif *fwif)
-{
-       int ret;
-
-       ret = gm200_gr_load(gr, ver, fwif);
-       if (ret)
-               return ret;
-
-       return gk20a_gr_load_net(gr, "gr/", "sw_veid_bundle_init", ver, tu102_gr_av_to_init_veid,
-                                &gr->bundle_veid);
-}
-
 static const struct gf100_gr_fwif
 tu102_gr_fwif[] = {
        {  0, gm200_gr_load, &tu102_gr, &gp108_gr_fecs_acr, &gp108_gr_gpccs_acr },
index a530ecc..bf34498 100644 (file)
@@ -833,12 +833,12 @@ static int vop_plane_atomic_check(struct drm_plane *plane,
         * need align with 2 pixel.
         */
        if (fb->format->is_yuv && ((new_plane_state->src.x1 >> 16) % 2)) {
-               DRM_ERROR("Invalid Source: Yuv format not support odd xpos\n");
+               DRM_DEBUG_KMS("Invalid Source: Yuv format not support odd xpos\n");
                return -EINVAL;
        }
 
        if (fb->format->is_yuv && new_plane_state->rotation & DRM_MODE_REFLECT_Y) {
-               DRM_ERROR("Invalid Source: Yuv format does not support this rotation\n");
+               DRM_DEBUG_KMS("Invalid Source: Yuv format does not support this rotation\n");
                return -EINVAL;
        }
 
@@ -846,7 +846,7 @@ static int vop_plane_atomic_check(struct drm_plane *plane,
                struct vop *vop = to_vop(crtc);
 
                if (!vop->data->afbc) {
-                       DRM_ERROR("vop does not support AFBC\n");
+                       DRM_DEBUG_KMS("vop does not support AFBC\n");
                        return -EINVAL;
                }
 
@@ -855,15 +855,16 @@ static int vop_plane_atomic_check(struct drm_plane *plane,
                        return ret;
 
                if (new_plane_state->src.x1 || new_plane_state->src.y1) {
-                       DRM_ERROR("AFBC does not support offset display, xpos=%d, ypos=%d, offset=%d\n",
-                                 new_plane_state->src.x1,
-                                 new_plane_state->src.y1, fb->offsets[0]);
+                       DRM_DEBUG_KMS("AFBC does not support offset display, " \
+                                     "xpos=%d, ypos=%d, offset=%d\n",
+                                     new_plane_state->src.x1, new_plane_state->src.y1,
+                                     fb->offsets[0]);
                        return -EINVAL;
                }
 
                if (new_plane_state->rotation && new_plane_state->rotation != DRM_MODE_ROTATE_0) {
-                       DRM_ERROR("No rotation support in AFBC, rotation=%d\n",
-                                 new_plane_state->rotation);
+                       DRM_DEBUG_KMS("No rotation support in AFBC, rotation=%d\n",
+                                     new_plane_state->rotation);
                        return -EINVAL;
                }
        }
index a997dbc..0238078 100644 (file)
 
 #include <linux/crc16.h>
 #include <linux/debugfs.h>
+#include <linux/delay.h>
 #include <linux/hid.h>
 #include <linux/hwmon.h>
 #include <linux/jiffies.h>
+#include <linux/ktime.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
@@ -63,6 +65,8 @@ static const char *const aqc_device_names[] = {
 #define CTRL_REPORT_ID                 0x03
 #define AQUAERO_CTRL_REPORT_ID         0x0b
 
+#define CTRL_REPORT_DELAY              200     /* ms */
+
 /* The HID report that the official software always sends
  * after writing values, currently same for all devices
  */
@@ -527,6 +531,9 @@ struct aqc_data {
        int secondary_ctrl_report_size;
        u8 *secondary_ctrl_report;
 
+       ktime_t last_ctrl_report_op;
+       int ctrl_report_delay;  /* Delay between two ctrl report operations, in ms */
+
        int buffer_size;
        u8 *buffer;
        int checksum_start;
@@ -611,17 +618,35 @@ static int aqc_aquastreamxt_convert_fan_rpm(u16 val)
        return 0;
 }
 
+static void aqc_delay_ctrl_report(struct aqc_data *priv)
+{
+       /*
+        * If previous read or write is too close to this one, delay the current operation
+        * to give the device enough time to process the previous one.
+        */
+       if (priv->ctrl_report_delay) {
+               s64 delta = ktime_ms_delta(ktime_get(), priv->last_ctrl_report_op);
+
+               if (delta < priv->ctrl_report_delay)
+                       msleep(priv->ctrl_report_delay - delta);
+       }
+}
+
 /* Expects the mutex to be locked */
 static int aqc_get_ctrl_data(struct aqc_data *priv)
 {
        int ret;
 
+       aqc_delay_ctrl_report(priv);
+
        memset(priv->buffer, 0x00, priv->buffer_size);
        ret = hid_hw_raw_request(priv->hdev, priv->ctrl_report_id, priv->buffer, priv->buffer_size,
                                 HID_FEATURE_REPORT, HID_REQ_GET_REPORT);
        if (ret < 0)
                ret = -ENODATA;
 
+       priv->last_ctrl_report_op = ktime_get();
+
        return ret;
 }
 
@@ -631,6 +656,8 @@ static int aqc_send_ctrl_data(struct aqc_data *priv)
        int ret;
        u16 checksum;
 
+       aqc_delay_ctrl_report(priv);
+
        /* Checksum is not needed for Aquaero */
        if (priv->kind != aquaero) {
                /* Init and xorout value for CRC-16/USB is 0xffff */
@@ -646,12 +673,16 @@ static int aqc_send_ctrl_data(struct aqc_data *priv)
        ret = hid_hw_raw_request(priv->hdev, priv->ctrl_report_id, priv->buffer, priv->buffer_size,
                                 HID_FEATURE_REPORT, HID_REQ_SET_REPORT);
        if (ret < 0)
-               return ret;
+               goto record_access_and_ret;
 
        /* The official software sends this report after every change, so do it here as well */
        ret = hid_hw_raw_request(priv->hdev, priv->secondary_ctrl_report_id,
                                 priv->secondary_ctrl_report, priv->secondary_ctrl_report_size,
                                 HID_FEATURE_REPORT, HID_REQ_SET_REPORT);
+
+record_access_and_ret:
+       priv->last_ctrl_report_op = ktime_get();
+
        return ret;
 }
 
@@ -1524,6 +1555,7 @@ static int aqc_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
                priv->buffer_size = AQUAERO_CTRL_REPORT_SIZE;
                priv->temp_ctrl_offset = AQUAERO_TEMP_CTRL_OFFSET;
+               priv->ctrl_report_delay = CTRL_REPORT_DELAY;
 
                priv->temp_label = label_temp_sensors;
                priv->virtual_temp_label = label_virtual_temp_sensors;
@@ -1547,6 +1579,7 @@ static int aqc_probe(struct hid_device *hdev, const struct hid_device_id *id)
                priv->temp_ctrl_offset = D5NEXT_TEMP_CTRL_OFFSET;
 
                priv->buffer_size = D5NEXT_CTRL_REPORT_SIZE;
+               priv->ctrl_report_delay = CTRL_REPORT_DELAY;
 
                priv->power_cycle_count_offset = D5NEXT_POWER_CYCLES;
 
@@ -1597,6 +1630,7 @@ static int aqc_probe(struct hid_device *hdev, const struct hid_device_id *id)
                priv->temp_ctrl_offset = OCTO_TEMP_CTRL_OFFSET;
 
                priv->buffer_size = OCTO_CTRL_REPORT_SIZE;
+               priv->ctrl_report_delay = CTRL_REPORT_DELAY;
 
                priv->power_cycle_count_offset = OCTO_POWER_CYCLES;
 
@@ -1624,6 +1658,7 @@ static int aqc_probe(struct hid_device *hdev, const struct hid_device_id *id)
                priv->temp_ctrl_offset = QUADRO_TEMP_CTRL_OFFSET;
 
                priv->buffer_size = QUADRO_CTRL_REPORT_SIZE;
+               priv->ctrl_report_delay = CTRL_REPORT_DELAY;
 
                priv->flow_pulses_ctrl_offset = QUADRO_FLOW_PULSES_CTRL_OFFSET;
                priv->power_cycle_count_offset = QUADRO_POWER_CYCLES;
index fa5070a..7c5f4b1 100644 (file)
 enum chips {pfe1100, pfe3000};
 
 /*
- * Disable status check for pfe3000 devices, because some devices report
- * communication error (invalid command) for VOUT_MODE command (0x20)
- * although correct VOUT_MODE (0x16) is returned: it leads to incorrect
- * exponent in linear mode.
+ * Disable status check because some devices report communication error
+ * (invalid command) for VOUT_MODE command (0x20) although the correct
+ * VOUT_MODE (0x16) is returned: it leads to incorrect exponent in linear
+ * mode.
+ * This affects both pfe3000 and pfe1100.
  */
-static struct pmbus_platform_data pfe3000_plat_data = {
+static struct pmbus_platform_data pfe_plat_data = {
        .flags = PMBUS_SKIP_STATUS_CHECK,
 };
 
@@ -94,16 +95,15 @@ static int pfe_pmbus_probe(struct i2c_client *client)
        int model;
 
        model = (int)i2c_match_id(pfe_device_id, client)->driver_data;
+       client->dev.platform_data = &pfe_plat_data;
 
        /*
         * PFE3000-12-069RA devices may not stay in page 0 during device
         * probe which leads to probe failure (read status word failed).
         * So let's set the device to page 0 at the beginning.
         */
-       if (model == pfe3000) {
-               client->dev.platform_data = &pfe3000_plat_data;
+       if (model == pfe3000)
                i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0);
-       }
 
        return pmbus_do_probe(client, &pfe_driver_info[model]);
 }
index fa09d51..baf3125 100644 (file)
@@ -247,7 +247,7 @@ extern void dsp_cmx_hardware(struct dsp_conf *conf, struct dsp *dsp);
 extern int dsp_cmx_conf(struct dsp *dsp, u32 conf_id);
 extern void dsp_cmx_receive(struct dsp *dsp, struct sk_buff *skb);
 extern void dsp_cmx_hdlc(struct dsp *dsp, struct sk_buff *skb);
-extern void dsp_cmx_send(void *arg);
+extern void dsp_cmx_send(struct timer_list *arg);
 extern void dsp_cmx_transmit(struct dsp *dsp, struct sk_buff *skb);
 extern int dsp_cmx_del_conf_member(struct dsp *dsp);
 extern int dsp_cmx_del_conf(struct dsp_conf *conf);
index 357b875..61cb45c 100644 (file)
@@ -1614,7 +1614,7 @@ static u16        dsp_count; /* last sample count */
 static int     dsp_count_valid; /* if we have last sample count */
 
 void
-dsp_cmx_send(void *arg)
+dsp_cmx_send(struct timer_list *arg)
 {
        struct dsp_conf *conf;
        struct dsp_conf_member *member;
index 3860845..fae95f1 100644 (file)
@@ -1195,7 +1195,7 @@ static int __init dsp_init(void)
        }
 
        /* set sample timer */
-       timer_setup(&dsp_spl_tl, (void *)dsp_cmx_send, 0);
+       timer_setup(&dsp_spl_tl, dsp_cmx_send, 0);
        dsp_spl_tl.expires = jiffies + dsp_tics;
        dsp_spl_jiffies = dsp_spl_tl.expires;
        add_timer(&dsp_spl_tl);
index 7f0802a..3418d2d 100644 (file)
@@ -251,8 +251,8 @@ int pkt_session_unset_buffers(struct hfi_session_release_buffer_pkt *pkt,
 
                pkt->extradata_size = 0;
                pkt->shdr.hdr.size =
-                       struct_size((struct hfi_session_set_buffers_pkt *)0,
-                                   buffer_info, bd->num_buffers);
+                       struct_size_t(struct hfi_session_set_buffers_pkt,
+                                     buffer_info, bd->num_buffers);
        }
 
        pkt->response_req = bd->response_required;
index 2d002c8..d0d6ffc 100644 (file)
@@ -338,13 +338,7 @@ static void moxart_transfer_pio(struct moxart_host *host)
                                return;
                        }
                        for (len = 0; len < remain && len < host->fifo_width;) {
-                               /* SCR data must be read in big endian. */
-                               if (data->mrq->cmd->opcode == SD_APP_SEND_SCR)
-                                       *sgp = ioread32be(host->base +
-                                                         REG_DATA_WINDOW);
-                               else
-                                       *sgp = ioread32(host->base +
-                                                       REG_DATA_WINDOW);
+                               *sgp = ioread32(host->base + REG_DATA_WINDOW);
                                sgp++;
                                len += 4;
                        }
index a202a69..b01ffb4 100644 (file)
@@ -29,9 +29,16 @@ struct f_sdhost_priv {
        bool enable_cmd_dat_delay;
 };
 
+static void *sdhci_f_sdhost_priv(struct sdhci_host *host)
+{
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+       return sdhci_pltfm_priv(pltfm_host);
+}
+
 static void sdhci_f_sdh30_soft_voltage_switch(struct sdhci_host *host)
 {
-       struct f_sdhost_priv *priv = sdhci_priv(host);
+       struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host);
        u32 ctrl = 0;
 
        usleep_range(2500, 3000);
@@ -64,7 +71,7 @@ static unsigned int sdhci_f_sdh30_get_min_clock(struct sdhci_host *host)
 
 static void sdhci_f_sdh30_reset(struct sdhci_host *host, u8 mask)
 {
-       struct f_sdhost_priv *priv = sdhci_priv(host);
+       struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host);
        u32 ctl;
 
        if (sdhci_readw(host, SDHCI_CLOCK_CONTROL) == 0)
@@ -95,30 +102,32 @@ static const struct sdhci_ops sdhci_f_sdh30_ops = {
        .set_uhs_signaling = sdhci_set_uhs_signaling,
 };
 
+static const struct sdhci_pltfm_data sdhci_f_sdh30_pltfm_data = {
+       .ops = &sdhci_f_sdh30_ops,
+       .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC
+               | SDHCI_QUIRK_INVERTED_WRITE_PROTECT,
+       .quirks2 = SDHCI_QUIRK2_SUPPORT_SINGLE
+               |  SDHCI_QUIRK2_TUNING_WORK_AROUND,
+};
+
 static int sdhci_f_sdh30_probe(struct platform_device *pdev)
 {
        struct sdhci_host *host;
        struct device *dev = &pdev->dev;
-       int irq, ctrl = 0, ret = 0;
+       int ctrl = 0, ret = 0;
        struct f_sdhost_priv *priv;
+       struct sdhci_pltfm_host *pltfm_host;
        u32 reg = 0;
 
-       irq = platform_get_irq(pdev, 0);
-       if (irq < 0)
-               return irq;
-
-       host = sdhci_alloc_host(dev, sizeof(struct f_sdhost_priv));
+       host = sdhci_pltfm_init(pdev, &sdhci_f_sdh30_pltfm_data,
+                               sizeof(struct f_sdhost_priv));
        if (IS_ERR(host))
                return PTR_ERR(host);
 
-       priv = sdhci_priv(host);
+       pltfm_host = sdhci_priv(host);
+       priv = sdhci_pltfm_priv(pltfm_host);
        priv->dev = dev;
 
-       host->quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC |
-                      SDHCI_QUIRK_INVERTED_WRITE_PROTECT;
-       host->quirks2 = SDHCI_QUIRK2_SUPPORT_SINGLE |
-                       SDHCI_QUIRK2_TUNING_WORK_AROUND;
-
        priv->enable_cmd_dat_delay = device_property_read_bool(dev,
                                                "fujitsu,cmd-dat-delay-select");
 
@@ -126,18 +135,6 @@ static int sdhci_f_sdh30_probe(struct platform_device *pdev)
        if (ret)
                goto err;
 
-       platform_set_drvdata(pdev, host);
-
-       host->hw_name = "f_sdh30";
-       host->ops = &sdhci_f_sdh30_ops;
-       host->irq = irq;
-
-       host->ioaddr = devm_platform_ioremap_resource(pdev, 0);
-       if (IS_ERR(host->ioaddr)) {
-               ret = PTR_ERR(host->ioaddr);
-               goto err;
-       }
-
        if (dev_of_node(dev)) {
                sdhci_get_of_property(pdev);
 
@@ -204,24 +201,21 @@ err_rst:
 err_clk:
        clk_disable_unprepare(priv->clk_iface);
 err:
-       sdhci_free_host(host);
+       sdhci_pltfm_free(pdev);
+
        return ret;
 }
 
 static int sdhci_f_sdh30_remove(struct platform_device *pdev)
 {
        struct sdhci_host *host = platform_get_drvdata(pdev);
-       struct f_sdhost_priv *priv = sdhci_priv(host);
-
-       sdhci_remove_host(host, readl(host->ioaddr + SDHCI_INT_STATUS) ==
-                         0xffffffff);
+       struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host);
 
        reset_control_assert(priv->rst);
        clk_disable_unprepare(priv->clk);
        clk_disable_unprepare(priv->clk_iface);
 
-       sdhci_free_host(host);
-       platform_set_drvdata(pdev, NULL);
+       sdhci_pltfm_unregister(pdev);
 
        return 0;
 }
index 484c9e3..447b06e 100644 (file)
@@ -5901,7 +5901,9 @@ void bond_setup(struct net_device *bond_dev)
 
        bond_dev->hw_features = BOND_VLAN_FEATURES |
                                NETIF_F_HW_VLAN_CTAG_RX |
-                               NETIF_F_HW_VLAN_CTAG_FILTER;
+                               NETIF_F_HW_VLAN_CTAG_FILTER |
+                               NETIF_F_HW_VLAN_STAG_RX |
+                               NETIF_F_HW_VLAN_STAG_FILTER;
 
        bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
        bond_dev->features |= bond_dev->hw_features;
index 8da46d2..bef879c 100644 (file)
@@ -1625,8 +1625,10 @@ static void felix_teardown(struct dsa_switch *ds)
        struct felix *felix = ocelot_to_felix(ocelot);
        struct dsa_port *dp;
 
+       rtnl_lock();
        if (felix->tag_proto_ops)
                felix->tag_proto_ops->teardown(ds);
+       rtnl_unlock();
 
        dsa_switch_for_each_available_port(dp, ds)
                ocelot_deinit_port(ocelot, dp->index);
index 1416262..e0a4cb7 100644 (file)
@@ -1186,14 +1186,9 @@ static int enetc_init_port_rss_memory(struct enetc_si *si)
 
 static int enetc_pf_register_with_ierb(struct pci_dev *pdev)
 {
-       struct device_node *node = pdev->dev.of_node;
        struct platform_device *ierb_pdev;
        struct device_node *ierb_node;
 
-       /* Don't register with the IERB if the PF itself is disabled */
-       if (!node || !of_device_is_available(node))
-               return 0;
-
        ierb_node = of_find_compatible_node(NULL, NULL,
                                            "fsl,ls1028a-enetc-ierb");
        if (!ierb_node || !of_device_is_available(ierb_node))
@@ -1208,56 +1203,81 @@ static int enetc_pf_register_with_ierb(struct pci_dev *pdev)
        return enetc_ierb_register_pf(ierb_pdev, pdev);
 }
 
-static int enetc_pf_probe(struct pci_dev *pdev,
-                         const struct pci_device_id *ent)
+static struct enetc_si *enetc_psi_create(struct pci_dev *pdev)
 {
-       struct device_node *node = pdev->dev.of_node;
-       struct enetc_ndev_priv *priv;
-       struct net_device *ndev;
        struct enetc_si *si;
-       struct enetc_pf *pf;
        int err;
 
-       err = enetc_pf_register_with_ierb(pdev);
-       if (err == -EPROBE_DEFER)
-               return err;
-       if (err)
-               dev_warn(&pdev->dev,
-                        "Could not register with IERB driver: %pe, please update the device tree\n",
-                        ERR_PTR(err));
-
-       err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(*pf));
-       if (err)
-               return dev_err_probe(&pdev->dev, err, "PCI probing failed\n");
+       err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(struct enetc_pf));
+       if (err) {
+               dev_err_probe(&pdev->dev, err, "PCI probing failed\n");
+               goto out;
+       }
 
        si = pci_get_drvdata(pdev);
        if (!si->hw.port || !si->hw.global) {
                err = -ENODEV;
                dev_err(&pdev->dev, "could not map PF space, probing a VF?\n");
-               goto err_map_pf_space;
+               goto out_pci_remove;
        }
 
        err = enetc_setup_cbdr(&pdev->dev, &si->hw, ENETC_CBDR_DEFAULT_SIZE,
                               &si->cbd_ring);
        if (err)
-               goto err_setup_cbdr;
+               goto out_pci_remove;
 
        err = enetc_init_port_rfs_memory(si);
        if (err) {
                dev_err(&pdev->dev, "Failed to initialize RFS memory\n");
-               goto err_init_port_rfs;
+               goto out_teardown_cbdr;
        }
 
        err = enetc_init_port_rss_memory(si);
        if (err) {
                dev_err(&pdev->dev, "Failed to initialize RSS memory\n");
-               goto err_init_port_rss;
+               goto out_teardown_cbdr;
        }
 
-       if (node && !of_device_is_available(node)) {
-               dev_info(&pdev->dev, "device is disabled, skipping\n");
-               err = -ENODEV;
-               goto err_device_disabled;
+       return si;
+
+out_teardown_cbdr:
+       enetc_teardown_cbdr(&si->cbd_ring);
+out_pci_remove:
+       enetc_pci_remove(pdev);
+out:
+       return ERR_PTR(err);
+}
+
+static void enetc_psi_destroy(struct pci_dev *pdev)
+{
+       struct enetc_si *si = pci_get_drvdata(pdev);
+
+       enetc_teardown_cbdr(&si->cbd_ring);
+       enetc_pci_remove(pdev);
+}
+
+static int enetc_pf_probe(struct pci_dev *pdev,
+                         const struct pci_device_id *ent)
+{
+       struct device_node *node = pdev->dev.of_node;
+       struct enetc_ndev_priv *priv;
+       struct net_device *ndev;
+       struct enetc_si *si;
+       struct enetc_pf *pf;
+       int err;
+
+       err = enetc_pf_register_with_ierb(pdev);
+       if (err == -EPROBE_DEFER)
+               return err;
+       if (err)
+               dev_warn(&pdev->dev,
+                        "Could not register with IERB driver: %pe, please update the device tree\n",
+                        ERR_PTR(err));
+
+       si = enetc_psi_create(pdev);
+       if (IS_ERR(si)) {
+               err = PTR_ERR(si);
+               goto err_psi_create;
        }
 
        pf = enetc_si_priv(si);
@@ -1339,15 +1359,9 @@ err_alloc_si_res:
        si->ndev = NULL;
        free_netdev(ndev);
 err_alloc_netdev:
-err_init_port_rss:
-err_init_port_rfs:
-err_device_disabled:
 err_setup_mac_addresses:
-       enetc_teardown_cbdr(&si->cbd_ring);
-err_setup_cbdr:
-err_map_pf_space:
-       enetc_pci_remove(pdev);
-
+       enetc_psi_destroy(pdev);
+err_psi_create:
        return err;
 }
 
@@ -1370,12 +1384,29 @@ static void enetc_pf_remove(struct pci_dev *pdev)
        enetc_free_msix(priv);
 
        enetc_free_si_resources(priv);
-       enetc_teardown_cbdr(&si->cbd_ring);
 
        free_netdev(si->ndev);
 
-       enetc_pci_remove(pdev);
+       enetc_psi_destroy(pdev);
+}
+
+static void enetc_fixup_clear_rss_rfs(struct pci_dev *pdev)
+{
+       struct device_node *node = pdev->dev.of_node;
+       struct enetc_si *si;
+
+       /* Only apply quirk for disabled functions. For the ones
+        * that are enabled, enetc_pf_probe() will apply it.
+        */
+       if (node && of_device_is_available(node))
+               return;
+
+       si = enetc_psi_create(pdev);
+       if (si)
+               enetc_psi_destroy(pdev);
 }
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF,
+                       enetc_fixup_clear_rss_rfs);
 
 static const struct pci_device_id enetc_pf_id_table[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF) },
index 52546f6..f276b5e 100644 (file)
@@ -464,9 +464,9 @@ static void hns3_dbg_fill_content(char *content, u16 len,
                if (result) {
                        if (item_len < strlen(result[i]))
                                break;
-                       strscpy(pos, result[i], strlen(result[i]));
+                       memcpy(pos, result[i], strlen(result[i]));
                } else {
-                       strscpy(pos, items[i].name, strlen(items[i].name));
+                       memcpy(pos, items[i].name, strlen(items[i].name));
                }
                pos += item_len;
                len -= item_len;
index 9f68900..b7b51e5 100644 (file)
@@ -5854,6 +5854,9 @@ void hns3_external_lb_prepare(struct net_device *ndev, bool if_running)
        if (!if_running)
                return;
 
+       if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state))
+               return;
+
        netif_carrier_off(ndev);
        netif_tx_disable(ndev);
 
@@ -5882,7 +5885,16 @@ void hns3_external_lb_restore(struct net_device *ndev, bool if_running)
        if (!if_running)
                return;
 
-       hns3_nic_reset_all_ring(priv->ae_handle);
+       if (hns3_nic_resetting(ndev))
+               return;
+
+       if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))
+               return;
+
+       if (hns3_nic_reset_all_ring(priv->ae_handle))
+               return;
+
+       clear_bit(HNS3_NIC_STATE_DOWN, &priv->state);
 
        for (i = 0; i < priv->vector_num; i++)
                hns3_vector_enable(&priv->tqp_vector[i]);
index 409db2e..0fb2eae 100644 (file)
@@ -111,9 +111,9 @@ static void hclge_dbg_fill_content(char *content, u16 len,
                if (result) {
                        if (item_len < strlen(result[i]))
                                break;
-                       strscpy(pos, result[i], strlen(result[i]));
+                       memcpy(pos, result[i], strlen(result[i]));
                } else {
-                       strscpy(pos, items[i].name, strlen(items[i].name));
+                       memcpy(pos, items[i].name, strlen(items[i].name));
                }
                pos += item_len;
                len -= item_len;
index bf675c1..a940e35 100644 (file)
@@ -72,6 +72,8 @@ static void hclge_restore_hw_table(struct hclge_dev *hdev);
 static void hclge_sync_promisc_mode(struct hclge_dev *hdev);
 static void hclge_sync_fd_table(struct hclge_dev *hdev);
 static void hclge_update_fec_stats(struct hclge_dev *hdev);
+static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
+                                     int wait_cnt);
 
 static struct hnae3_ae_algo ae_algo;
 
@@ -7558,6 +7560,8 @@ static void hclge_enable_fd(struct hnae3_handle *handle, bool enable)
 
 static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable)
 {
+#define HCLGE_LINK_STATUS_WAIT_CNT  3
+
        struct hclge_desc desc;
        struct hclge_config_mac_mode_cmd *req =
                (struct hclge_config_mac_mode_cmd *)desc.data;
@@ -7582,9 +7586,15 @@ static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable)
        req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en);
 
        ret = hclge_cmd_send(&hdev->hw, &desc, 1);
-       if (ret)
+       if (ret) {
                dev_err(&hdev->pdev->dev,
                        "mac enable fail, ret =%d.\n", ret);
+               return;
+       }
+
+       if (!enable)
+               hclge_mac_link_status_wait(hdev, HCLGE_LINK_STATUS_DOWN,
+                                          HCLGE_LINK_STATUS_WAIT_CNT);
 }
 
 static int hclge_config_switch_param(struct hclge_dev *hdev, int vfid,
@@ -7647,10 +7657,9 @@ static void hclge_phy_link_status_wait(struct hclge_dev *hdev,
        } while (++i < HCLGE_PHY_LINK_STATUS_NUM);
 }
 
-static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret)
+static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
+                                     int wait_cnt)
 {
-#define HCLGE_MAC_LINK_STATUS_NUM  100
-
        int link_status;
        int i = 0;
        int ret;
@@ -7663,13 +7672,15 @@ static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret)
                        return 0;
 
                msleep(HCLGE_LINK_STATUS_MS);
-       } while (++i < HCLGE_MAC_LINK_STATUS_NUM);
+       } while (++i < wait_cnt);
        return -EBUSY;
 }
 
 static int hclge_mac_phy_link_status_wait(struct hclge_dev *hdev, bool en,
                                          bool is_phy)
 {
+#define HCLGE_MAC_LINK_STATUS_NUM  100
+
        int link_ret;
 
        link_ret = en ? HCLGE_LINK_STATUS_UP : HCLGE_LINK_STATUS_DOWN;
@@ -7677,7 +7688,8 @@ static int hclge_mac_phy_link_status_wait(struct hclge_dev *hdev, bool en,
        if (is_phy)
                hclge_phy_link_status_wait(hdev, link_ret);
 
-       return hclge_mac_link_status_wait(hdev, link_ret);
+       return hclge_mac_link_status_wait(hdev, link_ret,
+                                         HCLGE_MAC_LINK_STATUS_NUM);
 }
 
 static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en)
@@ -10915,9 +10927,12 @@ int hclge_cfg_flowctrl(struct hclge_dev *hdev)
        u32 rx_pause, tx_pause;
        u8 flowctl;
 
-       if (!phydev->link || !phydev->autoneg)
+       if (!phydev->link)
                return 0;
 
+       if (!phydev->autoneg)
+               return hclge_mac_pause_setup_hw(hdev);
+
        local_advertising = linkmode_adv_to_lcl_adv_t(phydev->advertising);
 
        if (phydev->pause)
index de509e5..c58c312 100644 (file)
@@ -1553,7 +1553,7 @@ static int hclge_bp_setup_hw(struct hclge_dev *hdev, u8 tc)
        return 0;
 }
 
-static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
+int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
 {
        bool tx_en, rx_en;
 
index 45dcfef..53eec6d 100644 (file)
@@ -245,6 +245,7 @@ int hclge_pfc_pause_en_cfg(struct hclge_dev *hdev, u8 tx_rx_bitmap,
                           u8 pfc_bitmap);
 int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx);
 int hclge_pause_addr_cfg(struct hclge_dev *hdev, const u8 *mac_addr);
+int hclge_mac_pause_setup_hw(struct hclge_dev *hdev);
 void hclge_pfc_rx_stats_get(struct hclge_dev *hdev, u64 *stats);
 void hclge_pfc_tx_stats_get(struct hclge_dev *hdev, u64 *stats);
 int hclge_tm_qs_shaper_cfg(struct hclge_vport *vport, int max_tx_rate);
index 763d613..df76cda 100644 (file)
@@ -97,6 +97,8 @@ static int pending_scrq(struct ibmvnic_adapter *,
 static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *,
                                        struct ibmvnic_sub_crq_queue *);
 static int ibmvnic_poll(struct napi_struct *napi, int data);
+static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter);
+static inline void reinit_init_done(struct ibmvnic_adapter *adapter);
 static void send_query_map(struct ibmvnic_adapter *adapter);
 static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, u32, u8);
 static int send_request_unmap(struct ibmvnic_adapter *, u8);
@@ -114,6 +116,7 @@ static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter,
 static void free_long_term_buff(struct ibmvnic_adapter *adapter,
                                struct ibmvnic_long_term_buff *ltb);
 static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter);
+static void flush_reset_queue(struct ibmvnic_adapter *adapter);
 
 struct ibmvnic_stat {
        char name[ETH_GSTRING_LEN];
@@ -1505,8 +1508,8 @@ static const char *adapter_state_to_string(enum vnic_state state)
 
 static int ibmvnic_login(struct net_device *netdev)
 {
+       unsigned long flags, timeout = msecs_to_jiffies(20000);
        struct ibmvnic_adapter *adapter = netdev_priv(netdev);
-       unsigned long timeout = msecs_to_jiffies(20000);
        int retry_count = 0;
        int retries = 10;
        bool retry;
@@ -1527,11 +1530,9 @@ static int ibmvnic_login(struct net_device *netdev)
 
                if (!wait_for_completion_timeout(&adapter->init_done,
                                                 timeout)) {
-                       netdev_warn(netdev, "Login timed out, retrying...\n");
-                       retry = true;
-                       adapter->init_done_rc = 0;
-                       retry_count++;
-                       continue;
+                       netdev_warn(netdev, "Login timed out\n");
+                       adapter->login_pending = false;
+                       goto partial_reset;
                }
 
                if (adapter->init_done_rc == ABORTED) {
@@ -1573,10 +1574,69 @@ static int ibmvnic_login(struct net_device *netdev)
                                            "SCRQ irq initialization failed\n");
                                return rc;
                        }
+               /* Default/timeout error handling, reset and start fresh */
                } else if (adapter->init_done_rc) {
                        netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n",
                                    adapter->init_done_rc);
-                       return -EIO;
+
+partial_reset:
+                       /* adapter login failed, so free any CRQs or sub-CRQs
+                        * and register again before attempting to login again.
+                        * If we don't do this then the VIOS may think that
+                        * we are already logged in and reject any subsequent
+                        * attempts
+                        */
+                       netdev_warn(netdev,
+                                   "Freeing and re-registering CRQs before attempting to login again\n");
+                       retry = true;
+                       adapter->init_done_rc = 0;
+                       release_sub_crqs(adapter, true);
+                       /* Much of this is similar logic as ibmvnic_probe(),
+                        * we are essentially re-initializing communication
+                        * with the server. We really should not run any
+                        * resets/failovers here because this is already a form
+                        * of reset and we do not want parallel resets occurring
+                        */
+                       do {
+                               reinit_init_done(adapter);
+                               /* Clear any failovers we got in the previous
+                                * pass since we are re-initializing the CRQ
+                                */
+                               adapter->failover_pending = false;
+                               release_crq_queue(adapter);
+                               /* If we don't sleep here then we risk an
+                                * unnecessary failover event from the VIOS.
+                                * This is a known VIOS issue caused by a vnic
+                                * device freeing and registering a CRQ too
+                                * quickly.
+                                */
+                               msleep(1500);
+                               /* Avoid any resets, since we are currently
+                                * resetting.
+                                */
+                               spin_lock_irqsave(&adapter->rwi_lock, flags);
+                               flush_reset_queue(adapter);
+                               spin_unlock_irqrestore(&adapter->rwi_lock,
+                                                      flags);
+
+                               rc = init_crq_queue(adapter);
+                               if (rc) {
+                                       netdev_err(netdev, "login recovery: init CRQ failed %d\n",
+                                                  rc);
+                                       return -EIO;
+                               }
+
+                               rc = ibmvnic_reset_init(adapter, false);
+                               if (rc)
+                                       netdev_err(netdev, "login recovery: Reset init failed %d\n",
+                                                  rc);
+                               /* IBMVNIC_CRQ_INIT will return EAGAIN if it
+                                * fails, since ibmvnic_reset_init will free
+                                * irq's in failure, we won't be able to receive
+                                * new CRQs so we need to keep trying. probe()
+                                * handles this similarly.
+                                */
+                       } while (rc == -EAGAIN && retry_count++ < retries);
                }
        } while (retry);
 
@@ -1588,12 +1648,22 @@ static int ibmvnic_login(struct net_device *netdev)
 
 static void release_login_buffer(struct ibmvnic_adapter *adapter)
 {
+       if (!adapter->login_buf)
+               return;
+
+       dma_unmap_single(&adapter->vdev->dev, adapter->login_buf_token,
+                        adapter->login_buf_sz, DMA_TO_DEVICE);
        kfree(adapter->login_buf);
        adapter->login_buf = NULL;
 }
 
 static void release_login_rsp_buffer(struct ibmvnic_adapter *adapter)
 {
+       if (!adapter->login_rsp_buf)
+               return;
+
+       dma_unmap_single(&adapter->vdev->dev, adapter->login_rsp_buf_token,
+                        adapter->login_rsp_buf_sz, DMA_FROM_DEVICE);
        kfree(adapter->login_rsp_buf);
        adapter->login_rsp_buf = NULL;
 }
@@ -4830,11 +4900,14 @@ static int send_login(struct ibmvnic_adapter *adapter)
        if (rc) {
                adapter->login_pending = false;
                netdev_err(adapter->netdev, "Failed to send login, rc=%d\n", rc);
-               goto buf_rsp_map_failed;
+               goto buf_send_failed;
        }
 
        return 0;
 
+buf_send_failed:
+       dma_unmap_single(dev, rsp_buffer_token, rsp_buffer_size,
+                        DMA_FROM_DEVICE);
 buf_rsp_map_failed:
        kfree(login_rsp_buffer);
        adapter->login_rsp_buf = NULL;
@@ -5396,6 +5469,7 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq,
        int num_tx_pools;
        int num_rx_pools;
        u64 *size_array;
+       u32 rsp_len;
        int i;
 
        /* CHECK: Test/set of login_pending does not need to be atomic
@@ -5407,11 +5481,6 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq,
        }
        adapter->login_pending = false;
 
-       dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz,
-                        DMA_TO_DEVICE);
-       dma_unmap_single(dev, adapter->login_rsp_buf_token,
-                        adapter->login_rsp_buf_sz, DMA_FROM_DEVICE);
-
        /* If the number of queues requested can't be allocated by the
         * server, the login response will return with code 1. We will need
         * to resend the login buffer with fewer queues requested.
@@ -5447,6 +5516,23 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq,
                ibmvnic_reset(adapter, VNIC_RESET_FATAL);
                return -EIO;
        }
+
+       rsp_len = be32_to_cpu(login_rsp->len);
+       if (be32_to_cpu(login->login_rsp_len) < rsp_len ||
+           rsp_len <= be32_to_cpu(login_rsp->off_txsubm_subcrqs) ||
+           rsp_len <= be32_to_cpu(login_rsp->off_rxadd_subcrqs) ||
+           rsp_len <= be32_to_cpu(login_rsp->off_rxadd_buff_size) ||
+           rsp_len <= be32_to_cpu(login_rsp->off_supp_tx_desc)) {
+               /* This can happen if a login request times out and there are
+                * 2 outstanding login requests sent, the LOGIN_RSP crq
+                * could have been for the older login request. So we are
+                * parsing the newer response buffer which may be incomplete
+                */
+               dev_err(dev, "FATAL: Login rsp offsets/lengths invalid\n");
+               ibmvnic_reset(adapter, VNIC_RESET_FATAL);
+               return -EIO;
+       }
+
        size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
                be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size));
        /* variable buffer sizes are not supported, so just read the
index 2f47cfa..460ca56 100644 (file)
@@ -1401,14 +1401,15 @@ static int iavf_add_fdir_ethtool(struct iavf_adapter *adapter, struct ethtool_rx
        if (fsp->flow_type & FLOW_MAC_EXT)
                return -EINVAL;
 
+       spin_lock_bh(&adapter->fdir_fltr_lock);
        if (adapter->fdir_active_fltr >= IAVF_MAX_FDIR_FILTERS) {
+               spin_unlock_bh(&adapter->fdir_fltr_lock);
                dev_err(&adapter->pdev->dev,
                        "Unable to add Flow Director filter because VF reached the limit of max allowed filters (%u)\n",
                        IAVF_MAX_FDIR_FILTERS);
                return -ENOSPC;
        }
 
-       spin_lock_bh(&adapter->fdir_fltr_lock);
        if (iavf_find_fdir_fltr_by_loc(adapter, fsp->location)) {
                dev_err(&adapter->pdev->dev, "Failed to add Flow Director filter, it already exists\n");
                spin_unlock_bh(&adapter->fdir_fltr_lock);
@@ -1781,7 +1782,9 @@ static int iavf_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
        case ETHTOOL_GRXCLSRLCNT:
                if (!FDIR_FLTR_SUPPORT(adapter))
                        break;
+               spin_lock_bh(&adapter->fdir_fltr_lock);
                cmd->rule_cnt = adapter->fdir_active_fltr;
+               spin_unlock_bh(&adapter->fdir_fltr_lock);
                cmd->data = IAVF_MAX_FDIR_FILTERS;
                ret = 0;
                break;
index 6146203..505e82e 100644 (file)
@@ -722,7 +722,9 @@ void iavf_print_fdir_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *f
 bool iavf_fdir_is_dup_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *fltr)
 {
        struct iavf_fdir_fltr *tmp;
+       bool ret = false;
 
+       spin_lock_bh(&adapter->fdir_fltr_lock);
        list_for_each_entry(tmp, &adapter->fdir_list_head, list) {
                if (tmp->flow_type != fltr->flow_type)
                        continue;
@@ -732,11 +734,14 @@ bool iavf_fdir_is_dup_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *
                    !memcmp(&tmp->ip_data, &fltr->ip_data,
                            sizeof(fltr->ip_data)) &&
                    !memcmp(&tmp->ext_data, &fltr->ext_data,
-                           sizeof(fltr->ext_data)))
-                       return true;
+                           sizeof(fltr->ext_data))) {
+                       ret = true;
+                       break;
+               }
        }
+       spin_unlock_bh(&adapter->fdir_fltr_lock);
 
-       return false;
+       return ret;
 }
 
 /**
index 9db384f..38901d2 100644 (file)
@@ -195,6 +195,10 @@ struct igc_adapter {
        u32 qbv_config_change_errors;
        bool qbv_transition;
        unsigned int qbv_count;
+       /* Access to oper_gate_closed, admin_gate_closed and qbv_transition
+        * are protected by the qbv_tx_lock.
+        */
+       spinlock_t qbv_tx_lock;
 
        /* OS defined structs */
        struct pci_dev *pdev;
index bdeb367..6f557e8 100644 (file)
@@ -4801,6 +4801,7 @@ static int igc_sw_init(struct igc_adapter *adapter)
        adapter->nfc_rule_count = 0;
 
        spin_lock_init(&adapter->stats64_lock);
+       spin_lock_init(&adapter->qbv_tx_lock);
        /* Assume MSI-X interrupts, will be checked during IRQ allocation */
        adapter->flags |= IGC_FLAG_HAS_MSIX;
 
@@ -6119,15 +6120,15 @@ static int igc_tsn_enable_launchtime(struct igc_adapter *adapter,
        return igc_tsn_offload_apply(adapter);
 }
 
-static int igc_tsn_clear_schedule(struct igc_adapter *adapter)
+static int igc_qbv_clear_schedule(struct igc_adapter *adapter)
 {
+       unsigned long flags;
        int i;
 
        adapter->base_time = 0;
        adapter->cycle_time = NSEC_PER_SEC;
        adapter->taprio_offload_enable = false;
        adapter->qbv_config_change_errors = 0;
-       adapter->qbv_transition = false;
        adapter->qbv_count = 0;
 
        for (i = 0; i < adapter->num_tx_queues; i++) {
@@ -6136,10 +6137,28 @@ static int igc_tsn_clear_schedule(struct igc_adapter *adapter)
                ring->start_time = 0;
                ring->end_time = NSEC_PER_SEC;
                ring->max_sdu = 0;
+       }
+
+       spin_lock_irqsave(&adapter->qbv_tx_lock, flags);
+
+       adapter->qbv_transition = false;
+
+       for (i = 0; i < adapter->num_tx_queues; i++) {
+               struct igc_ring *ring = adapter->tx_ring[i];
+
                ring->oper_gate_closed = false;
                ring->admin_gate_closed = false;
        }
 
+       spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags);
+
+       return 0;
+}
+
+static int igc_tsn_clear_schedule(struct igc_adapter *adapter)
+{
+       igc_qbv_clear_schedule(adapter);
+
        return 0;
 }
 
@@ -6150,6 +6169,7 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter,
        struct igc_hw *hw = &adapter->hw;
        u32 start_time = 0, end_time = 0;
        struct timespec64 now;
+       unsigned long flags;
        size_t n;
        int i;
 
@@ -6217,6 +6237,8 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter,
                start_time += e->interval;
        }
 
+       spin_lock_irqsave(&adapter->qbv_tx_lock, flags);
+
        /* Check whether a queue gets configured.
         * If not, set the start and end time to be end time.
         */
@@ -6241,6 +6263,8 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter,
                }
        }
 
+       spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags);
+
        for (i = 0; i < adapter->num_tx_queues; i++) {
                struct igc_ring *ring = adapter->tx_ring[i];
                struct net_device *dev = adapter->netdev;
@@ -6619,8 +6643,11 @@ static enum hrtimer_restart igc_qbv_scheduling_timer(struct hrtimer *timer)
 {
        struct igc_adapter *adapter = container_of(timer, struct igc_adapter,
                                                   hrtimer);
+       unsigned long flags;
        unsigned int i;
 
+       spin_lock_irqsave(&adapter->qbv_tx_lock, flags);
+
        adapter->qbv_transition = true;
        for (i = 0; i < adapter->num_tx_queues; i++) {
                struct igc_ring *tx_ring = adapter->tx_ring[i];
@@ -6633,6 +6660,9 @@ static enum hrtimer_restart igc_qbv_scheduling_timer(struct hrtimer *timer)
                }
        }
        adapter->qbv_transition = false;
+
+       spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags);
+
        return HRTIMER_NORESTART;
 }
 
index a9a1028..de31717 100644 (file)
@@ -166,11 +166,11 @@ prestera_util_neigh2nc_key(struct prestera_switch *sw, struct neighbour *n,
 
 static bool __prestera_fi_is_direct(struct fib_info *fi)
 {
-       struct fib_nh *fib_nh;
+       struct fib_nh_common *fib_nhc;
 
        if (fib_info_num_path(fi) == 1) {
-               fib_nh = fib_info_nh(fi, 0);
-               if (fib_nh->fib_nh_gw_family == AF_UNSPEC)
+               fib_nhc = fib_info_nhc(fi, 0);
+               if (fib_nhc->nhc_gw_family == AF_UNSPEC)
                        return true;
        }
 
@@ -261,7 +261,7 @@ static bool
 __prestera_util_kern_n_is_reachable_v4(u32 tb_id, __be32 *addr,
                                       struct net_device *dev)
 {
-       struct fib_nh *fib_nh;
+       struct fib_nh_common *fib_nhc;
        struct fib_result res;
        bool reachable;
 
@@ -269,8 +269,8 @@ __prestera_util_kern_n_is_reachable_v4(u32 tb_id, __be32 *addr,
 
        if (!prestera_util_kern_get_route(&res, tb_id, addr))
                if (prestera_fi_is_direct(res.fi)) {
-                       fib_nh = fib_info_nh(res.fi, 0);
-                       if (dev == fib_nh->fib_nh_dev)
+                       fib_nhc = fib_info_nhc(res.fi, 0);
+                       if (dev == fib_nhc->nhc_dev)
                                reachable = true;
                }
 
@@ -324,7 +324,7 @@ prestera_kern_fib_info_nhc(struct fib_notifier_info *info, int n)
        if (info->family == AF_INET) {
                fen4_info = container_of(info, struct fib_entry_notifier_info,
                                         info);
-               return &fib_info_nh(fen4_info->fi, n)->nh_common;
+               return fib_info_nhc(fen4_info->fi, n);
        } else if (info->family == AF_INET6) {
                fen6_info = container_of(info, struct fib6_entry_notifier_info,
                                         info);
index b012833..e869c65 100644 (file)
@@ -2,6 +2,7 @@
 /* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */
 
 #include "reporter_vnic.h"
+#include "en_stats.h"
 #include "devlink.h"
 
 #define VNIC_ENV_GET64(vnic_env_stats, c) \
@@ -36,55 +37,72 @@ int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
        if (err)
                return err;
 
-       err = devlink_fmsg_u64_pair_put(fmsg, "total_error_queues",
-                                       VNIC_ENV_GET64(&vnic, total_error_queues));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "send_queue_priority_update_flow",
-                                       VNIC_ENV_GET64(&vnic, send_queue_priority_update_flow));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "comp_eq_overrun",
-                                       VNIC_ENV_GET64(&vnic, comp_eq_overrun));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "async_eq_overrun",
-                                       VNIC_ENV_GET64(&vnic, async_eq_overrun));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "cq_overrun",
-                                       VNIC_ENV_GET64(&vnic, cq_overrun));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "invalid_command",
-                                       VNIC_ENV_GET64(&vnic, invalid_command));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "quota_exceeded_command",
-                                       VNIC_ENV_GET64(&vnic, quota_exceeded_command));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
-                                       VNIC_ENV_GET64(&vnic, nic_receive_steering_discard));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "generated_pkt_steering_fail",
-                                       VNIC_ENV_GET64(&vnic, generated_pkt_steering_fail));
-       if (err)
-               return err;
-
-       err = devlink_fmsg_u64_pair_put(fmsg, "handled_pkt_steering_fail",
-                                       VNIC_ENV_GET64(&vnic, handled_pkt_steering_fail));
-       if (err)
-               return err;
+       if (MLX5_CAP_GEN(dev, vnic_env_queue_counters)) {
+               err = devlink_fmsg_u32_pair_put(fmsg, "total_error_queues",
+                                               VNIC_ENV_GET(&vnic, total_error_queues));
+               if (err)
+                       return err;
+
+               err = devlink_fmsg_u32_pair_put(fmsg, "send_queue_priority_update_flow",
+                                               VNIC_ENV_GET(&vnic,
+                                                            send_queue_priority_update_flow));
+               if (err)
+                       return err;
+       }
+
+       if (MLX5_CAP_GEN(dev, eq_overrun_count)) {
+               err = devlink_fmsg_u32_pair_put(fmsg, "comp_eq_overrun",
+                                               VNIC_ENV_GET(&vnic, comp_eq_overrun));
+               if (err)
+                       return err;
+
+               err = devlink_fmsg_u32_pair_put(fmsg, "async_eq_overrun",
+                                               VNIC_ENV_GET(&vnic, async_eq_overrun));
+               if (err)
+                       return err;
+       }
+
+       if (MLX5_CAP_GEN(dev, vnic_env_cq_overrun)) {
+               err = devlink_fmsg_u32_pair_put(fmsg, "cq_overrun",
+                                               VNIC_ENV_GET(&vnic, cq_overrun));
+               if (err)
+                       return err;
+       }
+
+       if (MLX5_CAP_GEN(dev, invalid_command_count)) {
+               err = devlink_fmsg_u32_pair_put(fmsg, "invalid_command",
+                                               VNIC_ENV_GET(&vnic, invalid_command));
+               if (err)
+                       return err;
+       }
+
+       if (MLX5_CAP_GEN(dev, quota_exceeded_count)) {
+               err = devlink_fmsg_u32_pair_put(fmsg, "quota_exceeded_command",
+                                               VNIC_ENV_GET(&vnic, quota_exceeded_command));
+               if (err)
+                       return err;
+       }
+
+       if (MLX5_CAP_GEN(dev, nic_receive_steering_discard)) {
+               err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
+                                               VNIC_ENV_GET64(&vnic,
+                                                              nic_receive_steering_discard));
+               if (err)
+                       return err;
+       }
+
+       if (MLX5_CAP_GEN(dev, vnic_env_cnt_steering_fail)) {
+               err = devlink_fmsg_u64_pair_put(fmsg, "generated_pkt_steering_fail",
+                                               VNIC_ENV_GET64(&vnic,
+                                                              generated_pkt_steering_fail));
+               if (err)
+                       return err;
+
+               err = devlink_fmsg_u64_pair_put(fmsg, "handled_pkt_steering_fail",
+                                               VNIC_ENV_GET64(&vnic, handled_pkt_steering_fail));
+               if (err)
+                       return err;
+       }
 
        err = devlink_fmsg_obj_nest_end(fmsg);
        if (err)
index 0c88cf4..1730f6a 100644 (file)
@@ -1461,10 +1461,12 @@ static void mlx5e_invalidate_encap(struct mlx5e_priv *priv,
                attr = mlx5e_tc_get_encap_attr(flow);
                esw_attr = attr->esw_attr;
 
-               if (flow_flag_test(flow, SLOW))
+               if (flow_flag_test(flow, SLOW)) {
                        mlx5e_tc_unoffload_from_slow_path(esw, flow);
-               else
+               } else {
                        mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
+                       mlx5e_tc_unoffload_flow_post_acts(flow);
+               }
 
                mlx5e_tc_detach_mod_hdr(priv, flow, attr);
                attr->modify_hdr = NULL;
index 1c82011..c27df14 100644 (file)
@@ -5266,6 +5266,7 @@ void mlx5e_destroy_q_counters(struct mlx5e_priv *priv)
 static int mlx5e_nic_init(struct mlx5_core_dev *mdev,
                          struct net_device *netdev)
 {
+       const bool take_rtnl = netdev->reg_state == NETREG_REGISTERED;
        struct mlx5e_priv *priv = netdev_priv(netdev);
        struct mlx5e_flow_steering *fs;
        int err;
@@ -5294,9 +5295,19 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev,
                mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
 
        mlx5e_health_create_reporters(priv);
+
+       /* If netdev is already registered (e.g. move from uplink to nic profile),
+        * RTNL lock must be held before triggering netdev notifiers.
+        */
+       if (take_rtnl)
+               rtnl_lock();
+
        /* update XDP supported features */
        mlx5e_set_xdp_feature(netdev);
 
+       if (take_rtnl)
+               rtnl_unlock();
+
        return 0;
 }
 
index 9237763..31708d5 100644 (file)
@@ -1943,9 +1943,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 {
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        struct mlx5_flow_attr *attr = flow->attr;
-       struct mlx5_esw_flow_attr *esw_attr;
 
-       esw_attr = attr->esw_attr;
        mlx5e_put_flow_tunnel_id(flow);
 
        remove_unready_flow(flow);
@@ -1966,12 +1964,6 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 
        mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr);
 
-       if (esw_attr->int_port)
-               mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->int_port);
-
-       if (esw_attr->dest_int_port)
-               mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->dest_int_port);
-
        if (flow_flag_test(flow, L3_TO_L2_DECAP))
                mlx5e_detach_decap(priv, flow);
 
@@ -4268,6 +4260,7 @@ static void
 mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr)
 {
        struct mlx5_core_dev *counter_dev = get_flow_counter_dev(flow);
+       struct mlx5_esw_flow_attr *esw_attr;
 
        if (!attr)
                return;
@@ -4285,6 +4278,18 @@ mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *a
                mlx5e_tc_detach_mod_hdr(flow->priv, flow, attr);
        }
 
+       if (mlx5e_is_eswitch_flow(flow)) {
+               esw_attr = attr->esw_attr;
+
+               if (esw_attr->int_port)
+                       mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv),
+                                             esw_attr->int_port);
+
+               if (esw_attr->dest_int_port)
+                       mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv),
+                                             esw_attr->dest_int_port);
+       }
+
        mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr);
 
        free_branch_attr(flow, attr->branch_true);
index af779c7..fdf2be5 100644 (file)
@@ -60,7 +60,7 @@ static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16
        }  else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) {
                memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len);
                dl_port->attrs.switch_id.id_len = ppid.id_len;
-               devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum,
+               devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum,
                                              vport_num - 1, false);
        }
        return dl_port;
index d3a3fe4..7d9bbb4 100644 (file)
@@ -574,7 +574,7 @@ static int __mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev,
        for (i = 0; i < ldev->ports; i++) {
                for (j = 0; j < ldev->buckets; j++) {
                        idx = i * ldev->buckets + j;
-                       if (ldev->v2p_map[i] == ports[i])
+                       if (ldev->v2p_map[idx] == ports[idx])
                                continue;
 
                        dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[ports[idx] - 1].dev,
index 973babf..377372f 100644 (file)
@@ -227,10 +227,15 @@ static void mlx5_timestamp_overflow(struct work_struct *work)
        clock = container_of(timer, struct mlx5_clock, timer);
        mdev = container_of(clock, struct mlx5_core_dev, clock);
 
+       if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+               goto out;
+
        write_seqlock_irqsave(&clock->lock, flags);
        timecounter_read(&timer->tc);
        mlx5_update_clock_info_page(mdev);
        write_sequnlock_irqrestore(&clock->lock, flags);
+
+out:
        schedule_delayed_work(&timer->overflow_work, timer->overflow_period);
 }
 
index f42abc2..72ae560 100644 (file)
@@ -1989,7 +1989,7 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
 
        mlx5_enter_error_state(dev, false);
        mlx5_error_sw_reset(dev);
-       mlx5_unload_one(dev, true);
+       mlx5_unload_one(dev, false);
        mlx5_drain_health_wq(dev);
        mlx5_pci_disable_device(dev);
 
index c4be257..682d3dc 100644 (file)
@@ -361,7 +361,7 @@ static inline bool mlx5_core_is_ec_vf_vport(const struct mlx5_core_dev *dev, u16
 
 static inline int mlx5_vport_to_func_id(const struct mlx5_core_dev *dev, u16 vport, bool ec_vf_func)
 {
-       return ec_vf_func ? vport - mlx5_core_ec_vf_vport_base(dev)
+       return ec_vf_func ? vport - mlx5_core_ec_vf_vport_base(dev) + 1
                          : vport;
 }
 
index 4e42a3b..a2fc937 100644 (file)
@@ -285,8 +285,7 @@ static u16 mlx5_get_max_vfs(struct mlx5_core_dev *dev)
                host_total_vfs = MLX5_GET(query_esw_functions_out, out,
                                          host_params_context.host_total_vfs);
                kvfree(out);
-               if (host_total_vfs)
-                       return host_total_vfs;
+               return host_total_vfs;
        }
 
 done:
index d6947fe..8ca534e 100644 (file)
@@ -82,7 +82,7 @@ dr_ptrn_alloc_pattern(struct mlx5dr_ptrn_mgr *mgr,
        u32 chunk_size;
        u32 index;
 
-       chunk_size = ilog2(num_of_actions);
+       chunk_size = ilog2(roundup_pow_of_two(num_of_actions));
        /* HW modify action index granularity is at least 64B */
        chunk_size = max_t(u32, chunk_size, DR_CHUNK_SIZE_8);
 
index a499e46..c2ad092 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/ethtool.h>
 #include <linux/filter.h>
 #include <linux/mm.h>
+#include <linux/pci.h>
 
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
@@ -2345,9 +2346,12 @@ int mana_attach(struct net_device *ndev)
 static int mana_dealloc_queues(struct net_device *ndev)
 {
        struct mana_port_context *apc = netdev_priv(ndev);
+       unsigned long timeout = jiffies + 120 * HZ;
        struct gdma_dev *gd = apc->ac->gdma_dev;
        struct mana_txq *txq;
+       struct sk_buff *skb;
        int i, err;
+       u32 tsleep;
 
        if (apc->port_is_up)
                return -EINVAL;
@@ -2363,15 +2367,40 @@ static int mana_dealloc_queues(struct net_device *ndev)
         * to false, but it doesn't matter since mana_start_xmit() drops any
         * new packets due to apc->port_is_up being false.
         *
-        * Drain all the in-flight TX packets
+        * Drain all the in-flight TX packets.
+        * A timeout of 120 seconds for all the queues is used.
+        * This will break the while loop when h/w is not responding.
+        * This value of 120 has been decided here considering max
+        * number of queues.
         */
+
        for (i = 0; i < apc->num_queues; i++) {
                txq = &apc->tx_qp[i].txq;
-
-               while (atomic_read(&txq->pending_sends) > 0)
-                       usleep_range(1000, 2000);
+               tsleep = 1000;
+               while (atomic_read(&txq->pending_sends) > 0 &&
+                      time_before(jiffies, timeout)) {
+                       usleep_range(tsleep, tsleep + 1000);
+                       tsleep <<= 1;
+               }
+               if (atomic_read(&txq->pending_sends)) {
+                       err = pcie_flr(to_pci_dev(gd->gdma_context->dev));
+                       if (err) {
+                               netdev_err(ndev, "flr failed %d with %d pkts pending in txq %u\n",
+                                          err, atomic_read(&txq->pending_sends),
+                                          txq->gdma_txq_id);
+                       }
+                       break;
+               }
        }
 
+       for (i = 0; i < apc->num_queues; i++) {
+               txq = &apc->tx_qp[i].txq;
+               while ((skb = skb_dequeue(&txq->pending_skbs))) {
+                       mana_unmap_skb(skb, apc);
+                       dev_kfree_skb_any(skb);
+               }
+               atomic_set(&txq->pending_sends, 0);
+       }
        /* We're 100% sure the queues can no longer be woken up, because
         * we're sure now mana_poll_tx_cq() can't be running.
         */
index 612b001..432fb93 100644 (file)
@@ -1817,6 +1817,7 @@ static int ionic_change_mtu(struct net_device *netdev, int new_mtu)
 static void ionic_tx_timeout_work(struct work_struct *ws)
 {
        struct ionic_lif *lif = container_of(ws, struct ionic_lif, tx_timeout_work);
+       int err;
 
        if (test_bit(IONIC_LIF_F_FW_RESET, lif->state))
                return;
@@ -1829,8 +1830,11 @@ static void ionic_tx_timeout_work(struct work_struct *ws)
 
        mutex_lock(&lif->queue_lock);
        ionic_stop_queues_reconfig(lif);
-       ionic_start_queues_reconfig(lif);
+       err = ionic_start_queues_reconfig(lif);
        mutex_unlock(&lif->queue_lock);
+
+       if (err)
+               dev_err(lif->ionic->dev, "%s: Restarting queues failed\n", __func__);
 }
 
 static void ionic_tx_timeout(struct net_device *netdev, unsigned int txqueue)
@@ -2800,17 +2804,22 @@ static int ionic_cmb_reconfig(struct ionic_lif *lif,
                        if (err) {
                                dev_err(lif->ionic->dev,
                                        "CMB restore failed: %d\n", err);
-                               goto errout;
+                               goto err_out;
                        }
                }
 
-               ionic_start_queues_reconfig(lif);
-       } else {
-               /* This was detached in ionic_stop_queues_reconfig() */
-               netif_device_attach(lif->netdev);
+               err = ionic_start_queues_reconfig(lif);
+               if (err) {
+                       dev_err(lif->ionic->dev,
+                               "CMB reconfig failed: %d\n", err);
+                       goto err_out;
+               }
        }
 
-errout:
+err_out:
+       /* This was detached in ionic_stop_queues_reconfig() */
+       netif_device_attach(lif->netdev);
+
        return err;
 }
 
index 984dfa5..144ec75 100644 (file)
@@ -743,7 +743,7 @@ static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u
                u64_stats_update_begin(&rxsc_stats->syncp);
                rxsc_stats->stats.InPktsLate++;
                u64_stats_update_end(&rxsc_stats->syncp);
-               secy->netdev->stats.rx_dropped++;
+               DEV_STATS_INC(secy->netdev, rx_dropped);
                return false;
        }
 
@@ -767,7 +767,7 @@ static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u
                        rxsc_stats->stats.InPktsNotValid++;
                        u64_stats_update_end(&rxsc_stats->syncp);
                        this_cpu_inc(rx_sa->stats->InPktsNotValid);
-                       secy->netdev->stats.rx_errors++;
+                       DEV_STATS_INC(secy->netdev, rx_errors);
                        return false;
                }
 
@@ -1069,7 +1069,7 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
                        u64_stats_update_begin(&secy_stats->syncp);
                        secy_stats->stats.InPktsNoTag++;
                        u64_stats_update_end(&secy_stats->syncp);
-                       macsec->secy.netdev->stats.rx_dropped++;
+                       DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
                        continue;
                }
 
@@ -1179,7 +1179,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
                u64_stats_update_begin(&secy_stats->syncp);
                secy_stats->stats.InPktsBadTag++;
                u64_stats_update_end(&secy_stats->syncp);
-               secy->netdev->stats.rx_errors++;
+               DEV_STATS_INC(secy->netdev, rx_errors);
                goto drop_nosa;
        }
 
@@ -1196,7 +1196,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
                        u64_stats_update_begin(&rxsc_stats->syncp);
                        rxsc_stats->stats.InPktsNotUsingSA++;
                        u64_stats_update_end(&rxsc_stats->syncp);
-                       secy->netdev->stats.rx_errors++;
+                       DEV_STATS_INC(secy->netdev, rx_errors);
                        if (active_rx_sa)
                                this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA);
                        goto drop_nosa;
@@ -1230,7 +1230,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
                        u64_stats_update_begin(&rxsc_stats->syncp);
                        rxsc_stats->stats.InPktsLate++;
                        u64_stats_update_end(&rxsc_stats->syncp);
-                       macsec->secy.netdev->stats.rx_dropped++;
+                       DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
                        goto drop;
                }
        }
@@ -1271,7 +1271,7 @@ deliver:
        if (ret == NET_RX_SUCCESS)
                count_rx(dev, len);
        else
-               macsec->secy.netdev->stats.rx_dropped++;
+               DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
 
        rcu_read_unlock();
 
@@ -1308,7 +1308,7 @@ nosci:
                        u64_stats_update_begin(&secy_stats->syncp);
                        secy_stats->stats.InPktsNoSCI++;
                        u64_stats_update_end(&secy_stats->syncp);
-                       macsec->secy.netdev->stats.rx_errors++;
+                       DEV_STATS_INC(macsec->secy.netdev, rx_errors);
                        continue;
                }
 
@@ -1327,7 +1327,7 @@ nosci:
                        secy_stats->stats.InPktsUnknownSCI++;
                        u64_stats_update_end(&secy_stats->syncp);
                } else {
-                       macsec->secy.netdev->stats.rx_dropped++;
+                       DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
                }
        }
 
@@ -3422,7 +3422,7 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
 
        if (!secy->operational) {
                kfree_skb(skb);
-               dev->stats.tx_dropped++;
+               DEV_STATS_INC(dev, tx_dropped);
                return NETDEV_TX_OK;
        }
 
@@ -3430,7 +3430,7 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
        skb = macsec_encrypt(skb, dev);
        if (IS_ERR(skb)) {
                if (PTR_ERR(skb) != -EINPROGRESS)
-                       dev->stats.tx_dropped++;
+                       DEV_STATS_INC(dev, tx_dropped);
                return NETDEV_TX_OK;
        }
 
@@ -3667,9 +3667,9 @@ static void macsec_get_stats64(struct net_device *dev,
 
        dev_fetch_sw_netstats(s, dev->tstats);
 
-       s->rx_dropped = dev->stats.rx_dropped;
-       s->tx_dropped = dev->stats.tx_dropped;
-       s->rx_errors = dev->stats.rx_errors;
+       s->rx_dropped = atomic_long_read(&dev->stats.__rx_dropped);
+       s->tx_dropped = atomic_long_read(&dev->stats.__tx_dropped);
+       s->rx_errors = atomic_long_read(&dev->stats.__rx_errors);
 }
 
 static int macsec_get_iflink(const struct net_device *dev)
index c1f307d..8a77ec3 100644 (file)
@@ -459,21 +459,27 @@ static int at803x_set_wol(struct phy_device *phydev,
                        phy_write_mmd(phydev, MDIO_MMD_PCS, offsets[i],
                                      mac[(i * 2) + 1] | (mac[(i * 2)] << 8));
 
-               /* Enable WOL function */
-               ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL,
-                               0, AT803X_WOL_EN);
-               if (ret)
-                       return ret;
+               /* Enable WOL function for 1588 */
+               if (phydev->drv->phy_id == ATH8031_PHY_ID) {
+                       ret = phy_modify_mmd(phydev, MDIO_MMD_PCS,
+                                            AT803X_PHY_MMD3_WOL_CTRL,
+                                            0, AT803X_WOL_EN);
+                       if (ret)
+                               return ret;
+               }
                /* Enable WOL interrupt */
                ret = phy_modify(phydev, AT803X_INTR_ENABLE, 0, AT803X_INTR_ENABLE_WOL);
                if (ret)
                        return ret;
        } else {
-               /* Disable WoL function */
-               ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL,
-                               AT803X_WOL_EN, 0);
-               if (ret)
-                       return ret;
+               /* Disable WoL function for 1588 */
+               if (phydev->drv->phy_id == ATH8031_PHY_ID) {
+                       ret = phy_modify_mmd(phydev, MDIO_MMD_PCS,
+                                            AT803X_PHY_MMD3_WOL_CTRL,
+                                            AT803X_WOL_EN, 0);
+                       if (ret)
+                               return ret;
+               }
                /* Disable WOL interrupt */
                ret = phy_modify(phydev, AT803X_INTR_ENABLE, AT803X_INTR_ENABLE_WOL, 0);
                if (ret)
@@ -508,11 +514,11 @@ static void at803x_get_wol(struct phy_device *phydev,
        wol->supported = WAKE_MAGIC;
        wol->wolopts = 0;
 
-       value = phy_read_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL);
+       value = phy_read(phydev, AT803X_INTR_ENABLE);
        if (value < 0)
                return;
 
-       if (value & AT803X_WOL_EN)
+       if (value & AT803X_INTR_ENABLE_WOL)
                wol->wolopts |= WAKE_MAGIC;
 }
 
@@ -858,9 +864,6 @@ static int at803x_probe(struct phy_device *phydev)
        if (phydev->drv->phy_id == ATH8031_PHY_ID) {
                int ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG);
                int mode_cfg;
-               struct ethtool_wolinfo wol = {
-                       .wolopts = 0,
-               };
 
                if (ccr < 0)
                        return ccr;
@@ -877,12 +880,14 @@ static int at803x_probe(struct phy_device *phydev)
                        break;
                }
 
-               /* Disable WOL by default */
-               ret = at803x_set_wol(phydev, &wol);
-               if (ret < 0) {
-                       phydev_err(phydev, "failed to disable WOL on probe: %d\n", ret);
+               /* Disable WoL in 1588 register which is enabled
+                * by default
+                */
+               ret = phy_modify_mmd(phydev, MDIO_MMD_PCS,
+                                    AT803X_PHY_MMD3_WOL_CTRL,
+                                    AT803X_WOL_EN, 0);
+               if (ret)
                        return ret;
-               }
        }
 
        return 0;
@@ -2059,8 +2064,6 @@ static struct phy_driver at803x_driver[] = {
        .flags                  = PHY_POLL_CABLE_TEST,
        .config_init            = at803x_config_init,
        .link_change_notify     = at803x_link_change_notify,
-       .set_wol                = at803x_set_wol,
-       .get_wol                = at803x_get_wol,
        .suspend                = at803x_suspend,
        .resume                 = at803x_resume,
        /* PHY_BASIC_FEATURES */
index 25f0191..100339b 100644 (file)
@@ -1594,7 +1594,7 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
        if (zerocopy)
                return false;
 
-       if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+       if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
            SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
                return false;
 
index a3de081..c3ff30a 100644 (file)
@@ -713,6 +713,12 @@ static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan,
        return vninode;
 }
 
+static void vxlan_vni_free(struct vxlan_vni_node *vninode)
+{
+       free_percpu(vninode->stats);
+       kfree(vninode);
+}
+
 static int vxlan_vni_add(struct vxlan_dev *vxlan,
                         struct vxlan_vni_group *vg,
                         u32 vni, union vxlan_addr *group,
@@ -740,7 +746,7 @@ static int vxlan_vni_add(struct vxlan_dev *vxlan,
                                            &vninode->vnode,
                                            vxlan_vni_rht_params);
        if (err) {
-               kfree(vninode);
+               vxlan_vni_free(vninode);
                return err;
        }
 
@@ -763,8 +769,7 @@ static void vxlan_vni_node_rcu_free(struct rcu_head *rcu)
        struct vxlan_vni_node *v;
 
        v = container_of(rcu, struct vxlan_vni_node, rcu);
-       free_percpu(v->stats);
-       kfree(v);
+       vxlan_vni_free(v);
 }
 
 static int vxlan_vni_del(struct vxlan_dev *vxlan,
index 5bf7822..0ba714c 100644 (file)
@@ -6,7 +6,7 @@
 #include "allowedips.h"
 #include "peer.h"
 
-enum { MAX_ALLOWEDIPS_BITS = 128 };
+enum { MAX_ALLOWEDIPS_DEPTH = 129 };
 
 static struct kmem_cache *node_cache;
 
@@ -42,7 +42,7 @@ static void push_rcu(struct allowedips_node **stack,
                     struct allowedips_node __rcu *p, unsigned int *len)
 {
        if (rcu_access_pointer(p)) {
-               if (WARN_ON(IS_ENABLED(DEBUG) && *len >= MAX_ALLOWEDIPS_BITS))
+               if (WARN_ON(IS_ENABLED(DEBUG) && *len >= MAX_ALLOWEDIPS_DEPTH))
                        return;
                stack[(*len)++] = rcu_dereference_raw(p);
        }
@@ -55,7 +55,7 @@ static void node_free_rcu(struct rcu_head *rcu)
 
 static void root_free_rcu(struct rcu_head *rcu)
 {
-       struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_BITS] = {
+       struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_DEPTH] = {
                container_of(rcu, struct allowedips_node, rcu) };
        unsigned int len = 1;
 
@@ -68,7 +68,7 @@ static void root_free_rcu(struct rcu_head *rcu)
 
 static void root_remove_peer_lists(struct allowedips_node *root)
 {
-       struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_BITS] = { root };
+       struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_DEPTH] = { root };
        unsigned int len = 1;
 
        while (len > 0 && (node = stack[--len])) {
index 78ebe28..3d1f64f 100644 (file)
@@ -593,16 +593,20 @@ bool __init wg_allowedips_selftest(void)
        wg_allowedips_remove_by_peer(&t, a, &mutex);
        test_negative(4, a, 192, 168, 0, 1);
 
-       /* These will hit the WARN_ON(len >= MAX_ALLOWEDIPS_BITS) in free_node
+       /* These will hit the WARN_ON(len >= MAX_ALLOWEDIPS_DEPTH) in free_node
         * if something goes wrong.
         */
-       for (i = 0; i < MAX_ALLOWEDIPS_BITS; ++i) {
-               part = cpu_to_be64(~(1LLU << (i % 64)));
-               memset(&ip, 0xff, 16);
-               memcpy((u8 *)&ip + (i < 64) * 8, &part, 8);
+       for (i = 0; i < 64; ++i) {
+               part = cpu_to_be64(~0LLU << i);
+               memset(&ip, 0xff, 8);
+               memcpy((u8 *)&ip + 8, &part, 8);
+               wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex);
+               memcpy(&ip, &part, 8);
+               memset((u8 *)&ip + 8, 0, 8);
                wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex);
        }
-
+       memset(&ip, 0, 16);
+       wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex);
        wg_allowedips_free(&t, &mutex);
 
        wg_allowedips_init(&t);
index 6512267..4928e4e 100644 (file)
@@ -2144,8 +2144,7 @@ int ath12k_wmi_send_scan_start_cmd(struct ath12k *ar,
        struct wmi_tlv *tlv;
        void *ptr;
        int i, ret, len;
-       u32 *tmp_ptr;
-       u8 extraie_len_with_pad = 0;
+       u32 *tmp_ptr, extraie_len_with_pad = 0;
        struct ath12k_wmi_hint_short_ssid_arg *s_ssid = NULL;
        struct ath12k_wmi_hint_bssid_arg *hint_bssid = NULL;
 
index de8a2e2..2a90bb2 100644 (file)
@@ -1456,6 +1456,10 @@ brcmf_run_escan(struct brcmf_cfg80211_info *cfg, struct brcmf_if *ifp,
                params_size -= BRCMF_SCAN_PARAMS_V2_FIXED_SIZE;
                params_size += BRCMF_SCAN_PARAMS_FIXED_SIZE;
                params_v1 = kzalloc(params_size, GFP_KERNEL);
+               if (!params_v1) {
+                       err = -ENOMEM;
+                       goto exit_params;
+               }
                params_v1->version = cpu_to_le32(BRCMF_ESCAN_REQ_VERSION);
                brcmf_scan_params_v2_to_v1(&params->params_v2_le, &params_v1->params_le);
                kfree(params);
@@ -1473,6 +1477,7 @@ brcmf_run_escan(struct brcmf_cfg80211_info *cfg, struct brcmf_if *ifp,
                        bphy_err(drvr, "error (%d)\n", err);
        }
 
+exit_params:
        kfree(params);
 exit:
        return err;
index b114bab..c93e625 100644 (file)
@@ -2524,7 +2524,7 @@ static int cmac_dma_init(struct rtw89_dev *rtwdev, u8 mac_idx)
        u32 reg;
        int ret;
 
-       if (chip_id != RTL8852A && chip_id != RTL8852B)
+       if (chip_id != RTL8852B)
                return 0;
 
        ret = rtw89_mac_check_mac_en(rtwdev, mac_idx, RTW89_CMAC_SEL);
index c8d20cd..88f760a 100644 (file)
@@ -396,7 +396,7 @@ static void xenvif_get_requests(struct xenvif_queue *queue,
        struct gnttab_map_grant_ref *gop = queue->tx_map_ops + *map_ops;
        struct xen_netif_tx_request *txp = first;
 
-       nr_slots = shinfo->nr_frags + 1;
+       nr_slots = shinfo->nr_frags + frag_overflow + 1;
 
        copy_count(skb) = 0;
        XENVIF_TX_CB(skb)->split_mask = 0;
@@ -462,8 +462,8 @@ static void xenvif_get_requests(struct xenvif_queue *queue,
                }
        }
 
-       for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots;
-            shinfo->nr_frags++, gop++) {
+       for (shinfo->nr_frags = 0; nr_slots > 0 && shinfo->nr_frags < MAX_SKB_FRAGS;
+            shinfo->nr_frags++, gop++, nr_slots--) {
                index = pending_index(queue->pending_cons++);
                pending_idx = queue->pending_ring[index];
                xenvif_tx_create_map_op(queue, pending_idx, txp,
@@ -476,12 +476,12 @@ static void xenvif_get_requests(struct xenvif_queue *queue,
                        txp++;
        }
 
-       if (frag_overflow) {
+       if (nr_slots > 0) {
 
                shinfo = skb_shinfo(nskb);
                frags = shinfo->frags;
 
-               for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+               for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots;
                     shinfo->nr_frags++, txp++, gop++) {
                        index = pending_index(queue->pending_cons++);
                        pending_idx = queue->pending_ring[index];
@@ -492,6 +492,11 @@ static void xenvif_get_requests(struct xenvif_queue *queue,
                }
 
                skb_shinfo(skb)->frag_list = nskb;
+       } else if (nskb) {
+               /* A frag_list skb was allocated but it is no longer needed
+                * because enough slots were converted to copy ops above.
+                */
+               kfree_skb(nskb);
        }
 
        (*copy_ops) = cop - queue->tx_copy_ops;
index 37b6fa7..f3a01b7 100644 (file)
@@ -3933,6 +3933,12 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
         */
        nvme_mpath_clear_ctrl_paths(ctrl);
 
+       /*
+        * Unquiesce io queues so any pending IO won't hang, especially
+        * those submitted from scan work
+        */
+       nvme_unquiesce_io_queues(ctrl);
+
        /* prevent racing with ns scanning */
        flush_work(&ctrl->scan_work);
 
@@ -3942,10 +3948,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
         * removing the namespaces' disks; fail all the queues now to avoid
         * potentially having to clean up the failed sync later.
         */
-       if (ctrl->state == NVME_CTRL_DEAD) {
+       if (ctrl->state == NVME_CTRL_DEAD)
                nvme_mark_namespaces_dead(ctrl);
-               nvme_unquiesce_io_queues(ctrl);
-       }
 
        /* this is a no-op when called from the controller reset handler */
        nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
index 5c3250f..d39f321 100644 (file)
@@ -786,11 +786,9 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
        if (!(ioucmd->flags & IORING_URING_CMD_POLLED))
                return 0;
 
-       rcu_read_lock();
        req = READ_ONCE(ioucmd->cookie);
        if (req && blk_rq_is_poll(req))
                ret = blk_rq_poll(req, iob, poll_flags);
-       rcu_read_unlock();
        return ret;
 }
 #ifdef CONFIG_NVME_MULTIPATH
index baf69af..2f57da1 100644 (file)
@@ -3402,7 +3402,8 @@ static const struct pci_device_id nvme_id_table[] = {
        { PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
                .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
        { PCI_DEVICE(0x144d, 0xa80b),   /* Samsung PM9B1 256G and 512G */
-               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES |
+                               NVME_QUIRK_BOGUS_NID, },
        { PCI_DEVICE(0x144d, 0xa809),   /* Samsung MZALQ256HBJD 256G */
                .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
        { PCI_DEVICE(0x144d, 0xa802),   /* Samsung SM953 */
index d433b2e..337a624 100644 (file)
@@ -883,6 +883,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
                goto out_cleanup_tagset;
 
        if (!new) {
+               nvme_start_freeze(&ctrl->ctrl);
                nvme_unquiesce_io_queues(&ctrl->ctrl);
                if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
                        /*
@@ -891,6 +892,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
                         * to be safe.
                         */
                        ret = -ENODEV;
+                       nvme_unfreeze(&ctrl->ctrl);
                        goto out_wait_freeze_timed_out;
                }
                blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
@@ -940,7 +942,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
                bool remove)
 {
        if (ctrl->ctrl.queue_count > 1) {
-               nvme_start_freeze(&ctrl->ctrl);
                nvme_quiesce_io_queues(&ctrl->ctrl);
                nvme_sync_io_queues(&ctrl->ctrl);
                nvme_rdma_stop_io_queues(ctrl);
index 9ce417c..5b332d9 100644 (file)
@@ -1868,6 +1868,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
                goto out_cleanup_connect_q;
 
        if (!new) {
+               nvme_start_freeze(ctrl);
                nvme_unquiesce_io_queues(ctrl);
                if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
                        /*
@@ -1876,6 +1877,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
                         * to be safe.
                         */
                        ret = -ENODEV;
+                       nvme_unfreeze(ctrl);
                        goto out_wait_freeze_timed_out;
                }
                blk_mq_update_nr_hw_queues(ctrl->tagset,
@@ -1980,7 +1982,6 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
        if (ctrl->queue_count <= 1)
                return;
        nvme_quiesce_admin_queue(ctrl);
-       nvme_start_freeze(ctrl);
        nvme_quiesce_io_queues(ctrl);
        nvme_sync_io_queues(ctrl);
        nvme_tcp_stop_io_queues(ctrl);
index bf3405f..8b1dcd5 100644 (file)
@@ -121,6 +121,8 @@ module_param(sba_reserve_agpgart, int, 0444);
 MODULE_PARM_DESC(sba_reserve_agpgart, "Reserve half of IO pdir as AGPGART");
 #endif
 
+struct proc_dir_entry *proc_runway_root __ro_after_init;
+struct proc_dir_entry *proc_mckinley_root __ro_after_init;
 
 /************************************
 ** SBA register read and write support
@@ -1968,11 +1970,15 @@ static int __init sba_driver_callback(struct parisc_device *dev)
 #ifdef CONFIG_PROC_FS
        switch (dev->id.hversion) {
        case PLUTO_MCKINLEY_PORT:
+               if (!proc_mckinley_root)
+                       proc_mckinley_root = proc_mkdir("bus/mckinley", NULL);
                root = proc_mckinley_root;
                break;
        case ASTRO_RUNWAY_PORT:
        case IKE_MERCED_PORT:
        default:
+               if (!proc_runway_root)
+                       proc_runway_root = proc_mkdir("bus/runway", NULL);
                root = proc_runway_root;
                break;
        }
index 5bc81cc..46b252b 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
+#include <linux/of.h>
 #include <linux/proc_fs.h>
 #include <linux/slab.h>
 
@@ -332,6 +333,7 @@ void __weak pcibios_bus_add_device(struct pci_dev *pdev) { }
  */
 void pci_bus_add_device(struct pci_dev *dev)
 {
+       struct device_node *dn = dev->dev.of_node;
        int retval;
 
        /*
@@ -344,7 +346,7 @@ void pci_bus_add_device(struct pci_dev *dev)
        pci_proc_attach_device(dev);
        pci_bridge_d3_update(dev);
 
-       dev->match_driver = true;
+       dev->match_driver = !dn || of_device_is_available(dn);
        retval = device_attach(&dev->dev);
        if (retval < 0 && retval != -EPROBE_DEFER)
                pci_warn(dev, "device attach failed (%d)\n", retval);
index 8d49bad..0859be8 100644 (file)
@@ -179,7 +179,6 @@ config PCI_MVEBU
        depends on MVEBU_MBUS
        depends on ARM
        depends on OF
-       depends on BROKEN
        select PCI_BRIDGE_EMUL
        help
         Add support for Marvell EBU PCIe controller. This PCIe controller
index cf61733..9952057 100644 (file)
@@ -485,20 +485,15 @@ int dw_pcie_host_init(struct dw_pcie_rp *pp)
        if (ret)
                goto err_remove_edma;
 
-       if (dw_pcie_link_up(pci)) {
-               dw_pcie_print_link_status(pci);
-       } else {
+       if (!dw_pcie_link_up(pci)) {
                ret = dw_pcie_start_link(pci);
                if (ret)
                        goto err_remove_edma;
-
-               if (pci->ops && pci->ops->start_link) {
-                       ret = dw_pcie_wait_for_link(pci);
-                       if (ret)
-                               goto err_stop_link;
-               }
        }
 
+       /* Ignore errors, the link may come up later */
+       dw_pcie_wait_for_link(pci);
+
        bridge->sysdata = pp;
 
        ret = pci_host_probe(bridge);
index c87848c..1f2ee71 100644 (file)
@@ -644,20 +644,9 @@ void dw_pcie_disable_atu(struct dw_pcie *pci, u32 dir, int index)
        dw_pcie_writel_atu(pci, dir, index, PCIE_ATU_REGION_CTRL2, 0);
 }
 
-void dw_pcie_print_link_status(struct dw_pcie *pci)
-{
-       u32 offset, val;
-
-       offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
-       val = dw_pcie_readw_dbi(pci, offset + PCI_EXP_LNKSTA);
-
-       dev_info(pci->dev, "PCIe Gen.%u x%u link up\n",
-                FIELD_GET(PCI_EXP_LNKSTA_CLS, val),
-                FIELD_GET(PCI_EXP_LNKSTA_NLW, val));
-}
-
 int dw_pcie_wait_for_link(struct dw_pcie *pci)
 {
+       u32 offset, val;
        int retries;
 
        /* Check if the link is up or not */
@@ -673,7 +662,12 @@ int dw_pcie_wait_for_link(struct dw_pcie *pci)
                return -ETIMEDOUT;
        }
 
-       dw_pcie_print_link_status(pci);
+       offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+       val = dw_pcie_readw_dbi(pci, offset + PCI_EXP_LNKSTA);
+
+       dev_info(pci->dev, "PCIe Gen.%u x%u link up\n",
+                FIELD_GET(PCI_EXP_LNKSTA_CLS, val),
+                FIELD_GET(PCI_EXP_LNKSTA_NLW, val));
 
        return 0;
 }
index 6156606..79713ce 100644 (file)
@@ -429,7 +429,6 @@ void dw_pcie_setup(struct dw_pcie *pci);
 void dw_pcie_iatu_detect(struct dw_pcie *pci);
 int dw_pcie_edma_detect(struct dw_pcie *pci);
 void dw_pcie_edma_remove(struct dw_pcie *pci);
-void dw_pcie_print_link_status(struct dw_pcie *pci);
 
 static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
 {
index 328d1e4..6011297 100644 (file)
@@ -498,6 +498,7 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
                                acpiphp_native_scan_bridge(dev);
                }
        } else {
+               LIST_HEAD(add_list);
                int max, pass;
 
                acpiphp_rescan_slot(slot);
@@ -511,10 +512,15 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge)
                                if (pass && dev->subordinate) {
                                        check_hotplug_bridge(slot, dev);
                                        pcibios_resource_survey_bus(dev->subordinate);
+                                       if (pci_is_root_bus(bus))
+                                               __pci_bus_size_bridges(dev->subordinate, &add_list);
                                }
                        }
                }
-               pci_assign_unassigned_bridge_resources(bus->self);
+               if (pci_is_root_bus(bus))
+                       __pci_bus_assign_resources(bus, &add_list, NULL);
+               else
+                       pci_assign_unassigned_bridge_resources(bus->self);
        }
 
        acpiphp_sanitize_bus(bus);
index e51219f..3c158b1 100644 (file)
@@ -34,11 +34,6 @@ int pci_set_of_node(struct pci_dev *dev)
        if (!node)
                return 0;
 
-       if (!of_device_is_available(node)) {
-               of_node_put(node);
-               return -ENODEV;
-       }
-
        device_set_node(&dev->dev, of_fwnode_handle(node));
        return 0;
 }
index 1bf3c44..b43fa8b 100644 (file)
@@ -1578,7 +1578,7 @@ const struct file_operations gfs2_file_fops = {
        .fsync          = gfs2_fsync,
        .lock           = gfs2_lock,
        .flock          = gfs2_flock,
-       .splice_read    = filemap_splice_read,
+       .splice_read    = copy_splice_read,
        .splice_write   = gfs2_file_splice_write,
        .setlease       = simple_nosetlease,
        .fallocate      = gfs2_fallocate,
@@ -1609,7 +1609,7 @@ const struct file_operations gfs2_file_fops_nolock = {
        .open           = gfs2_open,
        .release        = gfs2_release,
        .fsync          = gfs2_fsync,
-       .splice_read    = filemap_splice_read,
+       .splice_read    = copy_splice_read,
        .splice_write   = gfs2_file_splice_write,
        .setlease       = generic_setlease,
        .fallocate      = gfs2_fallocate,
index ec16312..7e835be 100644 (file)
@@ -230,9 +230,11 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 {
 
        struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+       struct super_block *sb = sdp->sd_vfs;
        struct gfs2_bufdata *bd;
        struct gfs2_meta_header *mh;
        struct gfs2_trans *tr = current->journal_info;
+       bool withdraw = false;
 
        lock_buffer(bh);
        if (buffer_pinned(bh)) {
@@ -266,13 +268,15 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
                       (unsigned long long)bd->bd_bh->b_blocknr);
                BUG();
        }
-       if (unlikely(test_bit(SDF_FROZEN, &sdp->sd_flags))) {
-               fs_info(sdp, "GFS2:adding buf while frozen\n");
-               gfs2_assert_withdraw(sdp, 0);
-       }
        if (unlikely(gfs2_withdrawn(sdp))) {
                fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n",
                        (unsigned long long)bd->bd_bh->b_blocknr);
+               goto out_unlock;
+       }
+       if (unlikely(sb->s_writers.frozen == SB_FREEZE_COMPLETE)) {
+               fs_info(sdp, "GFS2:adding buf while frozen\n");
+               withdraw = true;
+               goto out_unlock;
        }
        gfs2_pin(sdp, bd->bd_bh);
        mh->__pad0 = cpu_to_be64(0);
@@ -281,6 +285,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
        tr->tr_num_buf_new++;
 out_unlock:
        gfs2_log_unlock(sdp);
+       if (withdraw)
+               gfs2_assert_withdraw(sdp, 0);
 out:
        unlock_buffer(bh);
 }
index a8ce522..35bc793 100644 (file)
@@ -1101,9 +1101,17 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
 
 int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
 {
+       struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct buffer_head *ibh;
        int err;
 
+       /*
+        * Do not dirty inodes after the log writer has been detached
+        * and its nilfs_root struct has been freed.
+        */
+       if (unlikely(nilfs_purging(nilfs)))
+               return 0;
+
        err = nilfs_load_inode_block(inode, &ibh);
        if (unlikely(err)) {
                nilfs_warn(inode->i_sb,
index c255302..581691e 100644 (file)
@@ -2845,6 +2845,7 @@ void nilfs_detach_log_writer(struct super_block *sb)
                nilfs_segctor_destroy(nilfs->ns_writer);
                nilfs->ns_writer = NULL;
        }
+       set_nilfs_purging(nilfs);
 
        /* Force to free the list of dirty files */
        spin_lock(&nilfs->ns_inode_lock);
@@ -2857,4 +2858,5 @@ void nilfs_detach_log_writer(struct super_block *sb)
        up_write(&nilfs->ns_segctor_sem);
 
        nilfs_dispose_list(nilfs, &garbage_list, 1);
+       clear_nilfs_purging(nilfs);
 }
index 47c7dfb..cd4ae1b 100644 (file)
@@ -29,6 +29,7 @@ enum {
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
        THE_NILFS_GC_RUNNING,   /* gc process is running */
        THE_NILFS_SB_DIRTY,     /* super block is dirty */
+       THE_NILFS_PURGING,      /* disposing dirty files for cleanup */
 };
 
 /**
@@ -208,6 +209,7 @@ THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+THE_NILFS_FNS(PURGING, purging)
 
 /*
  * Mount option operations
index 9cb32e1..23fc24d 100644 (file)
@@ -309,6 +309,8 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
 
 static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
+       struct file *file = iocb->ki_filp;
+       char *buf = file->private_data;
        loff_t *fpos = &iocb->ki_pos;
        size_t phdrs_offset, notes_offset, data_offset;
        size_t page_offline_frozen = 1;
@@ -555,10 +557,21 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
                case KCORE_VMEMMAP:
                case KCORE_TEXT:
                        /*
-                        * We use _copy_to_iter() to bypass usermode hardening
-                        * which would otherwise prevent this operation.
+                        * Sadly we must use a bounce buffer here to be able to
+                        * make use of copy_from_kernel_nofault(), as these
+                        * memory regions might not always be mapped on all
+                        * architectures.
                         */
-                       if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+                       if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+                               if (iov_iter_zero(tsz, iter) != tsz) {
+                                       ret = -EFAULT;
+                                       goto out;
+                               }
+                       /*
+                        * We know the bounce buffer is safe to copy from, so
+                        * use _copy_to_iter() directly.
+                        */
+                       } else if (_copy_to_iter(buf, tsz, iter) != tsz) {
                                ret = -EFAULT;
                                goto out;
                        }
@@ -595,6 +608,10 @@ static int open_kcore(struct inode *inode, struct file *filp)
        if (ret)
                return ret;
 
+       filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!filp->private_data)
+               return -ENOMEM;
+
        if (kcore_need_update)
                kcore_update_ram();
        if (i_size_read(inode) != proc_root_kcore->size) {
@@ -605,9 +622,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
        return 0;
 }
 
+static int release_kcore(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+       return 0;
+}
+
 static const struct proc_ops kcore_proc_ops = {
        .proc_read_iter = read_kcore_iter,
        .proc_open      = open_kcore,
+       .proc_release   = release_kcore,
        .proc_lseek     = default_llseek,
 };
 
index 33b7e6c..e881df1 100644 (file)
@@ -380,13 +380,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
        }
 
        if (smb2_req_struct_sizes[command] != pdu->StructureSize2) {
-               if (command == SMB2_OPLOCK_BREAK_HE &&
-                   le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 &&
-                   le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) {
+               if (!(command == SMB2_OPLOCK_BREAK_HE &&
+                   (le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_20 ||
+                   le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_21))) {
                        /* special case for SMB2.1 lease break message */
                        ksmbd_debug(SMB,
-                                   "Illegal request size %d for oplock break\n",
-                                   le16_to_cpu(pdu->StructureSize2));
+                               "Illegal request size %u for command %d\n",
+                               le16_to_cpu(pdu->StructureSize2), command);
                        return 1;
                }
        }
index 9849d74..7cc1b0c 100644 (file)
@@ -2324,9 +2324,16 @@ next:
                        break;
                buf_len -= next;
                eabuf = (struct smb2_ea_info *)((char *)eabuf + next);
-               if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength))
+               if (buf_len < sizeof(struct smb2_ea_info)) {
+                       rc = -EINVAL;
                        break;
+               }
 
+               if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength +
+                               le16_to_cpu(eabuf->EaValueLength)) {
+                       rc = -EINVAL;
+                       break;
+               }
        } while (next != 0);
 
        kfree(attr_name);
index aca8290..069a019 100644 (file)
@@ -68,9 +68,9 @@ struct shfl_string {
 
        /** UTF-8 or UTF-16 string. Nul terminated. */
        union {
-               u8 utf8[2];
-               u16 utf16[1];
-               u16 ucs2[1]; /* misnomer, use utf16. */
+               u8 legacy_padding[2];
+               DECLARE_FLEX_ARRAY(u8, utf8);
+               DECLARE_FLEX_ARRAY(u16, utf16);
        } string;
 };
 VMMDEV_ASSERT_SIZE(shfl_string, 6);
index c4f5b52..11984ed 100644 (file)
@@ -791,7 +791,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
 {
        bio->bi_opf |= REQ_POLLED;
-       if (!is_sync_kiocb(kiocb))
+       if (kiocb->ki_flags & IOCB_NOWAIT)
                bio->bi_opf |= REQ_NOWAIT;
 }
 
index ed44a99..87d94be 100644 (file)
@@ -969,7 +969,6 @@ struct blk_plug {
 
        bool multiple_queues;
        bool has_elevator;
-       bool nowait;
 
        struct list_head cb_list; /* md requires an unplug callback */
 };
index 6e6e57e..23ac87b 100644 (file)
@@ -70,6 +70,8 @@ extern ssize_t cpu_show_mmio_stale_data(struct device *dev,
                                        char *buf);
 extern ssize_t cpu_show_retbleed(struct device *dev,
                                 struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev,
+                                            struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
index 054d791..c163751 100644 (file)
@@ -62,6 +62,7 @@ struct sk_psock_progs {
 
 enum sk_psock_state_bits {
        SK_PSOCK_TX_ENABLED,
+       SK_PSOCK_RX_STRP_ENABLED,
 };
 
 struct sk_psock_link {
index 6a1e8f1..4ee9d13 100644 (file)
@@ -283,6 +283,7 @@ enum tpm_chip_flags {
        TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED    = BIT(6),
        TPM_CHIP_FLAG_FIRMWARE_UPGRADE          = BIT(7),
        TPM_CHIP_FLAG_SUSPENDED                 = BIT(8),
+       TPM_CHIP_FLAG_HWRNG_DISABLED            = BIT(9),
 };
 
 #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
index 7c7d03a..d6fa7c8 100644 (file)
@@ -562,6 +562,9 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
        if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
                return NULL;
 
+       if (iftype == NL80211_IFTYPE_AP_VLAN)
+               iftype = NL80211_IFTYPE_AP;
+
        for (i = 0; i < sband->n_iftype_data; i++)  {
                const struct ieee80211_sband_iftype_data *data =
                        &sband->iftype_data[i];
index 640441a..3587085 100644 (file)
@@ -512,6 +512,7 @@ struct nft_set_elem_expr {
  *
  *     @list: table set list node
  *     @bindings: list of set bindings
+ *     @refs: internal refcounting for async set destruction
  *     @table: table this set belongs to
  *     @net: netnamespace this set belongs to
  *     @name: name of the set
@@ -541,6 +542,7 @@ struct nft_set_elem_expr {
 struct nft_set {
        struct list_head                list;
        struct list_head                bindings;
+       refcount_t                      refs;
        struct nft_table                *table;
        possible_net_t                  net;
        char                            *name;
@@ -562,7 +564,8 @@ struct nft_set {
        struct list_head                pending_update;
        /* runtime data below here */
        const struct nft_set_ops        *ops ____cacheline_aligned;
-       u16                             flags:14,
+       u16                             flags:13,
+                                       dead:1,
                                        genmask:2;
        u8                              klen;
        u8                              dlen;
@@ -596,7 +599,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
 
 struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
                                            const struct nft_set *set);
-void *nft_set_catchall_gc(const struct nft_set *set);
 
 static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
 {
@@ -813,62 +815,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
                                const struct nft_set *set, void *elem);
 
-/**
- *     struct nft_set_gc_batch_head - nf_tables set garbage collection batch
- *
- *     @rcu: rcu head
- *     @set: set the elements belong to
- *     @cnt: count of elements
- */
-struct nft_set_gc_batch_head {
-       struct rcu_head                 rcu;
-       const struct nft_set            *set;
-       unsigned int                    cnt;
-};
-
-#define NFT_SET_GC_BATCH_SIZE  ((PAGE_SIZE -                             \
-                                 sizeof(struct nft_set_gc_batch_head)) / \
-                                sizeof(void *))
-
-/**
- *     struct nft_set_gc_batch - nf_tables set garbage collection batch
- *
- *     @head: GC batch head
- *     @elems: garbage collection elements
- */
-struct nft_set_gc_batch {
-       struct nft_set_gc_batch_head    head;
-       void                            *elems[NFT_SET_GC_BATCH_SIZE];
-};
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
-                                               gfp_t gfp);
-void nft_set_gc_batch_release(struct rcu_head *rcu);
-
-static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb)
-{
-       if (gcb != NULL)
-               call_rcu(&gcb->head.rcu, nft_set_gc_batch_release);
-}
-
-static inline struct nft_set_gc_batch *
-nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb,
-                      gfp_t gfp)
-{
-       if (gcb != NULL) {
-               if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems))
-                       return gcb;
-               nft_set_gc_batch_complete(gcb);
-       }
-       return nft_set_gc_batch_alloc(set, gfp);
-}
-
-static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb,
-                                       void *elem)
-{
-       gcb->elems[gcb->head.cnt++] = elem;
-}
-
 struct nft_expr_ops;
 /**
  *     struct nft_expr_type - nf_tables expression type
@@ -1557,39 +1503,30 @@ static inline void nft_set_elem_change_active(const struct net *net,
 
 #endif /* IS_ENABLED(CONFIG_NF_TABLES) */
 
-/*
- * We use a free bit in the genmask field to indicate the element
- * is busy, meaning it is currently being processed either by
- * the netlink API or GC.
- *
- * Even though the genmask is only a single byte wide, this works
- * because the extension structure if fully constant once initialized,
- * so there are no non-atomic write accesses unless it is already
- * marked busy.
- */
-#define NFT_SET_ELEM_BUSY_MASK (1 << 2)
+#define NFT_SET_ELEM_DEAD_MASK (1 << 2)
 
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_BUSY_BIT  2
+#define NFT_SET_ELEM_DEAD_BIT  2
 #elif defined(__BIG_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_BUSY_BIT  (BITS_PER_LONG - BITS_PER_BYTE + 2)
+#define NFT_SET_ELEM_DEAD_BIT  (BITS_PER_LONG - BITS_PER_BYTE + 2)
 #else
 #error
 #endif
 
-static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext)
+static inline void nft_set_elem_dead(struct nft_set_ext *ext)
 {
        unsigned long *word = (unsigned long *)ext;
 
        BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
-       return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word);
+       set_bit(NFT_SET_ELEM_DEAD_BIT, word);
 }
 
-static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
+static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
 {
        unsigned long *word = (unsigned long *)ext;
 
-       clear_bit(NFT_SET_ELEM_BUSY_BIT, word);
+       BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
+       return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
 }
 
 /**
@@ -1732,6 +1669,38 @@ struct nft_trans_flowtable {
 #define nft_trans_flowtable_flags(trans)       \
        (((struct nft_trans_flowtable *)trans->data)->flags)
 
+#define NFT_TRANS_GC_BATCHCOUNT        256
+
+struct nft_trans_gc {
+       struct list_head        list;
+       struct net              *net;
+       struct nft_set          *set;
+       u32                     seq;
+       u8                      count;
+       void                    *priv[NFT_TRANS_GC_BATCHCOUNT];
+       struct rcu_head         rcu;
+};
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+                                       unsigned int gc_seq, gfp_t gfp);
+void nft_trans_gc_destroy(struct nft_trans_gc *trans);
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+                                             unsigned int gc_seq, gfp_t gfp);
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);
+
+struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
+                                          unsigned int gc_seq);
+
+void nft_setelem_data_deactivate(const struct net *net,
+                                const struct nft_set *set,
+                                struct nft_set_elem *elem);
+
 int __init nft_chain_filter_init(void);
 void nft_chain_filter_fini(void);
 
@@ -1758,6 +1727,7 @@ struct nftables_pernet {
        struct mutex            commit_mutex;
        u64                     table_handle;
        unsigned int            base_seq;
+       unsigned int            gc_seq;
 };
 
 extern unsigned int nf_tables_net_id;
index bf06db8..7b1ddff 100644 (file)
@@ -381,6 +381,7 @@ TRACE_EVENT(tcp_cong_state_set,
                __field(const void *, skaddr)
                __field(__u16, sport)
                __field(__u16, dport)
+               __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
@@ -396,6 +397,7 @@ TRACE_EVENT(tcp_cong_state_set,
 
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
+               __entry->family = sk->sk_family;
 
                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;
@@ -409,7 +411,8 @@ TRACE_EVENT(tcp_cong_state_set,
                __entry->cong_state = ca_state;
        ),
 
-       TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u",
+       TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u",
+                 show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
index f4591b9..93db3e4 100644 (file)
@@ -3470,6 +3470,8 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
         * - use the kernel virtual address of the shared io_uring context
         *   (instead of the userspace-provided address, which has to be 0UL
         *   anyway).
+        * - use the same pgoff which the get_unmapped_area() uses to
+        *   calculate the page colouring.
         * For architectures without such aliasing requirements, the
         * architecture will return any suitable mapping because addr is 0.
         */
@@ -3478,6 +3480,7 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
        pgoff = 0;      /* has been translated to ptr above */
 #ifdef SHM_COLOUR
        addr = (uintptr_t) ptr;
+       pgoff = addr >> PAGE_SHIFT;
 #else
        addr = 0UL;
 #endif
index 10ca57f..e3fae26 100644 (file)
@@ -35,9 +35,11 @@ static bool io_openat_force_async(struct io_open *open)
 {
        /*
         * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
-        * it'll always -EAGAIN
+        * it'll always -EAGAIN. Note that we test for __O_TMPFILE because
+        * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force
+        * async for.
         */
-       return open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE);
+       return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE);
 }
 
 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
index e1b4bfa..2b4a946 100644 (file)
@@ -1166,7 +1166,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
        int error;
 
        if (!hibernation_available())
-               return 0;
+               return n;
 
        if (len && buf[len-1] == '\n')
                len--;
index 02a8f40..800b420 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/sched/debug.h>
 #include <linux/nmi.h>
 #include <linux/kvm_para.h>
+#include <linux/delay.h>
 
 #include "workqueue_internal.h"
 
@@ -338,8 +339,10 @@ static cpumask_var_t *wq_numa_possible_cpumask;
  * Per-cpu work items which run for longer than the following threshold are
  * automatically considered CPU intensive and excluded from concurrency
  * management to prevent them from noticeably delaying other per-cpu work items.
+ * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
+ * The actual value is initialized in wq_cpu_intensive_thresh_init().
  */
-static unsigned long wq_cpu_intensive_thresh_us = 10000;
+static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
 module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
 
 static bool wq_disable_numa;
@@ -6513,6 +6516,42 @@ void __init workqueue_init_early(void)
               !system_freezable_power_efficient_wq);
 }
 
+static void __init wq_cpu_intensive_thresh_init(void)
+{
+       unsigned long thresh;
+       unsigned long bogo;
+
+       /* if the user set it to a specific value, keep it */
+       if (wq_cpu_intensive_thresh_us != ULONG_MAX)
+               return;
+
+       /*
+        * The default of 10ms is derived from the fact that most modern (as of
+        * 2023) processors can do a lot in 10ms and that it's just below what
+        * most consider human-perceivable. However, the kernel also runs on a
+        * lot slower CPUs including microcontrollers where the threshold is way
+        * too low.
+        *
+        * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
+        * This is by no means accurate but it doesn't have to be. The mechanism
+        * is still useful even when the threshold is fully scaled up. Also, as
+        * the reports would usually be applicable to everyone, some machines
+        * operating on longer thresholds won't significantly diminish their
+        * usefulness.
+        */
+       thresh = 10 * USEC_PER_MSEC;
+
+       /* see init/calibrate.c for lpj -> BogoMIPS calculation */
+       bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
+       if (bogo < 4000)
+               thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);
+
+       pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
+                loops_per_jiffy, bogo, thresh);
+
+       wq_cpu_intensive_thresh_us = thresh;
+}
+
 /**
  * workqueue_init - bring workqueue subsystem fully online
  *
@@ -6528,6 +6567,8 @@ void __init workqueue_init(void)
        struct worker_pool *pool;
        int cpu, bkt;
 
+       wq_cpu_intensive_thresh_init();
+
        /*
         * It'd be simpler to initialize NUMA in workqueue_init_early() but
         * CPU to node mapping may not be available that early on some
index fbc89ba..d679851 100644 (file)
@@ -1200,7 +1200,7 @@ config WQ_CPU_INTENSIVE_REPORT
        help
          Say Y here to enable reporting of concurrency-managed per-cpu work
          items that hog CPUs for longer than
-         workqueue.cpu_intensive_threshold_us. Workqueue automatically
+         workqueue.cpu_intensive_thresh_us. Workqueue automatically
          detects and excludes them from concurrency management to prevent
          them from stalling other per-cpu work items. Occassional
          triggering may not necessarily indicate a problem. Repeated
index e86231a..c65566b 100644 (file)
@@ -1148,7 +1148,7 @@ static ssize_t extract_user_to_sg(struct iov_iter *iter,
 
 failed:
        while (sgtable->nents > sgtable->orig_nents)
-               put_page(sg_page(&sgtable->sgl[--sgtable->nents]));
+               unpin_user_page(sg_page(&sgtable->sgl[--sgtable->nents]));
        return res;
 }
 
index dbc9f86..eacca27 100644 (file)
@@ -912,11 +912,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
                /*
                 * Check if the pageblock has already been marked skipped.
-                * Only the aligned PFN is checked as the caller isolates
+                * Only the first PFN is checked as the caller isolates
                 * COMPACT_CLUSTER_MAX at a time so the second call must
                 * not falsely conclude that the block should be skipped.
                 */
-               if (!valid_page && pageblock_aligned(low_pfn)) {
+               if (!valid_page && (pageblock_aligned(low_pfn) ||
+                                   low_pfn == cc->zone->zone_start_pfn)) {
                        if (!isolation_suitable(cc, page)) {
                                low_pfn = end_pfn;
                                folio = NULL;
@@ -2002,7 +2003,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
                 * before making it "skip" so other compaction instances do
                 * not scan the same block.
                 */
-               if (pageblock_aligned(low_pfn) &&
+               if ((pageblock_aligned(low_pfn) ||
+                    low_pfn == cc->zone->zone_start_pfn) &&
                    !fast_find_block && !isolation_suitable(cc, page))
                        continue;
 
index 91cff7f..eb95809 100644 (file)
@@ -273,6 +273,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
                return NULL;
        filter->type = type;
        filter->matching = matching;
+       INIT_LIST_HEAD(&filter->list);
        return filter;
 }
 
index 64a3239..6da626b 100644 (file)
@@ -1579,9 +1579,37 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
                                                unsigned int order) { }
 #endif
 
+static inline void __clear_hugetlb_destructor(struct hstate *h,
+                                               struct folio *folio)
+{
+       lockdep_assert_held(&hugetlb_lock);
+
+       /*
+        * Very subtle
+        *
+        * For non-gigantic pages set the destructor to the normal compound
+        * page dtor.  This is needed in case someone takes an additional
+        * temporary ref to the page, and freeing is delayed until they drop
+        * their reference.
+        *
+        * For gigantic pages set the destructor to the null dtor.  This
+        * destructor will never be called.  Before freeing the gigantic
+        * page destroy_compound_gigantic_folio will turn the folio into a
+        * simple group of pages.  After this the destructor does not
+        * apply.
+        *
+        */
+       if (hstate_is_gigantic(h))
+               folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
+       else
+               folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+}
+
 /*
- * Remove hugetlb folio from lists, and update dtor so that the folio appears
- * as just a compound page.
+ * Remove hugetlb folio from lists.
+ * If vmemmap exists for the folio, update dtor so that the folio appears
+ * as just a compound page.  Otherwise, wait until after allocating vmemmap
+ * to update dtor.
  *
  * A reference is held on the folio, except in the case of demote.
  *
@@ -1612,31 +1640,19 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
        }
 
        /*
-        * Very subtle
-        *
-        * For non-gigantic pages set the destructor to the normal compound
-        * page dtor.  This is needed in case someone takes an additional
-        * temporary ref to the page, and freeing is delayed until they drop
-        * their reference.
-        *
-        * For gigantic pages set the destructor to the null dtor.  This
-        * destructor will never be called.  Before freeing the gigantic
-        * page destroy_compound_gigantic_folio will turn the folio into a
-        * simple group of pages.  After this the destructor does not
-        * apply.
-        *
-        * This handles the case where more than one ref is held when and
-        * after update_and_free_hugetlb_folio is called.
-        *
-        * In the case of demote we do not ref count the page as it will soon
-        * be turned into a page of smaller size.
+        * We can only clear the hugetlb destructor after allocating vmemmap
+        * pages.  Otherwise, someone (memory error handling) may try to write
+        * to tail struct pages.
+        */
+       if (!folio_test_hugetlb_vmemmap_optimized(folio))
+               __clear_hugetlb_destructor(h, folio);
+
+        /*
+         * In the case of demote we do not ref count the page as it will soon
+         * be turned into a page of smaller size.
         */
        if (!demote)
                folio_ref_unfreeze(folio, 1);
-       if (hstate_is_gigantic(h))
-               folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
-       else
-               folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
 
        h->nr_huge_pages--;
        h->nr_huge_pages_node[nid]--;
@@ -1705,6 +1721,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
 {
        int i;
        struct page *subpage;
+       bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
 
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
@@ -1735,6 +1752,16 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
        if (unlikely(folio_test_hwpoison(folio)))
                folio_clear_hugetlb_hwpoison(folio);
 
+       /*
+        * If vmemmap pages were allocated above, then we need to clear the
+        * hugetlb destructor under the hugetlb lock.
+        */
+       if (clear_dtor) {
+               spin_lock_irq(&hugetlb_lock);
+               __clear_hugetlb_destructor(h, folio);
+               spin_unlock_irq(&hugetlb_lock);
+       }
+
        for (i = 0; i < pages_per_huge_page(h); i++) {
                subpage = folio_page(folio, i);
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
index ba26635..d20d766 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2784,6 +2784,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
                        anon_vma->root == vma->anon_vma->root) {
                return page;            /* still no need to copy it */
        }
+       if (PageHWPoison(page))
+               return ERR_PTR(-EHWPOISON);
        if (!PageUptodate(page))
                return page;            /* let do_swap_page report the error */
 
index ece5d48..9a28503 100644 (file)
@@ -2466,7 +2466,7 @@ int unpoison_memory(unsigned long pfn)
 {
        struct folio *folio;
        struct page *p;
-       int ret = -EBUSY;
+       int ret = -EBUSY, ghp;
        unsigned long count = 1;
        bool huge = false;
        static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -2499,6 +2499,13 @@ int unpoison_memory(unsigned long pfn)
                goto unlock_mutex;
        }
 
+       if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
+               goto unlock_mutex;
+
+       /*
+        * Note that folio->_mapcount is overloaded in SLAB, so the simple test
+        * in folio_mapped() has to be done after folio_test_slab() is checked.
+        */
        if (folio_mapped(folio)) {
                unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
                                 pfn, &unpoison_rs);
@@ -2511,32 +2518,28 @@ int unpoison_memory(unsigned long pfn)
                goto unlock_mutex;
        }
 
-       if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
-               goto unlock_mutex;
-
-       ret = get_hwpoison_page(p, MF_UNPOISON);
-       if (!ret) {
+       ghp = get_hwpoison_page(p, MF_UNPOISON);
+       if (!ghp) {
                if (PageHuge(p)) {
                        huge = true;
                        count = folio_free_raw_hwp(folio, false);
-                       if (count == 0) {
-                               ret = -EBUSY;
+                       if (count == 0)
                                goto unlock_mutex;
-                       }
                }
                ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
-       } else if (ret < 0) {
-               if (ret == -EHWPOISON) {
+       } else if (ghp < 0) {
+               if (ghp == -EHWPOISON) {
                        ret = put_page_back_buddy(p) ? 0 : -EBUSY;
-               } else
+               } else {
+                       ret = ghp;
                        unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
                                         pfn, &unpoison_rs);
+               }
        } else {
                if (PageHuge(p)) {
                        huge = true;
                        count = folio_free_raw_hwp(folio, false);
                        if (count == 0) {
-                               ret = -EBUSY;
                                folio_put(folio);
                                goto unlock_mutex;
                        }
index 603b2f4..1ec1ef3 100644 (file)
@@ -5705,6 +5705,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
        if (mmap_read_lock_killable(mm))
                return 0;
 
+       /* Untag the address before looking up the VMA */
+       addr = untagged_addr_remote(mm, addr);
+
        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
                return 0;
index 8e6dde6..b15112b 100644 (file)
@@ -1746,7 +1746,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        struct page *swapcache;
        spinlock_t *ptl;
        pte_t *pte, new_pte, old_pte;
-       bool hwposioned = false;
+       bool hwpoisoned = PageHWPoison(page);
        int ret = 1;
 
        swapcache = page;
@@ -1754,7 +1754,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        if (unlikely(!page))
                return -ENOMEM;
        else if (unlikely(PTR_ERR(page) == -EHWPOISON))
-               hwposioned = true;
+               hwpoisoned = true;
 
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
@@ -1765,11 +1765,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 
        old_pte = ptep_get(pte);
 
-       if (unlikely(hwposioned || !PageUptodate(page))) {
+       if (unlikely(hwpoisoned || !PageUptodate(page))) {
                swp_entry_t swp_entry;
 
                dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
-               if (hwposioned) {
+               if (hwpoisoned) {
                        swp_entry = make_hwpoison_entry(swapcache);
                        page = swapcache;
                } else {
index 3f05797..32916d2 100644 (file)
@@ -1798,6 +1798,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 
 static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 {
+       struct zs_pool *pool;
        struct zspage *zspage;
 
        /*
@@ -1807,9 +1808,10 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
        VM_BUG_ON_PAGE(PageIsolated(page), page);
 
        zspage = get_zspage(page);
-       migrate_write_lock(zspage);
+       pool = zspage->pool;
+       spin_lock(&pool->lock);
        inc_zspage_isolation(zspage);
-       migrate_write_unlock(zspage);
+       spin_unlock(&pool->lock);
 
        return true;
 }
@@ -1875,12 +1877,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
        kunmap_atomic(s_addr);
 
        replace_sub_page(class, zspage, newpage, page);
+       dec_zspage_isolation(zspage);
        /*
         * Since we complete the data copy and set up new zspage structure,
         * it's okay to release the pool's lock.
         */
        spin_unlock(&pool->lock);
-       dec_zspage_isolation(zspage);
        migrate_write_unlock(zspage);
 
        get_page(newpage);
@@ -1897,14 +1899,16 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 
 static void zs_page_putback(struct page *page)
 {
+       struct zs_pool *pool;
        struct zspage *zspage;
 
        VM_BUG_ON_PAGE(!PageIsolated(page), page);
 
        zspage = get_zspage(page);
-       migrate_write_lock(zspage);
+       pool = zspage->pool;
+       spin_lock(&pool->lock);
        dec_zspage_isolation(zspage);
-       migrate_write_unlock(zspage);
+       spin_unlock(&pool->lock);
 }
 
 static const struct movable_operations zsmalloc_mops = {
index e40aa3e..b366211 100644 (file)
@@ -384,8 +384,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
                        dev->name);
                vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
        }
-       if (event == NETDEV_DOWN &&
-           (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+       if (event == NETDEV_DOWN)
                vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
 
        vlan_info = rtnl_dereference(dev->vlan_info);
index 06ba0e5..28a5959 100644 (file)
@@ -4116,12 +4116,6 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
        if (unlikely(data_end > data_hard_end))
                return -EINVAL;
 
-       /* ALL drivers MUST init xdp->frame_sz, chicken check below */
-       if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
-               WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
-               return -EINVAL;
-       }
-
        if (unlikely(data_end < xdp->data + ETH_HLEN))
                return -EINVAL;
 
index a29508e..ef1a2eb 100644 (file)
@@ -1120,13 +1120,19 @@ static void sk_psock_strp_data_ready(struct sock *sk)
 
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
 {
+       int ret;
+
        static const struct strp_callbacks cb = {
                .rcv_msg        = sk_psock_strp_read,
                .read_sock_done = sk_psock_strp_read_done,
                .parse_msg      = sk_psock_strp_parse,
        };
 
-       return strp_init(&psock->strp, sk, &cb);
+       ret = strp_init(&psock->strp, sk, &cb);
+       if (!ret)
+               sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+
+       return ret;
 }
 
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
@@ -1154,7 +1160,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
 static void sk_psock_done_strp(struct sk_psock *psock)
 {
        /* Parser has been stopped */
-       if (psock->progs.stream_parser)
+       if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED))
                strp_done(&psock->strp);
 }
 #else
index 6d4f28e..732fc37 100644 (file)
@@ -1778,7 +1778,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
                spin_unlock(&sk->sk_peer_lock);
 
                if (!peer_pid)
-                       return -ESRCH;
+                       return -ENODATA;
 
                pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
                put_pid(peer_pid);
index 08ab108..8f07fea 100644 (file)
@@ -146,13 +146,13 @@ static void sock_map_del_link(struct sock *sk,
        list_for_each_entry_safe(link, tmp, &psock->link, list) {
                if (link->link_raw == link_raw) {
                        struct bpf_map *map = link->map;
-                       struct bpf_stab *stab = container_of(map, struct bpf_stab,
-                                                            map);
-                       if (psock->saved_data_ready && stab->progs.stream_parser)
+                       struct sk_psock_progs *progs = sock_map_progs(map);
+
+                       if (psock->saved_data_ready && progs->stream_parser)
                                strp_stop = true;
-                       if (psock->saved_data_ready && stab->progs.stream_verdict)
+                       if (psock->saved_data_ready && progs->stream_verdict)
                                verdict_stop = true;
-                       if (psock->saved_data_ready && stab->progs.skb_verdict)
+                       if (psock->saved_data_ready && progs->skb_verdict)
                                verdict_stop = true;
                        list_del(&link->list);
                        sk_psock_free_link(link);
index b8a2473..fd2eb14 100644 (file)
@@ -187,7 +187,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 
        /* And store cached results */
        icsk->icsk_pmtu_cookie = pmtu;
-       dp->dccps_mss_cache = cur_mps;
+       WRITE_ONCE(dp->dccps_mss_cache, cur_mps);
 
        return cur_mps;
 }
index f331e59..4e3266e 100644 (file)
@@ -630,7 +630,7 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
                return dccp_getsockopt_service(sk, len,
                                               (__be32 __user *)optval, optlen);
        case DCCP_SOCKOPT_GET_CUR_MPS:
-               val = dp->dccps_mss_cache;
+               val = READ_ONCE(dp->dccps_mss_cache);
                break;
        case DCCP_SOCKOPT_AVAILABLE_CCIDS:
                return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
@@ -739,7 +739,7 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
        trace_dccp_probe(sk, len);
 
-       if (len > dp->dccps_mss_cache)
+       if (len > READ_ONCE(dp->dccps_mss_cache))
                return -EMSGSIZE;
 
        lock_sock(sk);
@@ -772,6 +772,12 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
                goto out_discard;
        }
 
+       /* We need to check dccps_mss_cache after socket is locked. */
+       if (len > dp->dccps_mss_cache) {
+               rc = -EMSGSIZE;
+               goto out_discard;
+       }
+
        skb_reserve(skb, sk->sk_prot->max_header);
        rc = memcpy_from_msg(skb_put(skb, len), msg, len);
        if (rc != 0)
index 92c02c8..586b1b3 100644 (file)
@@ -224,7 +224,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
                .un.frag.__unused       = 0,
                .un.frag.mtu            = htons(mtu),
        };
-       icmph->checksum = ip_compute_csum(icmph, len);
+       icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
        skb_reset_transport_header(skb);
 
        niph = skb_push(skb, sizeof(*niph));
index f95142e..be5498f 100644 (file)
@@ -3221,13 +3221,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
                                     &rtm_dump_nexthop_cb, &filter);
        if (err < 0) {
                if (likely(skb->len))
-                       goto out;
-               goto out_err;
+                       err = skb->len;
        }
 
-out:
-       err = skb->len;
-out_err:
        cb->seq = net->nexthop.seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        return err;
@@ -3367,25 +3363,19 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
                    dd->filter.res_bucket_nh_id != nhge->nh->id)
                        continue;
 
+               dd->ctx->bucket_index = bucket_index;
                err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
                                         RTM_NEWNEXTHOPBUCKET, portid,
                                         cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                         cb->extack);
-               if (err < 0) {
-                       if (likely(skb->len))
-                               goto out;
-                       goto out_err;
-               }
+               if (err)
+                       return err;
        }
 
        dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
-       bucket_index = 0;
+       dd->ctx->bucket_index = 0;
 
-out:
-       err = skb->len;
-out_err:
-       dd->ctx->bucket_index = bucket_index;
-       return err;
+       return 0;
 }
 
 static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
@@ -3434,13 +3424,9 @@ static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
 
        if (err < 0) {
                if (likely(skb->len))
-                       goto out;
-               goto out_err;
+                       err = skb->len;
        }
 
-out:
-       err = skb->len;
-out_err:
        cb->seq = net->nexthop.seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        return err;
index 18634eb..a42be96 100644 (file)
@@ -197,7 +197,8 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
 static inline int ndisc_is_useropt(const struct net_device *dev,
                                   struct nd_opt_hdr *opt)
 {
-       return opt->nd_opt_type == ND_OPT_RDNSS ||
+       return opt->nd_opt_type == ND_OPT_PREFIX_INFO ||
+               opt->nd_opt_type == ND_OPT_RDNSS ||
                opt->nd_opt_type == ND_OPT_DNSSL ||
                opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
                opt->nd_opt_type == ND_OPT_PREF64 ||
index 3317d1c..d806585 100644 (file)
@@ -2335,7 +2335,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 
        lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
 
-       if (flags & MPTCP_CF_FASTCLOSE) {
+       if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) {
                /* be sure to force the tcp_disconnect() path,
                 * to generate the egress reset
                 */
@@ -3328,7 +3328,7 @@ static void mptcp_release_cb(struct sock *sk)
 
        if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
                __mptcp_clean_una_wakeup(sk);
-       if (unlikely(&msk->cb_flags)) {
+       if (unlikely(msk->cb_flags)) {
                /* be sure to set the current sk state before tacking actions
                 * depending on sk_state, that is processing MPTCP_ERROR_REPORT
                 */
index 37fbe22..ba2a873 100644 (file)
@@ -325,7 +325,6 @@ struct mptcp_sock {
        u32             subflow_id;
        u32             setsockopt_seq;
        char            ca_name[TCP_CA_NAME_MAX];
-       struct mptcp_sock       *dl_next;
 };
 
 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
index 9ee3b7a..94ae7dd 100644 (file)
@@ -1793,16 +1793,31 @@ static void subflow_state_change(struct sock *sk)
 void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
 {
        struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
-       struct mptcp_sock *msk, *next, *head = NULL;
-       struct request_sock *req;
-       struct sock *sk;
+       struct request_sock *req, *head, *tail;
+       struct mptcp_subflow_context *subflow;
+       struct sock *sk, *ssk;
 
-       /* build a list of all unaccepted mptcp sockets */
+       /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
+        * Splice the req list, so that accept() can not reach the pending ssk after
+        * the listener socket is released below.
+        */
        spin_lock_bh(&queue->rskq_lock);
-       for (req = queue->rskq_accept_head; req; req = req->dl_next) {
-               struct mptcp_subflow_context *subflow;
-               struct sock *ssk = req->sk;
+       head = queue->rskq_accept_head;
+       tail = queue->rskq_accept_tail;
+       queue->rskq_accept_head = NULL;
+       queue->rskq_accept_tail = NULL;
+       spin_unlock_bh(&queue->rskq_lock);
+
+       if (!head)
+               return;
 
+       /* can't acquire the msk socket lock under the subflow one,
+        * or will cause ABBA deadlock
+        */
+       release_sock(listener_ssk);
+
+       for (req = head; req; req = req->dl_next) {
+               ssk = req->sk;
                if (!sk_is_mptcp(ssk))
                        continue;
 
@@ -1810,32 +1825,10 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s
                if (!subflow || !subflow->conn)
                        continue;
 
-               /* skip if already in list */
                sk = subflow->conn;
-               msk = mptcp_sk(sk);
-               if (msk->dl_next || msk == head)
-                       continue;
-
                sock_hold(sk);
-               msk->dl_next = head;
-               head = msk;
-       }
-       spin_unlock_bh(&queue->rskq_lock);
-       if (!head)
-               return;
-
-       /* can't acquire the msk socket lock under the subflow one,
-        * or will cause ABBA deadlock
-        */
-       release_sock(listener_ssk);
-
-       for (msk = head; msk; msk = next) {
-               sk = (struct sock *)msk;
 
                lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
-               next = msk->dl_next;
-               msk->dl_next = NULL;
-
                __mptcp_unaccepted_force_close(sk);
                release_sock(sk);
 
@@ -1859,6 +1852,13 @@ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_s
 
        /* we are still under the listener msk socket lock */
        lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+
+       /* restore the listener queue, to let the TCP code clean it up */
+       spin_lock_bh(&queue->rskq_lock);
+       WARN_ON_ONCE(queue->rskq_accept_head);
+       queue->rskq_accept_head = head;
+       queue->rskq_accept_tail = tail;
+       spin_unlock_bh(&queue->rskq_lock);
 }
 
 static int subflow_ulp_init(struct sock *sk)
index d3c6ecd..c62227a 100644 (file)
@@ -31,7 +31,9 @@ static LIST_HEAD(nf_tables_expressions);
 static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
 static LIST_HEAD(nf_tables_destroy_list);
+static LIST_HEAD(nf_tables_gc_list);
 static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
+static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
 
 enum {
        NFT_VALIDATE_SKIP       = 0,
@@ -120,6 +122,9 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s
 static void nf_tables_trans_destroy_work(struct work_struct *w);
 static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
 
+static void nft_trans_gc_work(struct work_struct *work);
+static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
+
 static void nft_ctx_init(struct nft_ctx *ctx,
                         struct net *net,
                         const struct sk_buff *skb,
@@ -582,10 +587,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
        return __nft_trans_set_add(ctx, msg_type, set, NULL);
 }
 
-static void nft_setelem_data_deactivate(const struct net *net,
-                                       const struct nft_set *set,
-                                       struct nft_set_elem *elem);
-
 static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
                                  struct nft_set *set,
                                  const struct nft_set_iter *iter,
@@ -5055,6 +5056,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
 
        INIT_LIST_HEAD(&set->bindings);
        INIT_LIST_HEAD(&set->catchall_list);
+       refcount_set(&set->refs, 1);
        set->table = table;
        write_pnet(&set->net, net);
        set->ops = ops;
@@ -5122,6 +5124,14 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
        }
 }
 
+static void nft_set_put(struct nft_set *set)
+{
+       if (refcount_dec_and_test(&set->refs)) {
+               kfree(set->name);
+               kvfree(set);
+       }
+}
+
 static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
 {
        int i;
@@ -5134,8 +5144,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
 
        set->ops->destroy(ctx, set);
        nft_set_catchall_destroy(ctx, set);
-       kfree(set->name);
-       kvfree(set);
+       nft_set_put(set);
 }
 
 static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
@@ -5602,8 +5611,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
                                  const struct nft_set_iter *iter,
                                  struct nft_set_elem *elem)
 {
+       const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
        struct nft_set_dump_args *args;
 
+       if (nft_set_elem_expired(ext))
+               return 0;
+
        args = container_of(iter, struct nft_set_dump_args, iter);
        return nf_tables_fill_setelem(args->skb, set, elem, args->reset);
 }
@@ -6274,7 +6287,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (nft_set_elem_active(ext, genmask) &&
-                   !nft_set_elem_expired(ext))
+                   !nft_set_elem_expired(ext) &&
+                   !nft_set_elem_is_dead(ext))
                        return ext;
        }
 
@@ -6282,29 +6296,6 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
 }
 EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
 
-void *nft_set_catchall_gc(const struct nft_set *set)
-{
-       struct nft_set_elem_catchall *catchall, *next;
-       struct nft_set_ext *ext;
-       void *elem = NULL;
-
-       list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
-               ext = nft_set_elem_ext(set, catchall->elem);
-
-               if (!nft_set_elem_expired(ext) ||
-                   nft_set_elem_mark_busy(ext))
-                       continue;
-
-               elem = catchall->elem;
-               list_del_rcu(&catchall->list);
-               kfree_rcu(catchall, rcu);
-               break;
-       }
-
-       return elem;
-}
-EXPORT_SYMBOL_GPL(nft_set_catchall_gc);
-
 static int nft_setelem_catchall_insert(const struct net *net,
                                       struct nft_set *set,
                                       const struct nft_set_elem *elem,
@@ -6366,7 +6357,6 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set,
 
        if (nft_setelem_is_catchall(set, elem)) {
                nft_set_elem_change_active(net, set, ext);
-               nft_set_elem_clear_busy(ext);
        } else {
                set->ops->activate(net, set, elem);
        }
@@ -6381,8 +6371,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net,
 
        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
-               if (!nft_is_active(net, ext) ||
-                   nft_set_elem_mark_busy(ext))
+               if (!nft_is_active(net, ext))
                        continue;
 
                kfree(elem->priv);
@@ -6777,7 +6766,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
                goto err_elem_free;
        }
 
-       ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
+       ext->genmask = nft_genmask_cur(ctx->net);
 
        err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags);
        if (err) {
@@ -6929,9 +6918,9 @@ static void nft_setelem_data_activate(const struct net *net,
                nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
 }
 
-static void nft_setelem_data_deactivate(const struct net *net,
-                                       const struct nft_set *set,
-                                       struct nft_set_elem *elem)
+void nft_setelem_data_deactivate(const struct net *net,
+                                const struct nft_set *set,
+                                struct nft_set_elem *elem)
 {
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
 
@@ -7095,8 +7084,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
 
        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
-               if (!nft_set_elem_active(ext, genmask) ||
-                   nft_set_elem_mark_busy(ext))
+               if (!nft_set_elem_active(ext, genmask))
                        continue;
 
                elem.priv = catchall->elem;
@@ -7170,29 +7158,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
        return err;
 }
 
-void nft_set_gc_batch_release(struct rcu_head *rcu)
-{
-       struct nft_set_gc_batch *gcb;
-       unsigned int i;
-
-       gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
-       for (i = 0; i < gcb->head.cnt; i++)
-               nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
-       kfree(gcb);
-}
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
-                                               gfp_t gfp)
-{
-       struct nft_set_gc_batch *gcb;
-
-       gcb = kzalloc(sizeof(*gcb), gfp);
-       if (gcb == NULL)
-               return gcb;
-       gcb->head.set = set;
-       return gcb;
-}
-
 /*
  * Stateful objects
  */
@@ -9414,6 +9379,207 @@ void nft_chain_del(struct nft_chain *chain)
        list_del_rcu(&chain->list);
 }
 
+static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
+                                       struct nft_trans_gc *trans)
+{
+       void **priv = trans->priv;
+       unsigned int i;
+
+       for (i = 0; i < trans->count; i++) {
+               struct nft_set_elem elem = {
+                       .priv = priv[i],
+               };
+
+               nft_setelem_data_deactivate(ctx->net, trans->set, &elem);
+               nft_setelem_remove(ctx->net, trans->set, &elem);
+       }
+}
+
+void nft_trans_gc_destroy(struct nft_trans_gc *trans)
+{
+       nft_set_put(trans->set);
+       put_net(trans->net);
+       kfree(trans);
+}
+
+static void nft_trans_gc_trans_free(struct rcu_head *rcu)
+{
+       struct nft_set_elem elem = {};
+       struct nft_trans_gc *trans;
+       struct nft_ctx ctx = {};
+       unsigned int i;
+
+       trans = container_of(rcu, struct nft_trans_gc, rcu);
+       ctx.net = read_pnet(&trans->set->net);
+
+       for (i = 0; i < trans->count; i++) {
+               elem.priv = trans->priv[i];
+               if (!nft_setelem_is_catchall(trans->set, &elem))
+                       atomic_dec(&trans->set->nelems);
+
+               nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv);
+       }
+
+       nft_trans_gc_destroy(trans);
+}
+
+static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
+{
+       struct nftables_pernet *nft_net;
+       struct nft_ctx ctx = {};
+
+       nft_net = nft_pernet(trans->net);
+
+       mutex_lock(&nft_net->commit_mutex);
+
+       /* Check for race with transaction, otherwise this batch refers to
+        * stale objects that might not be there anymore. Skip transaction if
+        * set has been destroyed from control plane transaction in case gc
+        * worker loses race.
+        */
+       if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
+               mutex_unlock(&nft_net->commit_mutex);
+               return false;
+       }
+
+       ctx.net = trans->net;
+       ctx.table = trans->set->table;
+
+       nft_trans_gc_setelem_remove(&ctx, trans);
+       mutex_unlock(&nft_net->commit_mutex);
+
+       return true;
+}
+
+static void nft_trans_gc_work(struct work_struct *work)
+{
+       struct nft_trans_gc *trans, *next;
+       LIST_HEAD(trans_gc_list);
+
+       spin_lock(&nf_tables_destroy_list_lock);
+       list_splice_init(&nf_tables_gc_list, &trans_gc_list);
+       spin_unlock(&nf_tables_destroy_list_lock);
+
+       list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
+               list_del(&trans->list);
+               if (!nft_trans_gc_work_done(trans)) {
+                       nft_trans_gc_destroy(trans);
+                       continue;
+               }
+               call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+       }
+}
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+                                       unsigned int gc_seq, gfp_t gfp)
+{
+       struct net *net = read_pnet(&set->net);
+       struct nft_trans_gc *trans;
+
+       trans = kzalloc(sizeof(*trans), gfp);
+       if (!trans)
+               return NULL;
+
+       refcount_inc(&set->refs);
+       trans->set = set;
+       trans->net = get_net(net);
+       trans->seq = gc_seq;
+
+       return trans;
+}
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
+{
+       trans->priv[trans->count++] = priv;
+}
+
+static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
+{
+       spin_lock(&nf_tables_gc_list_lock);
+       list_add_tail(&trans->list, &nf_tables_gc_list);
+       spin_unlock(&nf_tables_gc_list_lock);
+
+       schedule_work(&trans_gc_work);
+}
+
+static int nft_trans_gc_space(struct nft_trans_gc *trans)
+{
+       return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+                                             unsigned int gc_seq, gfp_t gfp)
+{
+       if (nft_trans_gc_space(gc))
+               return gc;
+
+       nft_trans_gc_queue_work(gc);
+
+       return nft_trans_gc_alloc(gc->set, gc_seq, gfp);
+}
+
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
+{
+       if (trans->count == 0) {
+               nft_trans_gc_destroy(trans);
+               return;
+       }
+
+       nft_trans_gc_queue_work(trans);
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
+{
+       if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
+               return NULL;
+
+       if (nft_trans_gc_space(gc))
+               return gc;
+
+       call_rcu(&gc->rcu, nft_trans_gc_trans_free);
+
+       return nft_trans_gc_alloc(gc->set, 0, gfp);
+}
+
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
+{
+       WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));
+
+       if (trans->count == 0) {
+               nft_trans_gc_destroy(trans);
+               return;
+       }
+
+       call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
+                                          unsigned int gc_seq)
+{
+       struct nft_set_elem_catchall *catchall;
+       const struct nft_set *set = gc->set;
+       struct nft_set_ext *ext;
+
+       list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+               ext = nft_set_elem_ext(set, catchall->elem);
+
+               if (!nft_set_elem_expired(ext))
+                       continue;
+               if (nft_set_elem_is_dead(ext))
+                       goto dead_elem;
+
+               nft_set_elem_dead(ext);
+dead_elem:
+               gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+               if (!gc)
+                       return NULL;
+
+               nft_trans_gc_elem_add(gc, catchall->elem);
+       }
+
+       return gc;
+}
+
 static void nf_tables_module_autoload_cleanup(struct net *net)
 {
        struct nftables_pernet *nft_net = nft_pernet(net);
@@ -9576,11 +9742,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans, *next;
+       unsigned int base_seq, gc_seq;
        LIST_HEAD(set_update_list);
        struct nft_trans_elem *te;
        struct nft_chain *chain;
        struct nft_table *table;
-       unsigned int base_seq;
        LIST_HEAD(adl);
        int err;
 
@@ -9657,6 +9823,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 
        WRITE_ONCE(nft_net->base_seq, base_seq);
 
+       /* Bump gc counter, it becomes odd, this is the busy mark. */
+       gc_seq = READ_ONCE(nft_net->gc_seq);
+       WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
+
        /* step 3. Start new generation, rules_gen_X now in use. */
        net->nft.gencursor = nft_gencursor_next(net);
 
@@ -9764,6 +9934,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
                        break;
                case NFT_MSG_DELSET:
                case NFT_MSG_DESTROYSET:
+                       nft_trans_set(trans)->dead = 1;
                        list_del_rcu(&nft_trans_set(trans)->list);
                        nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
                                             trans->msg_type, GFP_KERNEL);
@@ -9866,6 +10037,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
        nft_commit_notify(net, NETLINK_CB(skb).portid);
        nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
        nf_tables_commit_audit_log(&adl, nft_net->base_seq);
+
+       WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
        nf_tables_commit_release(net);
 
        return 0;
@@ -10915,6 +11088,7 @@ static int __net_init nf_tables_init_net(struct net *net)
        INIT_LIST_HEAD(&nft_net->notify_list);
        mutex_init(&nft_net->commit_mutex);
        nft_net->base_seq = 1;
+       nft_net->gc_seq = 0;
 
        return 0;
 }
@@ -10943,10 +11117,16 @@ static void __net_exit nf_tables_exit_net(struct net *net)
        WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
 }
 
+static void nf_tables_exit_batch(struct list_head *net_exit_list)
+{
+       flush_work(&trans_gc_work);
+}
+
 static struct pernet_operations nf_tables_net_ops = {
        .init           = nf_tables_init_net,
        .pre_exit       = nf_tables_pre_exit_net,
        .exit           = nf_tables_exit_net,
+       .exit_batch     = nf_tables_exit_batch,
        .id             = &nf_tables_net_id,
        .size           = sizeof(struct nftables_pernet),
 };
@@ -11018,6 +11198,7 @@ static void __exit nf_tables_module_exit(void)
        nft_chain_filter_fini();
        nft_chain_route_fini();
        unregister_pernet_subsys(&nf_tables_net_ops);
+       cancel_work_sync(&trans_gc_work);
        cancel_work_sync(&trans_destroy_work);
        rcu_barrier();
        rhltable_destroy(&nft_objname_ht);
index 0b73cb0..cef5df8 100644 (file)
@@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg,
 
        if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
                return 1;
+       if (nft_set_elem_is_dead(&he->ext))
+               return 1;
        if (nft_set_elem_expired(&he->ext))
                return 1;
        if (!nft_set_elem_active(&he->ext, x->genmask))
@@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
        struct nft_rhash_elem *he = elem->priv;
 
        nft_set_elem_change_active(net, set, &he->ext);
-       nft_set_elem_clear_busy(&he->ext);
 }
 
 static bool nft_rhash_flush(const struct net *net,
@@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net,
 {
        struct nft_rhash_elem *he = priv;
 
-       if (!nft_set_elem_mark_busy(&he->ext) ||
-           !nft_is_active(net, &he->ext)) {
-               nft_set_elem_change_active(net, set, &he->ext);
-               return true;
-       }
-       return false;
+       nft_set_elem_change_active(net, set, &he->ext);
+
+       return true;
 }
 
 static void *nft_rhash_deactivate(const struct net *net,
@@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net,
 
        rcu_read_lock();
        he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
-       if (he != NULL &&
-           !nft_rhash_flush(net, set, he))
-               he = NULL;
+       if (he)
+               nft_set_elem_change_active(net, set, &he->ext);
 
        rcu_read_unlock();
 
@@ -252,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set,
        if (he == NULL)
                return false;
 
-       return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
+       nft_set_elem_dead(&he->ext);
+
+       return true;
 }
 
 static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
@@ -278,8 +277,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 
                if (iter->count < iter->skip)
                        goto cont;
-               if (nft_set_elem_expired(&he->ext))
-                       goto cont;
                if (!nft_set_elem_active(&he->ext, iter->genmask))
                        goto cont;
 
@@ -314,25 +311,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
 
 static void nft_rhash_gc(struct work_struct *work)
 {
+       struct nftables_pernet *nft_net;
        struct nft_set *set;
        struct nft_rhash_elem *he;
        struct nft_rhash *priv;
-       struct nft_set_gc_batch *gcb = NULL;
        struct rhashtable_iter hti;
+       struct nft_trans_gc *gc;
+       struct net *net;
+       u32 gc_seq;
 
        priv = container_of(work, struct nft_rhash, gc_work.work);
        set  = nft_set_container_of(priv);
+       net  = read_pnet(&set->net);
+       nft_net = nft_pernet(net);
+       gc_seq = READ_ONCE(nft_net->gc_seq);
+
+       gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+       if (!gc)
+               goto done;
 
        rhashtable_walk_enter(&priv->ht, &hti);
        rhashtable_walk_start(&hti);
 
        while ((he = rhashtable_walk_next(&hti))) {
                if (IS_ERR(he)) {
-                       if (PTR_ERR(he) != -EAGAIN)
-                               break;
+                       if (PTR_ERR(he) != -EAGAIN) {
+                               nft_trans_gc_destroy(gc);
+                               gc = NULL;
+                               goto try_later;
+                       }
                        continue;
                }
 
+               /* Ruleset has been updated, try later. */
+               if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+                       nft_trans_gc_destroy(gc);
+                       gc = NULL;
+                       goto try_later;
+               }
+
+               if (nft_set_elem_is_dead(&he->ext))
+                       goto dead_elem;
+
                if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
                    nft_rhash_expr_needs_gc_run(set, &he->ext))
                        goto needs_gc_run;
@@ -340,26 +360,26 @@ static void nft_rhash_gc(struct work_struct *work)
                if (!nft_set_elem_expired(&he->ext))
                        continue;
 needs_gc_run:
-               if (nft_set_elem_mark_busy(&he->ext))
-                       continue;
+               nft_set_elem_dead(&he->ext);
+dead_elem:
+               gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+               if (!gc)
+                       goto try_later;
 
-               gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
-               if (gcb == NULL)
-                       break;
-               rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
-               atomic_dec(&set->nelems);
-               nft_set_gc_batch_add(gcb, he);
+               nft_trans_gc_elem_add(gc, he);
        }
+
+       gc = nft_trans_gc_catchall(gc, gc_seq);
+
+try_later:
+       /* catchall list iteration requires rcu read side lock. */
        rhashtable_walk_stop(&hti);
        rhashtable_walk_exit(&hti);
 
-       he = nft_set_catchall_gc(set);
-       if (he) {
-               gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
-               if (gcb)
-                       nft_set_gc_batch_add(gcb, he);
-       }
-       nft_set_gc_batch_complete(gcb);
+       if (gc)
+               nft_trans_gc_queue_async_done(gc);
+
+done:
        queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
                           nft_set_gc_interval(set));
 }
@@ -394,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set,
                return err;
 
        INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
-       if (set->flags & NFT_SET_TIMEOUT)
+       if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
                nft_rhash_gc_init(set);
 
        return 0;
@@ -422,7 +442,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx,
        };
 
        cancel_delayed_work_sync(&priv->gc_work);
-       rcu_barrier();
        rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
                                    (void *)&rhash_ctx);
 }
index 49915a2..a5b8301 100644 (file)
@@ -566,8 +566,7 @@ next_match:
                        goto out;
 
                if (last) {
-                       if (nft_set_elem_expired(&f->mt[b].e->ext) ||
-                           (genmask &&
+                       if ((genmask &&
                             !nft_set_elem_active(&f->mt[b].e->ext, genmask)))
                                goto next_match;
 
@@ -601,8 +600,17 @@ out:
 static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
                            const struct nft_set_elem *elem, unsigned int flags)
 {
-       return pipapo_get(net, set, (const u8 *)elem->key.val.data,
-                         nft_genmask_cur(net));
+       struct nft_pipapo_elem *ret;
+
+       ret = pipapo_get(net, set, (const u8 *)elem->key.val.data,
+                        nft_genmask_cur(net));
+       if (IS_ERR(ret))
+               return ret;
+
+       if (nft_set_elem_expired(&ret->ext))
+               return ERR_PTR(-ENOENT);
+
+       return ret;
 }
 
 /**
@@ -1528,16 +1536,34 @@ static void pipapo_drop(struct nft_pipapo_match *m,
        }
 }
 
+static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
+                                    struct nft_pipapo_elem *e)
+
+{
+       struct nft_set_elem elem = {
+               .priv   = e,
+       };
+
+       nft_setelem_data_deactivate(net, set, &elem);
+}
+
 /**
  * pipapo_gc() - Drop expired entries from set, destroy start and end elements
  * @set:       nftables API set representation
  * @m:         Matching data
  */
-static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
+static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m)
 {
+       struct nft_set *set = (struct nft_set *) _set;
        struct nft_pipapo *priv = nft_set_priv(set);
+       struct net *net = read_pnet(&set->net);
        int rules_f0, first_rule = 0;
        struct nft_pipapo_elem *e;
+       struct nft_trans_gc *gc;
+
+       gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+       if (!gc)
+               return;
 
        while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
                union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
@@ -1561,13 +1587,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
                f--;
                i--;
                e = f->mt[rulemap[i].to].e;
-               if (nft_set_elem_expired(&e->ext) &&
-                   !nft_set_elem_mark_busy(&e->ext)) {
+
+               /* synchronous gc never fails, there is no need to set on
+                * NFT_SET_ELEM_DEAD_BIT.
+                */
+               if (nft_set_elem_expired(&e->ext)) {
                        priv->dirty = true;
-                       pipapo_drop(m, rulemap);
 
-                       rcu_barrier();
-                       nft_set_elem_destroy(set, e, true);
+                       gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+                       if (!gc)
+                               break;
+
+                       nft_pipapo_gc_deactivate(net, set, e);
+                       pipapo_drop(m, rulemap);
+                       nft_trans_gc_elem_add(gc, e);
 
                        /* And check again current first rule, which is now the
                         * first we haven't checked.
@@ -1577,11 +1610,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
                }
        }
 
-       e = nft_set_catchall_gc(set);
-       if (e)
-               nft_set_elem_destroy(set, e, true);
-
-       priv->last_gc = jiffies;
+       gc = nft_trans_gc_catchall(gc, 0);
+       if (gc) {
+               nft_trans_gc_queue_sync_done(gc);
+               priv->last_gc = jiffies;
+       }
 }
 
 /**
@@ -1706,7 +1739,6 @@ static void nft_pipapo_activate(const struct net *net,
                return;
 
        nft_set_elem_change_active(net, set, &e->ext);
-       nft_set_elem_clear_busy(&e->ext);
 }
 
 /**
@@ -2005,8 +2037,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
                        goto cont;
 
                e = f->mt[r].e;
-               if (nft_set_elem_expired(&e->ext))
-                       goto cont;
 
                elem.priv = e;
 
index 8d73fff..f9d4c8f 100644 (file)
@@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set,
                      set->klen);
 }
 
+static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
+{
+       return nft_set_elem_expired(&rbe->ext) ||
+              nft_set_elem_is_dead(&rbe->ext);
+}
+
 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
                                const u32 *key, const struct nft_set_ext **ext,
                                unsigned int seq)
@@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
                                continue;
                        }
 
-                       if (nft_set_elem_expired(&rbe->ext))
+                       if (nft_rbtree_elem_expired(rbe))
                                return false;
 
                        if (nft_rbtree_interval_end(rbe)) {
@@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
 
        if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
            nft_set_elem_active(&interval->ext, genmask) &&
-           !nft_set_elem_expired(&interval->ext) &&
+           !nft_rbtree_elem_expired(interval) &&
            nft_rbtree_interval_start(interval)) {
                *ext = &interval->ext;
                return true;
@@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
        return rbe;
 }
 
+static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
+                                struct nft_rbtree *priv,
+                                struct nft_rbtree_elem *rbe)
+{
+       struct nft_set_elem elem = {
+               .priv   = rbe,
+       };
+
+       nft_setelem_data_deactivate(net, set, &elem);
+       rb_erase(&rbe->node, &priv->root);
+}
+
 static int nft_rbtree_gc_elem(const struct nft_set *__set,
                              struct nft_rbtree *priv,
                              struct nft_rbtree_elem *rbe,
@@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
 {
        struct nft_set *set = (struct nft_set *)__set;
        struct rb_node *prev = rb_prev(&rbe->node);
+       struct net *net = read_pnet(&set->net);
        struct nft_rbtree_elem *rbe_prev;
-       struct nft_set_gc_batch *gcb;
+       struct nft_trans_gc *gc;
 
-       gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC);
-       if (!gcb)
+       gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
+       if (!gc)
                return -ENOMEM;
 
        /* search for end interval coming before this element.
@@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
 
        if (prev) {
                rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+               nft_rbtree_gc_remove(net, set, priv, rbe_prev);
 
-               rb_erase(&rbe_prev->node, &priv->root);
-               atomic_dec(&set->nelems);
-               nft_set_gc_batch_add(gcb, rbe_prev);
+               /* There is always room in this trans gc for this element,
+                * memory allocation never actually happens, hence, the warning
+                * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
+                * this is synchronous gc which never fails.
+                */
+               gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+               if (WARN_ON_ONCE(!gc))
+                       return -ENOMEM;
+
+               nft_trans_gc_elem_add(gc, rbe_prev);
        }
 
-       rb_erase(&rbe->node, &priv->root);
-       atomic_dec(&set->nelems);
+       nft_rbtree_gc_remove(net, set, priv, rbe);
+       gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+       if (WARN_ON_ONCE(!gc))
+               return -ENOMEM;
+
+       nft_trans_gc_elem_add(gc, rbe);
 
-       nft_set_gc_batch_add(gcb, rbe);
-       nft_set_gc_batch_complete(gcb);
+       nft_trans_gc_queue_sync_done(gc);
 
        return 0;
 }
@@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net,
        struct nft_rbtree_elem *rbe = elem->priv;
 
        nft_set_elem_change_active(net, set, &rbe->ext);
-       nft_set_elem_clear_busy(&rbe->ext);
 }
 
 static bool nft_rbtree_flush(const struct net *net,
@@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net,
 {
        struct nft_rbtree_elem *rbe = priv;
 
-       if (!nft_set_elem_mark_busy(&rbe->ext) ||
-           !nft_is_active(net, &rbe->ext)) {
-               nft_set_elem_change_active(net, set, &rbe->ext);
-               return true;
-       }
-       return false;
+       nft_set_elem_change_active(net, set, &rbe->ext);
+
+       return true;
 }
 
 static void *nft_rbtree_deactivate(const struct net *net,
@@ -552,8 +578,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 
                if (iter->count < iter->skip)
                        goto cont;
-               if (nft_set_elem_expired(&rbe->ext))
-                       goto cont;
                if (!nft_set_elem_active(&rbe->ext, iter->genmask))
                        goto cont;
 
@@ -572,26 +596,40 @@ cont:
 
 static void nft_rbtree_gc(struct work_struct *work)
 {
-       struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
-       struct nft_set_gc_batch *gcb = NULL;
+       struct nft_rbtree_elem *rbe, *rbe_end = NULL;
+       struct nftables_pernet *nft_net;
        struct nft_rbtree *priv;
+       struct nft_trans_gc *gc;
        struct rb_node *node;
        struct nft_set *set;
+       unsigned int gc_seq;
        struct net *net;
-       u8 genmask;
 
        priv = container_of(work, struct nft_rbtree, gc_work.work);
        set  = nft_set_container_of(priv);
        net  = read_pnet(&set->net);
-       genmask = nft_genmask_cur(net);
+       nft_net = nft_pernet(net);
+       gc_seq  = READ_ONCE(nft_net->gc_seq);
+
+       gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+       if (!gc)
+               goto done;
 
        write_lock_bh(&priv->lock);
        write_seqcount_begin(&priv->count);
        for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+
+               /* Ruleset has been updated, try later. */
+               if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+                       nft_trans_gc_destroy(gc);
+                       gc = NULL;
+                       goto try_later;
+               }
+
                rbe = rb_entry(node, struct nft_rbtree_elem, node);
 
-               if (!nft_set_elem_active(&rbe->ext, genmask))
-                       continue;
+               if (nft_set_elem_is_dead(&rbe->ext))
+                       goto dead_elem;
 
                /* elements are reversed in the rbtree for historical reasons,
                 * from highest to lowest value, that is why end element is
@@ -604,46 +642,36 @@ static void nft_rbtree_gc(struct work_struct *work)
                if (!nft_set_elem_expired(&rbe->ext))
                        continue;
 
-               if (nft_set_elem_mark_busy(&rbe->ext)) {
-                       rbe_end = NULL;
+               nft_set_elem_dead(&rbe->ext);
+
+               if (!rbe_end)
                        continue;
-               }
 
-               if (rbe_prev) {
-                       rb_erase(&rbe_prev->node, &priv->root);
-                       rbe_prev = NULL;
-               }
-               gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
-               if (!gcb)
-                       break;
+               nft_set_elem_dead(&rbe_end->ext);
 
-               atomic_dec(&set->nelems);
-               nft_set_gc_batch_add(gcb, rbe);
-               rbe_prev = rbe;
+               gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+               if (!gc)
+                       goto try_later;
 
-               if (rbe_end) {
-                       atomic_dec(&set->nelems);
-                       nft_set_gc_batch_add(gcb, rbe_end);
-                       rb_erase(&rbe_end->node, &priv->root);
-                       rbe_end = NULL;
-               }
-               node = rb_next(node);
-               if (!node)
-                       break;
+               nft_trans_gc_elem_add(gc, rbe_end);
+               rbe_end = NULL;
+dead_elem:
+               gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+               if (!gc)
+                       goto try_later;
+
+               nft_trans_gc_elem_add(gc, rbe);
        }
-       if (rbe_prev)
-               rb_erase(&rbe_prev->node, &priv->root);
+
+       gc = nft_trans_gc_catchall(gc, gc_seq);
+
+try_later:
        write_seqcount_end(&priv->count);
        write_unlock_bh(&priv->lock);
 
-       rbe = nft_set_catchall_gc(set);
-       if (rbe) {
-               gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
-               if (gcb)
-                       nft_set_gc_batch_add(gcb, rbe);
-       }
-       nft_set_gc_batch_complete(gcb);
-
+       if (gc)
+               nft_trans_gc_queue_async_done(gc);
+done:
        queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
                           nft_set_gc_interval(set));
 }
index a4631cb..a2935bd 100644 (file)
@@ -401,18 +401,20 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 {
        union tpacket_uhdr h;
 
+       /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
+
        h.raw = frame;
        switch (po->tp_version) {
        case TPACKET_V1:
-               h.h1->tp_status = status;
+               WRITE_ONCE(h.h1->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                break;
        case TPACKET_V2:
-               h.h2->tp_status = status;
+               WRITE_ONCE(h.h2->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                break;
        case TPACKET_V3:
-               h.h3->tp_status = status;
+               WRITE_ONCE(h.h3->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
                break;
        default:
@@ -429,17 +431,19 @@ static int __packet_get_status(const struct packet_sock *po, void *frame)
 
        smp_rmb();
 
+       /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
+
        h.raw = frame;
        switch (po->tp_version) {
        case TPACKET_V1:
                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
-               return h.h1->tp_status;
+               return READ_ONCE(h.h1->tp_status);
        case TPACKET_V2:
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
-               return h.h2->tp_status;
+               return READ_ONCE(h.h2->tp_status);
        case TPACKET_V3:
                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
-               return h.h3->tp_status;
+               return READ_ONCE(h.h3->tp_status);
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
index 0c013d2..f5834af 100644 (file)
@@ -378,8 +378,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
        sk->sk_state = SMC_INIT;
        sk->sk_destruct = smc_destruct;
        sk->sk_protocol = protocol;
-       WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
-       WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
+       WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
+       WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
        smc = smc_sk(sk);
        INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
        INIT_WORK(&smc->connect_work, smc_connect_work);
@@ -436,13 +436,60 @@ out:
        return rc;
 }
 
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+                            (1UL << SOCK_KEEPOPEN) | \
+                            (1UL << SOCK_LINGER) | \
+                            (1UL << SOCK_BROADCAST) | \
+                            (1UL << SOCK_TIMESTAMP) | \
+                            (1UL << SOCK_DBG) | \
+                            (1UL << SOCK_RCVTSTAMP) | \
+                            (1UL << SOCK_RCVTSTAMPNS) | \
+                            (1UL << SOCK_LOCALROUTE) | \
+                            (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+                            (1UL << SOCK_RXQ_OVFL) | \
+                            (1UL << SOCK_WIFI_STATUS) | \
+                            (1UL << SOCK_NOFCS) | \
+                            (1UL << SOCK_FILTER_LOCKED) | \
+                            (1UL << SOCK_TSTAMP_NEW))
+
+/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
+static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
+                                    unsigned long mask)
+{
+       struct net *nnet = sock_net(nsk);
+
+       nsk->sk_userlocks = osk->sk_userlocks;
+       if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
+               nsk->sk_sndbuf = osk->sk_sndbuf;
+       } else {
+               if (mask == SK_FLAGS_SMC_TO_CLC)
+                       WRITE_ONCE(nsk->sk_sndbuf,
+                                  READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
+               else
+                       WRITE_ONCE(nsk->sk_sndbuf,
+                                  2 * READ_ONCE(nnet->smc.sysctl_wmem));
+       }
+       if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
+               nsk->sk_rcvbuf = osk->sk_rcvbuf;
+       } else {
+               if (mask == SK_FLAGS_SMC_TO_CLC)
+                       WRITE_ONCE(nsk->sk_rcvbuf,
+                                  READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
+               else
+                       WRITE_ONCE(nsk->sk_rcvbuf,
+                                  2 * READ_ONCE(nnet->smc.sysctl_rmem));
+       }
+}
+
 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
                                   unsigned long mask)
 {
        /* options we don't get control via setsockopt for */
        nsk->sk_type = osk->sk_type;
-       nsk->sk_sndbuf = osk->sk_sndbuf;
-       nsk->sk_rcvbuf = osk->sk_rcvbuf;
        nsk->sk_sndtimeo = osk->sk_sndtimeo;
        nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
        nsk->sk_mark = READ_ONCE(osk->sk_mark);
@@ -453,26 +500,10 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 
        nsk->sk_flags &= ~mask;
        nsk->sk_flags |= osk->sk_flags & mask;
+
+       smc_adjust_sock_bufsizes(nsk, osk, mask);
 }
 
-#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
-                            (1UL << SOCK_KEEPOPEN) | \
-                            (1UL << SOCK_LINGER) | \
-                            (1UL << SOCK_BROADCAST) | \
-                            (1UL << SOCK_TIMESTAMP) | \
-                            (1UL << SOCK_DBG) | \
-                            (1UL << SOCK_RCVTSTAMP) | \
-                            (1UL << SOCK_RCVTSTAMPNS) | \
-                            (1UL << SOCK_LOCALROUTE) | \
-                            (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
-                            (1UL << SOCK_RXQ_OVFL) | \
-                            (1UL << SOCK_WIFI_STATUS) | \
-                            (1UL << SOCK_NOFCS) | \
-                            (1UL << SOCK_FILTER_LOCKED) | \
-                            (1UL << SOCK_TSTAMP_NEW))
-/* copy only relevant settings and flags of SOL_SOCKET level from smc to
- * clc socket (since smc is not called for these options from net/core)
- */
 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
 {
        smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
@@ -2479,8 +2510,6 @@ static void smc_tcp_listen_work(struct work_struct *work)
                sock_hold(lsk); /* sock_put in smc_listen_work */
                INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
                smc_copy_sock_settings_to_smc(new_smc);
-               new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
-               new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
                sock_hold(&new_smc->sk); /* sock_put in passive closing */
                if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
                        sock_put(&new_smc->sk);
index 2eeea4c..1f2b912 100644 (file)
@@ -161,7 +161,7 @@ struct smc_connection {
 
        struct smc_buf_desc     *sndbuf_desc;   /* send buffer descriptor */
        struct smc_buf_desc     *rmb_desc;      /* RMBE descriptor */
-       int                     rmbe_size_short;/* compressed notation */
+       int                     rmbe_size_comp; /* compressed notation */
        int                     rmbe_update_limit;
                                                /* lower limit for consumer
                                                 * cursor update
index b9b8b07..c90d9e5 100644 (file)
@@ -1007,7 +1007,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
                clc->d0.gid =
                        conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd);
                clc->d0.token = conn->rmb_desc->token;
-               clc->d0.dmbe_size = conn->rmbe_size_short;
+               clc->d0.dmbe_size = conn->rmbe_size_comp;
                clc->d0.dmbe_idx = 0;
                memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
                if (version == SMC_V1) {
@@ -1050,7 +1050,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
                        clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
                        break;
                }
-               clc->r0.rmbe_size = conn->rmbe_size_short;
+               clc->r0.rmbe_size = conn->rmbe_size_comp;
                clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ?
                        cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) :
                        cpu_to_be64((u64)sg_dma_address
index 3f465fa..6b78075 100644 (file)
@@ -2309,31 +2309,30 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
        struct smc_connection *conn = &smc->conn;
        struct smc_link_group *lgr = conn->lgr;
        struct list_head *buf_list;
-       int bufsize, bufsize_short;
+       int bufsize, bufsize_comp;
        struct rw_semaphore *lock;      /* lock buffer list */
        bool is_dgraded = false;
-       int sk_buf_size;
 
        if (is_rmb)
                /* use socket recv buffer size (w/o overhead) as start value */
-               sk_buf_size = smc->sk.sk_rcvbuf;
+               bufsize = smc->sk.sk_rcvbuf / 2;
        else
                /* use socket send buffer size (w/o overhead) as start value */
-               sk_buf_size = smc->sk.sk_sndbuf;
+               bufsize = smc->sk.sk_sndbuf / 2;
 
-       for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
-            bufsize_short >= 0; bufsize_short--) {
+       for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb);
+            bufsize_comp >= 0; bufsize_comp--) {
                if (is_rmb) {
                        lock = &lgr->rmbs_lock;
-                       buf_list = &lgr->rmbs[bufsize_short];
+                       buf_list = &lgr->rmbs[bufsize_comp];
                } else {
                        lock = &lgr->sndbufs_lock;
-                       buf_list = &lgr->sndbufs[bufsize_short];
+                       buf_list = &lgr->sndbufs[bufsize_comp];
                }
-               bufsize = smc_uncompress_bufsize(bufsize_short);
+               bufsize = smc_uncompress_bufsize(bufsize_comp);
 
                /* check for reusable slot in the link group */
-               buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
+               buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
                if (buf_desc) {
                        buf_desc->is_dma_need_sync = 0;
                        SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
@@ -2377,8 +2376,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
        if (is_rmb) {
                conn->rmb_desc = buf_desc;
-               conn->rmbe_size_short = bufsize_short;
-               smc->sk.sk_rcvbuf = bufsize;
+               conn->rmbe_size_comp = bufsize_comp;
+               smc->sk.sk_rcvbuf = bufsize * 2;
                atomic_set(&conn->bytes_to_rcv, 0);
                conn->rmbe_update_limit =
                        smc_rmb_wnd_update_limit(buf_desc->len);
@@ -2386,7 +2385,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
                        smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
        } else {
                conn->sndbuf_desc = buf_desc;
-               smc->sk.sk_sndbuf = bufsize;
+               smc->sk.sk_sndbuf = bufsize * 2;
                atomic_set(&conn->sndbuf_space, bufsize);
        }
        return 0;
index b6f79fa..0b2a957 100644 (file)
 
 static int min_sndbuf = SMC_BUF_MIN_SIZE;
 static int min_rcvbuf = SMC_BUF_MIN_SIZE;
+static int max_sndbuf = INT_MAX / 2;
+static int max_rcvbuf = INT_MAX / 2;
+static const int net_smc_wmem_init = (64 * 1024);
+static const int net_smc_rmem_init = (64 * 1024);
 
 static struct ctl_table smc_table[] = {
        {
@@ -53,6 +57,7 @@ static struct ctl_table smc_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &min_sndbuf,
+               .extra2         = &max_sndbuf,
        },
        {
                .procname       = "rmem",
@@ -61,6 +66,7 @@ static struct ctl_table smc_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &min_rcvbuf,
+               .extra2         = &max_rcvbuf,
        },
        {  }
 };
@@ -88,8 +94,8 @@ int __net_init smc_sysctl_net_init(struct net *net)
        net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
        net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
        net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
-       WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]));
-       WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]));
+       WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init);
+       WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
 
        return 0;
 
index 2021fe5..529101e 100644 (file)
@@ -52,6 +52,8 @@ static LIST_HEAD(tls_device_list);
 static LIST_HEAD(tls_device_down_list);
 static DEFINE_SPINLOCK(tls_device_lock);
 
+static struct page *dummy_page;
+
 static void tls_device_free_ctx(struct tls_context *ctx)
 {
        if (ctx->tx_conf == TLS_HW) {
@@ -312,36 +314,33 @@ static int tls_push_record(struct sock *sk,
        return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
 }
 
-static int tls_device_record_close(struct sock *sk,
-                                  struct tls_context *ctx,
-                                  struct tls_record_info *record,
-                                  struct page_frag *pfrag,
-                                  unsigned char record_type)
+static void tls_device_record_close(struct sock *sk,
+                                   struct tls_context *ctx,
+                                   struct tls_record_info *record,
+                                   struct page_frag *pfrag,
+                                   unsigned char record_type)
 {
        struct tls_prot_info *prot = &ctx->prot_info;
-       int ret;
+       struct page_frag dummy_tag_frag;
 
        /* append tag
         * device will fill in the tag, we just need to append a placeholder
         * use socket memory to improve coalescing (re-using a single buffer
         * increases frag count)
-        * if we can't allocate memory now, steal some back from data
+        * if we can't allocate memory now use the dummy page
         */
-       if (likely(skb_page_frag_refill(prot->tag_size, pfrag,
-                                       sk->sk_allocation))) {
-               ret = 0;
-               tls_append_frag(record, pfrag, prot->tag_size);
-       } else {
-               ret = prot->tag_size;
-               if (record->len <= prot->overhead_size)
-                       return -ENOMEM;
+       if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) &&
+           !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) {
+               dummy_tag_frag.page = dummy_page;
+               dummy_tag_frag.offset = 0;
+               pfrag = &dummy_tag_frag;
        }
+       tls_append_frag(record, pfrag, prot->tag_size);
 
        /* fill prepend */
        tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]),
                         record->len - prot->overhead_size,
                         record_type);
-       return ret;
 }
 
 static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
@@ -541,18 +540,8 @@ last_record:
 
                if (done || record->len >= max_open_record_len ||
                    (record->num_frags >= MAX_SKB_FRAGS - 1)) {
-                       rc = tls_device_record_close(sk, tls_ctx, record,
-                                                    pfrag, record_type);
-                       if (rc) {
-                               if (rc > 0) {
-                                       size += rc;
-                               } else {
-                                       size = orig_size;
-                                       destroy_record(record);
-                                       ctx->open_record = NULL;
-                                       break;
-                               }
-                       }
+                       tls_device_record_close(sk, tls_ctx, record,
+                                               pfrag, record_type);
 
                        rc = tls_push_record(sk,
                                             tls_ctx,
@@ -1450,14 +1439,26 @@ int __init tls_device_init(void)
 {
        int err;
 
-       destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0);
-       if (!destruct_wq)
+       dummy_page = alloc_page(GFP_KERNEL);
+       if (!dummy_page)
                return -ENOMEM;
 
+       destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0);
+       if (!destruct_wq) {
+               err = -ENOMEM;
+               goto err_free_dummy;
+       }
+
        err = register_netdevice_notifier(&tls_dev_notifier);
        if (err)
-               destroy_workqueue(destruct_wq);
+               goto err_destroy_wq;
 
+       return 0;
+
+err_destroy_wq:
+       destroy_workqueue(destruct_wq);
+err_free_dummy:
+       put_page(dummy_page);
        return err;
 }
 
@@ -1466,4 +1467,5 @@ void __exit tls_device_cleanup(void)
        unregister_netdevice_notifier(&tls_dev_notifier);
        destroy_workqueue(destruct_wq);
        clean_acked_data_flush();
+       put_page(dummy_page);
 }
index b689612..4a8ee2f 100644 (file)
@@ -139,9 +139,6 @@ int tls_push_sg(struct sock *sk,
 
        ctx->splicing_pages = true;
        while (1) {
-               if (sg_is_last(sg))
-                       msg.msg_flags = flags;
-
                /* is sending application-limited? */
                tcp_rate_check_app_limited(sk);
                p = sg_page(sg);
index 0da2e6a..8bcf8e2 100644 (file)
@@ -5430,8 +5430,11 @@ nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs)
        if (!wiphy->mbssid_max_interfaces)
                return ERR_PTR(-EINVAL);
 
-       nla_for_each_nested(nl_elems, attrs, rem_elems)
+       nla_for_each_nested(nl_elems, attrs, rem_elems) {
+               if (num_elems >= 255)
+                       return ERR_PTR(-EINVAL);
                num_elems++;
+       }
 
        elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL);
        if (!elems)
index b89adb5..10ea85c 100644 (file)
@@ -994,6 +994,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
                                err = xp_alloc_tx_descs(xs->pool, xs);
                                if (err) {
                                        xp_put_pool(xs->pool);
+                                       xs->pool = NULL;
                                        sockfd_put(sock);
                                        goto out_unlock;
                                }
index b72b82b..b348e16 100644 (file)
@@ -9,7 +9,7 @@
 #include <linux/sysctl.h>
 #include "internal.h"
 
-struct ctl_table key_sysctls[] = {
+static struct ctl_table key_sysctls[] = {
        {
                .procname = "maxkeys",
                .data = &key_quota_maxkeys,
index cb8ca46..1f6d904 100644 (file)
@@ -14,7 +14,7 @@
  * Defines x86 CPU feature bits
  */
 #define NCAPINTS                       21         /* N 32-bit words worth of info */
-#define NBUGINTS                       1          /* N 32-bit bug flags */
+#define NBUGINTS                       2          /* N 32-bit bug flags */
 
 /*
  * Note: If the comment begins with a quoted string, that string is used
index 3aedae6..a00a53e 100644 (file)
 #define MSR_AMD64_DE_CFG               0xc0011029
 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT   1
 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE      BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)
+#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9
 
 #define MSR_AMD64_BU_CFG2              0xc001102a
 #define MSR_AMD64_IBSFETCHCTL          0xc0011030
index 2e1caab..2d51fa8 100644 (file)
@@ -824,5 +824,8 @@ bool arch_is_retpoline(struct symbol *sym)
 
 bool arch_is_rethunk(struct symbol *sym)
 {
-       return !strcmp(sym->name, "__x86_return_thunk");
+       return !strcmp(sym->name, "__x86_return_thunk") ||
+              !strcmp(sym->name, "srso_untrain_ret") ||
+              !strcmp(sym->name, "srso_safe_ret") ||
+              !strcmp(sym->name, "__ret");
 }
index 4e62843..f4cb41e 100644 (file)
@@ -45,7 +45,6 @@
 
 static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd,
                                     struct thread *th, bool lock);
-static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip);
 
 static struct dso *machine__kernel_dso(struct machine *machine)
 {
@@ -2385,10 +2384,6 @@ static int add_callchain_ip(struct thread *thread,
        ms.maps = maps__get(al.maps);
        ms.map = map__get(al.map);
        ms.sym = al.sym;
-
-       if (!branch && append_inlines(cursor, &ms, ip) == 0)
-               goto out;
-
        srcline = callchain_srcline(&ms, al.addr);
        err = callchain_cursor_append(cursor, ip, &ms,
                                      branch, flags, nr_loop_iter,
index 7329b33..d45d5dc 100644 (file)
@@ -931,6 +931,11 @@ static bool should_skip_zero_counter(struct perf_stat_config *config,
         */
        if (config->aggr_mode == AGGR_THREAD && config->system_wide)
                return true;
+
+       /* Tool events have the software PMU but are only gathered on 1. */
+       if (evsel__is_tool(counter))
+               return true;
+
        /*
         * Skip value 0 when it's an uncore event and the given aggr id
         * does not belong to the PMU cpumask.
index a61c7bc..63f468b 100644 (file)
@@ -177,7 +177,7 @@ void regression1_test(void)
        nr_threads = 2;
        pthread_barrier_init(&worker_barrier, NULL, nr_threads);
 
-       threads = malloc(nr_threads * sizeof(pthread_t *));
+       threads = malloc(nr_threads * sizeof(*threads));
 
        for (i = 0; i < nr_threads; i++) {
                arg = i;
index b4f6f3a..5674a9d 100644 (file)
@@ -869,6 +869,77 @@ static void test_msg_redir_to_listening(struct test_sockmap_listen *skel,
        xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT);
 }
 
+static void redir_partial(int family, int sotype, int sock_map, int parser_map)
+{
+       int s, c0, c1, p0, p1;
+       int err, n, key, value;
+       char buf[] = "abc";
+
+       key = 0;
+       value = sizeof(buf) - 1;
+       err = xbpf_map_update_elem(parser_map, &key, &value, 0);
+       if (err)
+               return;
+
+       s = socket_loopback(family, sotype | SOCK_NONBLOCK);
+       if (s < 0)
+               goto clean_parser_map;
+
+       err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1);
+       if (err)
+               goto close_srv;
+
+       err = add_to_sockmap(sock_map, p0, p1);
+       if (err)
+               goto close;
+
+       n = xsend(c1, buf, sizeof(buf), 0);
+       if (n < sizeof(buf))
+               FAIL("incomplete write");
+
+       n = xrecv_nonblock(c0, buf, sizeof(buf), 0);
+       if (n != sizeof(buf) - 1)
+               FAIL("expect %zu, received %d", sizeof(buf) - 1, n);
+
+close:
+       xclose(c0);
+       xclose(p0);
+       xclose(c1);
+       xclose(p1);
+close_srv:
+       xclose(s);
+
+clean_parser_map:
+       key = 0;
+       value = 0;
+       xbpf_map_update_elem(parser_map, &key, &value, 0);
+}
+
+static void test_skb_redir_partial(struct test_sockmap_listen *skel,
+                                  struct bpf_map *inner_map, int family,
+                                  int sotype)
+{
+       int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+       int parser = bpf_program__fd(skel->progs.prog_stream_parser);
+       int parser_map = bpf_map__fd(skel->maps.parser_map);
+       int sock_map = bpf_map__fd(inner_map);
+       int err;
+
+       err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0);
+       if (err)
+               return;
+
+       err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0);
+       if (err)
+               goto detach;
+
+       redir_partial(family, sotype, sock_map, parser_map);
+
+       xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT);
+detach:
+       xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER);
+}
+
 static void test_reuseport_select_listening(int family, int sotype,
                                            int sock_map, int verd_map,
                                            int reuseport_prog)
@@ -1243,6 +1314,7 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
        } tests[] = {
                TEST(test_skb_redir_to_connected),
                TEST(test_skb_redir_to_listening),
+               TEST(test_skb_redir_partial),
                TEST(test_msg_redir_to_connected),
                TEST(test_msg_redir_to_listening),
        };
@@ -1432,7 +1504,7 @@ static void vsock_unix_redir_connectible(int sock_mapfd, int verd_mapfd,
        if (n < 1)
                goto out;
 
-       n = recv(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), MSG_DONTWAIT);
+       n = xrecv_nonblock(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), 0);
        if (n < 0)
                FAIL("%s: recv() err, errno=%d", log_prefix, errno);
        if (n == 0)
index 325c9f1..464d35b 100644 (file)
@@ -28,12 +28,26 @@ struct {
        __type(value, unsigned int);
 } verdict_map SEC(".maps");
 
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, int);
+       __type(value, int);
+} parser_map SEC(".maps");
+
 bool test_sockmap = false; /* toggled by user-space */
 bool test_ingress = false; /* toggled by user-space */
 
 SEC("sk_skb/stream_parser")
 int prog_stream_parser(struct __sk_buff *skb)
 {
+       int *value;
+       __u32 key = 0;
+
+       value = bpf_map_lookup_elem(&parser_map, &key);
+       if (value && *value)
+               return *value;
+
        return skb->len;
 }
 
index 258ddc5..1b2cec9 100644 (file)
@@ -70,6 +70,10 @@ static int test_kmem_basic(const char *root)
                goto cleanup;
 
        cg_write(cg, "memory.high", "1M");
+
+       /* wait for RCU freeing */
+       sleep(1);
+
        slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
        if (slab1 <= 0)
                goto cleanup;
index 435aceb..380b691 100644 (file)
@@ -831,6 +831,7 @@ int main(int argc, char *argv[])
                                printf("Size must be greater than 0\n");
                                return KSFT_FAIL;
                        }
+                       break;
                case 't':
                        {
                                int tmp = atoi(optarg);
index 0f5e88c..df8d90b 100755 (executable)
@@ -1981,6 +1981,11 @@ basic()
 
        run_cmd "$IP link set dev lo up"
 
+       # Dump should not loop endlessly when maximum nexthop ID is configured.
+       run_cmd "$IP nexthop add id $((2**32-1)) blackhole"
+       run_cmd "timeout 5 $IP nexthop"
+       log_test $? 0 "Maximum nexthop ID dump"
+
        #
        # groups
        #
@@ -2201,6 +2206,11 @@ basic_res()
        run_cmd "$IP nexthop bucket list fdb"
        log_test $? 255 "Dump all nexthop buckets with invalid 'fdb' keyword"
 
+       # Dump should not loop endlessly when maximum nexthop ID is configured.
+       run_cmd "$IP nexthop add id $((2**32-1)) group 1/2 type resilient buckets 4"
+       run_cmd "timeout 5 $IP nexthop bucket"
+       log_test $? 0 "Maximum nexthop ID dump"
+
        #
        # resilient nexthop buckets get requests
        #
index ae3f946..d0c6c49 100755 (executable)
@@ -617,7 +617,7 @@ __cfg_test_port_ip_sg()
                grep -q "permanent"
        check_err $? "Entry not added as \"permanent\" when should"
        bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
-               grep -q "0.00"
+               grep -q " 0.00"
        check_err $? "\"permanent\" entry has a pending group timer"
        bridge mdb del dev br0 port $swp1 $grp_key vid 10
 
@@ -626,7 +626,7 @@ __cfg_test_port_ip_sg()
                grep -q "temp"
        check_err $? "Entry not added as \"temp\" when should"
        bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
-               grep -q "0.00"
+               grep -q " 0.00"
        check_fail $? "\"temp\" entry has an unpending group timer"
        bridge mdb del dev br0 port $swp1 $grp_key vid 10
 
@@ -659,7 +659,7 @@ __cfg_test_port_ip_sg()
                grep -q "permanent"
        check_err $? "Entry not marked as \"permanent\" after replace"
        bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
-               grep -q "0.00"
+               grep -q " 0.00"
        check_err $? "Entry has a pending group timer after replace"
 
        bridge mdb replace dev br0 port $swp1 $grp_key vid 10 temp
@@ -667,7 +667,7 @@ __cfg_test_port_ip_sg()
                grep -q "temp"
        check_err $? "Entry not marked as \"temp\" after replace"
        bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
-               grep -q "0.00"
+               grep -q " 0.00"
        check_fail $? "Entry has an unpending group timer after replace"
        bridge mdb del dev br0 port $swp1 $grp_key vid 10
 
@@ -850,6 +850,7 @@ cfg_test()
 __fwd_test_host_ip()
 {
        local grp=$1; shift
+       local dmac=$1; shift
        local src=$1; shift
        local mode=$1; shift
        local name
@@ -872,27 +873,27 @@ __fwd_test_host_ip()
        # Packet should only be flooded to multicast router ports when there is
        # no matching MDB entry. The bridge is not configured as a multicast
        # router port.
-       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
        tc_check_packets "dev br0 ingress" 1 0
        check_err $? "Packet locally received after flood"
 
        # Install a regular port group entry and expect the packet to not be
        # locally received.
        bridge mdb add dev br0 port $swp2 grp $grp temp vid 10
-       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
        tc_check_packets "dev br0 ingress" 1 0
        check_err $? "Packet locally received after installing a regular entry"
 
        # Add a host entry and expect the packet to be locally received.
        bridge mdb add dev br0 port br0 grp $grp temp vid 10
-       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
        tc_check_packets "dev br0 ingress" 1 1
        check_err $? "Packet not locally received after adding a host entry"
 
        # Remove the host entry and expect the packet to not be locally
        # received.
        bridge mdb del dev br0 port br0 grp $grp vid 10
-       $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
        tc_check_packets "dev br0 ingress" 1 1
        check_err $? "Packet locally received after removing a host entry"
 
@@ -905,8 +906,8 @@ __fwd_test_host_ip()
 
 fwd_test_host_ip()
 {
-       __fwd_test_host_ip "239.1.1.1" "192.0.2.1" "-4"
-       __fwd_test_host_ip "ff0e::1" "2001:db8:1::1" "-6"
+       __fwd_test_host_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "-4"
+       __fwd_test_host_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "-6"
 }
 
 fwd_test_host_l2()
@@ -966,6 +967,7 @@ fwd_test_host()
 __fwd_test_port_ip()
 {
        local grp=$1; shift
+       local dmac=$1; shift
        local valid_src=$1; shift
        local invalid_src=$1; shift
        local mode=$1; shift
@@ -999,43 +1001,43 @@ __fwd_test_port_ip()
                vlan_ethtype $eth_type vlan_id 10 dst_ip $grp \
                src_ip $invalid_src action drop
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 1 0
        check_err $? "Packet from valid source received on H2 before adding entry"
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 2 0
        check_err $? "Packet from invalid source received on H2 before adding entry"
 
        bridge mdb add dev br0 port $swp2 grp $grp vid 10 \
                filter_mode $filter_mode source_list $src_list
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 1 1
        check_err $? "Packet from valid source not received on H2 after adding entry"
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 2 0
        check_err $? "Packet from invalid source received on H2 after adding entry"
 
        bridge mdb replace dev br0 port $swp2 grp $grp vid 10 \
                filter_mode exclude
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 1 2
        check_err $? "Packet from valid source not received on H2 after allowing all sources"
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 2 1
        check_err $? "Packet from invalid source not received on H2 after allowing all sources"
 
        bridge mdb del dev br0 port $swp2 grp $grp vid 10
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 1 2
        check_err $? "Packet from valid source received on H2 after deleting entry"
 
-       $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+       $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
        tc_check_packets "dev $h2 ingress" 2 1
        check_err $? "Packet from invalid source received on H2 after deleting entry"
 
@@ -1047,11 +1049,11 @@ __fwd_test_port_ip()
 
 fwd_test_port_ip()
 {
-       __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "exclude"
-       __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \
+       __fwd_test_port_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "192.0.2.2" "-4" "exclude"
+       __fwd_test_port_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "2001:db8:1::2" "-6" \
                "exclude"
-       __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "include"
-       __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \
+       __fwd_test_port_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "192.0.2.2" "-4" "include"
+       __fwd_test_port_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "2001:db8:1::2" "-6" \
                "include"
 }
 
@@ -1127,7 +1129,7 @@ ctrl_igmpv3_is_in_test()
                filter_mode include source_list 192.0.2.1
 
        # IS_IN ( 192.0.2.2 )
-       $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
+       $MZ $h1.10 -c 1 -a own -b 01:00:5e:01:01:01 -A 192.0.2.1 -B 239.1.1.1 \
                -t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q
 
        bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -q 192.0.2.2
@@ -1140,7 +1142,7 @@ ctrl_igmpv3_is_in_test()
                filter_mode include source_list 192.0.2.1
 
        # IS_IN ( 192.0.2.2 )
-       $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
+       $MZ $h1.10 -a own -b 01:00:5e:01:01:01 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
                -t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q
 
        bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -v "src" | \
@@ -1167,7 +1169,7 @@ ctrl_mldv2_is_in_test()
 
        # IS_IN ( 2001:db8:1::2 )
        local p=$(mldv2_is_in_get fe80::1 ff0e::1 2001:db8:1::2)
-       $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
+       $MZ -6 $h1.10 -a own -b 33:33:00:00:00:01 -c 1 -A fe80::1 -B ff0e::1 \
                -t ip hop=1,next=0,p="$p" -q
 
        bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | \
@@ -1181,7 +1183,7 @@ ctrl_mldv2_is_in_test()
                filter_mode include source_list 2001:db8:1::1
 
        # IS_IN ( 2001:db8:1::2 )
-       $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
+       $MZ -6 $h1.10 -a own -b 33:33:00:00:00:01 -c 1 -A fe80::1 -B ff0e::1 \
                -t ip hop=1,next=0,p="$p" -q
 
        bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | grep -v "src" | \
@@ -1206,6 +1208,11 @@ ctrl_test()
        ctrl_mldv2_is_in_test
 }
 
+if ! bridge mdb help 2>&1 | grep -q "replace"; then
+       echo "SKIP: iproute2 too old, missing bridge mdb replace support"
+       exit $ksft_skip
+fi
+
 trap cleanup EXIT
 
 setup_prepare
index ae255b6..3da9d93 100755 (executable)
@@ -252,7 +252,8 @@ ctl4_entries_add()
        local IPs=$(seq -f 192.0.2.%g 1 $((n - 1)))
        local peer=$(locus_dev_peer $locus)
        local GRP=239.1.1.${grp}
-       $MZ $peer -c 1 -A 192.0.2.1 -B $GRP \
+       local dmac=01:00:5e:01:01:$(printf "%02x" $grp)
+       $MZ $peer -a own -b $dmac -c 1 -A 192.0.2.1 -B $GRP \
                -t ip proto=2,p=$(igmpv3_is_in_get $GRP $IPs) -q
        sleep 1
 
@@ -272,7 +273,8 @@ ctl4_entries_del()
 
        local peer=$(locus_dev_peer $locus)
        local GRP=239.1.1.${grp}
-       $MZ $peer -c 1 -A 192.0.2.1 -B 224.0.0.2 \
+       local dmac=01:00:5e:00:00:02
+       $MZ $peer -a own -b $dmac -c 1 -A 192.0.2.1 -B 224.0.0.2 \
                -t ip proto=2,p=$(igmpv2_leave_get $GRP) -q
        sleep 1
        ! bridge mdb show dev br0 | grep -q $GRP
@@ -289,8 +291,10 @@ ctl6_entries_add()
        local peer=$(locus_dev_peer $locus)
        local SIP=fe80::1
        local GRP=ff0e::${grp}
+       local dmac=33:33:00:00:00:$(printf "%02x" $grp)
        local p=$(mldv2_is_in_get $SIP $GRP $IPs)
-       $MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q
+       $MZ -6 $peer -a own -b $dmac -c 1 -A $SIP -B $GRP \
+               -t ip hop=1,next=0,p="$p" -q
        sleep 1
 
        local nn=$(bridge mdb show dev br0 | grep $GRP | wc -l)
@@ -310,8 +314,10 @@ ctl6_entries_del()
        local peer=$(locus_dev_peer $locus)
        local SIP=fe80::1
        local GRP=ff0e::${grp}
+       local dmac=33:33:00:00:00:$(printf "%02x" $grp)
        local p=$(mldv1_done_get $SIP $GRP)
-       $MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q
+       $MZ -6 $peer -a own -b $dmac -c 1 -A $SIP -B $GRP \
+               -t ip hop=1,next=0,p="$p" -q
        sleep 1
        ! bridge mdb show dev br0 | grep -q $GRP
 }
@@ -1328,6 +1334,11 @@ test_8021qvs()
        switch_destroy
 }
 
+if ! bridge link help 2>&1 | grep -q "mcast_max_groups"; then
+       echo "SKIP: iproute2 too old, missing bridge \"mcast_max_groups\" support"
+       exit $ksft_skip
+fi
+
 trap cleanup EXIT
 
 setup_prepare
index dbb9fcf..aa2eafb 100755 (executable)
@@ -286,6 +286,8 @@ different_speeds_autoneg_on()
        ethtool -s $h1 autoneg on
 }
 
+skip_on_veth
+
 trap cleanup EXIT
 
 setup_prepare
index c580ad6..39e736f 100755 (executable)
@@ -258,11 +258,6 @@ h2_destroy()
 
 setup_prepare()
 {
-       check_ethtool_mm_support
-       check_tc_fp_support
-       require_command lldptool
-       bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually"
-
        h1=${NETIFS[p1]}
        h2=${NETIFS[p2]}
 
@@ -278,6 +273,19 @@ cleanup()
        h1_destroy
 }
 
+check_ethtool_mm_support
+check_tc_fp_support
+require_command lldptool
+bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually"
+
+for netif in ${NETIFS[@]}; do
+       ethtool --show-mm $netif 2>&1 &> /dev/null
+       if [[ $? -ne 0 ]]; then
+               echo "SKIP: $netif does not support MAC Merge"
+               exit $ksft_skip
+       fi
+done
+
 trap cleanup EXIT
 
 setup_prepare
index eb9ec4a..7594bbb 100755 (executable)
@@ -99,6 +99,8 @@ test_stats_rx()
        test_stats g2a rx
 }
 
+skip_on_veth
+
 trap cleanup EXIT
 
 setup_prepare
index 9f5b3e2..49fa94b 100755 (executable)
@@ -14,6 +14,8 @@ ALL_TESTS="
 NUM_NETIFS=4
 source lib.sh
 
+require_command $TROUTE6
+
 h1_create()
 {
        simple_if_init $h1 2001:1:1::2/64
index 9ddb68d..f69015b 100755 (executable)
@@ -30,6 +30,7 @@ REQUIRE_MZ=${REQUIRE_MZ:=yes}
 REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no}
 STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no}
 TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=}
+TROUTE6=${TROUTE6:=traceroute6}
 
 relative_path="${BASH_SOURCE%/*}"
 if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
@@ -163,6 +164,17 @@ check_port_mab_support()
        fi
 }
 
+skip_on_veth()
+{
+       local kind=$(ip -j -d link show dev ${NETIFS[p1]} |
+               jq -r '.[].linkinfo.info_kind')
+
+       if [[ $kind == veth ]]; then
+               echo "SKIP: Test cannot be run with veth pairs"
+               exit $ksft_skip
+       fi
+}
+
 if [[ "$(id -u)" -ne 0 ]]; then
        echo "SKIP: need root privileges"
        exit $ksft_skip
@@ -225,6 +237,11 @@ create_netif_veth()
        for ((i = 1; i <= NUM_NETIFS; ++i)); do
                local j=$((i+1))
 
+               if [ -z ${NETIFS[p$i]} ]; then
+                       echo "SKIP: Cannot create interface. Name not specified"
+                       exit $ksft_skip
+               fi
+
                ip link show dev ${NETIFS[p$i]} &> /dev/null
                if [[ $? -ne 0 ]]; then
                        ip link add ${NETIFS[p$i]} type veth \
diff --git a/tools/testing/selftests/net/forwarding/settings b/tools/testing/selftests/net/forwarding/settings
new file mode 100644 (file)
index 0000000..e7b9417
--- /dev/null
@@ -0,0 +1 @@
+timeout=0
index a96cff8..b0f5e55 100755 (executable)
@@ -9,6 +9,8 @@ NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
 
+require_command ncat
+
 tcflags="skip_hw"
 
 h1_create()
@@ -220,9 +222,9 @@ mirred_egress_to_ingress_tcp_test()
                ip_proto icmp \
                        action drop
 
-       ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2  &
+       ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 &
        local rpid=$!
-       ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1
+       ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1
        wait -n $rpid
        cmp -s $mirred_e2i_tf1 $mirred_e2i_tf2
        check_err $? "server output check failed"
index 683711f..b1daad1 100755 (executable)
@@ -52,8 +52,8 @@ match_dst_mac_test()
        tc_check_packets "dev $h2 ingress" 101 1
        check_fail $? "Matched on a wrong filter"
 
-       tc_check_packets "dev $h2 ingress" 102 1
-       check_err $? "Did not match on correct filter"
+       tc_check_packets "dev $h2 ingress" 102 0
+       check_fail $? "Did not match on correct filter"
 
        tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
        tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
@@ -78,8 +78,8 @@ match_src_mac_test()
        tc_check_packets "dev $h2 ingress" 101 1
        check_fail $? "Matched on a wrong filter"
 
-       tc_check_packets "dev $h2 ingress" 102 1
-       check_err $? "Did not match on correct filter"
+       tc_check_packets "dev $h2 ingress" 102 0
+       check_fail $? "Did not match on correct filter"
 
        tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
        tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
index e22c2d2..20a7cb7 100755 (executable)
@@ -127,6 +127,7 @@ test_l2_miss_multicast_common()
        local proto=$1; shift
        local sip=$1; shift
        local dip=$1; shift
+       local dmac=$1; shift
        local mode=$1; shift
        local name=$1; shift
 
@@ -142,7 +143,7 @@ test_l2_miss_multicast_common()
           action pass
 
        # Before adding MDB entry.
-       $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q
+       $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q
 
        tc_check_packets "dev $swp2 egress" 101 1
        check_err $? "Unregistered multicast filter was not hit before adding MDB entry"
@@ -153,7 +154,7 @@ test_l2_miss_multicast_common()
        # Adding MDB entry.
        bridge mdb replace dev br1 port $swp2 grp $dip permanent
 
-       $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q
+       $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q
 
        tc_check_packets "dev $swp2 egress" 101 1
        check_err $? "Unregistered multicast filter was hit after adding MDB entry"
@@ -164,7 +165,7 @@ test_l2_miss_multicast_common()
        # Deleting MDB entry.
        bridge mdb del dev br1 port $swp2 grp $dip
 
-       $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q
+       $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q
 
        tc_check_packets "dev $swp2 egress" 101 2
        check_err $? "Unregistered multicast filter was not hit after deleting MDB entry"
@@ -183,10 +184,11 @@ test_l2_miss_multicast_ipv4()
        local proto="ipv4"
        local sip=192.0.2.1
        local dip=239.1.1.1
+       local dmac=01:00:5e:01:01:01
        local mode="-4"
        local name="IPv4"
 
-       test_l2_miss_multicast_common $proto $sip $dip $mode $name
+       test_l2_miss_multicast_common $proto $sip $dip $dmac $mode $name
 }
 
 test_l2_miss_multicast_ipv6()
@@ -194,10 +196,11 @@ test_l2_miss_multicast_ipv6()
        local proto="ipv6"
        local sip=2001:db8:1::1
        local dip=ff0e::1
+       local dmac=33:33:00:00:00:01
        local mode="-6"
        local name="IPv6"
 
-       test_l2_miss_multicast_common $proto $sip $dip $mode $name
+       test_l2_miss_multicast_common $proto $sip $dip $dmac $mode $name
 }
 
 test_l2_miss_multicast()
index 5ac184d..5a5dd90 100755 (executable)
@@ -104,11 +104,14 @@ tunnel_key_nofrag_test()
        local i
 
        tc filter add dev $swp1 ingress protocol ip pref 100 handle 100 \
-               flower ip_flags nofrag action drop
+               flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \
+               ip_flags nofrag action drop
        tc filter add dev $swp1 ingress protocol ip pref 101 handle 101 \
-               flower ip_flags firstfrag action drop
+               flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \
+               ip_flags firstfrag action drop
        tc filter add dev $swp1 ingress protocol ip pref 102 handle 102 \
-               flower ip_flags nofirstfrag action drop
+               flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \
+               ip_flags nofirstfrag action drop
 
        # test 'nofrag' set
        tc filter add dev h1-et egress protocol all pref 1 handle 1 matchall $tcflags \
index 3c2096a..d01b73a 100755 (executable)
@@ -705,6 +705,7 @@ pm_nl_del_endpoint()
        local addr=$3
 
        if [ $ip_mptcp -eq 1 ]; then
+               [ $id -ne 0 ] && addr=''
                ip -n $ns mptcp endpoint delete id $id $addr
        else
                ip netns exec $ns ./pm_nl_ctl del $id $addr
@@ -795,10 +796,11 @@ pm_nl_check_endpoint()
        fi
 
        if [ $ip_mptcp -eq 1 ]; then
+               # get line and trim trailing whitespace
                line=$(ip -n $ns mptcp endpoint show $id)
+               line="${line% }"
                # the dump order is: address id flags port dev
-               expected_line="$addr"
-               [ -n "$addr" ] && expected_line="$expected_line $addr"
+               [ -n "$addr" ] && expected_line="$addr"
                expected_line="$expected_line $id"
                [ -n "$_flags" ] && expected_line="$expected_line ${_flags//","/" "}"
                [ -n "$dev" ] && expected_line="$expected_line $dev"
index dfe3d28..f838dd3 100755 (executable)
@@ -361,6 +361,7 @@ err_buf=
 tcpdump_pids=
 nettest_pids=
 socat_pids=
+tmpoutfile=
 
 err() {
        err_buf="${err_buf}${1}
@@ -951,6 +952,7 @@ cleanup() {
        ip link del veth_A-R1                   2>/dev/null
        ovs-vsctl --if-exists del-port vxlan_a  2>/dev/null
        ovs-vsctl --if-exists del-br ovs_br0    2>/dev/null
+       rm -f "$tmpoutfile"
 }
 
 mtu() {
@@ -1328,6 +1330,39 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() {
        check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on bridged ${type} interface"
        pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
        check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on locally bridged ${type} interface"
+
+       tmpoutfile=$(mktemp)
+
+       # Flush Exceptions, retry with TCP
+       run_cmd ${ns_a} ip route flush cached ${dst}
+       run_cmd ${ns_b} ip route flush cached ${dst}
+       run_cmd ${ns_c} ip route flush cached ${dst}
+
+       for target in "${ns_a}" "${ns_c}" ; do
+               if [ ${family} -eq 4 ]; then
+                       TCPDST=TCP:${dst}:50000
+               else
+                       TCPDST="TCP:[${dst}]:50000"
+               fi
+               ${ns_b} socat -T 3 -u -6 TCP-LISTEN:50000 STDOUT > $tmpoutfile &
+
+               sleep 1
+
+               dd if=/dev/zero of=/dev/stdout status=none bs=1M count=1 | ${target} socat -T 3 -u STDIN $TCPDST,connect-timeout=3
+
+               size=$(du -sb $tmpoutfile)
+               size=${size%%/tmp/*}
+
+               [ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1
+       done
+
+       rm -f "$tmpoutfile"
+
+       # Check that exceptions were created
+       pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
+       check_pmtu_value ${exp_mtu} "${pmtu}" "tcp: exceeding link layer MTU on bridged ${type} interface"
+       pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+       check_pmtu_value ${exp_mtu} "${pmtu}" "tcp exceeding link layer MTU on locally bridged ${type} interface"
 }
 
 test_pmtu_ipv4_br_vxlan4_exception() {
index b357ba2..7a957c7 100644 (file)
@@ -4,8 +4,10 @@ ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
 CLANG_FLAGS += -no-integrated-as
 endif
 
+top_srcdir = ../../../..
+
 CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -L$(OUTPUT) -Wl,-rpath=./ \
-         $(CLANG_FLAGS)
+         $(CLANG_FLAGS) -I$(top_srcdir)/tools/include
 LDLIBS += -lpthread -ldl
 
 # Own dependencies because we only want to build against 1st prerequisite, but
index a723da2..96e812b 100644 (file)
@@ -31,6 +31,8 @@
 #include <sys/auxv.h>
 #include <linux/auxvec.h>
 
+#include <linux/compiler.h>
+
 #include "../kselftest.h"
 #include "rseq.h"